Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for SARIF output for secret scanning #920

Merged
merged 6 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog.d/20240624_175518_aurelien.gateau_sarif.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Added

- `ggshield secret scan` commands can now output results in [SARIF format](https://sarifweb.azurewebsites.net/), using the new `--format sarif` option (#869).
4 changes: 1 addition & 3 deletions ggshield/cmd/secret/scan/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@


@click.command()
@click.argument(
"paths", nargs=-1, type=RealPath(exists=True, resolve_path=True), required=True
)
@click.argument("paths", nargs=-1, type=RealPath(exists=True), required=True)
@click.option("--recursive", "-r", is_flag=True, help="Scan directory recursively.")
@click.option("--yes", "-y", is_flag=True, help="Confirm recursive scan.")
@click.option(
Expand Down
15 changes: 12 additions & 3 deletions ggshield/cmd/secret/scan/secret_scan_common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,17 @@
exit_zero_option,
get_config_from_context,
json_option,
text_json_sarif_format_option,
)
from ggshield.cmd.utils.context_obj import ContextObj
from ggshield.cmd.utils.output_format import OutputFormat
from ggshield.core.config.user_config import SecretConfig
from ggshield.core.filter import init_exclusion_regexes
from ggshield.utils.click import RealPath
from ggshield.verticals.secret.output import (
SecretJSONOutputHandler,
SecretOutputHandler,
SecretSARIFOutputHandler,
SecretTextOutputHandler,
)

Expand Down Expand Up @@ -122,6 +125,7 @@ def add_secret_scan_common_options() -> Callable[[AnyFunction], AnyFunction]:
def decorator(cmd: AnyFunction) -> AnyFunction:
add_common_options()(cmd)
json_option(cmd)
text_json_sarif_format_option(cmd)
_output_option(cmd)
_show_secrets_option(cmd)
exit_zero_option(cmd)
Expand All @@ -133,13 +137,18 @@ def decorator(cmd: AnyFunction) -> AnyFunction:
return decorator


OUTPUT_HANDLER_CLASSES = {
OutputFormat.TEXT: SecretTextOutputHandler,
OutputFormat.JSON: SecretJSONOutputHandler,
OutputFormat.SARIF: SecretSARIFOutputHandler,
}


def create_output_handler(ctx: click.Context) -> SecretOutputHandler:
"""Read objects defined in ctx.obj and create the appropriate OutputHandler
instance"""
ctx_obj = ContextObj.get(ctx)
output_handler_cls = (
SecretJSONOutputHandler if ctx_obj.use_json else SecretTextOutputHandler
)
output_handler_cls = OUTPUT_HANDLER_CLASSES[ctx_obj.output_format]
config = ctx_obj.config
return output_handler_cls(
show_secrets=config.user_config.secret.show_secrets,
Expand Down
44 changes: 42 additions & 2 deletions ggshield/cmd/utils/common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
"""

from pathlib import Path
from typing import Any, Callable, Optional, TypeVar
from typing import Any, Callable, List, Optional, TypeVar

import click

from ggshield.cmd.utils.context_obj import ContextObj
from ggshield.cmd.utils.debug_logs import setup_debug_logs
from ggshield.cmd.utils.output_format import OutputFormat
from ggshield.core.config.user_config import UserConfig


Expand Down Expand Up @@ -187,13 +188,52 @@ def decorator(cmd: AnyFunction) -> AnyFunction:
return decorator


def _set_json_output_format(
ctx: click.Context, param: click.Parameter, value: Optional[bool]
) -> Optional[bool]:
if value:
ctx_obj = ContextObj.get(ctx)
ctx_obj.output_format = OutputFormat.JSON
return value


json_option = click.option(
"--json",
"json_output",
is_flag=True,
default=None,
help="Use JSON output.",
callback=create_ctx_callback("use_json"),
callback=_set_json_output_format,
)


def _set_output_format(
fnareoh marked this conversation as resolved.
Show resolved Hide resolved
ctx: click.Context, param: click.Parameter, value: Optional[str]
) -> Optional[str]:
if value:
ctx_obj = ContextObj.get(ctx)
ctx_obj.output_format = OutputFormat(value)
return value


def _create_format_option(
formats: List[OutputFormat],
) -> Callable[[click.decorators.FC], click.decorators.FC]:
return click.option(
"--format",
type=click.Choice([x.value for x in formats]),
help="Output format.",
callback=_set_output_format,
)


# If a command only supports text and json formats, it should use this option
text_json_format_option = _create_format_option([OutputFormat.TEXT, OutputFormat.JSON])


# If a command supports text, sarif and json formats, it should use this option
text_json_sarif_format_option = _create_format_option(
[OutputFormat.TEXT, OutputFormat.JSON, OutputFormat.SARIF]
)
fnareoh marked this conversation as resolved.
Show resolved Hide resolved


Expand Down
8 changes: 6 additions & 2 deletions ggshield/cmd/utils/context_obj.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import click
from pygitguardian import GGClient

from ggshield.cmd.utils.output_format import OutputFormat
from ggshield.core.cache import Cache
from ggshield.core.config import Config
from ggshield.core.ui.ggshield_ui import GGShieldUI
Expand Down Expand Up @@ -42,12 +43,15 @@ def __init__(self):
# Set to false by the --no-check-for-updates option
self.check_for_updates = True

# Set by the --json option
self.use_json = False
self.output_format = OutputFormat.TEXT

# Set by the --output option
self.output: Optional[Path] = None

@property
def use_json(self) -> bool:
return self.output_format == OutputFormat.JSON

@property
def config(self) -> Config:
assert self._config
Expand Down
9 changes: 9 additions & 0 deletions ggshield/cmd/utils/output_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from enum import Enum


class OutputFormat(Enum):
"""The output format used by the various commands."""

TEXT = "text"
JSON = "json"
SARIF = "sarif"
3 changes: 2 additions & 1 deletion ggshield/core/scan/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,13 @@ def parser(commit: "Commit") -> Iterable[Scannable]:
def from_patch(
patch: str,
exclusion_regexes: Optional[Set[Pattern[str]]] = None,
sha: str = PATCH_PREFIX,
) -> "Commit":
"""This one is for tests"""
info = CommitInformation.from_patch_header(patch)

def parser(commit: "Commit") -> Iterable[Scannable]:
yield from parse_patch(PATCH_PREFIX, patch, exclusion_regexes)
yield from parse_patch(sha, patch, exclusion_regexes)

return Commit(sha=None, patch_parser=parser, info=info)

Expand Down
3 changes: 2 additions & 1 deletion ggshield/core/scan/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
UnexpectedDirectoryError,
get_filepaths,
is_path_binary,
url_for_path,
)

from .scannable import Scannable
Expand All @@ -22,7 +23,7 @@ def __init__(self, path: Union[str, Path]):

@property
def url(self) -> str:
return f"file://{self._path.absolute().as_posix()}"
return url_for_path(self._path)

@property
def filename(self) -> str:
Expand Down
16 changes: 15 additions & 1 deletion ggshield/utils/files.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from enum import Enum, auto
from pathlib import Path, PurePosixPath
from pathlib import Path, PurePath, PurePosixPath
from typing import List, Pattern, Set, Union
from urllib.parse import quote

from ggshield.utils._binary_extensions import BINARY_EXTENSIONS
from ggshield.utils.git_shell import (
Expand Down Expand Up @@ -92,3 +93,16 @@ def is_path_binary(path: Union[str, Path]) -> bool:
ext = Path(path).suffix
# `[1:]` because `ext` starts with a "." but extensions in `BINARY_EXTENSIONS` do not
return ext[1:] in BINARY_EXTENSIONS


def url_for_path(path: PurePath) -> str:
if not path.is_absolute():
return quote(path.as_posix())

# Allow ':'. This is required to represent the Windows drive in an URL.
path_str = quote(path.as_posix(), safe="/:")
if path_str[0] == "/":
return f"file://{path_str}"
else:
# This happens for Windows paths: `path_str` is something like "c:/foo/bar"
return f"file:///{path_str}"
2 changes: 2 additions & 0 deletions ggshield/verticals/secret/output/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from .secret_gitlab_webui_output_handler import SecretGitLabWebUIOutputHandler
from .secret_json_output_handler import SecretJSONOutputHandler
from .secret_output_handler import SecretOutputHandler
from .secret_sarif_output_handler import SecretSARIFOutputHandler
from .secret_text_output_handler import SecretTextOutputHandler


__all__ = [
"SecretOutputHandler",
"SecretJSONOutputHandler",
"SecretSARIFOutputHandler",
"SecretTextOutputHandler",
"SecretGitLabWebUIOutputHandler",
]
139 changes: 139 additions & 0 deletions ggshield/verticals/secret/output/secret_sarif_output_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import json
from typing import Any, Dict, Iterable, List, cast

from pygitguardian.client import VERSIONS
from pygitguardian.models import PolicyBreak

from ggshield import __version__ as ggshield_version
from ggshield.core.filter import get_ignore_sha
from ggshield.core.match_span import MatchSpan

from ..extended_match import ExtendedMatch
from ..secret_scan_collection import Result, SecretScanCollection
from .secret_output_handler import SecretOutputHandler


SCHEMA_URL = "https://docs.oasis-open.org/sarif/sarif/v2.1.0/errata01/os/schemas/sarif-schema-2.1.0.json"


class SecretSARIFOutputHandler(SecretOutputHandler):

def _process_scan_impl(self, scan: SecretScanCollection) -> str:
dct = {
"version": "2.1.0",
"$schema": SCHEMA_URL,
"runs": [
{
"tool": {
"driver": {
"organization": "GitGuardian",
"name": "ggshield",
"informationUri": "https://github.com/GitGuardian/ggshield",
"version": ggshield_version,
},
"extensions": [
{
"name": "secret",
"version": VERSIONS.secrets_engine_version,
}
],
},
"results": list(_create_sarif_results(scan.get_all_results())),
}
],
}
return json.dumps(dct)


def _create_sarif_results(results: Iterable[Result]) -> Iterable[Dict[str, Any]]:
"""
Creates SARIF result dicts for our Result instances. Creates one SARIF result dict
per policy break.
"""
for result in results:
for policy_break in result.scan.policy_breaks:
yield _create_sarif_result_dict(result.url, policy_break)


def _create_sarif_result_dict(
url: str,
policy_break: PolicyBreak,
) -> Dict[str, Any]:
# Prepare message with links to the related location for each match
matches_str = ", ".join(
f"[{m.match_type}]({id})" for id, m in enumerate(policy_break.matches)
)
matches_li = "\n".join(
f"- [{m.match_type}]({id})" for id, m in enumerate(policy_break.matches)
)
extended_matches = cast(List[ExtendedMatch], policy_break.matches)
message = f"Secret detected: {policy_break.break_type}.\nMatches: {matches_str}"
markdown_message = (
f"Secret detected: {policy_break.break_type}\nMatches:\n{matches_li}"
)

# Create dict
dct = {
"ruleId": policy_break.break_type,
"level": "error",
"message": {
"text": message,
"markdown": markdown_message,
},
"locations": [
_create_location_dict(url, [m.span for m in extended_matches]),
],
"relatedLocations": [
_create_related_location_dict(url, id, m)
for id, m in enumerate(extended_matches)
],
"partialFingerprints": {
"secret/v1": get_ignore_sha(policy_break),
},
}
if policy_break.incident_url:
dct["hostedViewerUri"] = policy_break.incident_url
return dct


def _create_location_dict(
url: str,
match_spans: List[MatchSpan],
) -> Dict[str, Any]:
# Create a span from the start of the first match to the end of the last match
start_pos = min((x.line_index_start, x.column_index_start) for x in match_spans)
end_pos = max((x.line_index_end, x.column_index_end) for x in match_spans)
span = MatchSpan(
line_index_start=start_pos[0],
line_index_end=end_pos[0],
column_index_start=start_pos[1],
column_index_end=end_pos[1],
)

return {"physicalLocation": _create_physical_location_dict(url, span)}


def _create_related_location_dict(
url: str,
id: int,
match: ExtendedMatch,
) -> Dict[str, Any]:
return {
"id": id,
"physicalLocation": _create_physical_location_dict(url, match.span),
"message": {"text": match.match_type},
}


def _create_physical_location_dict(url: str, match_span: MatchSpan) -> Dict[str, Any]:
return {
"artifactLocation": {
"uri": url,
},
"region": {
"startLine": match_span.line_index_start + 1,
"startColumn": match_span.column_index_start + 1,
"endLine": match_span.line_index_end + 1,
"endColumn": match_span.column_index_end + 1,
},
}
2 changes: 1 addition & 1 deletion tests/unit/core/scan/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def test_file_repr():
"""
if is_windows():
str_path = r"c:\Windows"
expected_url = "file://c:/Windows"
expected_url = "file:///c:/Windows"
else:
str_path = "/usr"
expected_url = "file:///usr"
Expand Down
Loading
Loading