Skip to content

Commit

Permalink
fix: fix git commit parser sometimes returning wrong diffs
Browse files Browse the repository at this point in the history
Problem 1: sometimes the path for a diff is wrong. This is because
`commit_utils.parse_patch()` assumes that the diffs are in the same order
as the filenames in the commit header, which turns out to not always be
true.

Solution: get the path name from the `+++ b/path` or `--- a/path` lines of
the diff header.

Problem 2: turning merge commits into individual commits by calling
`git show` with the `-m` option does not work when fetching only some files
inside the commit (which we do when scanning huge commits): it always
return the changes from the first parent.

Solution: maybe it would be possible to return the changes from the correct
parents, but it felt simpler to stop using the `-m` option.
As a bonus, it simplifies `parse_patch()` because it can now assume that
there is only one commit per patch.
  • Loading branch information
agateau-gg committed Sep 20, 2024
1 parent 807c2ba commit f7ae8e1
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 47 deletions.
3 changes: 3 additions & 0 deletions changelog.d/20240910_170441_aurelien.gateau_fix_git_parser.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Fixed

- The git commit parser have been reworked, fixing cases where commands scanning commits would fail.
2 changes: 1 addition & 1 deletion ggshield/core/scan/commit_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def from_patch_header(header_str: str) -> "CommitInformation":
Output format looks like this:
```
commit: $SHA
commit $SHA
Author: $NAME <$EMAIL>
Date: $DATE
Expand Down
144 changes: 104 additions & 40 deletions ggshield/core/scan/commit_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
HEADER_COMMON_ARGS = [
"--raw", # shows a header with the files touched by the commit
"-z", # separate file names in the raw header with \0
"-m", # split multi-parent (aka merge) commits into several one-parent commits
]

# Command line arguments passed to `git show` and `git diff` to get parsable patches
Expand All @@ -34,6 +33,12 @@

_RX_HEADER_FILE_LINE_SEPARATOR = re.compile("[\n\0]:", re.MULTILINE)

# Match the path in a "---a/file_path" or a "+++ b/file_path".
# Note that for some reason, git sometimes append an \t at the end (happens with the
# "I'm unusual!" file in the test suite). We ignore it.
OLD_NAME_RX = re.compile(r"^--- a/(.*?)\t?$", flags=re.MULTILINE)
NEW_NAME_RX = re.compile(r"^\+\+\+ b/(.*?)\t?$", flags=re.MULTILINE)


class PatchParseError(Exception):
"""
Expand Down Expand Up @@ -192,49 +197,108 @@ def parse_patch(
) -> Iterable[Scannable]:
"""
Parse a patch generated with `git show` or `git diff` using PATCH_COMMON_ARGS.
If the patch represents a merge commit, then `patch` actually contains multiple
commits, one per parent, because we call `git show` with the `-m` option to force it
to generate one single-parent commit per parent. This makes later code simpler and
ensures we see *all* the changes.
Returns a list of Scannable.
A patch looks like this:
```
commit $SHA
Author: $NAME <$EMAIL>
Date: $DATE
$SUBJECT
$BODY1
$BODY2
...
$AFFECTED_FILE_LINE\0$DIFF1
$DIFF2
...
```
For a non-merge commit, $DIFFn looks like this:
```
diff --git $A_NAME $B_NAME
$META_INFO
$META_INFO
...
--- $A_NAME
+++ $A_NAME
@@ $FROM $TO @@
$ACTUAL_CHANGES
@@ $FROM $TO @@
$MORE_CHANGES
```
$A_NAME and $B_NAME may be /dev/null in case of creation or removal. When they are
not, they start with "a/" and "b/" respectively.
For a 2-parent merge commit with resolved conflicts, $DIFFn looks like this:
```
diff --cc $NAME
$META_INFO
$META_INFO
...
--- $A_NAME
+++ $B_NAME
@@@ $FROM1 $FROM2 $TO @@@
$ACTUAL_CHANGES
```
Note that:
- The diff line only contains one name, without any "a/" or "b/" prefixes.
- The hunk starts with 3 "@" instead of 2. For a commit with N parents, there are
actually N+1 "@" characters.
"""
if exclusion_regexes is None:
exclusion_regexes = set()

for commit in patch.split("\0commit "):
tokens = commit.split("\0diff ", 1)
if len(tokens) == 1:
# No diff, carry on to next commit
continue
header_str, rest = tokens

try:
header = PatchHeader.from_string(header_str)

diffs = re.split(r"^diff ", rest, flags=re.MULTILINE)
for file_info, diff in zip(header.files, diffs):
if is_path_excluded(file_info.path, exclusion_regexes):
continue

# extract document from diff: we must skip diff extended headers
# (lines like "old mode 100644", "--- a/foo", "+++ b/foo"...)
try:
end_of_headers = diff.index("\n@@")
except ValueError:
# No content
continue
# +1 because we searched for the '\n'
content = diff[end_of_headers + 1 :]

yield CommitScannable(
sha, file_info.path, content, filemode=file_info.mode
)
except Exception as exc:
if sha:
msg = f"Could not parse patch (sha: {sha}): {exc}"
else:
msg = f"Could not parse patch: {exc}"
raise PatchParseError(msg)
tokens = patch.split("\0diff ", 1)
if len(tokens) == 1:
# No diff, we are done
return
header_str, rest = tokens

try:
header = PatchHeader.from_string(header_str)

diffs = re.split(r"^diff ", rest, flags=re.MULTILINE)
for diff in diffs:
# Split diff into header and content
try:
# + 1 because we match the "\n" in "\n@@"
content_start = diff.index("\n@@") + 1
except ValueError:
# No content
continue
diff_header = diff[:content_start]
content = diff[content_start:]

# Find diff path in diff header
match = NEW_NAME_RX.search(diff_header)
if not match:
# Must have been deleted. find the old path in this case
match = OLD_NAME_RX.search(diff_header)
if not match:
raise PatchParseError(

Check warning on line 286 in ggshield/core/scan/commit_utils.py

View check run for this annotation

Codecov / codecov/patch

ggshield/core/scan/commit_utils.py#L286

Added line #L286 was not covered by tests
f"Could not find old path in {repr(diff_header)}"
)
path = Path(match.group(1))
if is_path_excluded(path, exclusion_regexes):
continue

file_info = next(x for x in header.files if x.path == path)

yield CommitScannable(sha, file_info.path, content, filemode=file_info.mode)
except Exception as exc:
if sha:
msg = f"Could not parse patch (sha: {sha}): {exc}"

Check warning on line 298 in ggshield/core/scan/commit_utils.py

View check run for this annotation

Codecov / codecov/patch

ggshield/core/scan/commit_utils.py#L297-L298

Added lines #L297 - L298 were not covered by tests
else:
msg = f"Could not parse patch: {exc}"
raise PatchParseError(msg)

Check warning on line 301 in ggshield/core/scan/commit_utils.py

View check run for this annotation

Codecov / codecov/patch

ggshield/core/scan/commit_utils.py#L300-L301

Added lines #L300 - L301 were not covered by tests


def get_file_sha_in_ref(
Expand Down
58 changes: 52 additions & 6 deletions tests/unit/core/scan/test_commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,23 +343,19 @@ def scenario_type_change(repo: Repository) -> None:
),
(
scenario_merge,
[
("longfile", Filemode.MODIFY),
("longfile", Filemode.MODIFY),
("longfile", Filemode.MODIFY),
],
[], # no conflict -> nothing to scan
),
(
scenario_merge_with_changes,
[
("conflicted", Filemode.MODIFY),
("conflicted", Filemode.MODIFY),
],
),
(
scenario_type_change,
[
("f2", Filemode.NEW),
("f2", Filemode.NEW),
],
),
]
Expand Down Expand Up @@ -392,6 +388,56 @@ def test_from_sha(
]


def test_from_sha_gets_right_content_for_conflicts(tmp_path):
"""
GIVEN a merge commit with a conflict, loaded with Commit.from_sha()
WHEN Commit.get_files() is called
THEN it returns the right content
"""
repo = Repository.create(tmp_path)
scenario_merge_with_changes(repo)

sha = repo.get_top_sha()
commit = Commit.from_sha(sha, cwd=tmp_path)

files = list(commit.get_files())
assert len(files) == 1
content = files[0].content

# Content contains the old line,
assert "--Hello" in content
# the new line from main,
assert "- Hello from main" in content
# the new line from b1,
assert " -Hello from b1" in content
# and the result of the conflict resolution
assert "++Solve conflict" in content


def test_from_sha_files_matches_content(tmp_path):
"""
GIVEN a commit with many files
WHEN Commit.get_files() is called
THEN the reported file names match their expected content
"""
repo = Repository.create(tmp_path)

for idx in range(50):
path = tmp_path / str(idx)
path.parent.mkdir(exist_ok=True)
path.write_text(f"{idx}\n")
repo.add(path)
repo.create_commit()

sha = repo.get_top_sha()
commit = Commit.from_sha(sha, cwd=tmp_path)
files = list(commit.get_files())

for file in files:
last_line = file.content.splitlines()[-1]
assert last_line == f"+{file.path.name}"


def test_from_staged(tmp_path):
"""
GIVEN a new file added with `git add`
Expand Down

0 comments on commit f7ae8e1

Please sign in to comment.