Skip to content

Commit

Permalink
feat: add exclude-entropy-patterns (#192)
Browse files Browse the repository at this point in the history
* feat: add exclude-entropy-patterns

Add the ability to exclude entropy patterns. This will allow flagged entropy rules using regular expressions for all files or specific files.

* fix: address code review feedback

* fix: make sure we test for delimiter in pattern

* Make Rules hashable so we can lru_cache them

* Missing return annotation & extra hash safeguard

Co-authored-by: Joey Wilhelm <tarkatronic@gmail.com>
  • Loading branch information
dclayton-godaddy and tarkatronic authored Jun 15, 2021
1 parent 21e5e1d commit 9f615d5
Show file tree
Hide file tree
Showing 8 changed files with 260 additions and 8 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ Options:
excluded unless effectively excluded via the
--include-path-patterns option.

-xe, --exclude-entropy-patterns TEXT
Specify a regular expression which matches
entropy strings to exclude from the scan.
This option can be specified multiple times
to exclude multiple patterns. If not
provided (default), no entropy strings will
be excluded ({path regex}::{pattern regex}).

-e, --exclude-signatures TEXT Specify signatures of matches that you
explicitly want to exclude from the scan,
and mark as okay. These signatures are
Expand Down
22 changes: 22 additions & 0 deletions docs/source/features.rst
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,28 @@ thing, there needs to be a way to tell ``tartufo`` to ignore those things, and
not report them out as issues. For this reason, we provide multiple methods for
excluding these items.

Entropy Limiting
++++++++++++++++

Entropy scans can produce a high number of false positives such as git SHAs or md5
digests. To avoid these false positives, enable ``exclude-entropy-patterns``. Exclusions
apply to any strings flagged by entropy checks.

For example, if ``docs/README.md`` contains a git SHA, this would be flagged by entropy.
To exclude this, add ``docs/.*\.md$::^[a-zA-Z0-9]{40}$`` to ``exclude-entropy-patterns``.

.. code-block:: sh
> tartufo ... --exclude-entropy-patterns "docs/.*\.md$::^[a-zA-Z0-9]{40}$"
.. code-block:: toml
[tool.tartufo]
exclude-entropy-patterns = [
# format: "{file regex}::{entropy pattern}"
"docs/.*\.md$::^[a-zA-Z0-9]{40}$", # exclude all git SHAs in the docs directory
]
Limiting by Signature
+++++++++++++++++++++

Expand Down
9 changes: 9 additions & 0 deletions tartufo/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,15 @@ def get_command(self, ctx: click.Context, cmd_name: str) -> Optional[click.Comma
are excluded unless effectively excluded via the --include-path-patterns
option.""",
)
@click.option(
"-xe",
"--exclude-entropy-patterns",
multiple=True,
help="""Specify a regular expression which matches entropy strings to
exclude from the scan. This option can be specified multiple times to
exclude multiple patterns. If not provided (default), no entropy strings
will be excluded ({path regex}::{pattern regex}).""",
)
@click.option(
"-e",
"--exclude-signatures",
Expand Down
30 changes: 30 additions & 0 deletions tartufo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,33 @@ def compile_path_rules(patterns: Iterable[str]) -> List[Pattern]:
for pattern in stripped
if pattern and not pattern.startswith("#")
]


def compile_rule(pattern: str) -> Rule:
"""
Compile pattern string to Rule.
:param pattern: Rule pattern with {path_pattern}::{pattern}
:return Rule: Rule object with pattern and path_pattern
"""
try:
path, pattern = pattern.split("::", 1)
except ValueError: # Raised when the split separator is not found
path = ".*"
return Rule(name=None, pattern=re.compile(pattern), path_pattern=re.compile(path))


def compile_rules(patterns: Iterable[str]) -> List[Rule]:
"""Take a list of regex string with paths and compile them into a List of Rule.
Any line starting with `#` will be ignored.
:param patterns: The list of patterns to be compiled
:return: List of Rule objects
"""
stripped = (p.strip() for p in patterns)
return [
compile_rule(pattern)
for pattern in stripped
if pattern and not pattern.startswith("#")
]
87 changes: 79 additions & 8 deletions tartufo/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ class ScannerBase(abc.ABC):
_issues: Optional[List[Issue]] = None
_included_paths: Optional[List[Pattern]] = None
_excluded_paths: Optional[List[Pattern]] = None
_excluded_entropy: Optional[List[Rule]] = None
_rules_regexes: Optional[Dict[str, Rule]] = None
global_options: types.GlobalOptions
logger: logging.Logger
Expand Down Expand Up @@ -162,6 +163,23 @@ def included_paths(self) -> List[Pattern]:
)
return self._included_paths

@property
def excluded_entropy(self) -> List[Rule]:
"""Get a list of regexes used as an exclusive list of paths to scan.
:rtype: List[Pattern]
"""
if self._excluded_entropy is None:
self.logger.info("Initializing excluded entropy patterns")
patterns = list(self.global_options.exclude_entropy_patterns or ())
self._excluded_entropy = (
config.compile_rules(set(patterns)) if patterns else []
)
self.logger.debug(
"Excluded entropy was initialized as: %s", self._excluded_entropy
)
return self._excluded_entropy

@property
def excluded_paths(self) -> List[Pattern]:
"""Get a list of regexes used to match paths to exclude from the scan.
Expand Down Expand Up @@ -247,6 +265,36 @@ def signature_is_excluded(self, blob: str, file_path: str) -> bool:
in self.global_options.exclude_signatures
)

@staticmethod
@lru_cache(maxsize=None)
def rule_matches(rule: Rule, string: str, path: str) -> bool:
"""
Match string and path against rule.
:param rule: Rule to perform match
:param string: string to match against rule pattern
:param path: path to match against rule path_pattern
:return: True if string and path matched, False otherwise.
"""
match = False
if rule.pattern:
match = rule.pattern.match(string) is not None
if rule.path_pattern:
match = match and rule.path_pattern.match(path) is not None
return match

def entropy_string_is_excluded(self, string: str, path: str) -> bool:
"""Find whether the signature of some data has been excluded in configuration.
:param string: String to check against rule pattern
:param path: Path to check against rule path pattern
:return: True if excluded, False otherwise
"""

return bool(self.excluded_entropy) and any(
ScannerBase.rule_matches(p, string, path) for p in self.excluded_entropy
)

@lru_cache(maxsize=None)
def calculate_entropy(self, data: str, char_set: str) -> float:
"""Calculate the Shannon entropy for a piece of data.
Expand Down Expand Up @@ -311,18 +359,41 @@ def scan_entropy(self, chunk: types.Chunk) -> List[Issue]:
hex_strings = util.get_strings_of_set(word, HEX_CHARS)

for string in b64_strings:
if not self.signature_is_excluded(string, chunk.file_path):
b64_entropy = self.calculate_entropy(string, BASE64_CHARS)
if b64_entropy > 4.5:
issues.append(Issue(types.IssueType.Entropy, string, chunk))
issues += self.evaluate_entropy_string(
chunk, string, BASE64_CHARS, 4.5
)

for string in hex_strings:
if not self.signature_is_excluded(string, chunk.file_path):
hex_entropy = self.calculate_entropy(string, HEX_CHARS)
if hex_entropy > 3:
issues.append(Issue(types.IssueType.Entropy, string, chunk))
issues += self.evaluate_entropy_string(chunk, string, HEX_CHARS, 3)

return issues

def evaluate_entropy_string(
self,
chunk: types.Chunk,
string: str,
chars: str,
min_entropy_score: float,
) -> List[Issue]:
"""
Check entropy string using entropy characters and score.
:param chunk: The chunk of data to check
:param issues: Issue list to append any strings flagged
:param string: String to check
:param chars: Characters to calculate score
:param min_entropy_score: Minimum entropy score to flag
return: List of issues flagged
"""
if not self.signature_is_excluded(string, chunk.file_path):
entropy_score = self.calculate_entropy(string, chars)
if entropy_score > min_entropy_score:
if self.entropy_string_is_excluded(string, chunk.file_path):
self.logger.debug("entropy string %s was excluded", string)
else:
return [Issue(types.IssueType.Entropy, string, chunk)]
return []

def scan_regex(self, chunk: types.Chunk) -> List[Issue]:
"""Scan a chunk of data for matches against the configured regexes.
Expand Down
7 changes: 7 additions & 0 deletions tartufo/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class GlobalOptions:
"include_path_patterns",
"exclude_paths",
"exclude_path_patterns",
"exclude_entropy_patterns",
"exclude_signatures",
"output_dir",
"git_rules_repo",
Expand All @@ -35,6 +36,7 @@ class GlobalOptions:
include_path_patterns: Tuple[str, ...]
exclude_paths: Optional[TextIO]
exclude_path_patterns: Tuple[str, ...]
exclude_entropy_patterns: Tuple[str, ...]
exclude_signatures: Tuple[str, ...]
output_dir: Optional[str]
git_rules_repo: Optional[str]
Expand Down Expand Up @@ -75,6 +77,11 @@ class Rule:
pattern: Pattern
path_pattern: Optional[Pattern]

def __hash__(self) -> int:
if self.path_pattern:
return hash(f"{self.pattern.pattern}::{self.path_pattern.pattern}")
return hash(self.pattern.pattern)


class LogLevel(enum.IntEnum):
ERROR = 0
Expand Down
51 changes: 51 additions & 0 deletions tests/test_base_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,21 @@ def setUp(self) -> None:
)
self.scanner = TestScanner(self.options)

def test_entropy_string_is_excluded(self):
self.options.exclude_entropy_patterns = [r"docs/.*\.md::f.*"]
excluded = self.scanner.entropy_string_is_excluded("foo", "docs/README.md")
self.assertEqual(True, excluded)

def test_entropy_string_is_not_excluded(self):
self.options.exclude_entropy_patterns = [r"foo\..*::f.*"]
excluded = self.scanner.entropy_string_is_excluded("bar", "foo.py")
self.assertEqual(False, excluded)

def test_entropy_string_is_not_excluded_given_different_path(self):
self.options.exclude_entropy_patterns = [r"foo\..*::f.*"]
excluded = self.scanner.entropy_string_is_excluded("foo", "bar.py")
self.assertEqual(False, excluded)

def test_calculate_base64_entropy_calculation(self):
random_string = (
"ZWVTjPQSdhwRgl204Hc51YCsritMIzn8B=/p9UyeX7xu6KkAGqfm3FJ+oObLDNEva"
Expand Down Expand Up @@ -440,6 +455,42 @@ def test_issues_are_created_for_high_entropy_hex_strings(
self.assertEqual(issues[0].issue_type, types.IssueType.Entropy)
self.assertEqual(issues[0].matched_string, "foo")

@mock.patch("tartufo.scanner.ScannerBase.calculate_entropy")
@mock.patch("tartufo.scanner.ScannerBase.signature_is_excluded")
@mock.patch("tartufo.scanner.ScannerBase.entropy_string_is_excluded")
@mock.patch("tartufo.util.get_strings_of_set")
def test_issues_are_not_created_for_high_entropy_hex_strings_given_entropy_is_excluded(
self,
mock_strings: mock.MagicMock,
mock_entropy: mock.MagicMock,
mock_signature: mock.MagicMock,
mock_calculate: mock.MagicMock,
):
mock_strings.side_effect = ([], ["foo"], [], [], [], [])
mock_entropy.return_value = True
mock_signature.return_value = False
mock_calculate.return_value = 9.0
issues = self.scanner.scan_entropy(self.chunk)
self.assertEqual(len(issues), 0)

@mock.patch("tartufo.scanner.ScannerBase.calculate_entropy")
@mock.patch("tartufo.scanner.ScannerBase.signature_is_excluded")
@mock.patch("tartufo.scanner.ScannerBase.entropy_string_is_excluded")
@mock.patch("tartufo.util.get_strings_of_set")
def test_issues_are_not_created_for_low_entropy_b64_strings_given_entropy_is_excluded(
self,
mock_strings: mock.MagicMock,
mock_entropy: mock.MagicMock,
mock_signature: mock.MagicMock,
mock_calculate: mock.MagicMock,
):
mock_strings.side_effect = (["foo"], [], [], [], [], [])
mock_entropy.return_value = True
mock_signature.return_value = False
mock_calculate.return_value = 9.0
issues = self.scanner.scan_entropy(self.chunk)
self.assertEqual(len(issues), 0)

@mock.patch("tartufo.scanner.ScannerBase.calculate_entropy")
@mock.patch("tartufo.scanner.ScannerBase.signature_is_excluded")
@mock.patch("tartufo.util.get_strings_of_set")
Expand Down
54 changes: 54 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,5 +290,59 @@ def test_whitespace_lines_are_ignored(self):
)


class CompileRulesTests(unittest.TestCase):
def test_commented_lines_are_ignored(self):
rules = config.compile_rules(["# Poetry lock file", r"^[a-zA-Z0-9]{26}$"])
self.assertEqual(
rules, [Rule(None, re.compile(r"^[a-zA-Z0-9]{26}$"), re.compile(r".*"))]
)

def test_whitespace_lines_are_ignored(self):
rules = config.compile_rules(
[
"# Poetry lock file",
r"poetry\.lock::^[a-zA-Z0-9]{40}$",
"",
"\t\n",
"# NPM files",
r"^[a-zA-Z0-9]{26}$",
]
)
self.assertEqual(
rules,
[
Rule(
None, re.compile(r"^[a-zA-Z0-9]{40}$"), re.compile(r"poetry\.lock")
),
Rule(None, re.compile(r"^[a-zA-Z0-9]{26}$"), re.compile(r".*")),
],
)

def test_path_is_used(self):
rules = config.compile_rules(
[
r"src/.*::^[a-zA-Z0-9]{26}$",
r"^[a-zA-Z0-9]test$",
r"src/.*::^[a-zA-Z0-9]{26}::test$",
]
)
self.assertEqual(
rules,
[
Rule(None, re.compile(r"^[a-zA-Z0-9]{26}$"), re.compile(r"src/.*")),
Rule(None, re.compile(r"^[a-zA-Z0-9]test$"), re.compile(r".*")),
Rule(
None, re.compile(r"^[a-zA-Z0-9]{26}::test$"), re.compile(r"src/.*")
),
],
)

def test_match_can_contain_delimiter(self):
rules = config.compile_rules([r".*::^[a-zA-Z0-9]::test$"])
self.assertEqual(
rules, [Rule(None, re.compile(r"^[a-zA-Z0-9]::test$"), re.compile(r".*"))]
)


if __name__ == "__main__":
unittest.main()

0 comments on commit 9f615d5

Please sign in to comment.