diff --git a/README.md b/README.md index 08bb5fc5..ae1b0058 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,14 @@ Options: excluded unless effectively excluded via the --include-path-patterns option. + -xe, --exclude-entropy-patterns TEXT + Specify a regular expression which matches + entropy strings to exclude from the scan. + This option can be specified multiple times + to exclude multiple patterns. If not + provided (default), no entropy strings will + be excluded ({path regex}::{pattern regex}). + -e, --exclude-signatures TEXT Specify signatures of matches that you explicitly want to exclude from the scan, and mark as okay. These signatures are diff --git a/docs/source/features.rst b/docs/source/features.rst index 37713f5d..184623d1 100644 --- a/docs/source/features.rst +++ b/docs/source/features.rst @@ -227,6 +227,28 @@ thing, there needs to be a way to tell ``tartufo`` to ignore those things, and not report them out as issues. For this reason, we provide multiple methods for excluding these items. +Entropy Limiting +++++++++++++++++ + +Entropy scans can produce a high number of false positives such as git SHAs or md5 +digests. To avoid these false positives, enable ``exclude-entropy-patterns``. Exclusions +apply to any strings flagged by entropy checks. + +For example, if ``docs/README.md`` contains a git SHA, this would be flagged by entropy. +To exclude this, add ``docs/.*\.md$::^[a-zA-Z0-9]{40}$`` to ``exclude-entropy-patterns``. + +.. code-block:: sh + + > tartufo ... --exclude-entropy-patterns "docs/.*\.md$::^[a-zA-Z0-9]{40}$" + +.. code-block:: toml + + [tool.tartufo] + exclude-entropy-patterns = [ + # format: "{file regex}::{entropy pattern}" + "docs/.*\.md$::^[a-zA-Z0-9]{40}$", # exclude all git SHAs in the docs directory + ] + Limiting by Signature +++++++++++++++++++++ diff --git a/tartufo/cli.py b/tartufo/cli.py index 47a019d0..03d6f3b7 100644 --- a/tartufo/cli.py +++ b/tartufo/cli.py @@ -123,6 +123,15 @@ def get_command(self, ctx: click.Context, cmd_name: str) -> Optional[click.Comma are excluded unless effectively excluded via the --include-path-patterns option.""", ) +@click.option( + "-xe", + "--exclude-entropy-patterns", + multiple=True, + help="""Specify a regular expression which matches entropy strings to + exclude from the scan. This option can be specified multiple times to + exclude multiple patterns. If not provided (default), no entropy strings + will be excluded ({path regex}::{pattern regex}).""", +) @click.option( "-e", "--exclude-signatures", diff --git a/tartufo/config.py b/tartufo/config.py index 153ae9f2..a2931e2d 100644 --- a/tartufo/config.py +++ b/tartufo/config.py @@ -243,3 +243,33 @@ def compile_path_rules(patterns: Iterable[str]) -> List[Pattern]: for pattern in stripped if pattern and not pattern.startswith("#") ] + + +def compile_rule(pattern: str) -> Rule: + """ + Compile pattern string to Rule. + + :param pattern: Rule pattern with {path_pattern}::{pattern} + :return Rule: Rule object with pattern and path_pattern + """ + try: + path, pattern = pattern.split("::", 1) + except ValueError: # Raised when the split separator is not found + path = ".*" + return Rule(name=None, pattern=re.compile(pattern), path_pattern=re.compile(path)) + + +def compile_rules(patterns: Iterable[str]) -> List[Rule]: + """Take a list of regex string with paths and compile them into a List of Rule. + + Any line starting with `#` will be ignored. + + :param patterns: The list of patterns to be compiled + :return: List of Rule objects + """ + stripped = (p.strip() for p in patterns) + return [ + compile_rule(pattern) + for pattern in stripped + if pattern and not pattern.startswith("#") + ] diff --git a/tartufo/scanner.py b/tartufo/scanner.py index bb814c95..6e782817 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -117,6 +117,7 @@ class ScannerBase(abc.ABC): _issues: Optional[List[Issue]] = None _included_paths: Optional[List[Pattern]] = None _excluded_paths: Optional[List[Pattern]] = None + _excluded_entropy: Optional[List[Rule]] = None _rules_regexes: Optional[Dict[str, Rule]] = None global_options: types.GlobalOptions logger: logging.Logger @@ -162,6 +163,23 @@ def included_paths(self) -> List[Pattern]: ) return self._included_paths + @property + def excluded_entropy(self) -> List[Rule]: + """Get a list of regexes used as an exclusive list of paths to scan. + + :rtype: List[Pattern] + """ + if self._excluded_entropy is None: + self.logger.info("Initializing excluded entropy patterns") + patterns = list(self.global_options.exclude_entropy_patterns or ()) + self._excluded_entropy = ( + config.compile_rules(set(patterns)) if patterns else [] + ) + self.logger.debug( + "Excluded entropy was initialized as: %s", self._excluded_entropy + ) + return self._excluded_entropy + @property def excluded_paths(self) -> List[Pattern]: """Get a list of regexes used to match paths to exclude from the scan. @@ -247,6 +265,36 @@ def signature_is_excluded(self, blob: str, file_path: str) -> bool: in self.global_options.exclude_signatures ) + @staticmethod + @lru_cache(maxsize=None) + def rule_matches(rule: Rule, string: str, path: str) -> bool: + """ + Match string and path against rule. + + :param rule: Rule to perform match + :param string: string to match against rule pattern + :param path: path to match against rule path_pattern + :return: True if string and path matched, False otherwise. + """ + match = False + if rule.pattern: + match = rule.pattern.match(string) is not None + if rule.path_pattern: + match = match and rule.path_pattern.match(path) is not None + return match + + def entropy_string_is_excluded(self, string: str, path: str) -> bool: + """Find whether the signature of some data has been excluded in configuration. + + :param string: String to check against rule pattern + :param path: Path to check against rule path pattern + :return: True if excluded, False otherwise + """ + + return bool(self.excluded_entropy) and any( + ScannerBase.rule_matches(p, string, path) for p in self.excluded_entropy + ) + @lru_cache(maxsize=None) def calculate_entropy(self, data: str, char_set: str) -> float: """Calculate the Shannon entropy for a piece of data. @@ -311,18 +359,41 @@ def scan_entropy(self, chunk: types.Chunk) -> List[Issue]: hex_strings = util.get_strings_of_set(word, HEX_CHARS) for string in b64_strings: - if not self.signature_is_excluded(string, chunk.file_path): - b64_entropy = self.calculate_entropy(string, BASE64_CHARS) - if b64_entropy > 4.5: - issues.append(Issue(types.IssueType.Entropy, string, chunk)) + issues += self.evaluate_entropy_string( + chunk, string, BASE64_CHARS, 4.5 + ) for string in hex_strings: - if not self.signature_is_excluded(string, chunk.file_path): - hex_entropy = self.calculate_entropy(string, HEX_CHARS) - if hex_entropy > 3: - issues.append(Issue(types.IssueType.Entropy, string, chunk)) + issues += self.evaluate_entropy_string(chunk, string, HEX_CHARS, 3) + return issues + def evaluate_entropy_string( + self, + chunk: types.Chunk, + string: str, + chars: str, + min_entropy_score: float, + ) -> List[Issue]: + """ + Check entropy string using entropy characters and score. + + :param chunk: The chunk of data to check + :param issues: Issue list to append any strings flagged + :param string: String to check + :param chars: Characters to calculate score + :param min_entropy_score: Minimum entropy score to flag + return: List of issues flagged + """ + if not self.signature_is_excluded(string, chunk.file_path): + entropy_score = self.calculate_entropy(string, chars) + if entropy_score > min_entropy_score: + if self.entropy_string_is_excluded(string, chunk.file_path): + self.logger.debug("entropy string %s was excluded", string) + else: + return [Issue(types.IssueType.Entropy, string, chunk)] + return [] + def scan_regex(self, chunk: types.Chunk) -> List[Issue]: """Scan a chunk of data for matches against the configured regexes. diff --git a/tartufo/types.py b/tartufo/types.py index 005faed2..98c52252 100644 --- a/tartufo/types.py +++ b/tartufo/types.py @@ -16,6 +16,7 @@ class GlobalOptions: "include_path_patterns", "exclude_paths", "exclude_path_patterns", + "exclude_entropy_patterns", "exclude_signatures", "output_dir", "git_rules_repo", @@ -35,6 +36,7 @@ class GlobalOptions: include_path_patterns: Tuple[str, ...] exclude_paths: Optional[TextIO] exclude_path_patterns: Tuple[str, ...] + exclude_entropy_patterns: Tuple[str, ...] exclude_signatures: Tuple[str, ...] output_dir: Optional[str] git_rules_repo: Optional[str] @@ -75,6 +77,11 @@ class Rule: pattern: Pattern path_pattern: Optional[Pattern] + def __hash__(self) -> int: + if self.path_pattern: + return hash(f"{self.pattern.pattern}::{self.path_pattern.pattern}") + return hash(self.pattern.pattern) + class LogLevel(enum.IntEnum): ERROR = 0 diff --git a/tests/test_base_scanner.py b/tests/test_base_scanner.py index a994e082..13f3bed8 100644 --- a/tests/test_base_scanner.py +++ b/tests/test_base_scanner.py @@ -342,6 +342,21 @@ def setUp(self) -> None: ) self.scanner = TestScanner(self.options) + def test_entropy_string_is_excluded(self): + self.options.exclude_entropy_patterns = [r"docs/.*\.md::f.*"] + excluded = self.scanner.entropy_string_is_excluded("foo", "docs/README.md") + self.assertEqual(True, excluded) + + def test_entropy_string_is_not_excluded(self): + self.options.exclude_entropy_patterns = [r"foo\..*::f.*"] + excluded = self.scanner.entropy_string_is_excluded("bar", "foo.py") + self.assertEqual(False, excluded) + + def test_entropy_string_is_not_excluded_given_different_path(self): + self.options.exclude_entropy_patterns = [r"foo\..*::f.*"] + excluded = self.scanner.entropy_string_is_excluded("foo", "bar.py") + self.assertEqual(False, excluded) + def test_calculate_base64_entropy_calculation(self): random_string = ( "ZWVTjPQSdhwRgl204Hc51YCsritMIzn8B=/p9UyeX7xu6KkAGqfm3FJ+oObLDNEva" @@ -440,6 +455,42 @@ def test_issues_are_created_for_high_entropy_hex_strings( self.assertEqual(issues[0].issue_type, types.IssueType.Entropy) self.assertEqual(issues[0].matched_string, "foo") + @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy") + @mock.patch("tartufo.scanner.ScannerBase.signature_is_excluded") + @mock.patch("tartufo.scanner.ScannerBase.entropy_string_is_excluded") + @mock.patch("tartufo.util.get_strings_of_set") + def test_issues_are_not_created_for_high_entropy_hex_strings_given_entropy_is_excluded( + self, + mock_strings: mock.MagicMock, + mock_entropy: mock.MagicMock, + mock_signature: mock.MagicMock, + mock_calculate: mock.MagicMock, + ): + mock_strings.side_effect = ([], ["foo"], [], [], [], []) + mock_entropy.return_value = True + mock_signature.return_value = False + mock_calculate.return_value = 9.0 + issues = self.scanner.scan_entropy(self.chunk) + self.assertEqual(len(issues), 0) + + @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy") + @mock.patch("tartufo.scanner.ScannerBase.signature_is_excluded") + @mock.patch("tartufo.scanner.ScannerBase.entropy_string_is_excluded") + @mock.patch("tartufo.util.get_strings_of_set") + def test_issues_are_not_created_for_low_entropy_b64_strings_given_entropy_is_excluded( + self, + mock_strings: mock.MagicMock, + mock_entropy: mock.MagicMock, + mock_signature: mock.MagicMock, + mock_calculate: mock.MagicMock, + ): + mock_strings.side_effect = (["foo"], [], [], [], [], []) + mock_entropy.return_value = True + mock_signature.return_value = False + mock_calculate.return_value = 9.0 + issues = self.scanner.scan_entropy(self.chunk) + self.assertEqual(len(issues), 0) + @mock.patch("tartufo.scanner.ScannerBase.calculate_entropy") @mock.patch("tartufo.scanner.ScannerBase.signature_is_excluded") @mock.patch("tartufo.util.get_strings_of_set") diff --git a/tests/test_config.py b/tests/test_config.py index b621c7a1..23889613 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -290,5 +290,59 @@ def test_whitespace_lines_are_ignored(self): ) +class CompileRulesTests(unittest.TestCase): + def test_commented_lines_are_ignored(self): + rules = config.compile_rules(["# Poetry lock file", r"^[a-zA-Z0-9]{26}$"]) + self.assertEqual( + rules, [Rule(None, re.compile(r"^[a-zA-Z0-9]{26}$"), re.compile(r".*"))] + ) + + def test_whitespace_lines_are_ignored(self): + rules = config.compile_rules( + [ + "# Poetry lock file", + r"poetry\.lock::^[a-zA-Z0-9]{40}$", + "", + "\t\n", + "# NPM files", + r"^[a-zA-Z0-9]{26}$", + ] + ) + self.assertEqual( + rules, + [ + Rule( + None, re.compile(r"^[a-zA-Z0-9]{40}$"), re.compile(r"poetry\.lock") + ), + Rule(None, re.compile(r"^[a-zA-Z0-9]{26}$"), re.compile(r".*")), + ], + ) + + def test_path_is_used(self): + rules = config.compile_rules( + [ + r"src/.*::^[a-zA-Z0-9]{26}$", + r"^[a-zA-Z0-9]test$", + r"src/.*::^[a-zA-Z0-9]{26}::test$", + ] + ) + self.assertEqual( + rules, + [ + Rule(None, re.compile(r"^[a-zA-Z0-9]{26}$"), re.compile(r"src/.*")), + Rule(None, re.compile(r"^[a-zA-Z0-9]test$"), re.compile(r".*")), + Rule( + None, re.compile(r"^[a-zA-Z0-9]{26}::test$"), re.compile(r"src/.*") + ), + ], + ) + + def test_match_can_contain_delimiter(self): + rules = config.compile_rules([r".*::^[a-zA-Z0-9]::test$"]) + self.assertEqual( + rules, [Rule(None, re.compile(r"^[a-zA-Z0-9]::test$"), re.compile(r".*"))] + ) + + if __name__ == "__main__": unittest.main()