From 620c721662c3ddd8d8ca8838861b9c4ba3ea66e7 Mon Sep 17 00:00:00 2001 From: Joe Date: Thu, 4 Jan 2024 23:26:08 -0800 Subject: [PATCH] fix(english-preset): don't include skip-non-alphabetic transformer For #23, #46. BREAKING CHANGE: Using the default English preset, Obscenity will no longer strip non-alphabetic characters from the input text before matching. This addresses a class of egregious false negatives in previous versions (see #23), but introduces a regression where cases such as 'f u c k' (with the space) will no longer be detected by default. We expect to provide a more comprehensive fix in the next minor release. If desired, it remains possible to revert to the previous behavior by providing a custom set of transformers to the matcher. --- README.md | 3 +-- src/preset/english.ts | 4 ++-- src/transformer/skip-non-alphabetic/index.ts | 10 +++++++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7472154..f7f2c27 100644 --- a/README.md +++ b/README.md @@ -109,8 +109,7 @@ With the English preset, Obscenity (correctly) finds matches in all of the follo - **fk** you - **ffuk** you - i like **a$$es** -- **ʃ𝐟ʃὗƈ k** ỹоứ -- **f .... !!! uuuuuuuuu ccc k** +- ʃ𝐟ʃὗƈk ỹоứ ...and it **does not match** on the following: diff --git a/src/preset/english.ts b/src/preset/english.ts index e80e060..8b6bfe7 100644 --- a/src/preset/english.ts +++ b/src/preset/english.ts @@ -4,7 +4,6 @@ import { pattern } from '../pattern/Pattern'; import { collapseDuplicatesTransformer } from '../transformer/collapse-duplicates'; import { resolveConfusablesTransformer } from '../transformer/resolve-confusables'; import { resolveLeetSpeakTransformer } from '../transformer/resolve-leetspeak'; -import { skipNonAlphabeticTransformer } from '../transformer/skip-non-alphabetic'; import { toAsciiLowerCaseTransformer } from '../transformer/to-ascii-lowercase'; /** @@ -15,7 +14,8 @@ export const englishRecommendedBlacklistMatcherTransformers = [ resolveConfusablesTransformer(), resolveLeetSpeakTransformer(), toAsciiLowerCaseTransformer(), - skipNonAlphabeticTransformer(), + // See #23 and #46. + // skipNonAlphabeticTransformer(), collapseDuplicatesTransformer({ defaultThreshold: 1, customThresholds: new Map([ diff --git a/src/transformer/skip-non-alphabetic/index.ts b/src/transformer/skip-non-alphabetic/index.ts index e967588..054e00f 100644 --- a/src/transformer/skip-non-alphabetic/index.ts +++ b/src/transformer/skip-non-alphabetic/index.ts @@ -7,10 +7,18 @@ import { createSimpleTransformer } from '../Transformers'; * comprised of alphabetic characters (the pattern `hello` does not match * `h.e.l.l.o` by default, but does with this transformer). * + * **Warning** + * + * This transformation is not part of the default set of transformations, as + * there are some known rough edges with false negatives; see + * [#23](https://github.com/jo3-l/obscenity/issues/23) and + * [#46](https://github.com/jo3-l/obscenity/issues/46) on the GitHub issue + * tracker. + * * **Application order** * * It is recommended that this transformer be applied near the end of the - * transformer chain. + * transformer chain, if at all. * * @example * ```typescript