diff --git a/.changeset/rich-ways-exercise.md b/.changeset/rich-ways-exercise.md new file mode 100644 index 000000000..7cfd852d5 --- /dev/null +++ b/.changeset/rich-ways-exercise.md @@ -0,0 +1,5 @@ +--- +"eslint-plugin-regexp": minor +--- + +Improve `regexp/sort-alternatives` rule to add support for string alternatives and v flag diff --git a/docs/rules/sort-alternatives.md b/docs/rules/sort-alternatives.md index a039e84ab..414451564 100644 --- a/docs/rules/sort-alternatives.md +++ b/docs/rules/sort-alternatives.md @@ -19,7 +19,8 @@ This rule will sort alternatives to improve readability and maintainability. The primary target of this rule are lists of words and/or numbers. These lists are somewhat common, and sorting them makes it easy for readers to check whether a particular word or number is included. -This rule will only sort alternatives if reordering the alternatives doesn't affect the pattern. +This rule will only sort alternatives if reordering the alternatives doesn't affect the pattern.\ +However, character classes containing strings are ensured to match the longest string, so they can always be sorted. @@ -29,11 +30,13 @@ This rule will only sort alternatives if reordering the alternatives doesn't aff /* ✓ GOOD */ var foo = /\b(1|2|3)\b/; var foo = /\b(alpha|beta|gamma)\b/; +var foo = /[\q{blue|green|red}]/v; /* ✗ BAD */ var foo = /\b(2|1|3)\b/; var foo = /__(?:Foo|Bar)__/; var foo = /\((?:TM|R|C)\)/; +var foo = /[\q{red|green|blue}]/v; ``` diff --git a/lib/rules/sort-alternatives.ts b/lib/rules/sort-alternatives.ts index ec35b0908..31b42acd8 100644 --- a/lib/rules/sort-alternatives.ts +++ b/lib/rules/sort-alternatives.ts @@ -4,9 +4,11 @@ import type { Character, CharacterClass, CharacterSet, + ClassStringDisjunction, Element, - Node, + ExpressionCharacterClass, Pattern, + StringAlternative, } from "@eslint-community/regexpp/ast" import type { RegExpContext } from "../utils" import { @@ -30,12 +32,14 @@ import { hasSomeDescendant, canReorder, getLongestPrefix, - toCharSet, getConsumedChars, + toUnicodeSet, + hasStrings, } from "regexp-ast-analysis" import type { CharSet, Word, ReadonlyWord } from "refa" import { NFA, JS, transform } from "refa" import { getParser } from "../utils/regexp-ast" +import { getLexicographicallySmallestInConcatenation } from "../utils/lexicographically-smallest" interface AllowedChars { allowed: CharSet @@ -45,7 +49,10 @@ const cache = new Map>() function getAllowedChars(flags: ReadonlyFlags) { assertValidFlags(flags) - const cacheKey = (flags.ignoreCase ? "i" : "") + (flags.unicode ? "u" : "") + const cacheKey = + (flags.ignoreCase ? "i" : "") + + (flags.unicode ? "u" : "") + + (flags.unicodeSets ? "v" : "") let result = cache.get(cacheKey) if (result === undefined) { result = { @@ -86,7 +93,8 @@ function containsOnlyLiterals( d.type === "Backreference" || d.type === "CharacterSet" || (d.type === "Quantifier" && d.max === Infinity) || - (d.type === "CharacterClass" && d.negate) + (d.type === "CharacterClass" && d.negate) || + (d.type === "ExpressionCharacterClass" && d.negate) ) }, (d) => d.type !== "Assertion", @@ -156,29 +164,45 @@ function approximateLexicographicallySmallest( return getLexicographicallySmallestFromCharSets(prefix) } +function getLexicographicallySmallestFromAlternative( + alternative: Alternative, + parser: JS.Parser, + flags: ReadonlyFlags, +): Word | undefined +function getLexicographicallySmallestFromAlternative( + alternative: StringAlternative, + parser: JS.Parser, + flags: ReadonlyFlags, +): Word /** * If defined, this will return the lexicographically smallest string accepted * by the given alternative (ignoring assertions). */ function getLexicographicallySmallestFromAlternative( - alternative: Alternative, + alternative: Alternative | StringAlternative, parser: JS.Parser, flags: ReadonlyFlags, ): Word | undefined { - const { elements } = alternative - if (isOnlyCharacters(elements)) { + if ( + alternative.type === "StringAlternative" || + hasOnlyCharacters(alternative, flags) + ) { // fast path to avoid converting simple alternatives into NFAs const smallest: Word = [] - for (const e of elements) { - // FIXME: TS Error - // @ts-expect-error -- FIXME - const cs = toCharSet(e, flags) + for (const e of alternative.elements) { + const cs = toUnicodeSet(e, flags).chars if (cs.isEmpty) return undefined smallest.push(cs.ranges[0].min) } return smallest } + if (isOnlyCharacterElements(alternative.elements)) { + return getLexicographicallySmallestInConcatenation( + alternative.elements.map((e) => toUnicodeSet(e, flags)), + ) + } + try { const result = parser.parseElement(alternative, { assertions: "unknown", @@ -212,15 +236,45 @@ function getLexicographicallySmallestFromAlternative( } } -/** Returns whether the given array of nodes contains only characters. */ -function isOnlyCharacters( - nodes: readonly Node[], -): nodes is readonly (Character | CharacterClass | CharacterSet)[] { +/** + * Returns whether the given array of nodes contains only characters. + * But note that if the pattern has the v flag, the character class may contain strings. + */ +function isOnlyCharacterElements( + nodes: Element[], +): nodes is ( + | Character + | CharacterClass + | CharacterSet + | ExpressionCharacterClass +)[] { return nodes.every( (e) => e.type === "Character" || e.type === "CharacterClass" || - e.type === "CharacterSet", + e.type === "CharacterSet" || + e.type === "ExpressionCharacterClass", + ) +} + +/** + * Returns whether the given alternative has contains only characters. + * The v flag in the pattern does not contains the string. + */ +function hasOnlyCharacters( + alternative: Alternative, + flags: ReadonlyFlags, +): alternative is Alternative & { + elements: readonly ( + | Character + | CharacterClass + | CharacterSet + | ExpressionCharacterClass + )[] +} { + return ( + isOnlyCharacterElements(alternative.elements) && + alternative.elements.every((e) => !hasStrings(e, flags)) ) } @@ -433,6 +487,28 @@ function sortAlternatives( }) } +/** + * Sorts the given string alternatives. + * + * Sorting is done by comparing the lexicographically smallest strings (LSS). + * + * For more information on why we use LSS-based comparison and how it works, + * see https://github.com/ota-meshi/eslint-plugin-regexp/pull/423. + */ +function sortStringAlternatives( + alternatives: StringAlternative[], + parser: JS.Parser, + flags: ReadonlyFlags, +): void { + alternatives.sort((a, b) => { + const lssDiff = compareWords( + getLexicographicallySmallestFromAlternative(a, parser, flags), + getLexicographicallySmallestFromAlternative(b, parser, flags), + ) + return lssDiff + }) +} + /** * Returns whether the given string is a valid integer. * @param str @@ -446,7 +522,9 @@ function isIntegerString(str: string): boolean { * This tries to sort the given alternatives by assuming that all alternatives * are a number. */ -function trySortNumberAlternatives(alternatives: Alternative[]): void { +function trySortNumberAlternatives( + alternatives: (Alternative | StringAlternative)[], +): void { const runs = getRuns(alternatives, (a) => isIntegerString(a.raw)) for (const { startIndex, elements } of runs) { elements.sort((a, b) => { @@ -528,7 +606,7 @@ export default createRule("sort-alternatives", { fixable: "code", schema: [], messages: { - sort: "The alternatives of this group can be sorted without affecting the regex.", + sort: "The {{alternatives}} can be sorted without affecting the regex.", }, type: "suggestion", // "problem", }, @@ -551,7 +629,6 @@ export default createRule("sort-alternatives", { let chars = possibleCharsCache.get(a) if (chars === undefined) { chars = getConsumedChars(a, flags).chars - possibleCharsCache.set(a, chars) } return chars } @@ -590,14 +667,19 @@ export default createRule("sort-alternatives", { } } - enforceSorted(run) + enforceSorted(run, "alternatives of this group") } /** * Creates a report if the sorted alternatives are different from * the unsorted ones. */ - function enforceSorted(run: Run): void { + function enforceSorted( + run: Run, + alternatives: + | "alternatives of this group" + | "string alternatives", + ): void { const sorted = run.elements const parent = sorted[0].parent const unsorted = parent.alternatives.slice( @@ -619,6 +701,7 @@ export default createRule("sort-alternatives", { node, loc, messageId: "sort", + data: { alternatives }, fix: fixReplaceNode(parent, () => { const prefix = parent.raw.slice( 0, @@ -682,10 +765,29 @@ export default createRule("sort-alternatives", { } } + /** The handler for ClassStringDisjunction */ + function onClassStringDisjunction( + parent: ClassStringDisjunction, + ): void { + if (parent.alternatives.length < 2) { + return + } + + const alternatives = [...parent.alternatives] + sortStringAlternatives(alternatives, parser, flags) + trySortNumberAlternatives(alternatives) + const run: Run = { + startIndex: 0, + elements: [...alternatives], + } + enforceSorted(run, "string alternatives") + } + return { onGroupEnter: onParent, onPatternEnter: onParent, onCapturingGroupEnter: onParent, + onClassStringDisjunctionEnter: onClassStringDisjunction, } } diff --git a/lib/utils/lexicographically-smallest.ts b/lib/utils/lexicographically-smallest.ts new file mode 100644 index 000000000..8d47bedb2 --- /dev/null +++ b/lib/utils/lexicographically-smallest.ts @@ -0,0 +1,89 @@ +import type { Word } from "refa" +import type { JS } from "refa" + +function findMin( + array: readonly T[], + compare: (a: T, b: T) => number, +): T | undefined { + if (array.length === 0) { + return undefined + } + + let min = array[0] + for (let i = 1; i < array.length; i++) { + const item = array[i] + if (compare(item, min) < 0) { + min = item + } + } + return min +} + +function compareWords(a: Word, b: Word): number { + const l = Math.min(a.length, b.length) + for (let i = 0; i < l; i++) { + const diff = a[i] - b[i] + if (diff !== 0) { + return diff + } + } + return a.length - b.length +} + +/** + * Returns the lexicographically smallest word in the given set or `undefined` if the set is empty. + */ +export function getLexicographicallySmallest( + set: JS.UnicodeSet, +): Word | undefined { + if (set.accept.isEmpty) { + return set.chars.isEmpty ? undefined : [set.chars.ranges[0].min] + } + + const words = set.accept.wordSets.map( + (w): Word => w.map((c) => c.ranges[0].min), + ) + return findMin(words, compareWords) +} + +/** + * Returns the lexicographically smallest word in the given set or `undefined` if the set is empty. + */ +export function getLexicographicallySmallestInConcatenation( + elements: readonly JS.UnicodeSet[], +): Word | undefined { + if (elements.length === 1) { + return getLexicographicallySmallest(elements[0]) + } + + let smallest: Word = [] + for (let i = elements.length - 1; i >= 0; i--) { + const set = elements[i] + if (set.isEmpty) { + return undefined + } else if (set.accept.isEmpty) { + smallest.unshift(set.chars.ranges[0].min) + } else { + let words = [ + ...(set.chars.isEmpty ? [] : [[set.chars]]), + ...set.accept.wordSets, + ].map((w): Word => w.map((c) => c.ranges[0].min)) + // we only have to consider the lexicographically smallest words with unique length + const seenLengths = new Set() + words = words.sort(compareWords).filter((w) => { + if (seenLengths.has(w.length)) { + return false + } + seenLengths.add(w.length) + return true + }) + + smallest = findMin( + // eslint-disable-next-line no-loop-func -- x + words.map((w): Word => [...w, ...smallest]), + compareWords, + )! + } + } + return smallest +} diff --git a/tests/lib/rules/sort-alternatives.ts b/tests/lib/rules/sort-alternatives.ts index b64588433..6850809db 100644 --- a/tests/lib/rules/sort-alternatives.ts +++ b/tests/lib/rules/sort-alternatives.ts @@ -3,7 +3,7 @@ import rule from "../../../lib/rules/sort-alternatives" const tester = new RuleTester({ parserOptions: { - ecmaVersion: 2020, + ecmaVersion: "latest", sourceType: "module", }, }) @@ -20,6 +20,7 @@ tester.run("sort-alternatives", rule as any, { String.raw`/\b(x?Rec|RequestOptionsPage)\b/`, String.raw`/\b([ft]|false|true)\b/`, String.raw`/\b(a+b|a+c)\b/`, + String.raw` /[\q{blue|green|red}]/v`, ], invalid: [ { @@ -140,5 +141,55 @@ tester.run("sort-alternatives", rule as any, { "The alternatives of this group can be sorted without affecting the regex.", ], }, + { + code: String.raw`/[\q{red|green|blue}]/v`, + output: String.raw`/[\q{blue|green|red}]/v`, + errors: [ + "The string alternatives can be sorted without affecting the regex.", + ], + }, + { + code: String.raw`/(?:c|[\q{red|green|blue}]|a)/v`, + output: String.raw`/(?:a|[\q{red|green|blue}]|c)/v`, + errors: [ + "The alternatives of this group can be sorted without affecting the regex.", + "The string alternatives can be sorted without affecting the regex.", + ], + }, + { + code: String.raw`/[\q{ac|ab|aa}]/v`, + output: String.raw`/[\q{aa|ab|ac}]/v`, + errors: [ + "The string alternatives can be sorted without affecting the regex.", + ], + }, + { + code: String.raw`/(?:b|[a-b])/v`, + output: String.raw`/(?:[a-b]|b)/v`, + errors: [ + "The alternatives of this group can be sorted without affecting the regex.", + ], + }, + { + code: String.raw`/\b(?:a[\q{bd}]|abc)\b/v`, + output: String.raw`/\b(?:abc|a[\q{bd}])\b/v`, + errors: [ + "The alternatives of this group can be sorted without affecting the regex.", + ], + }, + { + code: String.raw`/\b(?:abb|[\q{a|aba}]bb)\b/v`, + output: String.raw`/\b(?:[\q{a|aba}]bb|abb)\b/v`, + errors: [ + "The alternatives of this group can be sorted without affecting the regex.", + ], + }, + { + code: String.raw`/\b(?:c|b_|[\q{b|da}]_|b_2)\b/v`, + output: String.raw`/\b(?:b_|[\q{b|da}]_|b_2|c)\b/v`, + errors: [ + "The alternatives of this group can be sorted without affecting the regex.", + ], + }, ], })