diff --git a/.changeset/rich-ways-exercise.md b/.changeset/rich-ways-exercise.md
new file mode 100644
index 000000000..7cfd852d5
--- /dev/null
+++ b/.changeset/rich-ways-exercise.md
@@ -0,0 +1,5 @@
+---
+"eslint-plugin-regexp": minor
+---
+
+Improve `regexp/sort-alternatives` rule to add support for string alternatives and v flag
diff --git a/docs/rules/sort-alternatives.md b/docs/rules/sort-alternatives.md
index a039e84ab..414451564 100644
--- a/docs/rules/sort-alternatives.md
+++ b/docs/rules/sort-alternatives.md
@@ -19,7 +19,8 @@ This rule will sort alternatives to improve readability and maintainability.
The primary target of this rule are lists of words and/or numbers. These lists are somewhat common, and sorting them makes it easy for readers to check whether a particular word or number is included.
-This rule will only sort alternatives if reordering the alternatives doesn't affect the pattern.
+This rule will only sort alternatives if reordering the alternatives doesn't affect the pattern.\
+However, character classes containing strings are ensured to match the longest string, so they can always be sorted.
@@ -29,11 +30,13 @@ This rule will only sort alternatives if reordering the alternatives doesn't aff
/* ✓ GOOD */
var foo = /\b(1|2|3)\b/;
var foo = /\b(alpha|beta|gamma)\b/;
+var foo = /[\q{blue|green|red}]/v;
/* ✗ BAD */
var foo = /\b(2|1|3)\b/;
var foo = /__(?:Foo|Bar)__/;
var foo = /\((?:TM|R|C)\)/;
+var foo = /[\q{red|green|blue}]/v;
```
diff --git a/lib/rules/sort-alternatives.ts b/lib/rules/sort-alternatives.ts
index ec35b0908..31b42acd8 100644
--- a/lib/rules/sort-alternatives.ts
+++ b/lib/rules/sort-alternatives.ts
@@ -4,9 +4,11 @@ import type {
Character,
CharacterClass,
CharacterSet,
+ ClassStringDisjunction,
Element,
- Node,
+ ExpressionCharacterClass,
Pattern,
+ StringAlternative,
} from "@eslint-community/regexpp/ast"
import type { RegExpContext } from "../utils"
import {
@@ -30,12 +32,14 @@ import {
hasSomeDescendant,
canReorder,
getLongestPrefix,
- toCharSet,
getConsumedChars,
+ toUnicodeSet,
+ hasStrings,
} from "regexp-ast-analysis"
import type { CharSet, Word, ReadonlyWord } from "refa"
import { NFA, JS, transform } from "refa"
import { getParser } from "../utils/regexp-ast"
+import { getLexicographicallySmallestInConcatenation } from "../utils/lexicographically-smallest"
interface AllowedChars {
allowed: CharSet
@@ -45,7 +49,10 @@ const cache = new Map>()
function getAllowedChars(flags: ReadonlyFlags) {
assertValidFlags(flags)
- const cacheKey = (flags.ignoreCase ? "i" : "") + (flags.unicode ? "u" : "")
+ const cacheKey =
+ (flags.ignoreCase ? "i" : "") +
+ (flags.unicode ? "u" : "") +
+ (flags.unicodeSets ? "v" : "")
let result = cache.get(cacheKey)
if (result === undefined) {
result = {
@@ -86,7 +93,8 @@ function containsOnlyLiterals(
d.type === "Backreference" ||
d.type === "CharacterSet" ||
(d.type === "Quantifier" && d.max === Infinity) ||
- (d.type === "CharacterClass" && d.negate)
+ (d.type === "CharacterClass" && d.negate) ||
+ (d.type === "ExpressionCharacterClass" && d.negate)
)
},
(d) => d.type !== "Assertion",
@@ -156,29 +164,45 @@ function approximateLexicographicallySmallest(
return getLexicographicallySmallestFromCharSets(prefix)
}
+function getLexicographicallySmallestFromAlternative(
+ alternative: Alternative,
+ parser: JS.Parser,
+ flags: ReadonlyFlags,
+): Word | undefined
+function getLexicographicallySmallestFromAlternative(
+ alternative: StringAlternative,
+ parser: JS.Parser,
+ flags: ReadonlyFlags,
+): Word
/**
* If defined, this will return the lexicographically smallest string accepted
* by the given alternative (ignoring assertions).
*/
function getLexicographicallySmallestFromAlternative(
- alternative: Alternative,
+ alternative: Alternative | StringAlternative,
parser: JS.Parser,
flags: ReadonlyFlags,
): Word | undefined {
- const { elements } = alternative
- if (isOnlyCharacters(elements)) {
+ if (
+ alternative.type === "StringAlternative" ||
+ hasOnlyCharacters(alternative, flags)
+ ) {
// fast path to avoid converting simple alternatives into NFAs
const smallest: Word = []
- for (const e of elements) {
- // FIXME: TS Error
- // @ts-expect-error -- FIXME
- const cs = toCharSet(e, flags)
+ for (const e of alternative.elements) {
+ const cs = toUnicodeSet(e, flags).chars
if (cs.isEmpty) return undefined
smallest.push(cs.ranges[0].min)
}
return smallest
}
+ if (isOnlyCharacterElements(alternative.elements)) {
+ return getLexicographicallySmallestInConcatenation(
+ alternative.elements.map((e) => toUnicodeSet(e, flags)),
+ )
+ }
+
try {
const result = parser.parseElement(alternative, {
assertions: "unknown",
@@ -212,15 +236,45 @@ function getLexicographicallySmallestFromAlternative(
}
}
-/** Returns whether the given array of nodes contains only characters. */
-function isOnlyCharacters(
- nodes: readonly Node[],
-): nodes is readonly (Character | CharacterClass | CharacterSet)[] {
+/**
+ * Returns whether the given array of nodes contains only characters.
+ * But note that if the pattern has the v flag, the character class may contain strings.
+ */
+function isOnlyCharacterElements(
+ nodes: Element[],
+): nodes is (
+ | Character
+ | CharacterClass
+ | CharacterSet
+ | ExpressionCharacterClass
+)[] {
return nodes.every(
(e) =>
e.type === "Character" ||
e.type === "CharacterClass" ||
- e.type === "CharacterSet",
+ e.type === "CharacterSet" ||
+ e.type === "ExpressionCharacterClass",
+ )
+}
+
+/**
+ * Returns whether the given alternative has contains only characters.
+ * The v flag in the pattern does not contains the string.
+ */
+function hasOnlyCharacters(
+ alternative: Alternative,
+ flags: ReadonlyFlags,
+): alternative is Alternative & {
+ elements: readonly (
+ | Character
+ | CharacterClass
+ | CharacterSet
+ | ExpressionCharacterClass
+ )[]
+} {
+ return (
+ isOnlyCharacterElements(alternative.elements) &&
+ alternative.elements.every((e) => !hasStrings(e, flags))
)
}
@@ -433,6 +487,28 @@ function sortAlternatives(
})
}
+/**
+ * Sorts the given string alternatives.
+ *
+ * Sorting is done by comparing the lexicographically smallest strings (LSS).
+ *
+ * For more information on why we use LSS-based comparison and how it works,
+ * see https://github.com/ota-meshi/eslint-plugin-regexp/pull/423.
+ */
+function sortStringAlternatives(
+ alternatives: StringAlternative[],
+ parser: JS.Parser,
+ flags: ReadonlyFlags,
+): void {
+ alternatives.sort((a, b) => {
+ const lssDiff = compareWords(
+ getLexicographicallySmallestFromAlternative(a, parser, flags),
+ getLexicographicallySmallestFromAlternative(b, parser, flags),
+ )
+ return lssDiff
+ })
+}
+
/**
* Returns whether the given string is a valid integer.
* @param str
@@ -446,7 +522,9 @@ function isIntegerString(str: string): boolean {
* This tries to sort the given alternatives by assuming that all alternatives
* are a number.
*/
-function trySortNumberAlternatives(alternatives: Alternative[]): void {
+function trySortNumberAlternatives(
+ alternatives: (Alternative | StringAlternative)[],
+): void {
const runs = getRuns(alternatives, (a) => isIntegerString(a.raw))
for (const { startIndex, elements } of runs) {
elements.sort((a, b) => {
@@ -528,7 +606,7 @@ export default createRule("sort-alternatives", {
fixable: "code",
schema: [],
messages: {
- sort: "The alternatives of this group can be sorted without affecting the regex.",
+ sort: "The {{alternatives}} can be sorted without affecting the regex.",
},
type: "suggestion", // "problem",
},
@@ -551,7 +629,6 @@ export default createRule("sort-alternatives", {
let chars = possibleCharsCache.get(a)
if (chars === undefined) {
chars = getConsumedChars(a, flags).chars
- possibleCharsCache.set(a, chars)
}
return chars
}
@@ -590,14 +667,19 @@ export default createRule("sort-alternatives", {
}
}
- enforceSorted(run)
+ enforceSorted(run, "alternatives of this group")
}
/**
* Creates a report if the sorted alternatives are different from
* the unsorted ones.
*/
- function enforceSorted(run: Run): void {
+ function enforceSorted(
+ run: Run,
+ alternatives:
+ | "alternatives of this group"
+ | "string alternatives",
+ ): void {
const sorted = run.elements
const parent = sorted[0].parent
const unsorted = parent.alternatives.slice(
@@ -619,6 +701,7 @@ export default createRule("sort-alternatives", {
node,
loc,
messageId: "sort",
+ data: { alternatives },
fix: fixReplaceNode(parent, () => {
const prefix = parent.raw.slice(
0,
@@ -682,10 +765,29 @@ export default createRule("sort-alternatives", {
}
}
+ /** The handler for ClassStringDisjunction */
+ function onClassStringDisjunction(
+ parent: ClassStringDisjunction,
+ ): void {
+ if (parent.alternatives.length < 2) {
+ return
+ }
+
+ const alternatives = [...parent.alternatives]
+ sortStringAlternatives(alternatives, parser, flags)
+ trySortNumberAlternatives(alternatives)
+ const run: Run = {
+ startIndex: 0,
+ elements: [...alternatives],
+ }
+ enforceSorted(run, "string alternatives")
+ }
+
return {
onGroupEnter: onParent,
onPatternEnter: onParent,
onCapturingGroupEnter: onParent,
+ onClassStringDisjunctionEnter: onClassStringDisjunction,
}
}
diff --git a/lib/utils/lexicographically-smallest.ts b/lib/utils/lexicographically-smallest.ts
new file mode 100644
index 000000000..8d47bedb2
--- /dev/null
+++ b/lib/utils/lexicographically-smallest.ts
@@ -0,0 +1,89 @@
+import type { Word } from "refa"
+import type { JS } from "refa"
+
+function findMin(
+ array: readonly T[],
+ compare: (a: T, b: T) => number,
+): T | undefined {
+ if (array.length === 0) {
+ return undefined
+ }
+
+ let min = array[0]
+ for (let i = 1; i < array.length; i++) {
+ const item = array[i]
+ if (compare(item, min) < 0) {
+ min = item
+ }
+ }
+ return min
+}
+
+function compareWords(a: Word, b: Word): number {
+ const l = Math.min(a.length, b.length)
+ for (let i = 0; i < l; i++) {
+ const diff = a[i] - b[i]
+ if (diff !== 0) {
+ return diff
+ }
+ }
+ return a.length - b.length
+}
+
+/**
+ * Returns the lexicographically smallest word in the given set or `undefined` if the set is empty.
+ */
+export function getLexicographicallySmallest(
+ set: JS.UnicodeSet,
+): Word | undefined {
+ if (set.accept.isEmpty) {
+ return set.chars.isEmpty ? undefined : [set.chars.ranges[0].min]
+ }
+
+ const words = set.accept.wordSets.map(
+ (w): Word => w.map((c) => c.ranges[0].min),
+ )
+ return findMin(words, compareWords)
+}
+
+/**
+ * Returns the lexicographically smallest word in the given set or `undefined` if the set is empty.
+ */
+export function getLexicographicallySmallestInConcatenation(
+ elements: readonly JS.UnicodeSet[],
+): Word | undefined {
+ if (elements.length === 1) {
+ return getLexicographicallySmallest(elements[0])
+ }
+
+ let smallest: Word = []
+ for (let i = elements.length - 1; i >= 0; i--) {
+ const set = elements[i]
+ if (set.isEmpty) {
+ return undefined
+ } else if (set.accept.isEmpty) {
+ smallest.unshift(set.chars.ranges[0].min)
+ } else {
+ let words = [
+ ...(set.chars.isEmpty ? [] : [[set.chars]]),
+ ...set.accept.wordSets,
+ ].map((w): Word => w.map((c) => c.ranges[0].min))
+ // we only have to consider the lexicographically smallest words with unique length
+ const seenLengths = new Set()
+ words = words.sort(compareWords).filter((w) => {
+ if (seenLengths.has(w.length)) {
+ return false
+ }
+ seenLengths.add(w.length)
+ return true
+ })
+
+ smallest = findMin(
+ // eslint-disable-next-line no-loop-func -- x
+ words.map((w): Word => [...w, ...smallest]),
+ compareWords,
+ )!
+ }
+ }
+ return smallest
+}
diff --git a/tests/lib/rules/sort-alternatives.ts b/tests/lib/rules/sort-alternatives.ts
index b64588433..6850809db 100644
--- a/tests/lib/rules/sort-alternatives.ts
+++ b/tests/lib/rules/sort-alternatives.ts
@@ -3,7 +3,7 @@ import rule from "../../../lib/rules/sort-alternatives"
const tester = new RuleTester({
parserOptions: {
- ecmaVersion: 2020,
+ ecmaVersion: "latest",
sourceType: "module",
},
})
@@ -20,6 +20,7 @@ tester.run("sort-alternatives", rule as any, {
String.raw`/\b(x?Rec|RequestOptionsPage)\b/`,
String.raw`/\b([ft]|false|true)\b/`,
String.raw`/\b(a+b|a+c)\b/`,
+ String.raw` /[\q{blue|green|red}]/v`,
],
invalid: [
{
@@ -140,5 +141,55 @@ tester.run("sort-alternatives", rule as any, {
"The alternatives of this group can be sorted without affecting the regex.",
],
},
+ {
+ code: String.raw`/[\q{red|green|blue}]/v`,
+ output: String.raw`/[\q{blue|green|red}]/v`,
+ errors: [
+ "The string alternatives can be sorted without affecting the regex.",
+ ],
+ },
+ {
+ code: String.raw`/(?:c|[\q{red|green|blue}]|a)/v`,
+ output: String.raw`/(?:a|[\q{red|green|blue}]|c)/v`,
+ errors: [
+ "The alternatives of this group can be sorted without affecting the regex.",
+ "The string alternatives can be sorted without affecting the regex.",
+ ],
+ },
+ {
+ code: String.raw`/[\q{ac|ab|aa}]/v`,
+ output: String.raw`/[\q{aa|ab|ac}]/v`,
+ errors: [
+ "The string alternatives can be sorted without affecting the regex.",
+ ],
+ },
+ {
+ code: String.raw`/(?:b|[a-b])/v`,
+ output: String.raw`/(?:[a-b]|b)/v`,
+ errors: [
+ "The alternatives of this group can be sorted without affecting the regex.",
+ ],
+ },
+ {
+ code: String.raw`/\b(?:a[\q{bd}]|abc)\b/v`,
+ output: String.raw`/\b(?:abc|a[\q{bd}])\b/v`,
+ errors: [
+ "The alternatives of this group can be sorted without affecting the regex.",
+ ],
+ },
+ {
+ code: String.raw`/\b(?:abb|[\q{a|aba}]bb)\b/v`,
+ output: String.raw`/\b(?:[\q{a|aba}]bb|abb)\b/v`,
+ errors: [
+ "The alternatives of this group can be sorted without affecting the regex.",
+ ],
+ },
+ {
+ code: String.raw`/\b(?:c|b_|[\q{b|da}]_|b_2)\b/v`,
+ output: String.raw`/\b(?:b_|[\q{b|da}]_|b_2|c)\b/v`,
+ errors: [
+ "The alternatives of this group can be sorted without affecting the regex.",
+ ],
+ },
],
})