ota-meshi · RunDevelopment · Sep 21, 2023 · Sep 9, 2023 · Sep 9, 2023 · Sep 9, 2023
diff --git a/.changeset/rich-ways-exercise.md b/.changeset/rich-ways-exercise.md
@@ -0,0 +1,5 @@
+---
+"eslint-plugin-regexp": minor
+---
+
+Improve `regexp/sort-alternatives` rule to add support for string alternatives and v flag
diff --git a/docs/rules/sort-alternatives.md b/docs/rules/sort-alternatives.md
@@ -19,7 +19,8 @@ This rule will sort alternatives to improve readability and maintainability.
 
 The primary target of this rule are lists of words and/or numbers. These lists are somewhat common, and sorting them makes it easy for readers to check whether a particular word or number is included.
 
-This rule will only sort alternatives if reordering the alternatives doesn't affect the pattern.
+This rule will only sort alternatives if reordering the alternatives doesn't affect the pattern.\
+However, character classes containing strings are ensured to match the longest string, so they can always be sorted.
 
 <eslint-code-block fix>
 
@@ -29,11 +30,13 @@ This rule will only sort alternatives if reordering the alternatives doesn't aff
 /* ✓ GOOD */
 var foo = /\b(1|2|3)\b/;
 var foo = /\b(alpha|beta|gamma)\b/;
+var foo = /[\q{blue|green|red}]/v;
 
 /* ✗ BAD */
 var foo = /\b(2|1|3)\b/;
 var foo = /__(?:Foo|Bar)__/;
 var foo = /\((?:TM|R|C)\)/;
+var foo = /[\q{red|green|blue}]/v;
 ```
 
 </eslint-code-block>

diff --git a/lib/rules/sort-alternatives.ts b/lib/rules/sort-alternatives.ts
@@ -4,9 +4,11 @@ import type {
     Character,
     CharacterClass,
     CharacterSet,
+    ClassStringDisjunction,
     Element,
-    Node,
+    ExpressionCharacterClass,
     Pattern,
+    StringAlternative,
 } from "@eslint-community/regexpp/ast"
 import type { RegExpContext } from "../utils"
 import {
@@ -30,12 +32,14 @@ import {
     hasSomeDescendant,
     canReorder,
     getLongestPrefix,
-    toCharSet,
     getConsumedChars,
+    toUnicodeSet,
+    hasStrings,
 } from "regexp-ast-analysis"
 import type { CharSet, Word, ReadonlyWord } from "refa"
 import { NFA, JS, transform } from "refa"
 import { getParser } from "../utils/regexp-ast"
+import { getLexicographicallySmallestInConcatenation } from "../utils/lexicographically-smallest"
 
 interface AllowedChars {
     allowed: CharSet
@@ -45,7 +49,10 @@ const cache = new Map<string, Readonly<AllowedChars>>()
 
 function getAllowedChars(flags: ReadonlyFlags) {
     assertValidFlags(flags)
-    const cacheKey = (flags.ignoreCase ? "i" : "") + (flags.unicode ? "u" : "")
+    const cacheKey =
+        (flags.ignoreCase ? "i" : "") +
+        (flags.unicode ? "u" : "") +
+        (flags.unicodeSets ? "v" : "")
     let result = cache.get(cacheKey)
     if (result === undefined) {
         result = {
@@ -86,7 +93,8 @@ function containsOnlyLiterals(
                 d.type === "Backreference" ||
                 d.type === "CharacterSet" ||
                 (d.type === "Quantifier" && d.max === Infinity) ||
-                (d.type === "CharacterClass" && d.negate)
+                (d.type === "CharacterClass" && d.negate) ||
+                (d.type === "ExpressionCharacterClass" && d.negate)
             )
         },
         (d) => d.type !== "Assertion",
@@ -156,29 +164,45 @@ function approximateLexicographicallySmallest(
     return getLexicographicallySmallestFromCharSets(prefix)
 }
 
+function getLexicographicallySmallestFromAlternative(
+    alternative: Alternative,
+    parser: JS.Parser,
+    flags: ReadonlyFlags,
+): Word | undefined
+function getLexicographicallySmallestFromAlternative(
+    alternative: StringAlternative,
+    parser: JS.Parser,
+    flags: ReadonlyFlags,
+): Word
 /**
  * If defined, this will return the lexicographically smallest string accepted
  * by the given alternative (ignoring assertions).
  */
 function getLexicographicallySmallestFromAlternative(
-    alternative: Alternative,
+    alternative: Alternative | StringAlternative,
     parser: JS.Parser,
     flags: ReadonlyFlags,
 ): Word | undefined {
-    const { elements } = alternative
-    if (isOnlyCharacters(elements)) {
+    if (
+        alternative.type === "StringAlternative" ||
+        hasOnlyCharacters(alternative, flags)
+    ) {
         // fast path to avoid converting simple alternatives into NFAs
         const smallest: Word = []
-        for (const e of elements) {
-            // FIXME: TS Error
-            // @ts-expect-error -- FIXME
-            const cs = toCharSet(e, flags)
+        for (const e of alternative.elements) {
+            const cs = toUnicodeSet(e, flags).chars
             if (cs.isEmpty) return undefined
             smallest.push(cs.ranges[0].min)
         }
         return smallest
     }
 
+    if (isOnlyCharacterElements(alternative.elements)) {
+        return getLexicographicallySmallestInConcatenation(
+            alternative.elements.map((e) => toUnicodeSet(e, flags)),
+        )
+    }
+
     try {
         const result = parser.parseElement(alternative, {
             assertions: "unknown",
@@ -212,15 +236,45 @@ function getLexicographicallySmallestFromAlternative(
     }
 }
 
-/** Returns whether the given array of nodes contains only characters. */
-function isOnlyCharacters(
-    nodes: readonly Node[],
-): nodes is readonly (Character | CharacterClass | CharacterSet)[] {
+/**
+ * Returns whether the given array of nodes contains only characters.
+ * But note that if the pattern has the v flag, the character class may contain strings.
+ */
+function isOnlyCharacterElements(
+    nodes: Element[],
+): nodes is (
+    | Character
+    | CharacterClass
+    | CharacterSet
+    | ExpressionCharacterClass
+)[] {
     return nodes.every(
         (e) =>
             e.type === "Character" ||
             e.type === "CharacterClass" ||
-            e.type === "CharacterSet",
+            e.type === "CharacterSet" ||
+            e.type === "ExpressionCharacterClass",
+    )
+}
+
+/**
+ * Returns whether the given alternative has contains only characters.
+ * The v flag in the pattern does not contains the string.
+ */
+function hasOnlyCharacters(
+    alternative: Alternative,
+    flags: ReadonlyFlags,
+): alternative is Alternative & {
+    elements: readonly (
+        | Character
+        | CharacterClass
+        | CharacterSet
+        | ExpressionCharacterClass
+    )[]
+} {
+    return (
+        isOnlyCharacterElements(alternative.elements) &&
+        alternative.elements.every((e) => !hasStrings(e, flags))
     )
 }
 
@@ -433,6 +487,28 @@ function sortAlternatives(
     })
 }
 
+/**
+ * Sorts the given string alternatives.
+ *
+ * Sorting is done by comparing the lexicographically smallest strings (LSS).
+ *
+ * For more information on why we use LSS-based comparison and how it works,
+ * see https://github.com/ota-meshi/eslint-plugin-regexp/pull/423.
+ */
+function sortStringAlternatives(
+    alternatives: StringAlternative[],
+    parser: JS.Parser,
+    flags: ReadonlyFlags,
+): void {
+    alternatives.sort((a, b) => {
+        const lssDiff = compareWords(
+            getLexicographicallySmallestFromAlternative(a, parser, flags),
+            getLexicographicallySmallestFromAlternative(b, parser, flags),
+        )
+        return lssDiff
+    })
+}
+
 /**
  * Returns whether the given string is a valid integer.
  * @param str
@@ -446,7 +522,9 @@ function isIntegerString(str: string): boolean {
  * This tries to sort the given alternatives by assuming that all alternatives
  * are a number.
  */
-function trySortNumberAlternatives(alternatives: Alternative[]): void {
+function trySortNumberAlternatives(
+    alternatives: (Alternative | StringAlternative)[],
+): void {
     const runs = getRuns(alternatives, (a) => isIntegerString(a.raw))
     for (const { startIndex, elements } of runs) {
         elements.sort((a, b) => {
@@ -528,7 +606,7 @@ export default createRule("sort-alternatives", {
         fixable: "code",
         schema: [],
         messages: {
-            sort: "The alternatives of this group can be sorted without affecting the regex.",
+            sort: "The {{alternatives}} can be sorted without affecting the regex.",
         },
         type: "suggestion", // "problem",
     },
@@ -551,7 +629,6 @@ export default createRule("sort-alternatives", {
                 let chars = possibleCharsCache.get(a)
                 if (chars === undefined) {
                     chars = getConsumedChars(a, flags).chars
-                    possibleCharsCache.set(a, chars)
                 }
                 return chars
             }
@@ -590,14 +667,19 @@ export default createRule("sort-alternatives", {
                     }
                 }
 
-                enforceSorted(run)
+                enforceSorted(run, "alternatives of this group")
             }
 
             /**
              * Creates a report if the sorted alternatives are different from
              * the unsorted ones.
              */
-            function enforceSorted(run: Run<Alternative>): void {
+            function enforceSorted(
+                run: Run<Alternative | StringAlternative>,
+                alternatives:
+                    | "alternatives of this group"
+                    | "string alternatives",
+            ): void {
                 const sorted = run.elements
                 const parent = sorted[0].parent
                 const unsorted = parent.alternatives.slice(
@@ -619,6 +701,7 @@ export default createRule("sort-alternatives", {
                     node,
                     loc,
                     messageId: "sort",
+                    data: { alternatives },
                     fix: fixReplaceNode(parent, () => {
                         const prefix = parent.raw.slice(
                             0,
@@ -682,10 +765,29 @@ export default createRule("sort-alternatives", {
                 }
             }
 
+            /** The handler for ClassStringDisjunction */
+            function onClassStringDisjunction(
+                parent: ClassStringDisjunction,
+            ): void {
+                if (parent.alternatives.length < 2) {
+                    return
+                }
+
+                const alternatives = [...parent.alternatives]
+                sortStringAlternatives(alternatives, parser, flags)
+                trySortNumberAlternatives(alternatives)
+                const run: Run<StringAlternative> = {
+                    startIndex: 0,
+                    elements: [...alternatives],
+                }
+                enforceSorted(run, "string alternatives")
+            }
+
             return {
                 onGroupEnter: onParent,
                 onPatternEnter: onParent,
                 onCapturingGroupEnter: onParent,
+                onClassStringDisjunctionEnter: onClassStringDisjunction,
             }
         }
 

diff --git a/lib/utils/lexicographically-smallest.ts b/lib/utils/lexicographically-smallest.ts
@@ -0,0 +1,89 @@
+import type { Word } from "refa"
+import type { JS } from "refa"
+
+function findMin<T>(
+    array: readonly T[],
+    compare: (a: T, b: T) => number,
+): T | undefined {
+    if (array.length === 0) {
+        return undefined
+    }
+
+    let min = array[0]
+    for (let i = 1; i < array.length; i++) {
+        const item = array[i]
+        if (compare(item, min) < 0) {
+            min = item
+        }
+    }
+    return min
+}
+
+function compareWords(a: Word, b: Word): number {
+    const l = Math.min(a.length, b.length)
+    for (let i = 0; i < l; i++) {
+        const diff = a[i] - b[i]
+        if (diff !== 0) {
+            return diff
+        }
+    }
+    return a.length - b.length
+}
+
+/**
+ * Returns the lexicographically smallest word in the given set or `undefined` if the set is empty.
+ */
+export function getLexicographicallySmallest(
+    set: JS.UnicodeSet,
+): Word | undefined {
+    if (set.accept.isEmpty) {
+        return set.chars.isEmpty ? undefined : [set.chars.ranges[0].min]
+    }
+
+    const words = set.accept.wordSets.map(
+        (w): Word => w.map((c) => c.ranges[0].min),
+    )
+    return findMin(words, compareWords)
+}
+
+/**
+ * Returns the lexicographically smallest word in the given set or `undefined` if the set is empty.
+ */
+export function getLexicographicallySmallestInConcatenation(
+    elements: readonly JS.UnicodeSet[],
+): Word | undefined {
+    if (elements.length === 1) {
+        return getLexicographicallySmallest(elements[0])
+    }
+
+    let smallest: Word = []
+    for (let i = elements.length - 1; i >= 0; i--) {
+        const set = elements[i]
+        if (set.isEmpty) {
+            return undefined
+        } else if (set.accept.isEmpty) {
+            smallest.unshift(set.chars.ranges[0].min)
+        } else {
+            let words = [
+                ...(set.chars.isEmpty ? [] : [[set.chars]]),
+                ...set.accept.wordSets,
+            ].map((w): Word => w.map((c) => c.ranges[0].min))
+            // we only have to consider the lexicographically smallest words with unique length
+            const seenLengths = new Set<number>()
+            words = words.sort(compareWords).filter((w) => {
+                if (seenLengths.has(w.length)) {
+                    return false
+                }
+                seenLengths.add(w.length)
+                return true
+            })
+
+            smallest = findMin(
+                // eslint-disable-next-line no-loop-func -- x
+                words.map((w): Word => [...w, ...smallest]),
+                compareWords,
+            )!
+        }
+    }
+    return smallest
+}