Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve regexp/sort-alternatives rule to add support for string alternatives and v flag #587

Merged
merged 10 commits into from
Sep 21, 2023
5 changes: 5 additions & 0 deletions .changeset/rich-ways-exercise.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"eslint-plugin-regexp": minor
---

Improve `regexp/sort-alternatives` rule to add support for string alternatives and v flag
5 changes: 4 additions & 1 deletion docs/rules/sort-alternatives.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ This rule will sort alternatives to improve readability and maintainability.

The primary target of this rule are lists of words and/or numbers. These lists are somewhat common, and sorting them makes it easy for readers to check whether a particular word or number is included.

This rule will only sort alternatives if reordering the alternatives doesn't affect the pattern.
This rule will only sort alternatives if reordering the alternatives doesn't affect the pattern.\
However, character classes containing strings are ensured to match the longest string, so they can always be sorted.

<eslint-code-block fix>

Expand All @@ -29,11 +30,13 @@ This rule will only sort alternatives if reordering the alternatives doesn't aff
/* ✓ GOOD */
var foo = /\b(1|2|3)\b/;
var foo = /\b(alpha|beta|gamma)\b/;
var foo = /[\q{blue|green|red}]/v;

/* ✗ BAD */
var foo = /\b(2|1|3)\b/;
var foo = /__(?:Foo|Bar)__/;
var foo = /\((?:TM|R|C)\)/;
var foo = /[\q{red|green|blue}]/v;
```

</eslint-code-block>
Expand Down
144 changes: 123 additions & 21 deletions lib/rules/sort-alternatives.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ import type {
Character,
CharacterClass,
CharacterSet,
ClassStringDisjunction,
Element,
Node,
ExpressionCharacterClass,
Pattern,
StringAlternative,
} from "@eslint-community/regexpp/ast"
import type { RegExpContext } from "../utils"
import {
Expand All @@ -30,12 +32,14 @@ import {
hasSomeDescendant,
canReorder,
getLongestPrefix,
toCharSet,
getConsumedChars,
toUnicodeSet,
hasStrings,
} from "regexp-ast-analysis"
import type { CharSet, Word, ReadonlyWord } from "refa"
import { NFA, JS, transform } from "refa"
import { getParser } from "../utils/regexp-ast"
import { getLexicographicallySmallestInConcatenation } from "../utils/lexicographically-smallest"

interface AllowedChars {
allowed: CharSet
Expand All @@ -45,7 +49,10 @@ const cache = new Map<string, Readonly<AllowedChars>>()

function getAllowedChars(flags: ReadonlyFlags) {
assertValidFlags(flags)
const cacheKey = (flags.ignoreCase ? "i" : "") + (flags.unicode ? "u" : "")
const cacheKey =
(flags.ignoreCase ? "i" : "") +
(flags.unicode ? "u" : "") +
(flags.unicodeSets ? "v" : "")
let result = cache.get(cacheKey)
if (result === undefined) {
result = {
Expand Down Expand Up @@ -86,7 +93,8 @@ function containsOnlyLiterals(
d.type === "Backreference" ||
d.type === "CharacterSet" ||
(d.type === "Quantifier" && d.max === Infinity) ||
(d.type === "CharacterClass" && d.negate)
(d.type === "CharacterClass" && d.negate) ||
(d.type === "ExpressionCharacterClass" && d.negate)
)
},
(d) => d.type !== "Assertion",
Expand Down Expand Up @@ -156,29 +164,45 @@ function approximateLexicographicallySmallest(
return getLexicographicallySmallestFromCharSets(prefix)
}

function getLexicographicallySmallestFromAlternative(
alternative: Alternative,
parser: JS.Parser,
flags: ReadonlyFlags,
): Word | undefined
function getLexicographicallySmallestFromAlternative(
alternative: StringAlternative,
parser: JS.Parser,
flags: ReadonlyFlags,
): Word
/**
* If defined, this will return the lexicographically smallest string accepted
* by the given alternative (ignoring assertions).
*/
function getLexicographicallySmallestFromAlternative(
alternative: Alternative,
alternative: Alternative | StringAlternative,
parser: JS.Parser,
flags: ReadonlyFlags,
): Word | undefined {
const { elements } = alternative
if (isOnlyCharacters(elements)) {
if (
alternative.type === "StringAlternative" ||
hasOnlyCharacters(alternative, flags)
) {
// fast path to avoid converting simple alternatives into NFAs
const smallest: Word = []
for (const e of elements) {
// FIXME: TS Error
// @ts-expect-error -- FIXME
const cs = toCharSet(e, flags)
for (const e of alternative.elements) {
const cs = toUnicodeSet(e, flags).chars
if (cs.isEmpty) return undefined
smallest.push(cs.ranges[0].min)
}
return smallest
}

if (isOnlyCharacterElements(alternative.elements)) {
return getLexicographicallySmallestInConcatenation(
alternative.elements.map((e) => toUnicodeSet(e, flags)),
)
}

try {
const result = parser.parseElement(alternative, {
assertions: "unknown",
Expand Down Expand Up @@ -212,15 +236,45 @@ function getLexicographicallySmallestFromAlternative(
}
}

/** Returns whether the given array of nodes contains only characters. */
function isOnlyCharacters(
nodes: readonly Node[],
): nodes is readonly (Character | CharacterClass | CharacterSet)[] {
/**
* Returns whether the given array of nodes contains only characters.
* But note that if the pattern has the v flag, the character class may contain strings.
*/
function isOnlyCharacterElements(
nodes: Element[],
): nodes is (
| Character
| CharacterClass
| CharacterSet
| ExpressionCharacterClass
)[] {
return nodes.every(
(e) =>
e.type === "Character" ||
e.type === "CharacterClass" ||
e.type === "CharacterSet",
e.type === "CharacterSet" ||
e.type === "ExpressionCharacterClass",
)
}

/**
* Returns whether the given alternative has contains only characters.
* The v flag in the pattern does not contains the string.
*/
function hasOnlyCharacters(
alternative: Alternative,
flags: ReadonlyFlags,
): alternative is Alternative & {
elements: readonly (
| Character
| CharacterClass
| CharacterSet
| ExpressionCharacterClass
)[]
} {
return (
isOnlyCharacterElements(alternative.elements) &&
alternative.elements.every((e) => !hasStrings(e, flags))
)
}

Expand Down Expand Up @@ -433,6 +487,28 @@ function sortAlternatives(
})
}

/**
* Sorts the given string alternatives.
*
* Sorting is done by comparing the lexicographically smallest strings (LSS).
*
* For more information on why we use LSS-based comparison and how it works,
* see https://github.com/ota-meshi/eslint-plugin-regexp/pull/423.
*/
function sortStringAlternatives(
alternatives: StringAlternative[],
parser: JS.Parser,
flags: ReadonlyFlags,
): void {
alternatives.sort((a, b) => {
const lssDiff = compareWords(
getLexicographicallySmallestFromAlternative(a, parser, flags),
getLexicographicallySmallestFromAlternative(b, parser, flags),
)
return lssDiff
})
}

/**
* Returns whether the given string is a valid integer.
* @param str
Expand All @@ -446,7 +522,9 @@ function isIntegerString(str: string): boolean {
* This tries to sort the given alternatives by assuming that all alternatives
* are a number.
*/
function trySortNumberAlternatives(alternatives: Alternative[]): void {
function trySortNumberAlternatives(
alternatives: (Alternative | StringAlternative)[],
): void {
const runs = getRuns(alternatives, (a) => isIntegerString(a.raw))
for (const { startIndex, elements } of runs) {
elements.sort((a, b) => {
Expand Down Expand Up @@ -528,7 +606,7 @@ export default createRule("sort-alternatives", {
fixable: "code",
schema: [],
messages: {
sort: "The alternatives of this group can be sorted without affecting the regex.",
sort: "The {{alternatives}} can be sorted without affecting the regex.",
},
type: "suggestion", // "problem",
},
Expand All @@ -551,7 +629,6 @@ export default createRule("sort-alternatives", {
let chars = possibleCharsCache.get(a)
if (chars === undefined) {
chars = getConsumedChars(a, flags).chars
possibleCharsCache.set(a, chars)
}
return chars
}
Expand Down Expand Up @@ -590,14 +667,19 @@ export default createRule("sort-alternatives", {
}
}

enforceSorted(run)
enforceSorted(run, "alternatives of this group")
}

/**
* Creates a report if the sorted alternatives are different from
* the unsorted ones.
*/
function enforceSorted(run: Run<Alternative>): void {
function enforceSorted(
run: Run<Alternative | StringAlternative>,
alternatives:
| "alternatives of this group"
| "string alternatives",
): void {
const sorted = run.elements
const parent = sorted[0].parent
const unsorted = parent.alternatives.slice(
Expand All @@ -619,6 +701,7 @@ export default createRule("sort-alternatives", {
node,
loc,
messageId: "sort",
data: { alternatives },
fix: fixReplaceNode(parent, () => {
const prefix = parent.raw.slice(
0,
Expand Down Expand Up @@ -682,10 +765,29 @@ export default createRule("sort-alternatives", {
}
}

/** The handler for ClassStringDisjunction */
function onClassStringDisjunction(
parent: ClassStringDisjunction,
): void {
if (parent.alternatives.length < 2) {
return
}

const alternatives = [...parent.alternatives]
sortStringAlternatives(alternatives, parser, flags)
trySortNumberAlternatives(alternatives)
const run: Run<StringAlternative> = {
startIndex: 0,
elements: [...alternatives],
}
enforceSorted(run, "string alternatives")
}

return {
onGroupEnter: onParent,
onPatternEnter: onParent,
onCapturingGroupEnter: onParent,
onClassStringDisjunctionEnter: onClassStringDisjunction,
}
}

Expand Down
89 changes: 89 additions & 0 deletions lib/utils/lexicographically-smallest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import type { Word } from "refa"
import type { JS } from "refa"

function findMin<T>(
array: readonly T[],
compare: (a: T, b: T) => number,
): T | undefined {
if (array.length === 0) {
return undefined
}

let min = array[0]
for (let i = 1; i < array.length; i++) {
const item = array[i]
if (compare(item, min) < 0) {
min = item
}
}
return min
}

function compareWords(a: Word, b: Word): number {
const l = Math.min(a.length, b.length)
for (let i = 0; i < l; i++) {
const diff = a[i] - b[i]
if (diff !== 0) {
return diff
}
}
return a.length - b.length
}

/**
* Returns the lexicographically smallest word in the given set or `undefined` if the set is empty.
*/
export function getLexicographicallySmallest(
set: JS.UnicodeSet,
): Word | undefined {
if (set.accept.isEmpty) {
return set.chars.isEmpty ? undefined : [set.chars.ranges[0].min]
}

const words = set.accept.wordSets.map(
(w): Word => w.map((c) => c.ranges[0].min),
)
return findMin(words, compareWords)
}

/**
* Returns the lexicographically smallest word in the given set or `undefined` if the set is empty.
*/
export function getLexicographicallySmallestInConcatenation(
elements: readonly JS.UnicodeSet[],
): Word | undefined {
if (elements.length === 1) {
return getLexicographicallySmallest(elements[0])
}

let smallest: Word = []
for (let i = elements.length - 1; i >= 0; i--) {
const set = elements[i]
if (set.isEmpty) {
return undefined
} else if (set.accept.isEmpty) {
smallest.unshift(set.chars.ranges[0].min)
} else {
let words = [
...(set.chars.isEmpty ? [] : [[set.chars]]),
...set.accept.wordSets,
].map((w): Word => w.map((c) => c.ranges[0].min))
// we only have to consider the lexicographically smallest words with unique length
const seenLengths = new Set<number>()
words = words.sort(compareWords).filter((w) => {
if (seenLengths.has(w.length)) {
return false
}
seenLengths.add(w.length)
return true
})

smallest = findMin(
// eslint-disable-next-line no-loop-func -- x
words.map((w): Word => [...w, ...smallest]),
compareWords,
)!
}
}
return smallest
}
Loading
Loading