Chevrotain · bd82 · Nov 9, 2017 · Nov 7, 2017 · Nov 9, 2017 · Nov 9, 2017
diff --git a/docs/resolving_lexer_errors.md b/docs/resolving_lexer_errors.md
@@ -2,6 +2,7 @@
 
 * [No LINE_BREAKS Error.](#LINE_BREAKS)
 * [Unexpected RegExp Anchor Error.](#ANCHORS)
+* [Token Can Never Be Matched.](#UNREACHABLE)
 
 
 ### <a name="LINE_BREAKS"></a> No LINE_BREAKS Error. 
@@ -97,5 +98,64 @@ const semVer = createToken({
 })
 ``` 
 
+
+
+### <a name="UNREACHABLE"></a> Token can never be matched.
+
+This error means that A Token type can never be successfully matched as
+a **previous** Token type in the lexer definition will **always** matched instead.
+This happens because the default behavior of Chevrotain is to attempt to match
+tokens **by the order** described in the lexer definition.
+
+For example:
+
+```javascript
+const ForKeyword = createToken({
+    name: "ForKeyword",
+    pattern: /for/
+})
+
+const Identifier = createToken({
+    name: "Identifier",
+    pattern: /[a-zA-z]+/
+})
+
+// Will throw Token <ForKeyword> can never be matched...
+// Because the input "for" is also a valid identifier
+// and matching an identifier will be attempted first.
+const myLexer = new chevrotain.Lexer([Identifier, ForKeyword])
+``` 
+
+* Note that this validation is limited to simple patterns such as keywords
+  The more general case of any pattern being a strict subset of a preceding pattern
+  will require much more in depth RegExp analysis capabilities.
+
+To resolve this simply re-arrange the order of Token types in the lexer
+definition such that the more specific Token types will be listed first.
+
+```javascript
+// Identifier is now listed as the last Token type.
+const myLexer = new chevrotain.Lexer([ForKeyword, Identifier])
+```
+
+Note that the solution provided above will create a new problem.
+Any identifier **starting with** "for" will be lexed as **two separate** tokens,
+a ForKeyword and an identifier. For example:
+
+```javascript
+const myLexer = new chevrotain.Lexer([ForKeyword, Identifier])
+
+// [
+//    {image:"for"}
+//    {image:"ward"}
+// ]
+const tokensResult = myLexer.tokenize("forward")
+```
+
+To resolve this second problem see how to prefer the **longest match**
+as demonstrated in the [keywords vs identifiers example][keywords_idents]
+
+
 [position_tracking]: http://sap.github.io/chevrotain/documentation/0_34_0/interfaces/_chevrotain_d_.ilexerconfig.html#positiontracking
 [line_terminator_docs]: http://sap.github.io/chevrotain/documentation/0_34_0/interfaces/_chevrotain_d_.ilexerconfig.html#lineTerminatorsPattern   
+[keywords_idents] https://github.com/SAP/Chevrotain/blob/master/examples/lexer/keywords_vs_identifiers/keywords_vs_identifiers.js
diff --git a/examples/grammars/css/css.js b/examples/grammars/css/css.js
@@ -96,11 +96,6 @@
         name: "Func",
         pattern: MAKE_PATTERN("{{ident}}\\(")
     })
-    // Ident must be before Minus
-    var Ident = createToken({
-        name: "Ident",
-        pattern: MAKE_PATTERN("{{ident}}")
-    })
 
     var Cdo = createToken({ name: "Cdo", pattern: /<!--/ })
     // Cdc must be before Minus
@@ -121,7 +116,6 @@
     var Equals = createToken({ name: "Equals", pattern: /=/ })
     var Star = createToken({ name: "Star", pattern: /\*/ })
     var Plus = createToken({ name: "Plus", pattern: /\+/ })
-    var Minus = createToken({ name: "Minus", pattern: /-/ })
     var GreaterThan = createToken({ name: "GreaterThan", pattern: />/ })
     var Slash = createToken({ name: "Slash", pattern: /\// })
 
@@ -257,6 +251,14 @@
         pattern: MAKE_PATTERN("{{num}}")
     })
 
+    // Ident must be before Minus
+    var Ident = createToken({
+        name: "Ident",
+        pattern: MAKE_PATTERN("{{ident}}")
+    })
+
+    var Minus = createToken({ name: "Minus", pattern: /-/ })
+
     var CssLexer = new Lexer(cssTokens)
 
     // ----------------- parser -----------------

diff --git a/src/scan/lexer.ts b/src/scan/lexer.ts
@@ -27,7 +27,8 @@ import {
     map,
     reduce,
     reject,
-    mapValues
+    mapValues,
+    cloneArr
 } from "../utils/utils"
 import { flatten } from "../utils/utils"
 
@@ -252,6 +253,8 @@ export function validatePatterns(
         findModesThatDoNotExist(validTokenClasses, validModesNames)
     )
 
+    errors = errors.concat(findUnreachablePatterns(validTokenClasses))
+
     return errors
 }
 
@@ -537,6 +540,97 @@ export function findModesThatDoNotExist(
     return errors
 }
 
+export function findUnreachablePatterns(
+    tokenClasses: TokenConstructor[]
+): ILexerDefinitionError[] {
+    const errors = []
+
+    const canBeTested = reduce(
+        tokenClasses,
+        (result, tokClass, idx) => {
+            const pattern = tokClass.PATTERN
+
+            if (pattern === Lexer.NA) {
+                return result
+            }
+
+            // a more comprehensive validation for all forms of regExps would require
+            // deeper regExp analysis capabilities
+            if (isString(pattern)) {
+                result.push({ str: pattern, idx, tokenType: tokClass })
+            } else if (isRegExp(pattern) && noMetaChar(pattern)) {
+                result.push({ str: pattern.source, idx, tokenType: tokClass })
+            }
+            return result
+        },
+        []
+    )
+
+    forEach(tokenClasses, (tokClass, testIdx) => {
+        forEach(canBeTested, ({ str, idx, tokenType }) => {
+            if (testIdx < idx && testTokenClass(str, tokClass.PATTERN)) {
+                let msg =
+                    `Token: ->${tokenName(
+                        tokenType
+                    )}<- can never be matched.\n` +
+                    `Because it appears AFTER the token ->${tokenName(
+                        tokClass
+                    )}<-` +
+                    `in the lexer's definition.\n` +
+                    `See https://github.com/SAP/chevrotain/blob/master/docs/resolving_lexer_errors.md#UNREACHABLE`
+                errors.push({
+                    message: msg,
+                    type: LexerDefinitionErrorType.UNREACHABLE_PATTERN,
+                    tokenClasses: [tokClass, tokenType]
+                })
+            }
+        })
+    })
+
+    return errors
+}
+
+function testTokenClass(str: string, pattern: any): boolean {
+    if (isRegExp(pattern)) {
+        const regExpArray = pattern.exec(str)
+        return regExpArray !== null && regExpArray.index === 0
+    } else if (isFunction(pattern)) {
+        // maintain the API of custom patterns
+        return pattern(str, 0, [], {})
+    } else if (has(pattern, "exec")) {
+        // maintain the API of custom patterns
+        return pattern.exec(str, 0, [], {})
+    } else if (typeof pattern === "string") {
+        return pattern === str
+    } else {
+        /* istanbul ignore next */
+        throw Error("non exhaustive match")
+    }
+}
+
+function noMetaChar(regExp: RegExp): boolean {
+    //https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp
+    const metaChars = [
+        ".",
+        "\\",
+        "[",
+        "]",
+        "|",
+        "^",
+        "$",
+        "(",
+        ")",
+        "?",
+        "*",
+        "+",
+        "{"
+    ]
+    return (
+        find(metaChars, char => regExp.source.indexOf(char) !== -1) ===
+        undefined
+    )
+}
+
 export function addStartOfInput(pattern: RegExp): RegExp {
     let flags = pattern.ignoreCase ? "i" : ""
     // always wrapping in a none capturing group preceded by '^' to make sure matching can only work on start of input.

diff --git a/src/scan/lexer_public.ts b/src/scan/lexer_public.ts
@@ -68,7 +68,8 @@ export enum LexerDefinitionErrorType {
     LEXER_DEFINITION_CANNOT_CONTAIN_UNDEFINED,
     SOI_ANCHOR_FOUND,
     EMPTY_MATCH_PATTERN,
-    NO_LINE_BREAKS_FLAGS
+    NO_LINE_BREAKS_FLAGS,
+    UNREACHABLE_PATTERN
 }
 
 export interface ILexerDefinitionError {

diff --git a/test/scan/lexer_spec.ts b/test/scan/lexer_spec.ts
@@ -25,6 +25,7 @@ import {
     findInvalidPatterns,
     findMissingPatterns,
     findStartOfInputAnchor,
+    findUnreachablePatterns,
     findUnsupportedFlags,
     SUPPORT_STICKY
 } from "../../src/scan/lexer"
@@ -326,6 +327,30 @@ function defineLexerSpecs(
                     expect(errors[0].message).to.contain("InvalidToken")
                 })
 
+                it("will detect unreachable patterns", () => {
+                    const ClassKeyword = createToken({
+                        name: "ClassKeyword",
+                        pattern: /class/
+                    })
+
+                    const Identifier = createToken({
+                        name: "Identifier",
+                        pattern: /\w+/
+                    })
+
+                    let tokenClasses = [Identifier, ClassKeyword]
+                    let errors = findUnreachablePatterns(tokenClasses)
+                    expect(errors.length).to.equal(1)
+                    expect(errors[0].tokenClasses).to.deep.equal([
+                        Identifier,
+                        ClassKeyword
+                    ])
+                    expect(errors[0].type).to.equal(
+                        LexerDefinitionErrorType.UNREACHABLE_PATTERN
+                    )
+                    expect(errors[0].message).to.contain("can never be matched")
+                })
+
                 it("won't detect negation as using unsupported start of input anchor", () => {
                     let negationPattern = createToken({
                         name: "negationPattern",