Core: Fixed greedy targeting bug (#1932)

The `target` parameter defines the token which causes rematching. Rematching is done by recursively calling `matchGrammar` where `target` will be the current token name. The rematching is done if either a match was found or we reached the target token, whatever comes first. The bug is that a token is identified by its name. But the name alone doesn't uniquely identify a token in a grammar because many tokens can have the same name (array). This fixes this bug.
PrismJS · Aug 29, 2019 · e864d51 · e864d51
1 parent 3509f3e
commit e864d51
Show file tree

Hide file tree

Showing 7 changed files with 114 additions and 30 deletions.
diff --git a/components/prism-core.js b/components/prism-core.js
@@ -279,18 +279,18 @@ var _ = {
 
 	matchGrammar: function (text, strarr, grammar, index, startPos, oneshot, target) {
 		for (var token in grammar) {
-			if(!grammar.hasOwnProperty(token) || !grammar[token]) {
+			if (!grammar.hasOwnProperty(token) || !grammar[token]) {
 				continue;
 			}
 
-			if (token == target) {
-				return;
-			}
-
 			var patterns = grammar[token];
-			patterns = (_.util.type(patterns) === "Array") ? patterns : [patterns];
+			patterns = Array.isArray(patterns) ? patterns : [patterns];
 
 			for (var j = 0; j < patterns.length; ++j) {
+				if (target && target == token + ',' + j) {
+					return;
+				}
+
 				var pattern = patterns[j],
 					inside = pattern.inside,
 					lookbehind = !!pattern.lookbehind,
@@ -394,7 +394,7 @@ var _ = {
 					Array.prototype.splice.apply(strarr, args);
 
 					if (delNum != 1)
-						_.matchGrammar(text, strarr, grammar, i, pos, true, token);
+						_.matchGrammar(text, strarr, grammar, i, pos, true, token + ',' + j);
 
 					if (oneshot)
 						break;

diff --git a/components/prism-core.min.js b/components/prism-core.min.js
diff --git a/package.json b/package.json
@@ -6,11 +6,12 @@
   "style": "themes/prism.css",
   "scripts": {
     "test:aliases": "mocha tests/aliases-test.js",
+    "test:core": "mocha tests/core/**/*.js",
     "test:languages": "mocha tests/run.js",
     "test:plugins": "mocha tests/plugins/**/*.js",
     "test:regex": "mocha tests/regex-tests.js",
     "test:runner": "mocha tests/testrunner-tests.js",
-    "test": "npm run test:runner && npm run test:languages && npm run test:plugins && npm run test:aliases && npm run test:regex"
+    "test": "npm run test:runner && npm run test:core && npm run test:languages && npm run test:plugins && npm run test:aliases && npm run test:regex"
   },
   "repository": {
     "type": "git",

diff --git a/prism.js b/prism.js
@@ -284,18 +284,18 @@ var _ = {
 
 	matchGrammar: function (text, strarr, grammar, index, startPos, oneshot, target) {
 		for (var token in grammar) {
-			if(!grammar.hasOwnProperty(token) || !grammar[token]) {
+			if (!grammar.hasOwnProperty(token) || !grammar[token]) {
 				continue;
 			}
 
-			if (token == target) {
-				return;
-			}
-
 			var patterns = grammar[token];
-			patterns = (_.util.type(patterns) === "Array") ? patterns : [patterns];
+			patterns = Array.isArray(patterns) ? patterns : [patterns];
 
 			for (var j = 0; j < patterns.length; ++j) {
+				if (target && target == token + ',' + j) {
+					return;
+				}
+
 				var pattern = patterns[j],
 					inside = pattern.inside,
 					lookbehind = !!pattern.lookbehind,
@@ -399,7 +399,7 @@ var _ = {
 					Array.prototype.splice.apply(strarr, args);
 
 					if (delNum != 1)
-						_.matchGrammar(text, strarr, grammar, i, pos, true, token);
+						_.matchGrammar(text, strarr, grammar, i, pos, true, token + ',' + j);
 
 					if (oneshot)
 						break;

diff --git a/tests/core/greedy.js b/tests/core/greedy.js
@@ -0,0 +1,69 @@
+"use strict";
+
+const { assert } = require('chai');
+const PrismLoader = require('../helper/prism-loader');
+const TestCase = require('../helper/test-case');
+
+
+function testTokens({ grammar, code, expected }) {
+	const Prism = PrismLoader.createEmptyPrism();
+	Prism.languages.test = grammar;
+
+	const simpleTokens = TestCase.simpleTokenize(Prism, code, 'test');
+
+	assert.deepStrictEqual(simpleTokens, expected);
+}
+
+describe('Greedy matching', function () {
+
+	it('should correctly handle tokens with the same name', function () {
+		testTokens({
+			grammar: {
+				'comment': [
+					/\/\/.*/,
+					{
+						pattern: /\/\*[\s\S]*?(?:\*\/|$)/,
+						greedy: true
+					}
+				]
+			},
+			code: '// /*\n/* comment */',
+			expected: [
+				["comment", "// /*"],
+				["comment", "/* comment */"]
+			]
+		});
+	});
+
+	// https://github.com/PrismJS/prism/issues/1492
+	/*
+	it('should correctly rematch tokens', function () {
+		testTokens({
+			grammar: {
+				'a': {
+					pattern: /'[^'\r\n]*'/,
+				},
+				'b': {
+					pattern: /"[^"\r\n]*"/,
+					greedy: true,
+				},
+				'c': {
+					pattern: /<[^>\r\n]*>/,
+					greedy: true,
+				}
+			},
+			code: `<'> '' ''\n<"> "" ""`,
+			expected: [
+				["c", "<'>"],
+				["a", "''"],
+				["a", "''"],
+
+				["c", "<\">"],
+				["b", "\"\""],
+				["b", "\"\""],
+			]
+		});
+	});
+	*/
+});
+
diff --git a/tests/helper/test-case.js b/tests/helper/test-case.js
@@ -60,19 +60,9 @@ module.exports = {
 		}
 
 		const Prism = PrismLoader.createInstance(usedLanguages.languages);
-		// the first language is the main language to highlight
-		const mainLanguageGrammar = Prism.languages[usedLanguages.mainLanguage];
-		const env = {
-			code: testCase.testSource,
-			grammar: mainLanguageGrammar,
-			language: usedLanguages.mainLanguage
-		};
-		Prism.hooks.run('before-tokenize', env);
-		env.tokens = Prism.tokenize(env.code, env.grammar);
-		Prism.hooks.run('after-tokenize', env);
-		const compiledTokenStream = env.tokens;
 
-		const simplifiedTokenStream = TokenStreamTransformer.simplify(compiledTokenStream);
+		// the first language is the main language to highlight
+		const simplifiedTokenStream = this.simpleTokenize(Prism, testCase.testSource, usedLanguages.mainLanguage);
 
 		const tzd = JSON.stringify(simplifiedTokenStream);
 		const exp = JSON.stringify(testCase.expectedTokenStream);
@@ -93,7 +83,31 @@ module.exports = {
 			"Expected Token Stream: \n" + exp +
 			"\n-----------------------------------------\n" + diff;
 
-		const result = assert.deepEqual(simplifiedTokenStream, testCase.expectedTokenStream, testCase.comment + message);
+		assert.deepEqual(simplifiedTokenStream, testCase.expectedTokenStream, testCase.comment + message);
+	},
+
+	/**
+	 * Returns the simplified token stream of the given code highlighted with `language`.
+	 *
+	 * The `before-tokenize` and `after-tokenize` hooks will also be executed.
+	 *
+	 * @param {any} Prism The Prism instance which will tokenize `code`.
+	 * @param {string} code The code to tokenize.
+	 * @param {string} language The language id.
+	 * @returns {Array<string|Array<string|any[]>>}
+	 */
+	simpleTokenize(Prism, code, language) {
+		const env = {
+			code,
+			grammar: Prism.languages[language],
+			language
+		};
+
+		Prism.hooks.run('before-tokenize', env);
+		env.tokens = Prism.tokenize(env.code, env.grammar);
+		Prism.hooks.run('after-tokenize', env);
+
+		return TokenStreamTransformer.simplify(env.tokens);
 	},
 
 

diff --git a/tests/helper/token-stream-transformer.js b/tests/helper/token-stream-transformer.js
@@ -6,7 +6,7 @@ module.exports = {
 	 * @typedef TokenStreamItem
 	 * @property {string} type
 	 * @property {string | TokenStreamItem | Array<string|TokenStreamItem>} content
-	*/
+	 */
 
 	/**
 	 * Simplifies the token stream to ease the matching with the expected token stream.