Skip to content

Commit

Permalink
Fix corner cases for the greedy flag
Browse files Browse the repository at this point in the history
By refactoring the tokenize() method into two separate methods, it is
possible to recursively clean up any broken tokens left by the greedy
feature.

This should fix the issue 1075.
  • Loading branch information
zeitgeist87 committed Feb 8, 2017
1 parent 565a2cc commit f1d30ac
Show file tree
Hide file tree
Showing 3 changed files with 207 additions and 191 deletions.
198 changes: 103 additions & 95 deletions components/prism-core.js
Original file line number Diff line number Diff line change
Expand Up @@ -253,131 +253,139 @@ var _ = _self.Prism = {
return Token.stringify(_.util.encode(tokens), language);
},

tokenize: function(text, grammar, language) {
var Token = _.Token;
matchPattern: function (text, strarr, grammar, token) {
var Token = _.Token,
patterns = grammar[token];

patterns = (_.util.type(patterns) === "Array") ? patterns : [patterns];

for (var j = 0; j < patterns.length; ++j) {
var pattern = patterns[j],
inside = pattern.inside,
lookbehind = !!pattern.lookbehind,
greedy = !!pattern.greedy,
lookbehindLength = 0,
alias = pattern.alias;

if (greedy && !pattern.pattern.global) {
// Without the global flag, lastIndex won't work
var flags = pattern.pattern.toString().match(/[imuy]*$/)[0];
pattern.pattern = RegExp(pattern.pattern.source, flags + "g");
}

var strarr = [text];
pattern = pattern.pattern || pattern;

var rest = grammar.rest;
// Don’t cache length as it changes during the loop
for (var i=0, pos = 0; i<strarr.length; pos += strarr[i].length, ++i) {

if (rest) {
for (var token in rest) {
grammar[token] = rest[token];
}

delete grammar.rest;
}
var str = strarr[i];

tokenloop: for (var token in grammar) {
if(!grammar.hasOwnProperty(token) || !grammar[token]) {
continue;
}
if (strarr.length > text.length) {
// Something went terribly wrong, ABORT, ABORT!
return;
}

var patterns = grammar[token];
patterns = (_.util.type(patterns) === "Array") ? patterns : [patterns];

for (var j = 0; j < patterns.length; ++j) {
var pattern = patterns[j],
inside = pattern.inside,
lookbehind = !!pattern.lookbehind,
greedy = !!pattern.greedy,
lookbehindLength = 0,
alias = pattern.alias;

if (greedy && !pattern.pattern.global) {
// Without the global flag, lastIndex won't work
var flags = pattern.pattern.toString().match(/[imuy]*$/)[0];
pattern.pattern = RegExp(pattern.pattern.source, flags + "g");
if (str instanceof Token) {
continue;
}

pattern = pattern.pattern || pattern;
pattern.lastIndex = 0;

// Don’t cache length as it changes during the loop
for (var i=0, pos = 0; i<strarr.length; pos += strarr[i].length, ++i) {
var match = pattern.exec(str),
delNum = 1, greedyCleanup = null;

var str = strarr[i];
// Greedy patterns can override/remove up to two previously matched tokens
if (!match && greedy && i != strarr.length - 1) {
pattern.lastIndex = pos;
match = pattern.exec(text);
if (!match) {
break;
}

if (strarr.length > text.length) {
// Something went terribly wrong, ABORT, ABORT!
break tokenloop;
var from = match.index + (lookbehind ? match[1].length : 0),
to = match.index + match[0].length,
k = i,
p = pos;

for (var len = strarr.length; k < len && p < to; ++k) {
p += strarr[k].length;
// Move the index i to the element in strarr that is closest to from
if (from >= p) {
++i;
pos = p;
}
}

if (str instanceof Token) {
/*
* If strarr[i] is a Token, then the match starts inside another Token, which is invalid
* If strarr[k - 1] is greedy we are in conflict with another greedy pattern
*/
if (strarr[i] instanceof Token || strarr[k - 1].greedy) {
continue;
}

pattern.lastIndex = 0;
// Number of tokens to delete and replace with the new match
delNum = k - i;
greedyCleanup = strarr[k - 1].type || null;
str = text.slice(pos, p);
match.index -= pos;
}

var match = pattern.exec(str),
delNum = 1;
if (!match) {
continue;
}

// Greedy patterns can override/remove up to two previously matched tokens
if (!match && greedy && i != strarr.length - 1) {
pattern.lastIndex = pos;
match = pattern.exec(text);
if (!match) {
break;
}
if(lookbehind) {
lookbehindLength = match[1].length;
}

var from = match.index + (lookbehind ? match[1].length : 0),
to = match.index + match[0].length,
k = i,
p = pos;

for (var len = strarr.length; k < len && p < to; ++k) {
p += strarr[k].length;
// Move the index i to the element in strarr that is closest to from
if (from >= p) {
++i;
pos = p;
}
}
var from = match.index + lookbehindLength,
match = match[0].slice(lookbehindLength),
to = from + match.length,
before = str.slice(0, from),
after = str.slice(to);

/*
* If strarr[i] is a Token, then the match starts inside another Token, which is invalid
* If strarr[k - 1] is greedy we are in conflict with another greedy pattern
*/
if (strarr[i] instanceof Token || strarr[k - 1].greedy) {
continue;
}
var args = [i, delNum];

// Number of tokens to delete and replace with the new match
delNum = k - i;
str = text.slice(pos, p);
match.index -= pos;
}
if (before) {
args.push(before);
}

if (!match) {
continue;
}
var wrapped = new Token(token, inside? _.tokenize(match, inside) : match, alias, match, greedy);

if(lookbehind) {
lookbehindLength = match[1].length;
}
args.push(wrapped);

var from = match.index + lookbehindLength,
match = match[0].slice(lookbehindLength),
to = from + match.length,
before = str.slice(0, from),
after = str.slice(to);
if (after) {
args.push(after);
}

var args = [i, delNum];
Array.prototype.splice.apply(strarr, args);

if (before) {
args.push(before);
}
if (greedyCleanup)
_.matchPattern(text, strarr, grammar, greedyCleanup);
}
}
},

var wrapped = new Token(token, inside? _.tokenize(match, inside) : match, alias, match, greedy);
tokenize: function(text, grammar, language) {
var strarr = [text];

args.push(wrapped);
var rest = grammar.rest;

if (after) {
args.push(after);
}
if (rest) {
for (var token in rest) {
grammar[token] = rest[token];
}

Array.prototype.splice.apply(strarr, args);
}
delete grammar.rest;
}

for (var token in grammar) {
if(!grammar.hasOwnProperty(token) || !grammar[token]) {
continue;
}

_.matchPattern(text, strarr, grammar, token);
}

return strarr;
Expand Down
2 changes: 1 addition & 1 deletion components/prism-core.min.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit f1d30ac

Please sign in to comment.