Skip to content

Commit

Permalink
Auto merge of #354 - robinst:fix-panics-in-extended-mode-by-being-mor…
Browse files Browse the repository at this point in the history
…e-strict, r=BurntSushi

Fix panics with whitespace in extended mode by being more strict

Instead of ignoring space in all the bump/peek methods (as proposed in
pull request #349), have an explicit `ignore_space` method that can be
used in places where space/comments should be allowed.

This makes parsing a bit stricter than before as well.
  • Loading branch information
bors committed May 20, 2017
2 parents 68bc958 + 6f32d2f commit c15170a
Showing 1 changed file with 131 additions and 43 deletions.
174 changes: 131 additions & 43 deletions regex-syntax/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,11 @@ impl Parser {
// Starts at the beginning of the input and consumes until either the end
// of input or an error.
fn parse_expr(mut self) -> Result<Expr> {
while !self.eof() {
loop {
self.ignore_space();
if self.eof() {
break;
}
let build_expr = match self.cur() {
'\\' => try!(self.parse_escape()),
'|' => { let e = try!(self.alternate()); self.bump(); e }
Expand Down Expand Up @@ -177,7 +181,7 @@ impl Parser {
return Err(self.err(ErrorKind::UnexpectedEscapeEof));
}
let c = self.cur();
if is_punct(c) {
if is_punct(c) || (self.flags.ignore_space && c.is_whitespace()) {
let c = self.bump();
return Ok(try!(self.lit(c)));
}
Expand Down Expand Up @@ -234,6 +238,7 @@ impl Parser {
let chari = self.chari;
let mut name: CaptureName = None;
self.bump();
self.ignore_space();
if self.bump_if("?P<") {
let n = try!(self.parse_group_name());
if self.names.iter().any(|n2| n2 == &n) {
Expand Down Expand Up @@ -370,13 +375,16 @@ impl Parser {
return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e)));
}
self.bump();
let min = try!(self.parse_decimal(|c| c != ',' && c != '}'));
self.ignore_space();
let min = try!(self.parse_decimal());
let mut max_opt = Some(min);
self.ignore_space();
if self.bump_if(',') {
self.ignore_space();
if self.peek_is('}') {
max_opt = None;
} else {
let max = try!(self.parse_decimal(|c| c != '}'));
let max = try!(self.parse_decimal());
if min > max {
// e.g., a{2,1}
return Err(self.err(ErrorKind::InvalidRepeatRange {
Expand All @@ -387,6 +395,7 @@ impl Parser {
max_opt = Some(max);
}
}
self.ignore_space();
if !self.bump_if('}') {
Err(self.err(ErrorKind::UnclosedRepeat))
} else {
Expand Down Expand Up @@ -423,8 +432,8 @@ impl Parser {
//
// Start: `1`
// End: `,` (where `until == ','`)
fn parse_decimal<B: Bumpable>(&mut self, until: B) -> Result<u32> {
match self.bump_get(until) {
fn parse_decimal(&mut self) -> Result<u32> {
match self.bump_get(|c| is_ascii_word(c) || c.is_whitespace()) {
// e.g., a{}
None => Err(self.err(ErrorKind::MissingBase10)),
Some(n) => {
Expand Down Expand Up @@ -472,6 +481,7 @@ impl Parser {
// Start: `{`
// End: `b`
fn parse_hex(&mut self) -> Result<Build> {
self.ignore_space();
if self.bump_if('{') {
self.parse_hex_many_digits()
} else {
Expand All @@ -486,9 +496,11 @@ impl Parser {
fn parse_hex_many_digits(&mut self) -> Result<Build> {
use std::char;

let s = self.bump_get(|c| c != '}').unwrap_or("".into());
self.ignore_space();
let s = self.bump_get(is_ascii_word).unwrap_or("".into());
let n = try!(u32::from_str_radix(&s, 16)
.map_err(|_| self.err(ErrorKind::InvalidBase16(s))));
self.ignore_space();
if !self.bump_if('}') {
// e.g., a\x{d
return Err(self.err(ErrorKind::UnclosedHex));
Expand Down Expand Up @@ -530,12 +542,16 @@ impl Parser {
// End: `+`
fn parse_class(&mut self) -> Result<Build> {
self.bump();
self.ignore_space();
let negated = self.bump_if('^');
self.ignore_space();
let mut class = CharClass::empty();
while self.bump_if('-') {
self.ignore_space();
class.ranges.push(ClassRange::one('-'));
}
loop {
self.ignore_space();
if self.eof() {
// e.g., [a
return Err(self.err(ErrorKind::UnexpectedClassEof));
Expand Down Expand Up @@ -631,11 +647,13 @@ impl Parser {
// End: `]`
fn parse_class_range(&mut self, class: &mut CharClass, start: char)
-> Result<()> {
self.ignore_space();
if !self.bump_if('-') {
// Not a range, so just push a singleton range.
class.ranges.push(ClassRange::one(start));
return Ok(());
}
self.ignore_space();
if self.eof() {
// e.g., [a-
return Err(self.err(ErrorKind::UnexpectedClassEof));
Expand Down Expand Up @@ -730,9 +748,12 @@ impl Parser {
//
// `negate` is true when the class name is used with `\P`.
fn parse_unicode_class(&mut self, neg: bool) -> Result<CharClass> {
self.ignore_space();
let name =
if self.bump_if('{') {
let n = self.bump_get(|c| c != '}').unwrap_or("".into());
self.ignore_space();
let n = self.bump_get(is_ascii_word).unwrap_or("".into());
self.ignore_space();
if n.is_empty() || !self.bump_if('}') {
// e.g., \p{Greek
return Err(self.err(ErrorKind::UnclosedUnicodeName));
Expand Down Expand Up @@ -796,7 +817,31 @@ impl Parser {
// Auxiliary helper methods.
impl Parser {
fn chars(&self) -> Chars {
Chars::new(&self.chars[self.chari..], self.flags.ignore_space)
Chars::new(&self.chars[self.chari..])
}

fn ignore_space(&mut self) {
if !self.flags.ignore_space {
return;
}
while !self.eof() {
match self.cur() {
'#' => {
self.bump();
while !self.eof() {
match self.bump() {
'\n' => break,
_ => continue,
}
}
},
c => if !c.is_whitespace() {
return;
} else {
self.bump();
}
}
}
}

fn bump(&mut self) -> char {
Expand Down Expand Up @@ -924,48 +969,22 @@ impl Parser {
struct Chars<'a> {
chars: &'a [char],
cur: usize,
ignore_space: bool,
}

impl<'a> Iterator for Chars<'a> {
type Item = char;
fn next(&mut self) -> Option<char> {
if !self.ignore_space {
let x = self.c();
self.advance();
return x;
}
while let Some(c) = self.c() {
self.advance();
match c {
'\\' => return match self.c() {
Some('#') => {self.advance(); Some('#')}
_ => Some('\\')
},
'#' => loop {
match self.c() {
Some(c) => {
self.advance();
if c == '\n' {
break;
}
},
None => return None
}
},
_ => if !c.is_whitespace() {return Some(c);}
}
}
None
let x = self.c();
self.advance();
return x;
}
}

impl<'a> Chars<'a> {
fn new(chars: &[char], ignore_space: bool) -> Chars {
fn new(chars: &[char]) -> Chars {
Chars {
chars: chars,
cur: 0,
ignore_space: ignore_space,
}
}

Expand Down Expand Up @@ -1221,6 +1240,13 @@ fn is_valid_capture_char(c: char) -> bool {
|| (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
}

fn is_ascii_word(c: char) -> bool {
match c {
'a' ... 'z' | 'A' ... 'Z' | '_' | '0' ... '9' => true,
_ => false,
}
}

/// Returns true if the give character has significance in a regex.
pub fn is_punct(c: char) -> bool {
match c {
Expand Down Expand Up @@ -2368,10 +2394,49 @@ mod tests {
}

#[test]
fn ignore_space_escape() {
assert_eq!(p(r"(?x)\ d"), Expr::Class(class(PERLD)));
assert_eq!(p(r"(?x)\
D"), Expr::Class(class(PERLD).negate()));
fn ignore_space_escape_octal() {
assert_eq!(p(r"(?x)\12 3"), Expr::Concat(vec![
lit('\n'),
lit('3'),
]));
}

#[test]
fn ignore_space_escape_hex() {
assert_eq!(p(r"(?x)\x { 53 }"), lit('S'));
assert_eq!(p(r"(?x)\x # comment
{ # comment
53 # comment
} # comment"), lit('S'));
}

#[test]
fn ignore_space_escape_hex2() {
assert_eq!(p(r"(?x)\x 53"), lit('S'));
assert_eq!(p(r"(?x)\x # comment
53 # comment"), lit('S'));
}

#[test]
fn ignore_space_escape_unicode_name() {
assert_eq!(p(r"(?x)\p # comment
{ # comment
Yi # comment
} # comment"), Expr::Class(class(YI)));
}

#[test]
fn ignore_space_repeat_counted() {
assert_eq!(p("(?x)a # comment
{ # comment
5 # comment
, # comment
10 # comment
}"), Expr::Repeat {
e: b(lit('a')),
r: Repeater::Range { min: 5, max: Some(10) },
greedy: true,
});
}

#[test]
Expand Down Expand Up @@ -2424,6 +2489,14 @@ mod tests {
]));
}

#[test]
fn ignore_space_escape_space() {
assert_eq!(p(r"(?x)a\ # hi there"), Expr::Concat(vec![
lit('a'),
lit(' '),
]));
}

// Test every single possible error case.

macro_rules! test_err {
Expand Down Expand Up @@ -2815,4 +2888,19 @@ mod tests {
test_err!("(?P<a>.)(?P<a>.)", 14,
ErrorKind::DuplicateCaptureName("a".into()));
}

#[test]
fn error_ignore_space_escape_hex() {
test_err!(r"(?x)\x{ 5 3 }", 10, ErrorKind::UnclosedHex);
}

#[test]
fn error_ignore_space_escape_hex2() {
test_err!(r"(?x)\x 5 3", 9, ErrorKind::InvalidBase16("5 ".into()));
}

#[test]
fn error_ignore_space_escape_unicode_name() {
test_err!(r"(?x)\p{Y i}", 9, ErrorKind::UnclosedUnicodeName);
}
}

0 comments on commit c15170a

Please sign in to comment.