From 6f32d2f6831a5093942c89c122aaab413b90f48d Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Fri, 7 Apr 2017 18:58:54 +1000 Subject: [PATCH] Fix panics with whitespace in extended mode by being more strict Instead of ignoring space in all the bump/peek methods (as proposed in pull request #349), have an explicit `ignore_space` method that can be used in places where space/comments should be allowed. This makes parsing a bit stricter than before as well. --- regex-syntax/src/parser.rs | 174 ++++++++++++++++++++++++++++--------- 1 file changed, 131 insertions(+), 43 deletions(-) diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index c2aca269bc..c0c131ea2e 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -104,7 +104,11 @@ impl Parser { // Starts at the beginning of the input and consumes until either the end // of input or an error. fn parse_expr(mut self) -> Result { - while !self.eof() { + loop { + self.ignore_space(); + if self.eof() { + break; + } let build_expr = match self.cur() { '\\' => try!(self.parse_escape()), '|' => { let e = try!(self.alternate()); self.bump(); e } @@ -177,7 +181,7 @@ impl Parser { return Err(self.err(ErrorKind::UnexpectedEscapeEof)); } let c = self.cur(); - if is_punct(c) { + if is_punct(c) || (self.flags.ignore_space && c.is_whitespace()) { let c = self.bump(); return Ok(try!(self.lit(c))); } @@ -234,6 +238,7 @@ impl Parser { let chari = self.chari; let mut name: CaptureName = None; self.bump(); + self.ignore_space(); if self.bump_if("?P<") { let n = try!(self.parse_group_name()); if self.names.iter().any(|n2| n2 == &n) { @@ -370,13 +375,16 @@ impl Parser { return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e))); } self.bump(); - let min = try!(self.parse_decimal(|c| c != ',' && c != '}')); + self.ignore_space(); + let min = try!(self.parse_decimal()); let mut max_opt = Some(min); + self.ignore_space(); if self.bump_if(',') { + self.ignore_space(); if self.peek_is('}') { max_opt = None; } else { - let max = try!(self.parse_decimal(|c| c != '}')); + let max = try!(self.parse_decimal()); if min > max { // e.g., a{2,1} return Err(self.err(ErrorKind::InvalidRepeatRange { @@ -387,6 +395,7 @@ impl Parser { max_opt = Some(max); } } + self.ignore_space(); if !self.bump_if('}') { Err(self.err(ErrorKind::UnclosedRepeat)) } else { @@ -423,8 +432,8 @@ impl Parser { // // Start: `1` // End: `,` (where `until == ','`) - fn parse_decimal(&mut self, until: B) -> Result { - match self.bump_get(until) { + fn parse_decimal(&mut self) -> Result { + match self.bump_get(|c| is_ascii_word(c) || c.is_whitespace()) { // e.g., a{} None => Err(self.err(ErrorKind::MissingBase10)), Some(n) => { @@ -472,6 +481,7 @@ impl Parser { // Start: `{` // End: `b` fn parse_hex(&mut self) -> Result { + self.ignore_space(); if self.bump_if('{') { self.parse_hex_many_digits() } else { @@ -486,9 +496,11 @@ impl Parser { fn parse_hex_many_digits(&mut self) -> Result { use std::char; - let s = self.bump_get(|c| c != '}').unwrap_or("".into()); + self.ignore_space(); + let s = self.bump_get(is_ascii_word).unwrap_or("".into()); let n = try!(u32::from_str_radix(&s, 16) .map_err(|_| self.err(ErrorKind::InvalidBase16(s)))); + self.ignore_space(); if !self.bump_if('}') { // e.g., a\x{d return Err(self.err(ErrorKind::UnclosedHex)); @@ -530,12 +542,16 @@ impl Parser { // End: `+` fn parse_class(&mut self) -> Result { self.bump(); + self.ignore_space(); let negated = self.bump_if('^'); + self.ignore_space(); let mut class = CharClass::empty(); while self.bump_if('-') { + self.ignore_space(); class.ranges.push(ClassRange::one('-')); } loop { + self.ignore_space(); if self.eof() { // e.g., [a return Err(self.err(ErrorKind::UnexpectedClassEof)); @@ -631,11 +647,13 @@ impl Parser { // End: `]` fn parse_class_range(&mut self, class: &mut CharClass, start: char) -> Result<()> { + self.ignore_space(); if !self.bump_if('-') { // Not a range, so just push a singleton range. class.ranges.push(ClassRange::one(start)); return Ok(()); } + self.ignore_space(); if self.eof() { // e.g., [a- return Err(self.err(ErrorKind::UnexpectedClassEof)); @@ -730,9 +748,12 @@ impl Parser { // // `negate` is true when the class name is used with `\P`. fn parse_unicode_class(&mut self, neg: bool) -> Result { + self.ignore_space(); let name = if self.bump_if('{') { - let n = self.bump_get(|c| c != '}').unwrap_or("".into()); + self.ignore_space(); + let n = self.bump_get(is_ascii_word).unwrap_or("".into()); + self.ignore_space(); if n.is_empty() || !self.bump_if('}') { // e.g., \p{Greek return Err(self.err(ErrorKind::UnclosedUnicodeName)); @@ -796,7 +817,31 @@ impl Parser { // Auxiliary helper methods. impl Parser { fn chars(&self) -> Chars { - Chars::new(&self.chars[self.chari..], self.flags.ignore_space) + Chars::new(&self.chars[self.chari..]) + } + + fn ignore_space(&mut self) { + if !self.flags.ignore_space { + return; + } + while !self.eof() { + match self.cur() { + '#' => { + self.bump(); + while !self.eof() { + match self.bump() { + '\n' => break, + _ => continue, + } + } + }, + c => if !c.is_whitespace() { + return; + } else { + self.bump(); + } + } + } } fn bump(&mut self) -> char { @@ -924,48 +969,22 @@ impl Parser { struct Chars<'a> { chars: &'a [char], cur: usize, - ignore_space: bool, } impl<'a> Iterator for Chars<'a> { type Item = char; fn next(&mut self) -> Option { - if !self.ignore_space { - let x = self.c(); - self.advance(); - return x; - } - while let Some(c) = self.c() { - self.advance(); - match c { - '\\' => return match self.c() { - Some('#') => {self.advance(); Some('#')} - _ => Some('\\') - }, - '#' => loop { - match self.c() { - Some(c) => { - self.advance(); - if c == '\n' { - break; - } - }, - None => return None - } - }, - _ => if !c.is_whitespace() {return Some(c);} - } - } - None + let x = self.c(); + self.advance(); + return x; } } impl<'a> Chars<'a> { - fn new(chars: &[char], ignore_space: bool) -> Chars { + fn new(chars: &[char]) -> Chars { Chars { chars: chars, cur: 0, - ignore_space: ignore_space, } } @@ -1221,6 +1240,13 @@ fn is_valid_capture_char(c: char) -> bool { || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') } +fn is_ascii_word(c: char) -> bool { + match c { + 'a' ... 'z' | 'A' ... 'Z' | '_' | '0' ... '9' => true, + _ => false, + } +} + /// Returns true if the give character has significance in a regex. pub fn is_punct(c: char) -> bool { match c { @@ -2368,10 +2394,49 @@ mod tests { } #[test] - fn ignore_space_escape() { - assert_eq!(p(r"(?x)\ d"), Expr::Class(class(PERLD))); - assert_eq!(p(r"(?x)\ - D"), Expr::Class(class(PERLD).negate())); + fn ignore_space_escape_octal() { + assert_eq!(p(r"(?x)\12 3"), Expr::Concat(vec![ + lit('\n'), + lit('3'), + ])); + } + + #[test] + fn ignore_space_escape_hex() { + assert_eq!(p(r"(?x)\x { 53 }"), lit('S')); + assert_eq!(p(r"(?x)\x # comment +{ # comment + 53 # comment +} # comment"), lit('S')); + } + + #[test] + fn ignore_space_escape_hex2() { + assert_eq!(p(r"(?x)\x 53"), lit('S')); + assert_eq!(p(r"(?x)\x # comment + 53 # comment"), lit('S')); + } + + #[test] + fn ignore_space_escape_unicode_name() { + assert_eq!(p(r"(?x)\p # comment +{ # comment + Yi # comment +} # comment"), Expr::Class(class(YI))); + } + + #[test] + fn ignore_space_repeat_counted() { + assert_eq!(p("(?x)a # comment +{ # comment + 5 # comment + , # comment + 10 # comment +}"), Expr::Repeat { + e: b(lit('a')), + r: Repeater::Range { min: 5, max: Some(10) }, + greedy: true, + }); } #[test] @@ -2424,6 +2489,14 @@ mod tests { ])); } + #[test] + fn ignore_space_escape_space() { + assert_eq!(p(r"(?x)a\ # hi there"), Expr::Concat(vec![ + lit('a'), + lit(' '), + ])); + } + // Test every single possible error case. macro_rules! test_err { @@ -2815,4 +2888,19 @@ mod tests { test_err!("(?P.)(?P.)", 14, ErrorKind::DuplicateCaptureName("a".into())); } + + #[test] + fn error_ignore_space_escape_hex() { + test_err!(r"(?x)\x{ 5 3 }", 10, ErrorKind::UnclosedHex); + } + + #[test] + fn error_ignore_space_escape_hex2() { + test_err!(r"(?x)\x 5 3", 9, ErrorKind::InvalidBase16("5 ".into())); + } + + #[test] + fn error_ignore_space_escape_unicode_name() { + test_err!(r"(?x)\p{Y i}", 9, ErrorKind::UnclosedUnicodeName); + } }