From c9c38a187cbe4d09dc0a06f5f333feab28e67024 Mon Sep 17 00:00:00 2001 From: lucab <98086+lucab@users.noreply.github.com> Date: Tue, 30 Jul 2024 17:53:13 +0000 Subject: [PATCH] perf(parser): support peeking over bytes (#4304) Closes https://github.com/oxc-project/oxc/issues/3291 --- crates/oxc_ast/src/ast_impl/literal.rs | 18 ++++++ crates/oxc_parser/src/lexer/byte_handlers.rs | 10 +-- crates/oxc_parser/src/lexer/identifier.rs | 8 +-- crates/oxc_parser/src/lexer/jsx.rs | 8 +-- crates/oxc_parser/src/lexer/kind.rs | 6 +- crates/oxc_parser/src/lexer/mod.rs | 18 +++++- crates/oxc_parser/src/lexer/numeric.rs | 68 +++++++++----------- crates/oxc_parser/src/lexer/punctuation.rs | 6 +- crates/oxc_parser/src/lexer/regex.rs | 8 ++- crates/oxc_parser/src/lexer/source.rs | 13 ++++ crates/oxc_parser/src/lexer/unicode.rs | 29 ++++----- 11 files changed, 116 insertions(+), 76 deletions(-) diff --git a/crates/oxc_ast/src/ast_impl/literal.rs b/crates/oxc_ast/src/ast_impl/literal.rs index 108b9db81b5a5..aa474075f73fc 100644 --- a/crates/oxc_ast/src/ast_impl/literal.rs +++ b/crates/oxc_ast/src/ast_impl/literal.rs @@ -108,6 +108,24 @@ impl TryFrom for RegExpFlags { } } +impl TryFrom for RegExpFlags { + type Error = u8; + + fn try_from(value: u8) -> Result { + match value { + b'g' => Ok(Self::G), + b'i' => Ok(Self::I), + b'm' => Ok(Self::M), + b's' => Ok(Self::S), + b'u' => Ok(Self::U), + b'y' => Ok(Self::Y), + b'd' => Ok(Self::D), + b'v' => Ok(Self::V), + _ => Err(value), + } + } +} + impl fmt::Display for RegExpFlags { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.contains(Self::G) { diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs index 4e2439131ed5a..182cbb5a759c5 100644 --- a/crates/oxc_parser/src/lexer/byte_handlers.rs +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -336,12 +336,12 @@ ascii_byte_handler!(PRD(lexer) { // / ascii_byte_handler!(SLH(lexer) { lexer.consume_char(); - match lexer.peek() { - Some('/') => { + match lexer.peek_byte() { + Some(b'/') => { lexer.consume_char(); lexer.skip_single_line_comment() } - Some('*') => { + Some(b'*') => { lexer.consume_char(); lexer.skip_multi_line_comment() } @@ -418,9 +418,9 @@ ascii_byte_handler!(QST(lexer) { } else { Kind::Question2 } - } else if lexer.peek() == Some('.') { + } else if lexer.peek_byte() == Some(b'.') { // parse `?.1` as `?` `.1` - if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) { + if lexer.peek_char2().is_some_and(|c| c.is_ascii_digit()) { Kind::Question } else { lexer.consume_char(); diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs index 4e21279354758..6b464e68b09c6 100644 --- a/crates/oxc_parser/src/lexer/identifier.rs +++ b/crates/oxc_parser/src/lexer/identifier.rs @@ -98,7 +98,7 @@ impl<'a> Lexer<'a> { /// Any number of characters can have already been consumed from `self.source` prior to it. /// `self.source` should be positioned at start of Unicode character. fn identifier_tail_unicode(&mut self, start_pos: SourcePosition) -> &'a str { - let c = self.peek().unwrap(); + let c = self.peek_char().unwrap(); if is_identifier_part_unicode(c) { self.consume_char(); self.identifier_tail_after_unicode(start_pos) @@ -115,7 +115,7 @@ impl<'a> Lexer<'a> { pub(super) fn identifier_tail_after_unicode(&mut self, start_pos: SourcePosition) -> &'a str { // Identifier contains a Unicode chars, so probably contains more. // So just iterate over chars now, instead of bytes. - while let Some(c) = self.peek() { + while let Some(c) = self.peek_char() { if is_identifier_part(c) { self.consume_char(); } else if c == '\\' { @@ -177,7 +177,7 @@ impl<'a> Lexer<'a> { // Consume chars until reach end of identifier or another escape let chunk_start = self.source.position(); loop { - let maybe_char = self.peek(); + let maybe_char = self.peek_char(); if maybe_char.is_some_and(is_identifier_part) { self.consume_char(); continue; @@ -272,7 +272,7 @@ impl<'a> Lexer<'a> { fn private_identifier_not_ascii_id(&mut self) -> Kind { let b = self.source.peek_byte().unwrap(); if !b.is_ascii() { - let c = self.peek().unwrap(); + let c = self.peek_char().unwrap(); if is_identifier_start_unicode(c) { let start_pos = self.source.position(); self.consume_char(); diff --git a/crates/oxc_parser/src/lexer/jsx.rs b/crates/oxc_parser/src/lexer/jsx.rs index bcc796f72af03..dc97462a58cf9 100644 --- a/crates/oxc_parser/src/lexer/jsx.rs +++ b/crates/oxc_parser/src/lexer/jsx.rs @@ -61,12 +61,12 @@ impl<'a> Lexer<'a> { /// `JSXFragment` /// { `JSXChildExpressionopt` } fn read_jsx_child(&mut self) -> Kind { - match self.peek() { - Some('<') => { + match self.peek_byte() { + Some(b'<') => { self.consume_char(); Kind::LAngle } - Some('{') => { + Some(b'{') => { self.consume_char(); Kind::LCurly } @@ -122,7 +122,7 @@ impl<'a> Lexer<'a> { // Unicode chars are rare in identifiers, so cold branch to keep common path for ASCII // as fast as possible cold_branch(|| { - while let Some(c) = self.peek() { + while let Some(c) = self.peek_char() { if c == '-' || is_identifier_part(c) { self.consume_char(); } else { diff --git a/crates/oxc_parser/src/lexer/kind.rs b/crates/oxc_parser/src/lexer/kind.rs index 5d94a42558296..d30069afb339b 100644 --- a/crates/oxc_parser/src/lexer/kind.rs +++ b/crates/oxc_parser/src/lexer/kind.rs @@ -206,11 +206,11 @@ impl Kind { ) } - pub fn matches_number_char(self, c: char) -> bool { + pub fn matches_number_char(self, c: u8) -> bool { match self { Decimal => c.is_ascii_digit(), - Binary => matches!(c, '0'..='1'), - Octal => matches!(c, '0'..='7'), + Binary => matches!(c, b'0'..=b'1'), + Octal => matches!(c, b'0'..=b'7'), Hex => c.is_ascii_hexdigit(), _ => unreachable!(), } diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index cebd47f110f3a..f242f24a96349 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -251,15 +251,27 @@ impl<'a> Lexer<'a> { self.source.next_char().unwrap() } + /// Peek the next byte without advancing the position + #[inline] + fn peek_byte(&self) -> Option { + self.source.peek_byte() + } + + /// Peek the next two bytes without advancing the position + #[inline] + fn peek_2_bytes(&self) -> Option<[u8; 2]> { + self.source.peek_2_bytes() + } + /// Peek the next char without advancing the position #[inline] - fn peek(&self) -> Option { + fn peek_char(&self) -> Option { self.source.peek_char() } /// Peek the next next char without advancing the position #[inline] - fn peek2(&self) -> Option { + fn peek_char2(&self) -> Option { self.source.peek_char2() } @@ -284,7 +296,7 @@ impl<'a> Lexer<'a> { /// Return `IllegalCharacter` Error or `UnexpectedEnd` if EOF fn unexpected_err(&mut self) { let offset = self.current_offset(); - match self.peek() { + match self.peek_char() { Some(c) => self.error(diagnostics::invalid_character(c, offset)), None => self.error(diagnostics::unexpected_end(offset)), } diff --git a/crates/oxc_parser/src/lexer/numeric.rs b/crates/oxc_parser/src/lexer/numeric.rs index 7e7ab75fe690d..27b902818824a 100644 --- a/crates/oxc_parser/src/lexer/numeric.rs +++ b/crates/oxc_parser/src/lexer/numeric.rs @@ -6,19 +6,19 @@ use crate::diagnostics; impl<'a> Lexer<'a> { /// 12.9.3 Numeric Literals with `0` prefix pub(super) fn read_zero(&mut self) -> Kind { - match self.peek() { - Some('b' | 'B') => self.read_non_decimal(Kind::Binary), - Some('o' | 'O') => self.read_non_decimal(Kind::Octal), - Some('x' | 'X') => self.read_non_decimal(Kind::Hex), - Some('e' | 'E') => { + match self.peek_byte() { + Some(b'b' | b'B') => self.read_non_decimal(Kind::Binary), + Some(b'o' | b'O') => self.read_non_decimal(Kind::Octal), + Some(b'x' | b'X') => self.read_non_decimal(Kind::Hex), + Some(b'e' | b'E') => { self.consume_char(); self.read_decimal_exponent() } - Some('.') => { + Some(b'.') => { self.consume_char(); self.decimal_literal_after_decimal_point_after_digits() } - Some('n') => { + Some(b'n') => { self.consume_char(); self.check_after_numeric_literal(Kind::Decimal) } @@ -42,23 +42,23 @@ impl<'a> Lexer<'a> { fn read_non_decimal(&mut self, kind: Kind) -> Kind { self.consume_char(); - if self.peek().is_some_and(|c| kind.matches_number_char(c)) { + if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) { self.consume_char(); } else { self.unexpected_err(); return Kind::Undetermined; } - while let Some(c) = self.peek() { + while let Some(c) = self.peek_byte() { match c { - '_' => { + b'_' => { self.consume_char(); // NOTE: it looks invalid numeric tokens are still parsed. // This seems to be a waste. It also requires us to put this // call here instead of after we ensure the next character // is a number character self.token.set_has_separator(); - if self.peek().is_some_and(|c| kind.matches_number_char(c)) { + if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) { self.consume_char(); } else { self.unexpected_err(); @@ -71,20 +71,18 @@ impl<'a> Lexer<'a> { _ => break, } } - if self.peek() == Some('n') { - self.consume_char(); - } + self.next_ascii_char_eq(b'n'); self.check_after_numeric_literal(kind) } fn read_legacy_octal(&mut self) -> Kind { let mut kind = Kind::Octal; loop { - match self.peek() { - Some('0'..='7') => { + match self.peek_byte() { + Some(b'0'..=b'7') => { self.consume_char(); } - Some('8'..='9') => { + Some(b'8'..=b'9') => { self.consume_char(); kind = Kind::Decimal; } @@ -92,14 +90,14 @@ impl<'a> Lexer<'a> { } } - match self.peek() { + match self.peek_byte() { // allow 08.5 and 09.5 - Some('.') if kind == Kind::Decimal => { + Some(b'.') if kind == Kind::Decimal => { self.consume_char(); self.decimal_literal_after_decimal_point_after_digits() } // allow 08e1 and 09e1 - Some('e') if kind == Kind::Decimal => { + Some(b'e') if kind == Kind::Decimal => { self.consume_char(); self.read_decimal_exponent() } @@ -108,12 +106,12 @@ impl<'a> Lexer<'a> { } fn read_decimal_exponent(&mut self) -> Kind { - let kind = match self.peek() { - Some('-') => { + let kind = match self.peek_byte() { + Some(b'-') => { self.consume_char(); Kind::NegativeExponential } - Some('+') => { + Some(b'+') => { self.consume_char(); Kind::PositiveExponential } @@ -124,7 +122,7 @@ impl<'a> Lexer<'a> { } fn read_decimal_digits(&mut self) { - if self.peek().is_some_and(|c| c.is_ascii_digit()) { + if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) { self.consume_char(); } else { self.unexpected_err(); @@ -135,23 +133,23 @@ impl<'a> Lexer<'a> { } fn read_decimal_digits_after_first_digit(&mut self) { - while let Some(c) = self.peek() { - match c { - '_' => { + while let Some(b) = self.peek_byte() { + match b { + b'_' => { self.consume_char(); // NOTE: it looks invalid numeric tokens are still parsed. // This seems to be a waste. It also requires us to put this // call here instead of after we ensure the next character // is an ASCII digit self.token.set_has_separator(); - if self.peek().is_some_and(|c| c.is_ascii_digit()) { + if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) { self.consume_char(); } else { self.unexpected_err(); return; } } - '0'..='9' => { + b'0'..=b'9' => { self.consume_char(); } _ => break, @@ -172,16 +170,14 @@ impl<'a> Lexer<'a> { } fn optional_decimal_digits(&mut self) { - if self.peek().is_some_and(|c| c.is_ascii_digit()) { + if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) { self.consume_char(); - } else { - return; + self.read_decimal_digits_after_first_digit(); } - self.read_decimal_digits_after_first_digit(); } fn optional_exponent(&mut self) -> Option { - if matches!(self.peek(), Some('e' | 'E')) { + if matches!(self.peek_byte(), Some(b'e' | b'E')) { self.consume_char(); return Some(self.read_decimal_exponent()); } @@ -191,12 +187,12 @@ impl<'a> Lexer<'a> { fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind { let offset = self.offset(); // The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit. - let c = self.peek(); + let c = self.peek_char(); if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) { return kind; } self.consume_char(); - while let Some(c) = self.peek() { + while let Some(c) = self.peek_char() { if is_identifier_start(c) { self.consume_char(); } else { diff --git a/crates/oxc_parser/src/lexer/punctuation.rs b/crates/oxc_parser/src/lexer/punctuation.rs index 57f96bf1af0e7..c1ab4327ed2c9 100644 --- a/crates/oxc_parser/src/lexer/punctuation.rs +++ b/crates/oxc_parser/src/lexer/punctuation.rs @@ -3,12 +3,12 @@ use super::{Kind, Lexer, Token}; impl<'a> Lexer<'a> { /// Section 12.8 Punctuators pub(super) fn read_dot(&mut self) -> Kind { - if self.peek() == Some('.') && self.peek2() == Some('.') { + if self.peek_2_bytes() == Some([b'.', b'.']) { self.consume_char(); self.consume_char(); return Kind::Dot3; } - if self.peek().is_some_and(|c| c.is_ascii_digit()) { + if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) { self.decimal_literal_after_decimal_point() } else { Kind::Dot @@ -25,7 +25,7 @@ impl<'a> Lexer<'a> { } } else if self.next_ascii_char_eq(b'=') { Some(Kind::LtEq) - } else if self.peek() == Some('!') + } else if self.peek_byte() == Some(b'!') // SingleLineHTMLOpenComment `