Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(parser): support peeking over bytes #4304

Merged
merged 1 commit into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions crates/oxc_ast/src/ast_impl/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,24 @@ impl TryFrom<char> for RegExpFlags {
}
}

impl TryFrom<u8> for RegExpFlags {
type Error = u8;

fn try_from(value: u8) -> Result<Self, Self::Error> {
match value {
b'g' => Ok(Self::G),
b'i' => Ok(Self::I),
b'm' => Ok(Self::M),
b's' => Ok(Self::S),
b'u' => Ok(Self::U),
b'y' => Ok(Self::Y),
b'd' => Ok(Self::D),
b'v' => Ok(Self::V),
_ => Err(value),
}
}
}

impl fmt::Display for RegExpFlags {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.contains(Self::G) {
Expand Down
10 changes: 5 additions & 5 deletions crates/oxc_parser/src/lexer/byte_handlers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -336,12 +336,12 @@ ascii_byte_handler!(PRD(lexer) {
// /
ascii_byte_handler!(SLH(lexer) {
lexer.consume_char();
match lexer.peek() {
Some('/') => {
match lexer.peek_byte() {
Some(b'/') => {
lexer.consume_char();
lexer.skip_single_line_comment()
}
Some('*') => {
Some(b'*') => {
lexer.consume_char();
lexer.skip_multi_line_comment()
}
Expand Down Expand Up @@ -418,9 +418,9 @@ ascii_byte_handler!(QST(lexer) {
} else {
Kind::Question2
}
} else if lexer.peek() == Some('.') {
} else if lexer.peek_byte() == Some(b'.') {
// parse `?.1` as `?` `.1`
if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) {
if lexer.peek_char2().is_some_and(|c| c.is_ascii_digit()) {
Kind::Question
} else {
lexer.consume_char();
Expand Down
8 changes: 4 additions & 4 deletions crates/oxc_parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ impl<'a> Lexer<'a> {
/// Any number of characters can have already been consumed from `self.source` prior to it.
/// `self.source` should be positioned at start of Unicode character.
fn identifier_tail_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
let c = self.peek().unwrap();
let c = self.peek_char().unwrap();
if is_identifier_part_unicode(c) {
self.consume_char();
self.identifier_tail_after_unicode(start_pos)
Expand All @@ -115,7 +115,7 @@ impl<'a> Lexer<'a> {
pub(super) fn identifier_tail_after_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
// Identifier contains a Unicode chars, so probably contains more.
// So just iterate over chars now, instead of bytes.
while let Some(c) = self.peek() {
while let Some(c) = self.peek_char() {
if is_identifier_part(c) {
self.consume_char();
} else if c == '\\' {
Expand Down Expand Up @@ -177,7 +177,7 @@ impl<'a> Lexer<'a> {
// Consume chars until reach end of identifier or another escape
let chunk_start = self.source.position();
loop {
let maybe_char = self.peek();
let maybe_char = self.peek_char();
if maybe_char.is_some_and(is_identifier_part) {
self.consume_char();
continue;
Expand Down Expand Up @@ -272,7 +272,7 @@ impl<'a> Lexer<'a> {
fn private_identifier_not_ascii_id(&mut self) -> Kind {
let b = self.source.peek_byte().unwrap();
if !b.is_ascii() {
let c = self.peek().unwrap();
let c = self.peek_char().unwrap();
if is_identifier_start_unicode(c) {
let start_pos = self.source.position();
self.consume_char();
Expand Down
8 changes: 4 additions & 4 deletions crates/oxc_parser/src/lexer/jsx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ impl<'a> Lexer<'a> {
/// `JSXFragment`
/// { `JSXChildExpressionopt` }
fn read_jsx_child(&mut self) -> Kind {
match self.peek() {
Some('<') => {
match self.peek_byte() {
Some(b'<') => {
self.consume_char();
Kind::LAngle
}
Some('{') => {
Some(b'{') => {
self.consume_char();
Kind::LCurly
}
Expand Down Expand Up @@ -122,7 +122,7 @@ impl<'a> Lexer<'a> {
// Unicode chars are rare in identifiers, so cold branch to keep common path for ASCII
// as fast as possible
cold_branch(|| {
while let Some(c) = self.peek() {
while let Some(c) = self.peek_char() {
if c == '-' || is_identifier_part(c) {
self.consume_char();
} else {
Expand Down
6 changes: 3 additions & 3 deletions crates/oxc_parser/src/lexer/kind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,11 @@ impl Kind {
)
}

pub fn matches_number_char(self, c: char) -> bool {
pub fn matches_number_char(self, c: u8) -> bool {
match self {
Decimal => c.is_ascii_digit(),
Binary => matches!(c, '0'..='1'),
Octal => matches!(c, '0'..='7'),
Binary => matches!(c, b'0'..=b'1'),
Octal => matches!(c, b'0'..=b'7'),
Hex => c.is_ascii_hexdigit(),
_ => unreachable!(),
}
Expand Down
18 changes: 15 additions & 3 deletions crates/oxc_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,15 +251,27 @@ impl<'a> Lexer<'a> {
self.source.next_char().unwrap()
}

/// Peek the next byte without advancing the position
#[inline]
fn peek_byte(&self) -> Option<u8> {
self.source.peek_byte()
}

/// Peek the next two bytes without advancing the position
#[inline]
fn peek_2_bytes(&self) -> Option<[u8; 2]> {
self.source.peek_2_bytes()
}

/// Peek the next char without advancing the position
#[inline]
fn peek(&self) -> Option<char> {
fn peek_char(&self) -> Option<char> {
self.source.peek_char()
}

/// Peek the next next char without advancing the position
#[inline]
fn peek2(&self) -> Option<char> {
fn peek_char2(&self) -> Option<char> {
self.source.peek_char2()
}

Expand All @@ -284,7 +296,7 @@ impl<'a> Lexer<'a> {
/// Return `IllegalCharacter` Error or `UnexpectedEnd` if EOF
fn unexpected_err(&mut self) {
let offset = self.current_offset();
match self.peek() {
match self.peek_char() {
Some(c) => self.error(diagnostics::invalid_character(c, offset)),
None => self.error(diagnostics::unexpected_end(offset)),
}
Expand Down
68 changes: 32 additions & 36 deletions crates/oxc_parser/src/lexer/numeric.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,19 @@ use crate::diagnostics;
impl<'a> Lexer<'a> {
/// 12.9.3 Numeric Literals with `0` prefix
pub(super) fn read_zero(&mut self) -> Kind {
match self.peek() {
Some('b' | 'B') => self.read_non_decimal(Kind::Binary),
Some('o' | 'O') => self.read_non_decimal(Kind::Octal),
Some('x' | 'X') => self.read_non_decimal(Kind::Hex),
Some('e' | 'E') => {
match self.peek_byte() {
Some(b'b' | b'B') => self.read_non_decimal(Kind::Binary),
Some(b'o' | b'O') => self.read_non_decimal(Kind::Octal),
Some(b'x' | b'X') => self.read_non_decimal(Kind::Hex),
Some(b'e' | b'E') => {
self.consume_char();
self.read_decimal_exponent()
}
Some('.') => {
Some(b'.') => {
self.consume_char();
self.decimal_literal_after_decimal_point_after_digits()
}
Some('n') => {
Some(b'n') => {
self.consume_char();
self.check_after_numeric_literal(Kind::Decimal)
}
Expand All @@ -42,23 +42,23 @@ impl<'a> Lexer<'a> {
fn read_non_decimal(&mut self, kind: Kind) -> Kind {
self.consume_char();

if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) {
self.consume_char();
} else {
self.unexpected_err();
return Kind::Undetermined;
}

while let Some(c) = self.peek() {
while let Some(c) = self.peek_byte() {
match c {
'_' => {
b'_' => {
self.consume_char();
// NOTE: it looks invalid numeric tokens are still parsed.
// This seems to be a waste. It also requires us to put this
// call here instead of after we ensure the next character
// is a number character
self.token.set_has_separator();
if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
if self.peek_byte().is_some_and(|c| kind.matches_number_char(c)) {
self.consume_char();
} else {
self.unexpected_err();
Expand All @@ -71,35 +71,33 @@ impl<'a> Lexer<'a> {
_ => break,
}
}
if self.peek() == Some('n') {
self.consume_char();
}
self.next_ascii_char_eq(b'n');
self.check_after_numeric_literal(kind)
}

fn read_legacy_octal(&mut self) -> Kind {
let mut kind = Kind::Octal;
loop {
match self.peek() {
Some('0'..='7') => {
match self.peek_byte() {
Some(b'0'..=b'7') => {
self.consume_char();
}
Some('8'..='9') => {
Some(b'8'..=b'9') => {
self.consume_char();
kind = Kind::Decimal;
}
_ => break,
}
}

match self.peek() {
match self.peek_byte() {
// allow 08.5 and 09.5
Some('.') if kind == Kind::Decimal => {
Some(b'.') if kind == Kind::Decimal => {
self.consume_char();
self.decimal_literal_after_decimal_point_after_digits()
}
// allow 08e1 and 09e1
Some('e') if kind == Kind::Decimal => {
Some(b'e') if kind == Kind::Decimal => {
self.consume_char();
self.read_decimal_exponent()
}
Expand All @@ -108,12 +106,12 @@ impl<'a> Lexer<'a> {
}

fn read_decimal_exponent(&mut self) -> Kind {
let kind = match self.peek() {
Some('-') => {
let kind = match self.peek_byte() {
Some(b'-') => {
self.consume_char();
Kind::NegativeExponential
}
Some('+') => {
Some(b'+') => {
self.consume_char();
Kind::PositiveExponential
}
Expand All @@ -124,7 +122,7 @@ impl<'a> Lexer<'a> {
}

fn read_decimal_digits(&mut self) {
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.consume_char();
} else {
self.unexpected_err();
Expand All @@ -135,23 +133,23 @@ impl<'a> Lexer<'a> {
}

fn read_decimal_digits_after_first_digit(&mut self) {
while let Some(c) = self.peek() {
match c {
'_' => {
while let Some(b) = self.peek_byte() {
match b {
b'_' => {
self.consume_char();
// NOTE: it looks invalid numeric tokens are still parsed.
// This seems to be a waste. It also requires us to put this
// call here instead of after we ensure the next character
// is an ASCII digit
self.token.set_has_separator();
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.consume_char();
} else {
self.unexpected_err();
return;
}
}
'0'..='9' => {
b'0'..=b'9' => {
self.consume_char();
}
_ => break,
Expand All @@ -172,16 +170,14 @@ impl<'a> Lexer<'a> {
}

fn optional_decimal_digits(&mut self) {
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.consume_char();
} else {
return;
self.read_decimal_digits_after_first_digit();
}
self.read_decimal_digits_after_first_digit();
}

fn optional_exponent(&mut self) -> Option<Kind> {
if matches!(self.peek(), Some('e' | 'E')) {
if matches!(self.peek_byte(), Some(b'e' | b'E')) {
self.consume_char();
return Some(self.read_decimal_exponent());
}
Expand All @@ -191,12 +187,12 @@ impl<'a> Lexer<'a> {
fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind {
let offset = self.offset();
// The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
let c = self.peek();
let c = self.peek_char();
if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) {
return kind;
}
self.consume_char();
while let Some(c) = self.peek() {
while let Some(c) = self.peek_char() {
if is_identifier_start(c) {
self.consume_char();
} else {
Expand Down
6 changes: 3 additions & 3 deletions crates/oxc_parser/src/lexer/punctuation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ use super::{Kind, Lexer, Token};
impl<'a> Lexer<'a> {
/// Section 12.8 Punctuators
pub(super) fn read_dot(&mut self) -> Kind {
if self.peek() == Some('.') && self.peek2() == Some('.') {
if self.peek_2_bytes() == Some([b'.', b'.']) {
self.consume_char();
self.consume_char();
return Kind::Dot3;
}
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.decimal_literal_after_decimal_point()
} else {
Kind::Dot
Expand All @@ -25,7 +25,7 @@ impl<'a> Lexer<'a> {
}
} else if self.next_ascii_char_eq(b'=') {
Some(Kind::LtEq)
} else if self.peek() == Some('!')
} else if self.peek_byte() == Some(b'!')
// SingleLineHTMLOpenComment `<!--` in script mode
&& self.source_type.is_script()
&& self.remaining().starts_with("!--")
Expand Down
8 changes: 5 additions & 3 deletions crates/oxc_parser/src/lexer/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,16 @@ impl<'a> Lexer<'a> {
let pattern_end = self.offset() - 1; // -1 to exclude `/`
let mut flags = RegExpFlags::empty();

while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() {
while let Some(ch @ (b'$' | b'_' | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9')) =
self.peek_byte()
{
self.consume_char();
let Ok(flag) = RegExpFlags::try_from(ch) else {
self.error(diagnostics::reg_exp_flag(ch, self.current_offset()));
self.error(diagnostics::reg_exp_flag(ch as char, self.current_offset()));
continue;
};
if flags.contains(flag) {
self.error(diagnostics::reg_exp_flag_twice(ch, self.current_offset()));
self.error(diagnostics::reg_exp_flag_twice(ch as char, self.current_offset()));
continue;
}
flags |= flag;
Expand Down
Loading