Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

librustc_lexer: Refactor the module #66015

Merged
merged 9 commits into from
Nov 6, 2019
10 changes: 10 additions & 0 deletions src/librustc_lexer/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,16 @@ impl<'a> Cursor<'a> {
self.chars().nth(n).unwrap_or(EOF_CHAR)
}

/// Peeks the next symbol from the input stream without consuming it.
pub(crate) fn first(&self) -> char {
self.nth_char(0)
}

/// Peeks the second symbol from the input stream without consuming it.
pub(crate) fn second(&self) -> char {
self.nth_char(1)
}
petrochenkov marked this conversation as resolved.
Show resolved Hide resolved

/// Checks if there is nothing more to consume.
pub(crate) fn is_eof(&self) -> bool {
self.chars.as_str().is_empty()
Expand Down
126 changes: 69 additions & 57 deletions src/librustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ impl Cursor<'_> {
let first_char = self.bump().unwrap();
let token_kind = match first_char {
// Slash, comment or block comment.
'/' => match self.nth_char(0) {
'/' => match self.first() {
'/' => self.line_comment(),
'*' => self.block_comment(),
_ => Slash,
Expand All @@ -257,8 +257,8 @@ impl Cursor<'_> {
// Whitespace sequence.
c if is_whitespace(c) => self.whitespace(),

// Raw string literal or identifier.
'r' => match (self.nth_char(0), self.nth_char(1)) {
// Raw identifier, raw string literal or identifier.
'r' => match (self.first(), self.second()) {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
Expand All @@ -273,7 +273,7 @@ impl Cursor<'_> {
},

// Byte literal, byte string literal, raw byte string literal or identifier.
'b' => match (self.nth_char(0), self.nth_char(1)) {
'b' => match (self.first(), self.second()) {
('\'', _) => {
self.bump();
let terminated = self.single_quoted_string();
Expand Down Expand Up @@ -366,31 +366,23 @@ impl Cursor<'_> {
}

fn line_comment(&mut self) -> TokenKind {
debug_assert!(self.prev() == '/' && self.nth_char(0) == '/');
debug_assert!(self.prev() == '/' && self.first() == '/');
self.bump();
loop {
match self.nth_char(0) {
'\n' => break,
EOF_CHAR if self.is_eof() => break,
_ => {
self.bump();
}
}
}
self.eat_while(|c| c != '\n');
LineComment
}

fn block_comment(&mut self) -> TokenKind {
debug_assert!(self.prev() == '/' && self.nth_char(0) == '*');
debug_assert!(self.prev() == '/' && self.first() == '*');
self.bump();
let mut depth = 1usize;
while let Some(c) = self.bump() {
match c {
'/' if self.nth_char(0) == '*' => {
'/' if self.first() == '*' => {
self.bump();
depth += 1;
}
'*' if self.nth_char(0) == '/' => {
'*' if self.first() == '/' => {
self.bump();
depth -= 1;
if depth == 0 {
Expand All @@ -409,31 +401,27 @@ impl Cursor<'_> {

fn whitespace(&mut self) -> TokenKind {
debug_assert!(is_whitespace(self.prev()));
while is_whitespace(self.nth_char(0)) {
self.bump();
}
self.eat_while(is_whitespace);
Whitespace
}

fn raw_ident(&mut self) -> TokenKind {
debug_assert!(
self.prev() == 'r'
&& self.nth_char(0) == '#'
&& is_id_start(self.nth_char(1))
&& self.first() == '#'
&& is_id_start(self.second())
);
// Eat "#" symbol.
self.bump();
self.bump();
while is_id_continue(self.nth_char(0)) {
self.bump();
}
// Eat the identifier part of RawIdent.
self.eat_identifier();
RawIdent
}

fn ident(&mut self) -> TokenKind {
debug_assert!(is_id_start(self.prev()));
while is_id_continue(self.nth_char(0)) {
self.bump();
}
// Start is already eaten, eat the rest of identifier.
self.eat_while(is_id_continue);
Ident
}

Expand All @@ -442,7 +430,7 @@ impl Cursor<'_> {
let mut base = Base::Decimal;
if first_digit == '0' {
// Attempt to parse encoding base.
let has_digits = match self.nth_char(0) {
let has_digits = match self.first() {
'b' => {
base = Base::Binary;
self.bump();
Expand Down Expand Up @@ -476,23 +464,23 @@ impl Cursor<'_> {
self.eat_decimal_digits();
};

match self.nth_char(0) {
match self.first() {
// Don't be greedy if this is actually an
// integer literal followed by field/method access or a range pattern
// (`0..2` and `12.foo()`)
'.' if self.nth_char(1) != '.'
&& !is_id_start(self.nth_char(1)) =>
'.' if self.second() != '.'
&& !is_id_start(self.second()) =>
{
// might have stuff after the ., and if it does, it needs to start
// with a number
self.bump();
let mut empty_exponent = false;
if self.nth_char(0).is_digit(10) {
if self.first().is_digit(10) {
self.eat_decimal_digits();
match self.nth_char(0) {
match self.first() {
'e' | 'E' => {
self.bump();
empty_exponent = self.float_exponent().is_err()
empty_exponent = !self.eat_float_exponent();
}
_ => (),
}
Expand All @@ -501,7 +489,7 @@ impl Cursor<'_> {
}
'e' | 'E' => {
self.bump();
let empty_exponent = self.float_exponent().is_err();
let empty_exponent = !self.eat_float_exponent();
Float { base, empty_exponent }
}
_ => Int { base, empty_int: false },
Expand Down Expand Up @@ -549,26 +537,30 @@ impl Cursor<'_> {

fn single_quoted_string(&mut self) -> bool {
debug_assert!(self.prev() == '\'');
// Parse `'''` as a single char literal.
if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' {
// Check if it's a one-symbol literal.
if self.second() == '\'' && self.first() != '\\' {
self.bump();
self.bump();
return true;
}

// Literal has more than one symbol.

// Parse until either quotes are terminated or error is detected.
let mut first = true;
loop {
match self.nth_char(0) {
// Probably beginning of the comment, which we don't want to include
// to the error report.
'/' if !first => break,
// Newline without following '\'' means unclosed quote, stop parsing.
'\n' if self.nth_char(1) != '\'' => break,
// End of file, stop parsing.
EOF_CHAR if self.is_eof() => break,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why the order of match arms was changed here.

Copy link
Contributor Author

@popzxc popzxc Nov 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, I had two-level motivation here:

  1. I ordered match arms depending on the termination level (first match has return, then go exceptional cases with break, then go char-skipping arms (escaped char and any other char)).
  2. I thought that it's a bit more readable to have the normal exit condition to be the first match arm.

match self.first() {
// Quotes are terminated, finish parsing.
'\'' => {
self.bump();
return true;
}
// Probably beginning of the comment, which we don't want to include
// to the error report.
'/' => break,
// Newline without following '\'' means unclosed quote, stop parsing.
'\n' if self.second() != '\'' => break,
// End of file, stop parsing.
EOF_CHAR if self.is_eof() => break,
// Escaped slash is considered one character, so bump twice.
'\\' => {
self.bump();
Expand All @@ -579,8 +571,8 @@ impl Cursor<'_> {
self.bump();
}
}
first = false;
}
// String was not terminated.
false
}

Expand Down Expand Up @@ -643,7 +635,7 @@ impl Cursor<'_> {
fn eat_decimal_digits(&mut self) -> bool {
let mut has_digits = false;
loop {
match self.nth_char(0) {
match self.first() {
'_' => {
self.bump();
}
Expand All @@ -660,7 +652,7 @@ impl Cursor<'_> {
fn eat_hexadecimal_digits(&mut self) -> bool {
let mut has_digits = false;
loop {
match self.nth_char(0) {
match self.first() {
'_' => {
self.bump();
}
Expand All @@ -674,23 +666,43 @@ impl Cursor<'_> {
has_digits
}

fn float_exponent(&mut self) -> Result<(), ()> {
/// Eats the float exponent. Returns true if at least one digit was met,
/// and returns false otherwise.
fn eat_float_exponent(&mut self) -> bool {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All other eat_x functions have a contract that, if they return false, they don't consume anything.

This function always consumed something, and, if it returns an Err, you must report it, hence this weird owl-result/bool. It definitely could use a comment though :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, do they? For example, eat_decimal_digits will consume _______ and return false.

debug_assert!(self.prev() == 'e' || self.prev() == 'E');
if self.nth_char(0) == '-' || self.nth_char(0) == '+' {
if self.first() == '-' || self.first() == '+' {
self.bump();
}
if self.eat_decimal_digits() { Ok(()) } else { Err(()) }
self.eat_decimal_digits()
}

// Eats the suffix if it's an identifier.
// Eats the suffix of the literal, e.g. "_u8".
fn eat_literal_suffix(&mut self) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like this method can be removed now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better to have it for readability. It's obvious why we are calling "eat_literal_suffix" after parsing the literal, but it's not that obvious when we'll call "eat_identifier" instead.

if !is_id_start(self.nth_char(0)) {
self.eat_identifier();
}

// Eats the identifier.
fn eat_identifier(&mut self) {
if !is_id_start(self.first()) {
return;
}
self.bump();

while is_id_continue(self.nth_char(0)) {
self.eat_while(is_id_continue);
petrochenkov marked this conversation as resolved.
Show resolved Hide resolved
}

/// Eats symbols while predicate returns true or until the end of file is reached.
/// Returns amount of eaten symbols.
fn eat_while<F>(&mut self, mut predicate: F) -> usize
where
F: FnMut(char) -> bool
{
let mut eaten: usize = 0;
while predicate(self.first()) && !self.is_eof() {
eaten += 1;
self.bump();
}

eaten
}
}