From 47ad7b4500d37a67c48edc22dd9f5ed8b2a09e6c Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Fri, 19 Jan 2024 17:39:37 +0100 Subject: [PATCH] Approximate tokens len (#9546) --- crates/ruff_benchmark/benches/formatter.rs | 4 +-- .../ruff_python_index/src/comment_ranges.rs | 4 +-- crates/ruff_python_parser/src/lib.rs | 34 ++++++++++++++++--- crates/ruff_python_parser/src/parser.rs | 5 ++- crates/ruff_wasm/src/lib.rs | 4 +-- 5 files changed, 38 insertions(+), 13 deletions(-) diff --git a/crates/ruff_benchmark/benches/formatter.rs b/crates/ruff_benchmark/benches/formatter.rs index 7d415c2bde031..98c3a97f2c956 100644 --- a/crates/ruff_benchmark/benches/formatter.rs +++ b/crates/ruff_benchmark/benches/formatter.rs @@ -7,7 +7,7 @@ use ruff_benchmark::{TestCase, TestFile, TestFileDownloadError}; use ruff_python_formatter::{format_module_ast, PreviewMode, PyFormatOptions}; use ruff_python_index::CommentRangesBuilder; use ruff_python_parser::lexer::lex; -use ruff_python_parser::{parse_tokens, Mode}; +use ruff_python_parser::{allocate_tokens_vec, parse_tokens, Mode}; #[cfg(target_os = "windows")] #[global_allocator] @@ -52,7 +52,7 @@ fn benchmark_formatter(criterion: &mut Criterion) { BenchmarkId::from_parameter(case.name()), &case, |b, case| { - let mut tokens = Vec::new(); + let mut tokens = allocate_tokens_vec(case.code()); let mut comment_ranges = CommentRangesBuilder::default(); for result in lex(case.code(), Mode::Module) { diff --git a/crates/ruff_python_index/src/comment_ranges.rs b/crates/ruff_python_index/src/comment_ranges.rs index 11e6496a38b18..e9ef4c04620bf 100644 --- a/crates/ruff_python_index/src/comment_ranges.rs +++ b/crates/ruff_python_index/src/comment_ranges.rs @@ -2,7 +2,7 @@ use std::fmt::Debug; use ruff_python_ast::PySourceType; use ruff_python_parser::lexer::{lex, LexResult, LexicalError}; -use ruff_python_parser::{AsMode, Tok}; +use ruff_python_parser::{allocate_tokens_vec, AsMode, Tok}; use ruff_python_trivia::CommentRanges; use ruff_text_size::TextRange; @@ -28,7 +28,7 @@ pub fn tokens_and_ranges( source: &str, source_type: PySourceType, ) -> Result<(Vec, CommentRanges), LexicalError> { - let mut tokens = Vec::new(); + let mut tokens = allocate_tokens_vec(source); let mut comment_ranges = CommentRangesBuilder::default(); for result in lex(source, source_type.as_mode()) { diff --git a/crates/ruff_python_parser/src/lib.rs b/crates/ruff_python_parser/src/lib.rs index 0217331fe21ff..2f95c684e87d9 100644 --- a/crates/ruff_python_parser/src/lib.rs +++ b/crates/ruff_python_parser/src/lib.rs @@ -78,14 +78,14 @@ //! These tokens can be directly fed into the `ruff_python_parser` to generate an AST: //! //! ``` -//! use ruff_python_parser::{lexer::lex, Mode, parse_tokens}; +//! use ruff_python_parser::{Mode, parse_tokens, tokenize_all}; //! //! let python_source = r#" //! def is_odd(i): //! return bool(i & 1) //! "#; -//! let tokens = lex(python_source, Mode::Module); -//! let ast = parse_tokens(tokens.collect(), python_source, Mode::Module); +//! let tokens = tokenize_all(python_source, Mode::Module); +//! let ast = parse_tokens(tokens, python_source, Mode::Module); //! //! assert!(ast.is_ok()); //! ``` @@ -133,7 +133,7 @@ pub mod typing; /// Collect tokens up to and including the first error. pub fn tokenize(contents: &str, mode: Mode) -> Vec { - let mut tokens: Vec = vec![]; + let mut tokens: Vec = allocate_tokens_vec(contents); for tok in lexer::lex(contents, mode) { let is_err = tok.is_err(); tokens.push(tok); @@ -141,9 +141,35 @@ pub fn tokenize(contents: &str, mode: Mode) -> Vec { break; } } + + tokens +} + +/// Tokenizes all tokens. +/// +/// It differs from [`tokenize`] in that it tokenizes all tokens and doesn't stop +/// after the first `Err`. +pub fn tokenize_all(contents: &str, mode: Mode) -> Vec { + let mut tokens = allocate_tokens_vec(contents); + for token in lexer::lex(contents, mode) { + tokens.push(token); + } tokens } +/// Allocates a [`Vec`] with an approximated capacity to fit all tokens +/// of `contents`. +/// +/// See [#9546](https://github.com/astral-sh/ruff/pull/9546) for a more detailed explanation. +pub fn allocate_tokens_vec(contents: &str) -> Vec { + Vec::with_capacity(approximate_tokens_lower_bound(contents)) +} + +/// Approximates the number of tokens when lexing `contents`. +fn approximate_tokens_lower_bound(contents: &str) -> usize { + contents.len().saturating_mul(15) / 100 +} + /// Parse a full Python program from its tokens. pub fn parse_program_tokens( tokens: Vec, diff --git a/crates/ruff_python_parser/src/parser.rs b/crates/ruff_python_parser/src/parser.rs index e158dadfcff4d..c0f6c7d18d2cb 100644 --- a/crates/ruff_python_parser/src/parser.rs +++ b/crates/ruff_python_parser/src/parser.rs @@ -31,7 +31,7 @@ use crate::{ lexer::{self, LexicalError, LexicalErrorType}, python, token::Tok, - Mode, + tokenize_all, Mode, }; /// Parse a full Python program usually consisting of multiple lines. @@ -55,8 +55,7 @@ use crate::{ /// assert!(program.is_ok()); /// ``` pub fn parse_program(source: &str) -> Result { - let lexer = lex(source, Mode::Module); - match parse_tokens(lexer.collect(), source, Mode::Module)? { + match parse_tokens(tokenize_all(source, Mode::Module), source, Mode::Module)? { Mod::Module(m) => Ok(m), Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"), } diff --git a/crates/ruff_wasm/src/lib.rs b/crates/ruff_wasm/src/lib.rs index 7ebfd67ca327e..f83ed36b79b45 100644 --- a/crates/ruff_wasm/src/lib.rs +++ b/crates/ruff_wasm/src/lib.rs @@ -17,7 +17,7 @@ use ruff_python_codegen::Stylist; use ruff_python_formatter::{format_module_ast, pretty_comments, PyFormatContext, QuoteStyle}; use ruff_python_index::{CommentRangesBuilder, Indexer}; use ruff_python_parser::lexer::LexResult; -use ruff_python_parser::{parse_tokens, AsMode, Mode}; +use ruff_python_parser::{parse_tokens, tokenize_all, AsMode, Mode}; use ruff_python_trivia::CommentRanges; use ruff_source_file::{Locator, SourceLocation}; use ruff_text_size::Ranged; @@ -272,7 +272,7 @@ struct ParsedModule<'a> { impl<'a> ParsedModule<'a> { fn from_source(source_code: &'a str) -> Result { - let tokens: Vec<_> = ruff_python_parser::lexer::lex(source_code, Mode::Module).collect(); + let tokens: Vec<_> = tokenize_all(source_code, Mode::Module); let mut comment_ranges = CommentRangesBuilder::default(); for (token, range) in tokens.iter().flatten() {