Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Approximate tokens len #9546

Merged
merged 1 commit into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/ruff_benchmark/benches/formatter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use ruff_benchmark::{TestCase, TestFile, TestFileDownloadError};
use ruff_python_formatter::{format_module_ast, PreviewMode, PyFormatOptions};
use ruff_python_index::CommentRangesBuilder;
use ruff_python_parser::lexer::lex;
use ruff_python_parser::{parse_tokens, Mode};
use ruff_python_parser::{allocate_tokens_vec, parse_tokens, Mode};

#[cfg(target_os = "windows")]
#[global_allocator]
Expand Down Expand Up @@ -52,7 +52,7 @@ fn benchmark_formatter(criterion: &mut Criterion) {
BenchmarkId::from_parameter(case.name()),
&case,
|b, case| {
let mut tokens = Vec::new();
let mut tokens = allocate_tokens_vec(case.code());
let mut comment_ranges = CommentRangesBuilder::default();

for result in lex(case.code(), Mode::Module) {
Expand Down
4 changes: 2 additions & 2 deletions crates/ruff_python_index/src/comment_ranges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::fmt::Debug;

use ruff_python_ast::PySourceType;
use ruff_python_parser::lexer::{lex, LexResult, LexicalError};
use ruff_python_parser::{AsMode, Tok};
use ruff_python_parser::{allocate_tokens_vec, AsMode, Tok};
use ruff_python_trivia::CommentRanges;
use ruff_text_size::TextRange;

Expand All @@ -28,7 +28,7 @@ pub fn tokens_and_ranges(
source: &str,
source_type: PySourceType,
) -> Result<(Vec<LexResult>, CommentRanges), LexicalError> {
let mut tokens = Vec::new();
let mut tokens = allocate_tokens_vec(source);
let mut comment_ranges = CommentRangesBuilder::default();

for result in lex(source, source_type.as_mode()) {
Expand Down
34 changes: 30 additions & 4 deletions crates/ruff_python_parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,14 @@
//! These tokens can be directly fed into the `ruff_python_parser` to generate an AST:
//!
//! ```
//! use ruff_python_parser::{lexer::lex, Mode, parse_tokens};
//! use ruff_python_parser::{Mode, parse_tokens, tokenize_all};
//!
//! let python_source = r#"
//! def is_odd(i):
//! return bool(i & 1)
//! "#;
//! let tokens = lex(python_source, Mode::Module);
//! let ast = parse_tokens(tokens.collect(), python_source, Mode::Module);
//! let tokens = tokenize_all(python_source, Mode::Module);
//! let ast = parse_tokens(tokens, python_source, Mode::Module);
//!
//! assert!(ast.is_ok());
//! ```
Expand Down Expand Up @@ -133,17 +133,43 @@ pub mod typing;

/// Collect tokens up to and including the first error.
pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
let mut tokens: Vec<LexResult> = vec![];
let mut tokens: Vec<LexResult> = allocate_tokens_vec(contents);
for tok in lexer::lex(contents, mode) {
let is_err = tok.is_err();
tokens.push(tok);
if is_err {
break;
}
}

tokens
}

/// Tokenizes all tokens.
///
/// It differs from [`tokenize`] in that it tokenizes all tokens and doesn't stop
/// after the first `Err`.
pub fn tokenize_all(contents: &str, mode: Mode) -> Vec<LexResult> {
let mut tokens = allocate_tokens_vec(contents);
for token in lexer::lex(contents, mode) {
tokens.push(token);
}
tokens
}

/// Allocates a [`Vec`] with an approximated capacity to fit all tokens
/// of `contents`.
///
/// See [#9546](https://github.com/astral-sh/ruff/pull/9546) for a more detailed explanation.
pub fn allocate_tokens_vec(contents: &str) -> Vec<LexResult> {
Vec::with_capacity(approximate_tokens_lower_bound(contents))
}

/// Approximates the number of tokens when lexing `contents`.
fn approximate_tokens_lower_bound(contents: &str) -> usize {
contents.len().saturating_mul(15) / 100
}

/// Parse a full Python program from its tokens.
pub fn parse_program_tokens(
tokens: Vec<LexResult>,
Expand Down
5 changes: 2 additions & 3 deletions crates/ruff_python_parser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ use crate::{
lexer::{self, LexicalError, LexicalErrorType},
python,
token::Tok,
Mode,
tokenize_all, Mode,
};

/// Parse a full Python program usually consisting of multiple lines.
Expand All @@ -55,8 +55,7 @@ use crate::{
/// assert!(program.is_ok());
/// ```
pub fn parse_program(source: &str) -> Result<ModModule, ParseError> {
let lexer = lex(source, Mode::Module);
match parse_tokens(lexer.collect(), source, Mode::Module)? {
match parse_tokens(tokenize_all(source, Mode::Module), source, Mode::Module)? {
Mod::Module(m) => Ok(m),
Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"),
}
Expand Down
4 changes: 2 additions & 2 deletions crates/ruff_wasm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use ruff_python_codegen::Stylist;
use ruff_python_formatter::{format_module_ast, pretty_comments, PyFormatContext, QuoteStyle};
use ruff_python_index::{CommentRangesBuilder, Indexer};
use ruff_python_parser::lexer::LexResult;
use ruff_python_parser::{parse_tokens, AsMode, Mode};
use ruff_python_parser::{parse_tokens, tokenize_all, AsMode, Mode};
use ruff_python_trivia::CommentRanges;
use ruff_source_file::{Locator, SourceLocation};
use ruff_text_size::Ranged;
Expand Down Expand Up @@ -272,7 +272,7 @@ struct ParsedModule<'a> {

impl<'a> ParsedModule<'a> {
fn from_source(source_code: &'a str) -> Result<Self, Error> {
let tokens: Vec<_> = ruff_python_parser::lexer::lex(source_code, Mode::Module).collect();
let tokens: Vec<_> = tokenize_all(source_code, Mode::Module);
let mut comment_ranges = CommentRangesBuilder::default();

for (token, range) in tokens.iter().flatten() {
Expand Down
Loading