From 1b34b414541103aef34c8e8e4cc4cff6ba612565 Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Tue, 19 Sep 2023 11:55:36 +0530 Subject: [PATCH] Update `Indexer` to use new f-string tokens (#7325) ## Summary This PR updates the `Indexer` to use the new f-string tokens to compute the `f_string_ranges` for f-strings. It adds a new abstraction which exposes two methods to support extracting the range for the surrounding innermost and outermost f-string. It uses the builder pattern to build the f-string ranges which is similar to how the comment ranges are built. ## Test Plan Add new test cases for f-strings for: * Tab indentation rule * Line continuation detection in the indexer * To get the innermost / outermost f-string range * All detected f-string ranges fixes: #7290 --- .../test/fixtures/pycodestyle/W19.py | 8 + crates/ruff_linter/src/checkers/ast/mod.rs | 2 +- ...ules__pycodestyle__tests__W191_W19.py.snap | 16 ++ .../ruff_python_index/src/fstring_ranges.rs | 84 ++++++ crates/ruff_python_index/src/indexer.rs | 242 +++++++++++++++--- crates/ruff_python_index/src/lib.rs | 1 + 6 files changed, 317 insertions(+), 36 deletions(-) create mode 100644 crates/ruff_python_index/src/fstring_ranges.rs diff --git a/crates/ruff_linter/resources/test/fixtures/pycodestyle/W19.py b/crates/ruff_linter/resources/test/fixtures/pycodestyle/W19.py index 2fdcb6f65ee57..6bb8fb430c9d1 100644 --- a/crates/ruff_linter/resources/test/fixtures/pycodestyle/W19.py +++ b/crates/ruff_linter/resources/test/fixtures/pycodestyle/W19.py @@ -152,3 +152,11 @@ def test_keys(self): multiline string with tab in it, different lines ''' " single line string with tab in it" + +f"test{ + tab_indented_should_be_flagged +} <- this tab is fine" + +f"""test{ + tab_indented_should_be_flagged +} <- this tab is fine""" diff --git a/crates/ruff_linter/src/checkers/ast/mod.rs b/crates/ruff_linter/src/checkers/ast/mod.rs index 27a8a2a7d31b0..365cb6d16a4f7 100644 --- a/crates/ruff_linter/src/checkers/ast/mod.rs +++ b/crates/ruff_linter/src/checkers/ast/mod.rs @@ -183,7 +183,7 @@ impl<'a> Checker<'a> { // Find the quote character used to start the containing f-string. let expr = self.semantic.current_expression()?; - let string_range = self.indexer.f_string_range(expr.start())?; + let string_range = self.indexer.fstring_ranges().innermost(expr.start())?; let trailing_quote = trailing_quote(self.locator.slice(string_range))?; // Invert the quote character, if it's a single quote. diff --git a/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W191_W19.py.snap b/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W191_W19.py.snap index ff6c28bc72a70..eb8c4e1643b5d 100644 --- a/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W191_W19.py.snap +++ b/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W191_W19.py.snap @@ -349,4 +349,20 @@ W19.py:146:1: W191 Indentation contains tabs 148 | #: W191 - okay | +W19.py:157:1: W191 Indentation contains tabs + | +156 | f"test{ +157 | tab_indented_should_be_flagged + | ^^^^ W191 +158 | } <- this tab is fine" + | + +W19.py:161:1: W191 Indentation contains tabs + | +160 | f"""test{ +161 | tab_indented_should_be_flagged + | ^^^^ W191 +162 | } <- this tab is fine""" + | + diff --git a/crates/ruff_python_index/src/fstring_ranges.rs b/crates/ruff_python_index/src/fstring_ranges.rs new file mode 100644 index 0000000000000..d2d13802a5cf4 --- /dev/null +++ b/crates/ruff_python_index/src/fstring_ranges.rs @@ -0,0 +1,84 @@ +use std::collections::BTreeMap; + +use ruff_python_parser::Tok; +use ruff_text_size::{TextRange, TextSize}; + +/// Stores the ranges of all f-strings in a file sorted by [`TextRange::start`]. +/// There can be multiple overlapping ranges for nested f-strings. +#[derive(Debug)] +pub struct FStringRanges { + raw: BTreeMap, +} + +impl FStringRanges { + /// Return the [`TextRange`] of the innermost f-string at the given offset. + pub fn innermost(&self, offset: TextSize) -> Option { + self.raw + .range(..=offset) + .rev() + .find(|(_, range)| range.contains(offset)) + .map(|(_, range)| *range) + } + + /// Return the [`TextRange`] of the outermost f-string at the given offset. + pub fn outermost(&self, offset: TextSize) -> Option { + // Explanation of the algorithm: + // + // ```python + // # v + // f"normal" f"another" f"first {f"second {f"third"} second"} first" + // # ^^(1)^^^ + // # ^^^^^^^^^^^^(2)^^^^^^^^^^^^ + // # ^^^^^^^^^^^^^^^^^^^^^(3)^^^^^^^^^^^^^^^^^^^^ + // # ^^^(4)^^^^ + // # ^^^(5)^^^ + // ``` + // + // The offset is marked with a `v` and the ranges are numbered in the order + // they are yielded by the iterator in the reverse order. The algorithm + // works as follows: + // 1. Skip all ranges that don't contain the offset (1). + // 2. Take all ranges that contain the offset (2, 3). + // 3. Stop taking ranges when the offset is no longer contained. + // 4. Take the last range that contained the offset (3, the outermost). + self.raw + .range(..=offset) + .rev() + .skip_while(|(_, range)| !range.contains(offset)) + .take_while(|(_, range)| range.contains(offset)) + .last() + .map(|(_, range)| *range) + } + + #[cfg(test)] + pub(crate) fn ranges(&self) -> impl Iterator + '_ { + self.raw.values().copied() + } +} + +#[derive(Default)] +pub(crate) struct FStringRangesBuilder { + start_locations: Vec, + raw: BTreeMap, +} + +impl FStringRangesBuilder { + pub(crate) fn visit_token(&mut self, token: &Tok, range: TextRange) { + match token { + Tok::FStringStart => { + self.start_locations.push(range.start()); + } + Tok::FStringEnd => { + if let Some(start) = self.start_locations.pop() { + self.raw.insert(start, TextRange::new(start, range.end())); + } + } + _ => {} + } + } + + pub(crate) fn finish(self) -> FStringRanges { + debug_assert!(self.start_locations.is_empty()); + FStringRanges { raw: self.raw } + } +} diff --git a/crates/ruff_python_index/src/indexer.rs b/crates/ruff_python_index/src/indexer.rs index 00add310b65b2..a48eab6cb8c2d 100644 --- a/crates/ruff_python_index/src/indexer.rs +++ b/crates/ruff_python_index/src/indexer.rs @@ -1,25 +1,26 @@ //! Struct used to index source code, to enable efficient lookup of tokens that //! are omitted from the AST (e.g., commented lines). -use crate::CommentRangesBuilder; use ruff_python_ast::Stmt; use ruff_python_parser::lexer::LexResult; -use ruff_python_parser::{StringKind, Tok}; +use ruff_python_parser::Tok; use ruff_python_trivia::{ has_leading_content, has_trailing_content, is_python_whitespace, CommentRanges, }; use ruff_source_file::Locator; use ruff_text_size::{Ranged, TextRange, TextSize}; +use crate::fstring_ranges::{FStringRanges, FStringRangesBuilder}; +use crate::CommentRangesBuilder; + pub struct Indexer { comment_ranges: CommentRanges, /// Stores the start offset of continuation lines. continuation_lines: Vec, - /// The range of all f-string in the source document. The ranges are sorted by their - /// [`TextRange::start`] position in increasing order. No two ranges are overlapping. - f_string_ranges: Vec, + /// The range of all f-string in the source document. + fstring_ranges: FStringRanges, } impl Indexer { @@ -27,8 +28,8 @@ impl Indexer { assert!(TextSize::try_from(locator.contents().len()).is_ok()); let mut comment_ranges_builder = CommentRangesBuilder::default(); + let mut fstring_ranges_builder = FStringRangesBuilder::default(); let mut continuation_lines = Vec::new(); - let mut f_string_ranges = Vec::new(); // Token, end let mut prev_end = TextSize::default(); let mut prev_token: Option<&Tok> = None; @@ -59,18 +60,10 @@ impl Indexer { } comment_ranges_builder.visit_token(tok, *range); + fstring_ranges_builder.visit_token(tok, *range); - match tok { - Tok::Newline | Tok::NonLogicalNewline => { - line_start = range.end(); - } - Tok::String { - kind: StringKind::FString | StringKind::RawFString, - .. - } => { - f_string_ranges.push(*range); - } - _ => {} + if matches!(tok, Tok::Newline | Tok::NonLogicalNewline) { + line_start = range.end(); } prev_token = Some(tok); @@ -79,7 +72,7 @@ impl Indexer { Self { comment_ranges: comment_ranges_builder.finish(), continuation_lines, - f_string_ranges, + fstring_ranges: fstring_ranges_builder.finish(), } } @@ -88,6 +81,11 @@ impl Indexer { &self.comment_ranges } + /// Returns the byte offset ranges of f-strings. + pub const fn fstring_ranges(&self) -> &FStringRanges { + &self.fstring_ranges + } + /// Returns the line start positions of continuations (backslash). pub fn continuation_line_starts(&self) -> &[TextSize] { &self.continuation_lines @@ -99,22 +97,6 @@ impl Indexer { self.continuation_lines.binary_search(&line_start).is_ok() } - /// Return the [`TextRange`] of the f-string containing a given offset. - pub fn f_string_range(&self, offset: TextSize) -> Option { - let Ok(string_range_index) = self.f_string_ranges.binary_search_by(|range| { - if offset < range.start() { - std::cmp::Ordering::Greater - } else if range.contains(offset) { - std::cmp::Ordering::Equal - } else { - std::cmp::Ordering::Less - } - }) else { - return None; - }; - Some(self.f_string_ranges[string_range_index]) - } - /// Returns `true` if a statement or expression includes at least one comment. pub fn has_comments(&self, node: &T, locator: &Locator) -> bool where @@ -250,7 +232,7 @@ mod tests { use ruff_python_parser::lexer::LexResult; use ruff_python_parser::{lexer, Mode}; use ruff_source_file::Locator; - use ruff_text_size::TextSize; + use ruff_text_size::{TextRange, TextSize}; use crate::Indexer; @@ -333,5 +315,195 @@ import os TextSize::from(116) ] ); + + let contents = r" +f'foo { 'str1' \ + 'str2' \ + 'str3' + f'nested { 'str4' + 'str5' \ + 'str6' + }' +}' +" + .trim(); + let lxr: Vec = lexer::lex(contents, Mode::Module).collect(); + let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents)); + assert_eq!( + indexer.continuation_line_starts(), + [ + // row 1 + TextSize::new(0), + // row 2 + TextSize::new(17), + // row 5 + TextSize::new(63), + ] + ); + } + + #[test] + fn test_f_string_ranges() { + let contents = r#" +f"normal f-string" +f"start {f"inner {f"another"}"} end" +f"implicit " f"concatenation" +"# + .trim(); + let lxr: Vec = lexer::lex(contents, Mode::Module).collect(); + let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents)); + assert_eq!( + indexer.fstring_ranges().ranges().collect::>(), + &[ + TextRange::new(TextSize::from(0), TextSize::from(18)), + TextRange::new(TextSize::from(19), TextSize::from(55)), + TextRange::new(TextSize::from(28), TextSize::from(49)), + TextRange::new(TextSize::from(37), TextSize::from(47)), + TextRange::new(TextSize::from(56), TextSize::from(68)), + TextRange::new(TextSize::from(69), TextSize::from(85)), + ] + ); + } + + #[test] + fn test_triple_quoted_f_string_ranges() { + let contents = r#" +f""" +this is one +multiline f-string +""" +f''' +and this is +another +''' +f""" +this is a {f"""nested multiline +f-string"""} +""" +"# + .trim(); + let lxr: Vec = lexer::lex(contents, Mode::Module).collect(); + let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents)); + assert_eq!( + indexer.fstring_ranges().ranges().collect::>(), + &[ + TextRange::new(TextSize::from(0), TextSize::from(39)), + TextRange::new(TextSize::from(40), TextSize::from(68)), + TextRange::new(TextSize::from(69), TextSize::from(122)), + TextRange::new(TextSize::from(85), TextSize::from(117)), + ] + ); + } + + #[test] + fn test_fstring_innermost_outermost() { + let contents = r#" +f"no nested f-string" + +if True: + f"first {f"second {f"third"} second"} first" + foo = "normal string" + +f"implicit " f"concatenation" + +f"first line { + foo + f"second line {bar}" +} third line" + +f"""this is a +multi-line {f"""nested +f-string"""} +the end""" +"# + .trim(); + let lxr: Vec = lexer::lex(contents, Mode::Module).collect(); + let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents)); + + // For reference, the ranges of the f-strings in the above code are as + // follows where the ones inside parentheses are nested f-strings: + // + // [0..21, (36..80, 45..72, 55..63), 108..120, 121..137, (139..198, 164..184), (200..260, 226..248)] + + for (offset, innermost_range, outermost_range) in [ + // Inside a normal f-string + ( + TextSize::new(130), + TextRange::new(TextSize::new(121), TextSize::new(137)), + TextRange::new(TextSize::new(121), TextSize::new(137)), + ), + // Left boundary + ( + TextSize::new(121), + TextRange::new(TextSize::new(121), TextSize::new(137)), + TextRange::new(TextSize::new(121), TextSize::new(137)), + ), + // Right boundary + ( + TextSize::new(136), // End offsets are exclusive + TextRange::new(TextSize::new(121), TextSize::new(137)), + TextRange::new(TextSize::new(121), TextSize::new(137)), + ), + // "first" left + ( + TextSize::new(40), + TextRange::new(TextSize::new(36), TextSize::new(80)), + TextRange::new(TextSize::new(36), TextSize::new(80)), + ), + // "second" left + ( + TextSize::new(50), + TextRange::new(TextSize::new(45), TextSize::new(72)), + TextRange::new(TextSize::new(36), TextSize::new(80)), + ), + // "third" + ( + TextSize::new(60), + TextRange::new(TextSize::new(55), TextSize::new(63)), + TextRange::new(TextSize::new(36), TextSize::new(80)), + ), + // "second" right + ( + TextSize::new(70), + TextRange::new(TextSize::new(45), TextSize::new(72)), + TextRange::new(TextSize::new(36), TextSize::new(80)), + ), + // "first" right + ( + TextSize::new(75), + TextRange::new(TextSize::new(36), TextSize::new(80)), + TextRange::new(TextSize::new(36), TextSize::new(80)), + ), + // Single-quoted f-strings spanning across multiple lines + ( + TextSize::new(160), + TextRange::new(TextSize::new(139), TextSize::new(198)), + TextRange::new(TextSize::new(139), TextSize::new(198)), + ), + ( + TextSize::new(170), + TextRange::new(TextSize::new(164), TextSize::new(184)), + TextRange::new(TextSize::new(139), TextSize::new(198)), + ), + // Multi-line f-strings + ( + TextSize::new(220), + TextRange::new(TextSize::new(200), TextSize::new(260)), + TextRange::new(TextSize::new(200), TextSize::new(260)), + ), + ( + TextSize::new(240), + TextRange::new(TextSize::new(226), TextSize::new(248)), + TextRange::new(TextSize::new(200), TextSize::new(260)), + ), + ] { + assert_eq!( + indexer.fstring_ranges().innermost(offset).unwrap(), + innermost_range + ); + assert_eq!( + indexer.fstring_ranges().outermost(offset).unwrap(), + outermost_range + ); + } } } diff --git a/crates/ruff_python_index/src/lib.rs b/crates/ruff_python_index/src/lib.rs index 2e585ca5df0e1..288009e90e80c 100644 --- a/crates/ruff_python_index/src/lib.rs +++ b/crates/ruff_python_index/src/lib.rs @@ -1,4 +1,5 @@ mod comment_ranges; +mod fstring_ranges; mod indexer; pub use comment_ranges::CommentRangesBuilder;