From 1b34b414541103aef34c8e8e4cc4cff6ba612565 Mon Sep 17 00:00:00 2001
From: Dhruv Manilawala <dhruvmanila@gmail.com>
Date: Tue, 19 Sep 2023 11:55:36 +0530
Subject: [PATCH] Update `Indexer` to use new f-string tokens (#7325)

## Summary

This PR updates the `Indexer` to use the new f-string tokens to compute
the `f_string_ranges` for f-strings. It adds a new abstraction which
exposes two methods to support extracting the range for the surrounding
innermost and outermost f-string. It uses the builder pattern to build
the f-string ranges which is similar to how the comment ranges are
built.

## Test Plan

Add new test cases for f-strings for:
* Tab indentation rule
* Line continuation detection in the indexer
* To get the innermost / outermost f-string range
* All detected f-string ranges

fixes: #7290
---
 .../test/fixtures/pycodestyle/W19.py          |   8 +
 crates/ruff_linter/src/checkers/ast/mod.rs    |   2 +-
 ...ules__pycodestyle__tests__W191_W19.py.snap |  16 ++
 .../ruff_python_index/src/fstring_ranges.rs   |  84 ++++++
 crates/ruff_python_index/src/indexer.rs       | 242 +++++++++++++++---
 crates/ruff_python_index/src/lib.rs           |   1 +
 6 files changed, 317 insertions(+), 36 deletions(-)
 create mode 100644 crates/ruff_python_index/src/fstring_ranges.rs

diff --git a/crates/ruff_linter/resources/test/fixtures/pycodestyle/W19.py b/crates/ruff_linter/resources/test/fixtures/pycodestyle/W19.py
index 2fdcb6f65ee57..6bb8fb430c9d1 100644
--- a/crates/ruff_linter/resources/test/fixtures/pycodestyle/W19.py
+++ b/crates/ruff_linter/resources/test/fixtures/pycodestyle/W19.py
@@ -152,3 +152,11 @@ def test_keys(self):
 	multiline string with tab in it, different lines
 '''
 "	single line string with tab in it"
+
+f"test{
+	tab_indented_should_be_flagged
+}	<- this tab is fine"
+
+f"""test{
+	tab_indented_should_be_flagged
+}	<- this tab is fine"""
diff --git a/crates/ruff_linter/src/checkers/ast/mod.rs b/crates/ruff_linter/src/checkers/ast/mod.rs
index 27a8a2a7d31b0..365cb6d16a4f7 100644
--- a/crates/ruff_linter/src/checkers/ast/mod.rs
+++ b/crates/ruff_linter/src/checkers/ast/mod.rs
@@ -183,7 +183,7 @@ impl<'a> Checker<'a> {
 
         // Find the quote character used to start the containing f-string.
         let expr = self.semantic.current_expression()?;
-        let string_range = self.indexer.f_string_range(expr.start())?;
+        let string_range = self.indexer.fstring_ranges().innermost(expr.start())?;
         let trailing_quote = trailing_quote(self.locator.slice(string_range))?;
 
         // Invert the quote character, if it's a single quote.
diff --git a/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W191_W19.py.snap b/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W191_W19.py.snap
index ff6c28bc72a70..eb8c4e1643b5d 100644
--- a/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W191_W19.py.snap
+++ b/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W191_W19.py.snap
@@ -349,4 +349,20 @@ W19.py:146:1: W191 Indentation contains tabs
 148 | #: W191 - okay
     |
 
+W19.py:157:1: W191 Indentation contains tabs
+    |
+156 | f"test{
+157 |     tab_indented_should_be_flagged
+    | ^^^^ W191
+158 | }   <- this tab is fine"
+    |
+
+W19.py:161:1: W191 Indentation contains tabs
+    |
+160 | f"""test{
+161 |     tab_indented_should_be_flagged
+    | ^^^^ W191
+162 | }   <- this tab is fine"""
+    |
+
 
diff --git a/crates/ruff_python_index/src/fstring_ranges.rs b/crates/ruff_python_index/src/fstring_ranges.rs
new file mode 100644
index 0000000000000..d2d13802a5cf4
--- /dev/null
+++ b/crates/ruff_python_index/src/fstring_ranges.rs
@@ -0,0 +1,84 @@
+use std::collections::BTreeMap;
+
+use ruff_python_parser::Tok;
+use ruff_text_size::{TextRange, TextSize};
+
+/// Stores the ranges of all f-strings in a file sorted by [`TextRange::start`].
+/// There can be multiple overlapping ranges for nested f-strings.
+#[derive(Debug)]
+pub struct FStringRanges {
+    raw: BTreeMap<TextSize, TextRange>,
+}
+
+impl FStringRanges {
+    /// Return the [`TextRange`] of the innermost f-string at the given offset.
+    pub fn innermost(&self, offset: TextSize) -> Option<TextRange> {
+        self.raw
+            .range(..=offset)
+            .rev()
+            .find(|(_, range)| range.contains(offset))
+            .map(|(_, range)| *range)
+    }
+
+    /// Return the [`TextRange`] of the outermost f-string at the given offset.
+    pub fn outermost(&self, offset: TextSize) -> Option<TextRange> {
+        // Explanation of the algorithm:
+        //
+        // ```python
+        // #                                                     v
+        //   f"normal" f"another" f"first {f"second {f"third"} second"} first"
+        // #                                         ^^(1)^^^
+        // #                               ^^^^^^^^^^^^(2)^^^^^^^^^^^^
+        // #                      ^^^^^^^^^^^^^^^^^^^^^(3)^^^^^^^^^^^^^^^^^^^^
+        // #           ^^^(4)^^^^
+        // # ^^^(5)^^^
+        // ```
+        //
+        // The offset is marked with a `v` and the ranges are numbered in the order
+        // they are yielded by the iterator in the reverse order. The algorithm
+        // works as follows:
+        //   1. Skip all ranges that don't contain the offset (1).
+        //   2. Take all ranges that contain the offset (2, 3).
+        //   3. Stop taking ranges when the offset is no longer contained.
+        //   4. Take the last range that contained the offset (3, the outermost).
+        self.raw
+            .range(..=offset)
+            .rev()
+            .skip_while(|(_, range)| !range.contains(offset))
+            .take_while(|(_, range)| range.contains(offset))
+            .last()
+            .map(|(_, range)| *range)
+    }
+
+    #[cfg(test)]
+    pub(crate) fn ranges(&self) -> impl Iterator<Item = TextRange> + '_ {
+        self.raw.values().copied()
+    }
+}
+
+#[derive(Default)]
+pub(crate) struct FStringRangesBuilder {
+    start_locations: Vec<TextSize>,
+    raw: BTreeMap<TextSize, TextRange>,
+}
+
+impl FStringRangesBuilder {
+    pub(crate) fn visit_token(&mut self, token: &Tok, range: TextRange) {
+        match token {
+            Tok::FStringStart => {
+                self.start_locations.push(range.start());
+            }
+            Tok::FStringEnd => {
+                if let Some(start) = self.start_locations.pop() {
+                    self.raw.insert(start, TextRange::new(start, range.end()));
+                }
+            }
+            _ => {}
+        }
+    }
+
+    pub(crate) fn finish(self) -> FStringRanges {
+        debug_assert!(self.start_locations.is_empty());
+        FStringRanges { raw: self.raw }
+    }
+}
diff --git a/crates/ruff_python_index/src/indexer.rs b/crates/ruff_python_index/src/indexer.rs
index 00add310b65b2..a48eab6cb8c2d 100644
--- a/crates/ruff_python_index/src/indexer.rs
+++ b/crates/ruff_python_index/src/indexer.rs
@@ -1,25 +1,26 @@
 //! Struct used to index source code, to enable efficient lookup of tokens that
 //! are omitted from the AST (e.g., commented lines).
 
-use crate::CommentRangesBuilder;
 use ruff_python_ast::Stmt;
 use ruff_python_parser::lexer::LexResult;
-use ruff_python_parser::{StringKind, Tok};
+use ruff_python_parser::Tok;
 use ruff_python_trivia::{
     has_leading_content, has_trailing_content, is_python_whitespace, CommentRanges,
 };
 use ruff_source_file::Locator;
 use ruff_text_size::{Ranged, TextRange, TextSize};
 
+use crate::fstring_ranges::{FStringRanges, FStringRangesBuilder};
+use crate::CommentRangesBuilder;
+
 pub struct Indexer {
     comment_ranges: CommentRanges,
 
     /// Stores the start offset of continuation lines.
     continuation_lines: Vec<TextSize>,
 
-    /// The range of all f-string in the source document. The ranges are sorted by their
-    /// [`TextRange::start`] position in increasing order. No two ranges are overlapping.
-    f_string_ranges: Vec<TextRange>,
+    /// The range of all f-string in the source document.
+    fstring_ranges: FStringRanges,
 }
 
 impl Indexer {
@@ -27,8 +28,8 @@ impl Indexer {
         assert!(TextSize::try_from(locator.contents().len()).is_ok());
 
         let mut comment_ranges_builder = CommentRangesBuilder::default();
+        let mut fstring_ranges_builder = FStringRangesBuilder::default();
         let mut continuation_lines = Vec::new();
-        let mut f_string_ranges = Vec::new();
         // Token, end
         let mut prev_end = TextSize::default();
         let mut prev_token: Option<&Tok> = None;
@@ -59,18 +60,10 @@ impl Indexer {
             }
 
             comment_ranges_builder.visit_token(tok, *range);
+            fstring_ranges_builder.visit_token(tok, *range);
 
-            match tok {
-                Tok::Newline | Tok::NonLogicalNewline => {
-                    line_start = range.end();
-                }
-                Tok::String {
-                    kind: StringKind::FString | StringKind::RawFString,
-                    ..
-                } => {
-                    f_string_ranges.push(*range);
-                }
-                _ => {}
+            if matches!(tok, Tok::Newline | Tok::NonLogicalNewline) {
+                line_start = range.end();
             }
 
             prev_token = Some(tok);
@@ -79,7 +72,7 @@ impl Indexer {
         Self {
             comment_ranges: comment_ranges_builder.finish(),
             continuation_lines,
-            f_string_ranges,
+            fstring_ranges: fstring_ranges_builder.finish(),
         }
     }
 
@@ -88,6 +81,11 @@ impl Indexer {
         &self.comment_ranges
     }
 
+    /// Returns the byte offset ranges of f-strings.
+    pub const fn fstring_ranges(&self) -> &FStringRanges {
+        &self.fstring_ranges
+    }
+
     /// Returns the line start positions of continuations (backslash).
     pub fn continuation_line_starts(&self) -> &[TextSize] {
         &self.continuation_lines
@@ -99,22 +97,6 @@ impl Indexer {
         self.continuation_lines.binary_search(&line_start).is_ok()
     }
 
-    /// Return the [`TextRange`] of the f-string containing a given offset.
-    pub fn f_string_range(&self, offset: TextSize) -> Option<TextRange> {
-        let Ok(string_range_index) = self.f_string_ranges.binary_search_by(|range| {
-            if offset < range.start() {
-                std::cmp::Ordering::Greater
-            } else if range.contains(offset) {
-                std::cmp::Ordering::Equal
-            } else {
-                std::cmp::Ordering::Less
-            }
-        }) else {
-            return None;
-        };
-        Some(self.f_string_ranges[string_range_index])
-    }
-
     /// Returns `true` if a statement or expression includes at least one comment.
     pub fn has_comments<T>(&self, node: &T, locator: &Locator) -> bool
     where
@@ -250,7 +232,7 @@ mod tests {
     use ruff_python_parser::lexer::LexResult;
     use ruff_python_parser::{lexer, Mode};
     use ruff_source_file::Locator;
-    use ruff_text_size::TextSize;
+    use ruff_text_size::{TextRange, TextSize};
 
     use crate::Indexer;
 
@@ -333,5 +315,195 @@ import os
                 TextSize::from(116)
             ]
         );
+
+        let contents = r"
+f'foo { 'str1' \
+    'str2' \
+    'str3'
+    f'nested { 'str4'
+        'str5' \
+        'str6'
+    }'
+}'
+"
+        .trim();
+        let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
+        let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
+        assert_eq!(
+            indexer.continuation_line_starts(),
+            [
+                // row 1
+                TextSize::new(0),
+                // row 2
+                TextSize::new(17),
+                // row 5
+                TextSize::new(63),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_f_string_ranges() {
+        let contents = r#"
+f"normal f-string"
+f"start {f"inner {f"another"}"} end"
+f"implicit " f"concatenation"
+"#
+        .trim();
+        let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
+        let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
+        assert_eq!(
+            indexer.fstring_ranges().ranges().collect::<Vec<_>>(),
+            &[
+                TextRange::new(TextSize::from(0), TextSize::from(18)),
+                TextRange::new(TextSize::from(19), TextSize::from(55)),
+                TextRange::new(TextSize::from(28), TextSize::from(49)),
+                TextRange::new(TextSize::from(37), TextSize::from(47)),
+                TextRange::new(TextSize::from(56), TextSize::from(68)),
+                TextRange::new(TextSize::from(69), TextSize::from(85)),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_triple_quoted_f_string_ranges() {
+        let contents = r#"
+f"""
+this is one
+multiline f-string
+"""
+f'''
+and this is
+another
+'''
+f"""
+this is a {f"""nested multiline
+f-string"""}
+"""
+"#
+        .trim();
+        let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
+        let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
+        assert_eq!(
+            indexer.fstring_ranges().ranges().collect::<Vec<_>>(),
+            &[
+                TextRange::new(TextSize::from(0), TextSize::from(39)),
+                TextRange::new(TextSize::from(40), TextSize::from(68)),
+                TextRange::new(TextSize::from(69), TextSize::from(122)),
+                TextRange::new(TextSize::from(85), TextSize::from(117)),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_fstring_innermost_outermost() {
+        let contents = r#"
+f"no nested f-string"
+
+if True:
+    f"first {f"second {f"third"} second"} first"
+    foo = "normal string"
+
+f"implicit " f"concatenation"
+
+f"first line {
+    foo + f"second line {bar}"
+} third line"
+
+f"""this is a
+multi-line {f"""nested
+f-string"""}
+the end"""
+"#
+        .trim();
+        let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
+        let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
+
+        // For reference, the ranges of the f-strings in the above code are as
+        // follows where the ones inside parentheses are nested f-strings:
+        //
+        // [0..21, (36..80, 45..72, 55..63), 108..120, 121..137, (139..198, 164..184), (200..260, 226..248)]
+
+        for (offset, innermost_range, outermost_range) in [
+            // Inside a normal f-string
+            (
+                TextSize::new(130),
+                TextRange::new(TextSize::new(121), TextSize::new(137)),
+                TextRange::new(TextSize::new(121), TextSize::new(137)),
+            ),
+            // Left boundary
+            (
+                TextSize::new(121),
+                TextRange::new(TextSize::new(121), TextSize::new(137)),
+                TextRange::new(TextSize::new(121), TextSize::new(137)),
+            ),
+            // Right boundary
+            (
+                TextSize::new(136), // End offsets are exclusive
+                TextRange::new(TextSize::new(121), TextSize::new(137)),
+                TextRange::new(TextSize::new(121), TextSize::new(137)),
+            ),
+            // "first" left
+            (
+                TextSize::new(40),
+                TextRange::new(TextSize::new(36), TextSize::new(80)),
+                TextRange::new(TextSize::new(36), TextSize::new(80)),
+            ),
+            // "second" left
+            (
+                TextSize::new(50),
+                TextRange::new(TextSize::new(45), TextSize::new(72)),
+                TextRange::new(TextSize::new(36), TextSize::new(80)),
+            ),
+            // "third"
+            (
+                TextSize::new(60),
+                TextRange::new(TextSize::new(55), TextSize::new(63)),
+                TextRange::new(TextSize::new(36), TextSize::new(80)),
+            ),
+            // "second" right
+            (
+                TextSize::new(70),
+                TextRange::new(TextSize::new(45), TextSize::new(72)),
+                TextRange::new(TextSize::new(36), TextSize::new(80)),
+            ),
+            // "first" right
+            (
+                TextSize::new(75),
+                TextRange::new(TextSize::new(36), TextSize::new(80)),
+                TextRange::new(TextSize::new(36), TextSize::new(80)),
+            ),
+            // Single-quoted f-strings spanning across multiple lines
+            (
+                TextSize::new(160),
+                TextRange::new(TextSize::new(139), TextSize::new(198)),
+                TextRange::new(TextSize::new(139), TextSize::new(198)),
+            ),
+            (
+                TextSize::new(170),
+                TextRange::new(TextSize::new(164), TextSize::new(184)),
+                TextRange::new(TextSize::new(139), TextSize::new(198)),
+            ),
+            // Multi-line f-strings
+            (
+                TextSize::new(220),
+                TextRange::new(TextSize::new(200), TextSize::new(260)),
+                TextRange::new(TextSize::new(200), TextSize::new(260)),
+            ),
+            (
+                TextSize::new(240),
+                TextRange::new(TextSize::new(226), TextSize::new(248)),
+                TextRange::new(TextSize::new(200), TextSize::new(260)),
+            ),
+        ] {
+            assert_eq!(
+                indexer.fstring_ranges().innermost(offset).unwrap(),
+                innermost_range
+            );
+            assert_eq!(
+                indexer.fstring_ranges().outermost(offset).unwrap(),
+                outermost_range
+            );
+        }
     }
 }
diff --git a/crates/ruff_python_index/src/lib.rs b/crates/ruff_python_index/src/lib.rs
index 2e585ca5df0e1..288009e90e80c 100644
--- a/crates/ruff_python_index/src/lib.rs
+++ b/crates/ruff_python_index/src/lib.rs
@@ -1,4 +1,5 @@
 mod comment_ranges;
+mod fstring_ranges;
 mod indexer;
 
 pub use comment_ranges::CommentRangesBuilder;