From fb63fad8b0a876a7a230b558ccd762830821f8a7 Mon Sep 17 00:00:00 2001
From: Dhruv Manilawala <dhruvmanila@gmail.com>
Date: Tue, 19 Sep 2023 12:08:46 +0530
Subject: [PATCH] Update `W605` to check in f-strings (#7329)

This PR updates the `W605` (invalid-escape-sequence) to check inside
f-strings. It also adds support to report violation on invalid escape
sequence within f-strings w.r.t. the curly braces. So, the following
cases will be identified:

```python
f"\{1}"
f"\{{1}}"
f"{1:\}"
```

The new CPython parser also gives out a syntax warning for such cases:

```
fstring.py:1: SyntaxWarning: invalid escape sequence '\{'
  f"\{1}"
fstring.py:2: SyntaxWarning: invalid escape sequence '\{'
  f"\{{1}}"
fstring.py:3: SyntaxWarning: invalid escape sequence '\}'
  f"{1:\}"
```

Nested f-strings are supported here, so the generated fix is aware of that
and will create an edit for the proper f-string.

Add new test cases for f-strings.

fixes: #7295
---
 .../test/fixtures/pycodestyle/W605_2.py       |  54 +++++
 crates/ruff_linter/src/checkers/tokens.rs     |  16 +-
 .../ruff_linter/src/rules/pycodestyle/mod.rs  |   1 +
 .../rules/invalid_escape_sequence.rs          |  86 +++++--
 ...s__pycodestyle__tests__W605_W605_2.py.snap | 227 ++++++++++++++++++
 5 files changed, 355 insertions(+), 29 deletions(-)
 create mode 100644 crates/ruff_linter/resources/test/fixtures/pycodestyle/W605_2.py
 create mode 100644 crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W605_W605_2.py.snap

diff --git a/crates/ruff_linter/resources/test/fixtures/pycodestyle/W605_2.py b/crates/ruff_linter/resources/test/fixtures/pycodestyle/W605_2.py
new file mode 100644
index 0000000000000..b34ad587c46d5
--- /dev/null
+++ b/crates/ruff_linter/resources/test/fixtures/pycodestyle/W605_2.py
@@ -0,0 +1,54 @@
+# Same as `W605_0.py` but using f-strings instead.
+
+#: W605:1:10
+regex = f'\.png$'
+
+#: W605:2:1
+regex = f'''
+\.png$
+'''
+
+#: W605:2:6
+f(
+    f'\_'
+)
+
+#: W605:4:6
+f"""
+multi-line
+literal
+with \_ somewhere
+in the middle
+"""
+
+#: W605:1:38
+value = f'new line\nand invalid escape \_ here'
+
+
+#: Okay
+regex = fr'\.png$'
+regex = f'\\.png$'
+regex = fr'''
+\.png$
+'''
+regex = fr'''
+\\.png$
+'''
+s = f'\\'
+regex = f'\w'  # noqa
+regex = f'''
+\w
+'''  # noqa
+
+regex = f'\\\_'
+value = f'\{{1}}'
+value = f'\{1}'
+value = f'{1:\}'
+value = f"{f"\{1}"}"
+value = rf"{f"\{1}"}"
+
+# Okay
+value = rf'\{{1}}'
+value = rf'\{1}'
+value = rf'{1:\}'
+value = f"{rf"\{1}"}"
diff --git a/crates/ruff_linter/src/checkers/tokens.rs b/crates/ruff_linter/src/checkers/tokens.rs
index c1a6314f94dc7..d6e240574548f 100644
--- a/crates/ruff_linter/src/checkers/tokens.rs
+++ b/crates/ruff_linter/src/checkers/tokens.rs
@@ -77,14 +77,14 @@ pub(crate) fn check_tokens(
 
     if settings.rules.enabled(Rule::InvalidEscapeSequence) {
         for (tok, range) in tokens.iter().flatten() {
-            if tok.is_string() {
-                pycodestyle::rules::invalid_escape_sequence(
-                    &mut diagnostics,
-                    locator,
-                    *range,
-                    settings.rules.should_fix(Rule::InvalidEscapeSequence),
-                );
-            }
+            pycodestyle::rules::invalid_escape_sequence(
+                &mut diagnostics,
+                locator,
+                indexer,
+                tok,
+                *range,
+                settings.rules.should_fix(Rule::InvalidEscapeSequence),
+            );
         }
     }
 
diff --git a/crates/ruff_linter/src/rules/pycodestyle/mod.rs b/crates/ruff_linter/src/rules/pycodestyle/mod.rs
index c6387432637e8..18b44f63aea02 100644
--- a/crates/ruff_linter/src/rules/pycodestyle/mod.rs
+++ b/crates/ruff_linter/src/rules/pycodestyle/mod.rs
@@ -28,6 +28,7 @@ mod tests {
     #[test_case(Rule::BlankLineWithWhitespace, Path::new("W29.py"))]
     #[test_case(Rule::InvalidEscapeSequence, Path::new("W605_0.py"))]
     #[test_case(Rule::InvalidEscapeSequence, Path::new("W605_1.py"))]
+    #[test_case(Rule::InvalidEscapeSequence, Path::new("W605_2.py"))]
     #[test_case(Rule::LineTooLong, Path::new("E501.py"))]
     #[test_case(Rule::MixedSpacesAndTabs, Path::new("E101.py"))]
     #[test_case(Rule::ModuleImportNotAtTopOfFile, Path::new("E40.py"))]
diff --git a/crates/ruff_linter/src/rules/pycodestyle/rules/invalid_escape_sequence.rs b/crates/ruff_linter/src/rules/pycodestyle/rules/invalid_escape_sequence.rs
index 8cf0466c5cdb6..ff671365a3a83 100644
--- a/crates/ruff_linter/src/rules/pycodestyle/rules/invalid_escape_sequence.rs
+++ b/crates/ruff_linter/src/rules/pycodestyle/rules/invalid_escape_sequence.rs
@@ -2,7 +2,8 @@ use memchr::memchr_iter;
 
 use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
 use ruff_macros::{derive_message_formats, violation};
-use ruff_python_ast::str::{leading_quote, trailing_quote};
+use ruff_python_index::Indexer;
+use ruff_python_parser::Tok;
 use ruff_source_file::Locator;
 use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
 
@@ -45,29 +46,32 @@ impl AlwaysAutofixableViolation for InvalidEscapeSequence {
 pub(crate) fn invalid_escape_sequence(
     diagnostics: &mut Vec<Diagnostic>,
     locator: &Locator,
-    range: TextRange,
+    indexer: &Indexer,
+    token: &Tok,
+    token_range: TextRange,
     autofix: bool,
 ) {
-    let text = locator.slice(range);
-
-    // Determine whether the string is single- or triple-quoted.
-    let Some(leading_quote) = leading_quote(text) else {
-        return;
-    };
-    let Some(trailing_quote) = trailing_quote(text) else {
-        return;
+    let token_source_code = match token {
+        Tok::FStringMiddle { value, is_raw } => {
+            if *is_raw {
+                return;
+            }
+            value.as_str()
+        }
+        Tok::String { kind, .. } => {
+            if kind.is_raw() {
+                return;
+            }
+            locator.slice(token_range)
+        }
+        _ => return,
     };
-    let body = &text[leading_quote.len()..text.len() - trailing_quote.len()];
-
-    if leading_quote.contains(['r', 'R']) {
-        return;
-    }
 
     let mut contains_valid_escape_sequence = false;
     let mut invalid_escape_sequence = Vec::new();
 
     let mut prev = None;
-    let bytes = body.as_bytes();
+    let bytes = token_source_code.as_bytes();
     for i in memchr_iter(b'\\', bytes) {
         // If the previous character was also a backslash, skip.
         if prev.is_some_and(|prev| prev == i - 1) {
@@ -77,9 +81,38 @@ pub(crate) fn invalid_escape_sequence(
 
         prev = Some(i);
 
-        let Some(next_char) = body[i + 1..].chars().next() else {
+        let next_char = match token_source_code[i + 1..].chars().next() {
+            Some(next_char) => next_char,
+            None if token.is_f_string_middle() => {
+                // If we're at the end of a f-string middle token, the next character
+                // is actually emitted as a different token. For example,
+                //
+                // ```python
+                // f"\{1}"
+                // ```
+                //
+                // is lexed as `FStringMiddle('\\')` and `LBrace` (ignoring irrelevant
+                // tokens), so we need to check the next character in the source code.
+                //
+                // Now, if we're at the end of the f-string itself, the lexer wouldn't
+                // have emitted the `FStringMiddle` token in the first place. For example,
+                //
+                // ```python
+                // f"foo\"
+                // ```
+                //
+                // Here, there won't be any `FStringMiddle` because it's an unterminated
+                // f-string. This means that if there's a `FStringMiddle` token and we
+                // encounter a `\` character, then the next character is always going to
+                // be part of the f-string.
+                if let Some(next_char) = locator.after(token_range.end()).chars().next() {
+                    next_char
+                } else {
+                    continue;
+                }
+            }
             // If we're at the end of the file, skip.
-            continue;
+            None => continue,
         };
 
         // If we're at the end of line, skip.
@@ -120,7 +153,7 @@ pub(crate) fn invalid_escape_sequence(
             continue;
         }
 
-        let location = range.start() + leading_quote.text_len() + TextSize::try_from(i).unwrap();
+        let location = token_range.start() + TextSize::try_from(i).unwrap();
         let range = TextRange::at(location, next_char.text_len() + TextSize::from(1));
         invalid_escape_sequence.push(Diagnostic::new(InvalidEscapeSequence(next_char), range));
     }
@@ -135,14 +168,25 @@ pub(crate) fn invalid_escape_sequence(
                 )));
             }
         } else {
+            let tok_start = if token.is_f_string_middle() {
+                // SAFETY: If this is a `FStringMiddle` token, then the indexer
+                // must have the f-string range.
+                indexer
+                    .fstring_ranges()
+                    .innermost(token_range.start())
+                    .unwrap()
+                    .start()
+            } else {
+                token_range.start()
+            };
             // Turn into raw string.
             for diagnostic in &mut invalid_escape_sequence {
                 // If necessary, add a space between any leading keyword (`return`, `yield`,
                 // `assert`, etc.) and the string. For example, `return"foo"` is valid, but
                 // `returnr"foo"` is not.
                 diagnostic.set_fix(Fix::automatic(Edit::insertion(
-                    pad_start("r".to_string(), range.start(), locator),
-                    range.start(),
+                    pad_start("r".to_string(), tok_start, locator),
+                    tok_start,
                 )));
             }
         }
diff --git a/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W605_W605_2.py.snap b/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W605_W605_2.py.snap
new file mode 100644
index 0000000000000..c50eae3c81f98
--- /dev/null
+++ b/crates/ruff_linter/src/rules/pycodestyle/snapshots/ruff_linter__rules__pycodestyle__tests__W605_W605_2.py.snap
@@ -0,0 +1,227 @@
+---
+source: crates/ruff/src/rules/pycodestyle/mod.rs
+---
+W605_2.py:4:11: W605 [*] Invalid escape sequence: `\.`
+  |
+3 | #: W605:1:10
+4 | regex = f'\.png$'
+  |           ^^ W605
+5 | 
+6 | #: W605:2:1
+  |
+  = help: Add backslash to escape sequence
+
+ℹ Fix
+1 1 | # Same as `W605_0.py` but using f-strings instead.
+2 2 | 
+3 3 | #: W605:1:10
+4   |-regex = f'\.png$'
+  4 |+regex = rf'\.png$'
+5 5 | 
+6 6 | #: W605:2:1
+7 7 | regex = f'''
+
+W605_2.py:8:1: W605 [*] Invalid escape sequence: `\.`
+  |
+6 | #: W605:2:1
+7 | regex = f'''
+8 | \.png$
+  | ^^ W605
+9 | '''
+  |
+  = help: Add backslash to escape sequence
+
+ℹ Fix
+4 4 | regex = f'\.png$'
+5 5 | 
+6 6 | #: W605:2:1
+7   |-regex = f'''
+  7 |+regex = rf'''
+8 8 | \.png$
+9 9 | '''
+10 10 | 
+
+W605_2.py:13:7: W605 [*] Invalid escape sequence: `\_`
+   |
+11 | #: W605:2:6
+12 | f(
+13 |     f'\_'
+   |       ^^ W605
+14 | )
+   |
+   = help: Add backslash to escape sequence
+
+ℹ Fix
+10 10 | 
+11 11 | #: W605:2:6
+12 12 | f(
+13    |-    f'\_'
+   13 |+    rf'\_'
+14 14 | )
+15 15 | 
+16 16 | #: W605:4:6
+
+W605_2.py:20:6: W605 [*] Invalid escape sequence: `\_`
+   |
+18 | multi-line
+19 | literal
+20 | with \_ somewhere
+   |      ^^ W605
+21 | in the middle
+22 | """
+   |
+   = help: Add backslash to escape sequence
+
+ℹ Fix
+14 14 | )
+15 15 | 
+16 16 | #: W605:4:6
+17    |-f"""
+   17 |+rf"""
+18 18 | multi-line
+19 19 | literal
+20 20 | with \_ somewhere
+
+W605_2.py:25:40: W605 [*] Invalid escape sequence: `\_`
+   |
+24 | #: W605:1:38
+25 | value = f'new line\nand invalid escape \_ here'
+   |                                        ^^ W605
+   |
+   = help: Add backslash to escape sequence
+
+ℹ Fix
+22 22 | """
+23 23 | 
+24 24 | #: W605:1:38
+25    |-value = f'new line\nand invalid escape \_ here'
+   25 |+value = f'new line\nand invalid escape \\_ here'
+26 26 | 
+27 27 | 
+28 28 | #: Okay
+
+W605_2.py:43:13: W605 [*] Invalid escape sequence: `\_`
+   |
+41 | '''  # noqa
+42 | 
+43 | regex = f'\\\_'
+   |             ^^ W605
+44 | value = f'\{{1}}'
+45 | value = f'\{1}'
+   |
+   = help: Add backslash to escape sequence
+
+ℹ Fix
+40 40 | \w
+41 41 | '''  # noqa
+42 42 | 
+43    |-regex = f'\\\_'
+   43 |+regex = f'\\\\_'
+44 44 | value = f'\{{1}}'
+45 45 | value = f'\{1}'
+46 46 | value = f'{1:\}'
+
+W605_2.py:44:11: W605 [*] Invalid escape sequence: `\{`
+   |
+43 | regex = f'\\\_'
+44 | value = f'\{{1}}'
+   |           ^^ W605
+45 | value = f'\{1}'
+46 | value = f'{1:\}'
+   |
+   = help: Add backslash to escape sequence
+
+ℹ Fix
+41 41 | '''  # noqa
+42 42 | 
+43 43 | regex = f'\\\_'
+44    |-value = f'\{{1}}'
+   44 |+value = rf'\{{1}}'
+45 45 | value = f'\{1}'
+46 46 | value = f'{1:\}'
+47 47 | value = f"{f"\{1}"}"
+
+W605_2.py:45:11: W605 [*] Invalid escape sequence: `\{`
+   |
+43 | regex = f'\\\_'
+44 | value = f'\{{1}}'
+45 | value = f'\{1}'
+   |           ^^ W605
+46 | value = f'{1:\}'
+47 | value = f"{f"\{1}"}"
+   |
+   = help: Add backslash to escape sequence
+
+ℹ Fix
+42 42 | 
+43 43 | regex = f'\\\_'
+44 44 | value = f'\{{1}}'
+45    |-value = f'\{1}'
+   45 |+value = rf'\{1}'
+46 46 | value = f'{1:\}'
+47 47 | value = f"{f"\{1}"}"
+48 48 | value = rf"{f"\{1}"}"
+
+W605_2.py:46:14: W605 [*] Invalid escape sequence: `\}`
+   |
+44 | value = f'\{{1}}'
+45 | value = f'\{1}'
+46 | value = f'{1:\}'
+   |              ^^ W605
+47 | value = f"{f"\{1}"}"
+48 | value = rf"{f"\{1}"}"
+   |
+   = help: Add backslash to escape sequence
+
+ℹ Fix
+43 43 | regex = f'\\\_'
+44 44 | value = f'\{{1}}'
+45 45 | value = f'\{1}'
+46    |-value = f'{1:\}'
+   46 |+value = rf'{1:\}'
+47 47 | value = f"{f"\{1}"}"
+48 48 | value = rf"{f"\{1}"}"
+49 49 | 
+
+W605_2.py:47:14: W605 [*] Invalid escape sequence: `\{`
+   |
+45 | value = f'\{1}'
+46 | value = f'{1:\}'
+47 | value = f"{f"\{1}"}"
+   |              ^^ W605
+48 | value = rf"{f"\{1}"}"
+   |
+   = help: Add backslash to escape sequence
+
+ℹ Fix
+44 44 | value = f'\{{1}}'
+45 45 | value = f'\{1}'
+46 46 | value = f'{1:\}'
+47    |-value = f"{f"\{1}"}"
+   47 |+value = f"{rf"\{1}"}"
+48 48 | value = rf"{f"\{1}"}"
+49 49 | 
+50 50 | # Okay
+
+W605_2.py:48:15: W605 [*] Invalid escape sequence: `\{`
+   |
+46 | value = f'{1:\}'
+47 | value = f"{f"\{1}"}"
+48 | value = rf"{f"\{1}"}"
+   |               ^^ W605
+49 | 
+50 | # Okay
+   |
+   = help: Add backslash to escape sequence
+
+ℹ Fix
+45 45 | value = f'\{1}'
+46 46 | value = f'{1:\}'
+47 47 | value = f"{f"\{1}"}"
+48    |-value = rf"{f"\{1}"}"
+   48 |+value = rf"{rf"\{1}"}"
+49 49 | 
+50 50 | # Okay
+51 51 | value = rf'\{{1}}'
+
+