Skip to content

Commit

Permalink
pythongh-112943: Correctly compute end offsets for multiline tokens i…
Browse files Browse the repository at this point in the history
…n the tokenize module (python#112949)
  • Loading branch information
pablogsal authored and aisk committed Feb 11, 2024
1 parent f236288 commit 79e2905
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 6 deletions.
10 changes: 10 additions & 0 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,16 @@ def test_string(self):
OP '}' (3, 0) (3, 1)
FSTRING_MIDDLE '__' (3, 1) (3, 3)
FSTRING_END "'" (3, 3) (3, 4)
""")

self.check_tokenize("""\
'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli
aktualni pracownicy, obecni pracownicy'''
""", """\
INDENT ' ' (1, 0) (1, 4)
STRING "'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli\\n aktualni pracownicy, obecni pracownicy'''" (1, 4) (2, 45)
NEWLINE '\\n' (2, 45) (2, 46)
DEDENT '' (3, 0) (3, 0)
""")

def test_function(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Correctly compute end column offsets for multiline tokens in the
:mod:`tokenize` module. Patch by Pablo Galindo
16 changes: 11 additions & 5 deletions Parser/pegen.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,8 @@ _PyPegen_interactive_exit(Parser *p)
}

Py_ssize_t
_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
{
const char *str = PyUnicode_AsUTF8(line);
if (!str) {
return -1;
}
Py_ssize_t len = strlen(str);
if (col_offset > len + 1) {
col_offset = len + 1;
Expand All @@ -39,6 +35,16 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
return size;
}

Py_ssize_t
_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
{
const char *str = PyUnicode_AsUTF8(line);
if (!str) {
return -1;
}
return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
}

// Here, mark is the start of the node, while p->mark is the end.
// If node==NULL, they should be the same.
int
Expand Down
1 change: 1 addition & 0 deletions Parser/pegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ expr_ty _PyPegen_name_token(Parser *p);
expr_ty _PyPegen_number_token(Parser *p);
void *_PyPegen_string_token(Parser *p);
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset);

// Error handling functions and APIs
typedef enum {
Expand Down
2 changes: 1 addition & 1 deletion Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ tokenizeriter_next(tokenizeriterobject *it)
col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
}
if (token.end != NULL && token.end >= it->tok->line_start) {
end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
}

if (it->tok->tok_extra_tokens) {
Expand Down

0 comments on commit 79e2905

Please sign in to comment.