diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 75710db7d05375..480bff743a9f8a 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,4 +1,5 @@ import os +import re import token import tokenize import unittest @@ -1819,6 +1820,22 @@ def test_iter_compat(self): self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ') +def contains_ambiguous_backslash(source): + """Return `True` if the source contains a backslash on a + line by itself. For example: + + a = (1 + \\ + ) + + Code like this cannot be untokenized exactly. This is because + the tokenizer does not produce any tokens for the line containing + the backslash and so there is no way to know its indent. + """ + pattern = re.compile(br'\n\s*\\\r?\n') + return pattern.search(source) is not None + + class TestRoundtrip(TestCase): def check_roundtrip(self, f): @@ -1829,6 +1846,9 @@ def check_roundtrip(self, f): tokenize.untokenize(), and the latter tokenized again to 2-tuples. The test fails if the 3 pair tokenizations do not match. + If the source code can be untokenized unambiguously, the + untokenized code must match the original code exactly. + When untokenize bugs are fixed, untokenize with 5-tuples should reproduce code that does not contain a backslash continuation following spaces. A proper test should test this. @@ -1852,6 +1872,13 @@ def check_roundtrip(self, f): tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) + if not contains_ambiguous_backslash(code): + # The BOM does not produce a token so there is no way to preserve it. + code_without_bom = code.removeprefix(b'\xef\xbb\xbf') + readline = iter(code_without_bom.splitlines(keepends=True)).__next__ + untokenized_code = tokenize.untokenize(tokenize.tokenize(readline)) + self.assertEqual(code_without_bom, untokenized_code) + def check_line_extraction(self, f): if isinstance(f, str): code = f.encode('utf-8') diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 1a60fd32a77ea4..9ce95a62d961ba 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -169,6 +169,7 @@ def __init__(self): self.prev_row = 1 self.prev_col = 0 self.prev_type = None + self.prev_line = "" self.encoding = None def add_whitespace(self, start): @@ -176,14 +177,28 @@ def add_whitespace(self, start): if row < self.prev_row or row == self.prev_row and col < self.prev_col: raise ValueError("start ({},{}) precedes previous end ({},{})" .format(row, col, self.prev_row, self.prev_col)) - row_offset = row - self.prev_row - if row_offset: - self.tokens.append("\\\n" * row_offset) - self.prev_col = 0 + self.add_backslash_continuation(start) col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) + def add_backslash_continuation(self, start): + """Add backslash continuation characters if the row has increased + without encountering a newline token. + + This also inserts the correct amount of whitespace before the backslash. + """ + row = start[0] + row_offset = row - self.prev_row + if row_offset == 0: + return + + newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n' + line = self.prev_line.rstrip('\\\r\n') + ws = ''.join(_itertools.takewhile(str.isspace, reversed(line))) + self.tokens.append(ws + f"\\{newline}" * row_offset) + self.prev_col = 0 + def escape_brackets(self, token): characters = [] consume_until_next_bracket = False @@ -243,8 +258,6 @@ def untokenize(self, iterable): end_line, end_col = end extra_chars = last_line.count("{{") + last_line.count("}}") end = (end_line, end_col + extra_chars) - elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): - self.tokens.append(" ") self.add_whitespace(start) self.tokens.append(token) @@ -253,6 +266,7 @@ def untokenize(self, iterable): self.prev_row += 1 self.prev_col = 0 self.prev_type = tok_type + self.prev_line = line return "".join(self.tokens) def compat(self, token, iterable): diff --git a/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst new file mode 100644 index 00000000000000..291c5e6f6f2181 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst @@ -0,0 +1,2 @@ +Fix round-trip invariance for backslash continuations in +:func:`tokenize.untokenize`.