From 3bc07b8e05ee2ac534e59cfa9841aedd5ec6408f Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sat, 26 Oct 2024 16:52:54 +0200 Subject: [PATCH 1/8] Fix backslash continuation in untokenize --- Lib/test/test_tokenize.py | 14 ++++++++++++-- Lib/tokenize.py | 25 +++++++++++++++++++------ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 75710db7d05375..5aa3df27c022db 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1803,7 +1803,7 @@ def test_backslash_continuation(self): u.prev_row = 2 u.add_whitespace((4, 4)) self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' ']) - TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n') + TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n', compare_tokens_only=True) def test_iter_compat(self): u = tokenize.Untokenizer() @@ -1821,7 +1821,7 @@ def test_iter_compat(self): class TestRoundtrip(TestCase): - def check_roundtrip(self, f): + def check_roundtrip(self, f, *, compare_tokens_only=False): """ Test roundtrip for `untokenize`. `f` is an open file or a string. The source code in f is tokenized to both 5- and 2-tuples. @@ -1829,6 +1829,9 @@ def check_roundtrip(self, f): tokenize.untokenize(), and the latter tokenized again to 2-tuples. The test fails if the 3 pair tokenizations do not match. + If `compare_tokens_only` is False, the exact output of `untokenize` + is compared against the original source code. + When untokenize bugs are fixed, untokenize with 5-tuples should reproduce code that does not contain a backslash continuation following spaces. A proper test should test this. @@ -1852,6 +1855,13 @@ def check_roundtrip(self, f): tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) + # Compare the exact output + if not compare_tokens_only: + readline = iter(code.splitlines(keepends=True)).__next__ + # The BOM does not produce a token so there is no way to preserve it + code_without_bom = code.removeprefix(b'\xef\xbb\xbf') + self.assertEqual(code_without_bom, tokenize.untokenize(tokenize.tokenize(readline))) + def check_line_extraction(self, f): if isinstance(f, str): code = f.encode('utf-8') diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 1a60fd32a77ea4..8d9716df245cd9 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -169,6 +169,7 @@ def __init__(self): self.prev_row = 1 self.prev_col = 0 self.prev_type = None + self.prev_line = "" self.encoding = None def add_whitespace(self, start): @@ -176,14 +177,27 @@ def add_whitespace(self, start): if row < self.prev_row or row == self.prev_row and col < self.prev_col: raise ValueError("start ({},{}) precedes previous end ({},{})" .format(row, col, self.prev_row, self.prev_col)) - row_offset = row - self.prev_row - if row_offset: - self.tokens.append("\\\n" * row_offset) - self.prev_col = 0 + self.add_backslash_continuation(start) col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) + def add_backslash_continuation(self, start): + """Add backslash continuation characters if the row has increased + without encountering a newline token. + + This also inserts the correct amount of whitespace before the backslash. + """ + row = start[0] + row_offset = row - self.prev_row + if row_offset == 0: + return + + line = self.prev_line.rstrip('\\\r\n') + ws = ''.join(_itertools.takewhile(str.isspace, reversed(line))) + self.tokens.append(ws + "\\\n" * row_offset) + self.prev_col = 0 + def escape_brackets(self, token): characters = [] consume_until_next_bracket = False @@ -243,8 +257,6 @@ def untokenize(self, iterable): end_line, end_col = end extra_chars = last_line.count("{{") + last_line.count("}}") end = (end_line, end_col + extra_chars) - elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): - self.tokens.append(" ") self.add_whitespace(start) self.tokens.append(token) @@ -253,6 +265,7 @@ def untokenize(self, iterable): self.prev_row += 1 self.prev_col = 0 self.prev_type = tok_type + self.prev_line = line return "".join(self.tokens) def compat(self, token, iterable): From cc2fb5edb094deb38667ac013ffd5027722a84ed Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sat, 26 Oct 2024 17:04:48 +0200 Subject: [PATCH 2/8] Add news entry --- .../next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst diff --git a/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst new file mode 100644 index 00000000000000..291c5e6f6f2181 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst @@ -0,0 +1,2 @@ +Fix round-trip invariance for backslash continuations in +:func:`tokenize.untokenize`. From ca6293543ca99d0c335ac9d55f98f31bba239e50 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sat, 26 Oct 2024 19:43:42 +0200 Subject: [PATCH 3/8] Fix Windows --- Lib/tokenize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 8d9716df245cd9..9ce95a62d961ba 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -193,9 +193,10 @@ def add_backslash_continuation(self, start): if row_offset == 0: return + newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n' line = self.prev_line.rstrip('\\\r\n') ws = ''.join(_itertools.takewhile(str.isspace, reversed(line))) - self.tokens.append(ws + "\\\n" * row_offset) + self.tokens.append(ws + f"\\{newline}" * row_offset) self.prev_col = 0 def escape_brackets(self, token): From a595dde6dbdfbca955e4238f58af2c52148d81dc Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Tue, 29 Oct 2024 00:33:53 +0100 Subject: [PATCH 4/8] Be more lenient with test_traceback --- Lib/test/test_tokenize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 5aa3df27c022db..bc1fee59b2a4a2 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -2011,7 +2011,8 @@ def test_random_files(self): print('tokenize', testfile) with open(testfile, 'rb') as f: with self.subTest(file=testfile): - self.check_roundtrip(f) + compare_tokens_only = os.path.basename(testfile) == "test_traceback.py" # Ambiguous backslash continuation + self.check_roundtrip(f, compare_tokens_only=compare_tokens_only) self.check_line_extraction(f) From 6f6a6881b660e28b809e59f875684805c92e3cbc Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Tue, 29 Oct 2024 22:19:07 +0100 Subject: [PATCH 5/8] Check if a file can be compared exactly --- Lib/test/test_tokenize.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index bc1fee59b2a4a2..439ef23f35f420 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,4 +1,5 @@ import os +import re import token import tokenize import unittest @@ -1803,7 +1804,7 @@ def test_backslash_continuation(self): u.prev_row = 2 u.add_whitespace((4, 4)) self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' ']) - TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n', compare_tokens_only=True) + TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n') def test_iter_compat(self): u = tokenize.Untokenizer() @@ -1819,9 +1820,25 @@ def test_iter_compat(self): self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ') +def contains_ambiguous_backslash(source): + """Return `True` if the source contains a backslash on a + line by itself. For example: + + a = (1 + \\ + ) + + Code like this cannot be untokenized exactly. This is because + the tokenizer does not produce any tokens for the line containing + the backslash and so there is no way to know its indent. + """ + pattern = re.compile(br'\n\s*\\\s*\r?\n') + return pattern.search(source) is not None + + class TestRoundtrip(TestCase): - def check_roundtrip(self, f, *, compare_tokens_only=False): + def check_roundtrip(self, f): """ Test roundtrip for `untokenize`. `f` is an open file or a string. The source code in f is tokenized to both 5- and 2-tuples. @@ -1829,8 +1846,8 @@ def check_roundtrip(self, f, *, compare_tokens_only=False): tokenize.untokenize(), and the latter tokenized again to 2-tuples. The test fails if the 3 pair tokenizations do not match. - If `compare_tokens_only` is False, the exact output of `untokenize` - is compared against the original source code. + If the source code can be untokenized unambiguously, the + untokenized code must match the original code exactly. When untokenize bugs are fixed, untokenize with 5-tuples should reproduce code that does not contain a backslash continuation @@ -1855,12 +1872,12 @@ def check_roundtrip(self, f, *, compare_tokens_only=False): tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) - # Compare the exact output - if not compare_tokens_only: - readline = iter(code.splitlines(keepends=True)).__next__ - # The BOM does not produce a token so there is no way to preserve it + if not contains_ambiguous_backslash(code): + # The BOM does not produce a token so there is no way to preserve it. code_without_bom = code.removeprefix(b'\xef\xbb\xbf') - self.assertEqual(code_without_bom, tokenize.untokenize(tokenize.tokenize(readline))) + readline = iter(code_without_bom.splitlines(keepends=True)).__next__ + untokenized_code = tokenize.untokenize(tokenize.tokenize(readline)) + self.assertEqual(code_without_bom, untokenized_code) def check_line_extraction(self, f): if isinstance(f, str): @@ -2011,8 +2028,7 @@ def test_random_files(self): print('tokenize', testfile) with open(testfile, 'rb') as f: with self.subTest(file=testfile): - compare_tokens_only = os.path.basename(testfile) == "test_traceback.py" # Ambiguous backslash continuation - self.check_roundtrip(f, compare_tokens_only=compare_tokens_only) + self.check_roundtrip(f) self.check_line_extraction(f) From 497067ac29402ae7a2a78e862913711bbcbe2810 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Tue, 29 Oct 2024 22:28:25 +0100 Subject: [PATCH 6/8] Simplify regex --- Lib/test/test_tokenize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 439ef23f35f420..480bff743a9f8a 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1832,7 +1832,7 @@ def contains_ambiguous_backslash(source): the tokenizer does not produce any tokens for the line containing the backslash and so there is no way to know its indent. """ - pattern = re.compile(br'\n\s*\\\s*\r?\n') + pattern = re.compile(br'\n\s*\\\r?\n') return pattern.search(source) is not None From 4b32c8e9efdf674291c0f501da48dd0009b0b39f Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Tue, 29 Oct 2024 22:55:46 +0100 Subject: [PATCH 7/8] Use a list for ambiguous files --- Lib/test/test_tokenize.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 480bff743a9f8a..387ca4758fbb2b 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1804,7 +1804,7 @@ def test_backslash_continuation(self): u.prev_row = 2 u.add_whitespace((4, 4)) self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' ']) - TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n') + TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n', compare_tokens_only=True) def test_iter_compat(self): u = tokenize.Untokenizer() @@ -1838,7 +1838,7 @@ def contains_ambiguous_backslash(source): class TestRoundtrip(TestCase): - def check_roundtrip(self, f): + def check_roundtrip(self, f, *, compare_tokens_only=False): """ Test roundtrip for `untokenize`. `f` is an open file or a string. The source code in f is tokenized to both 5- and 2-tuples. @@ -1846,8 +1846,8 @@ def check_roundtrip(self, f): tokenize.untokenize(), and the latter tokenized again to 2-tuples. The test fails if the 3 pair tokenizations do not match. - If the source code can be untokenized unambiguously, the - untokenized code must match the original code exactly. + If `compare_tokens_only` is False, the exact output of `untokenize` + is compared against the original source code. When untokenize bugs are fixed, untokenize with 5-tuples should reproduce code that does not contain a backslash continuation @@ -1872,7 +1872,9 @@ def check_roundtrip(self, f): tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) - if not contains_ambiguous_backslash(code): + if compare_tokens_only: + self.assertTrue(contains_ambiguous_backslash(code)) + else: # The BOM does not produce a token so there is no way to preserve it. code_without_bom = code.removeprefix(b'\xef\xbb\xbf') readline = iter(code_without_bom.splitlines(keepends=True)).__next__ @@ -2019,6 +2021,8 @@ def test_random_files(self): import glob, random tempdir = os.path.dirname(__file__) or os.curdir testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py")) + # Known files which cannot be untokenized exactly + known_ambiguous_files = [os.path.join(tempdir, "test_traceback.py")] if not support.is_resource_enabled("cpu"): testfiles = random.sample(testfiles, 10) @@ -2028,7 +2032,8 @@ def test_random_files(self): print('tokenize', testfile) with open(testfile, 'rb') as f: with self.subTest(file=testfile): - self.check_roundtrip(f) + compare_tokens_only = testfile in known_ambiguous_files + self.check_roundtrip(f, compare_tokens_only=compare_tokens_only) self.check_line_extraction(f) From e2c9bb7af911250be40eda6e8e5bec1931bf508a Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Wed, 30 Oct 2024 19:04:20 +0100 Subject: [PATCH 8/8] Revert "Use a list for ambiguous files" This reverts commit eb2e6f264dc0984f72fa0eb962d23240c4ccfc5b. --- Lib/test/test_tokenize.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 387ca4758fbb2b..480bff743a9f8a 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1804,7 +1804,7 @@ def test_backslash_continuation(self): u.prev_row = 2 u.add_whitespace((4, 4)) self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' ']) - TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n', compare_tokens_only=True) + TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n') def test_iter_compat(self): u = tokenize.Untokenizer() @@ -1838,7 +1838,7 @@ def contains_ambiguous_backslash(source): class TestRoundtrip(TestCase): - def check_roundtrip(self, f, *, compare_tokens_only=False): + def check_roundtrip(self, f): """ Test roundtrip for `untokenize`. `f` is an open file or a string. The source code in f is tokenized to both 5- and 2-tuples. @@ -1846,8 +1846,8 @@ def check_roundtrip(self, f, *, compare_tokens_only=False): tokenize.untokenize(), and the latter tokenized again to 2-tuples. The test fails if the 3 pair tokenizations do not match. - If `compare_tokens_only` is False, the exact output of `untokenize` - is compared against the original source code. + If the source code can be untokenized unambiguously, the + untokenized code must match the original code exactly. When untokenize bugs are fixed, untokenize with 5-tuples should reproduce code that does not contain a backslash continuation @@ -1872,9 +1872,7 @@ def check_roundtrip(self, f, *, compare_tokens_only=False): tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) - if compare_tokens_only: - self.assertTrue(contains_ambiguous_backslash(code)) - else: + if not contains_ambiguous_backslash(code): # The BOM does not produce a token so there is no way to preserve it. code_without_bom = code.removeprefix(b'\xef\xbb\xbf') readline = iter(code_without_bom.splitlines(keepends=True)).__next__ @@ -2021,8 +2019,6 @@ def test_random_files(self): import glob, random tempdir = os.path.dirname(__file__) or os.curdir testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py")) - # Known files which cannot be untokenized exactly - known_ambiguous_files = [os.path.join(tempdir, "test_traceback.py")] if not support.is_resource_enabled("cpu"): testfiles = random.sample(testfiles, 10) @@ -2032,8 +2028,7 @@ def test_random_files(self): print('tokenize', testfile) with open(testfile, 'rb') as f: with self.subTest(file=testfile): - compare_tokens_only = testfile in known_ambiguous_files - self.check_roundtrip(f, compare_tokens_only=compare_tokens_only) + self.check_roundtrip(f) self.check_line_extraction(f)