From 3ce123379608544de7fc5ac0e75fc46afae2b64a Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 8 Feb 2024 16:00:27 +0000 Subject: [PATCH 1/8] gh-115154: Fix untokenize handling of unicode named literals Signed-off-by: Pablo Galindo --- Lib/test/test_tokenize.py | 2 ++ Lib/tokenize.py | 8 +++----- .../2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 21e8637a7ca905..8392f543be15d8 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1877,6 +1877,8 @@ def test_roundtrip(self): " print('Can not import' # comment2\n)" "else: print('Loaded')\n") + self.check_roundtrip("f'\\N{EXCLAMATION MARK}'") + def test_continuation(self): # Balancing continuation self.check_roundtrip("a = (3,4, \n" diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 0ab1893d42f72f..f8ac40116c9bd7 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -215,9 +215,8 @@ def untokenize(self, iterable): elif tok_type == FSTRING_MIDDLE: if '{' in token or '}' in token: end_line, end_col = end - end = (end_line, end_col + token.count('{') + token.count('}')) - token = re.sub('{', '{{', token) - token = re.sub('}', '}}', token) + token = re.sub(r'(? Date: Sun, 11 Feb 2024 12:56:45 +0000 Subject: [PATCH 2/8] fixup! gh-115154: Fix untokenize handling of unicode named literals --- Lib/tokenize.py | 8 +++++--- lol.py | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 lol.py diff --git a/Lib/tokenize.py b/Lib/tokenize.py index f8ac40116c9bd7..0ab1893d42f72f 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -215,8 +215,9 @@ def untokenize(self, iterable): elif tok_type == FSTRING_MIDDLE: if '{' in token or '}' in token: end_line, end_col = end - token = re.sub(r'(?', 'exec') From 31f6ff41a802746f11af9bd781f34d48ad796fc9 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sun, 11 Feb 2024 13:37:14 +0000 Subject: [PATCH 3/8] Fix tokenizing of test_fstring --- Lib/test/test_tokenize.py | 6 ++--- Lib/tokenize.py | 49 ++++++++++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 8392f543be15d8..14f340d5383199 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1878,6 +1878,9 @@ def test_roundtrip(self): "else: print('Loaded')\n") self.check_roundtrip("f'\\N{EXCLAMATION MARK}'") + self.check_roundtrip(r"f'\\N{EXCLAMATION MARK}'") + self.check_roundtrip(r"f'\\N{SNAKE}'") + def test_continuation(self): # Balancing continuation @@ -1913,9 +1916,6 @@ def test_random_files(self): tempdir = os.path.dirname(__file__) or os.curdir testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py")) - # TODO: Remove this once we can untokenize PEP 701 syntax - testfiles.remove(os.path.join(tempdir, "test_fstring.py")) - if not support.is_resource_enabled("cpu"): testfiles = random.sample(testfiles, 10) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 0ab1893d42f72f..08321a49a577e0 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -168,6 +168,7 @@ def __init__(self): self.tokens = [] self.prev_row = 1 self.prev_col = 0 + self.prev_type = None self.encoding = None def add_whitespace(self, start): @@ -182,6 +183,23 @@ def add_whitespace(self, start): col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) + + def escape_brackets(self, token): + characters = [] + consume_until_next_bracket = False + for character in token: + if character == "}": + if consume_until_next_bracket: + consume_until_next_bracket = False + else: + characters.append(character) + if character == "{": + if characters[-2:] != ["\\", "N"]: + characters.append(character) + else: + consume_until_next_bracket = True + characters.append(character) + return "".join(characters) def untokenize(self, iterable): it = iter(iterable) @@ -214,11 +232,13 @@ def untokenize(self, iterable): startline = False elif tok_type == FSTRING_MIDDLE: if '{' in token or '}' in token: + token = self.escape_brackets(token) + last_line = token.splitlines()[-1] end_line, end_col = end - end = (end_line, end_col + token.count('{') + token.count('}')) - token = re.sub('{', '{{', token) - token = re.sub('}', '}}', token) - + extra_chars = last_line.count("{{") + last_line.count("}}") + end = (end_line, end_col + extra_chars) + elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): + self.tokens.append(" ") self.add_whitespace(start) self.tokens.append(token) @@ -226,6 +246,7 @@ def untokenize(self, iterable): if tok_type in (NEWLINE, NL): self.prev_row += 1 self.prev_col = 0 + self.prev_type = tok_type return "".join(self.tokens) def compat(self, token, iterable): @@ -233,6 +254,7 @@ def compat(self, token, iterable): toks_append = self.tokens.append startline = token[0] in (NEWLINE, NL) prevstring = False + in_fstring = 0 for tok in _itertools.chain([token], iterable): toknum, tokval = tok[:2] @@ -250,7 +272,11 @@ def compat(self, token, iterable): prevstring = True else: prevstring = False - + + if toknum == FSTRING_START: + in_fstring += 1 + elif toknum == FSTRING_END: + in_fstring -= 1 if toknum == INDENT: indents.append(tokval) continue @@ -263,11 +289,18 @@ def compat(self, token, iterable): toks_append(indents[-1]) startline = False elif toknum == FSTRING_MIDDLE: - if '{' in tokval or '}' in tokval: - tokval = re.sub('{', '{{', tokval) - tokval = re.sub('}', '}}', tokval) + tokval = self.escape_brackets(tokval) + + # Insert a space between two consecutive brackets if e are in an f-string + if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring: + tokval = ' ' + tokval + + # Insert a space between two consecutive f-strings + if toknum in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): + self.tokens.append(" ") toks_append(tokval) + self.prev_type = toknum def untokenize(iterable): From ce7ddd7b6a45d9209349d9a3d965e97a2c8b1692 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sun, 11 Feb 2024 13:47:03 +0000 Subject: [PATCH 4/8] Fix linting --- Lib/test/test_tokenize.py | 4 ++-- Lib/tokenize.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 14f340d5383199..f787b2c0c93e47 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1878,8 +1878,8 @@ def test_roundtrip(self): "else: print('Loaded')\n") self.check_roundtrip("f'\\N{EXCLAMATION MARK}'") - self.check_roundtrip(r"f'\\N{EXCLAMATION MARK}'") - self.check_roundtrip(r"f'\\N{SNAKE}'") + self.check_roundtrip(r"f'\\N{EXCLAMATION MARK}'") + self.check_roundtrip(r"f'\\N{SNAKE}'") def test_continuation(self): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 08321a49a577e0..39c65be3c5c208 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -183,14 +183,14 @@ def add_whitespace(self, start): col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) - + def escape_brackets(self, token): characters = [] consume_until_next_bracket = False for character in token: if character == "}": if consume_until_next_bracket: - consume_until_next_bracket = False + consume_until_next_bracket = False else: characters.append(character) if character == "{": @@ -272,7 +272,7 @@ def compat(self, token, iterable): prevstring = True else: prevstring = False - + if toknum == FSTRING_START: in_fstring += 1 elif toknum == FSTRING_END: From c9b33aedccd536cc4e37b09bda3a0a3e6fa32468 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Sun, 11 Feb 2024 15:43:55 +0000 Subject: [PATCH 5/8] Apply suggestions from code review Co-authored-by: Alex Waygood --- Lib/tokenize.py | 2 +- .../2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 39c65be3c5c208..81db518e0aa3c8 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -291,7 +291,7 @@ def compat(self, token, iterable): elif toknum == FSTRING_MIDDLE: tokval = self.escape_brackets(tokval) - # Insert a space between two consecutive brackets if e are in an f-string + # Insert a space between two consecutive brackets if we are in an f-string if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring: tokval = ' ' + tokval diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst b/Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst index 89184ec95e6292..045596bfcdca43 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst @@ -1,2 +1,2 @@ -Fix a bug that was causing the :func:`tokenize.untokenize` function to not -handle correctly unicode named literals. Patch by Pablo Galindo +Fix a bug that was causing the :func:`tokenize.untokenize` function to +handle unicode named literals incorrectly. Patch by Pablo Galindo From 59b406c1e44a7dc8b6b5b1896136db51c82ac8bf Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Sun, 11 Feb 2024 15:45:29 +0000 Subject: [PATCH 6/8] fixup! Apply suggestions from code review --- Lib/test/test_tokenize.py | 18 ++++++++++++++++++ lol.py | 1 - 2 files changed, 18 insertions(+), 1 deletion(-) delete mode 100644 lol.py diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index f787b2c0c93e47..e264f10286f3f1 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1880,6 +1880,24 @@ def test_roundtrip(self): self.check_roundtrip("f'\\N{EXCLAMATION MARK}'") self.check_roundtrip(r"f'\\N{EXCLAMATION MARK}'") self.check_roundtrip(r"f'\\N{SNAKE}'") + cases = [ + """ +if 1: + "foo" +"bar" +""", + """ +if 1: + ("foo" + "bar") +""", + """ +if 1: + "foo" + "bar" +""" ] + for case in cases: + self.check_roundtrip(case) def test_continuation(self): diff --git a/lol.py b/lol.py deleted file mode 100644 index 9d1d64baa646ab..00000000000000 --- a/lol.py +++ /dev/null @@ -1 +0,0 @@ -compile('match y:\n case e(e=v,v,', '', 'exec') From 4ab43521a435827f2227e00bc9e160a8d3149154 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Mon, 12 Feb 2024 15:59:06 +0000 Subject: [PATCH 7/8] Fix scaped \N --- Lib/test/test_tokenize.py | 1 + Lib/tokenize.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index e264f10286f3f1..ec5d0a7b6f59cd 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1880,6 +1880,7 @@ def test_roundtrip(self): self.check_roundtrip("f'\\N{EXCLAMATION MARK}'") self.check_roundtrip(r"f'\\N{EXCLAMATION MARK}'") self.check_roundtrip(r"f'\\N{SNAKE}'") + self.check_roundtrip(r"f'\\N{{SNAKE}}'") cases = [ """ if 1: diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 81db518e0aa3c8..99815cb258a293 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -194,7 +194,7 @@ def escape_brackets(self, token): else: characters.append(character) if character == "{": - if characters[-2:] != ["\\", "N"]: + if characters[-2:] != ["\\", "N"] or characters[-3:] == ["\\", "\\", "N"]: characters.append(character) else: consume_until_next_bracket = True From b630f684078b8b03ab5f25f00f9f1b16b7e108e0 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Mon, 19 Feb 2024 14:34:04 +0000 Subject: [PATCH 8/8] Fix multiple backslashes --- Lib/test/test_tokenize.py | 15 ++++++++++++++- Lib/tokenize.py | 8 +++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index ec5d0a7b6f59cd..4428e8cea1964c 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1878,9 +1878,22 @@ def test_roundtrip(self): "else: print('Loaded')\n") self.check_roundtrip("f'\\N{EXCLAMATION MARK}'") - self.check_roundtrip(r"f'\\N{EXCLAMATION MARK}'") self.check_roundtrip(r"f'\\N{SNAKE}'") self.check_roundtrip(r"f'\\N{{SNAKE}}'") + self.check_roundtrip(r"f'\N{SNAKE}'") + self.check_roundtrip(r"f'\\\N{SNAKE}'") + self.check_roundtrip(r"f'\\\\\N{SNAKE}'") + self.check_roundtrip(r"f'\\\\\\\N{SNAKE}'") + + self.check_roundtrip(r"f'\\N{1}'") + self.check_roundtrip(r"f'\\\\N{2}'") + self.check_roundtrip(r"f'\\\\\\N{3}'") + self.check_roundtrip(r"f'\\\\\\\\N{4}'") + + self.check_roundtrip(r"f'\\N{{'") + self.check_roundtrip(r"f'\\\\N{{'") + self.check_roundtrip(r"f'\\\\\\N{{'") + self.check_roundtrip(r"f'\\\\\\\\N{{'") cases = [ """ if 1: diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 99815cb258a293..7f418bb7a1b37f 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -194,7 +194,13 @@ def escape_brackets(self, token): else: characters.append(character) if character == "{": - if characters[-2:] != ["\\", "N"] or characters[-3:] == ["\\", "\\", "N"]: + n_backslashes = sum( + 1 for char in _itertools.takewhile( + "\\".__eq__, + characters[-2::-1] + ) + ) + if n_backslashes % 2 == 0: characters.append(character) else: consume_until_next_bracket = True