From 3bc07b8e05ee2ac534e59cfa9841aedd5ec6408f Mon Sep 17 00:00:00 2001
From: Tomas Roun <tomas.roun8@gmail.com>
Date: Sat, 26 Oct 2024 16:52:54 +0200
Subject: [PATCH 1/8] Fix backslash continuation in untokenize

---
 Lib/test/test_tokenize.py | 14 ++++++++++++--
 Lib/tokenize.py           | 25 +++++++++++++++++++------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 75710db7d05375..5aa3df27c022db 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1803,7 +1803,7 @@ def test_backslash_continuation(self):
         u.prev_row = 2
         u.add_whitespace((4, 4))
         self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', '    '])
-        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n')
+        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n', compare_tokens_only=True)
 
     def test_iter_compat(self):
         u = tokenize.Untokenizer()
@@ -1821,7 +1821,7 @@ def test_iter_compat(self):
 
 class TestRoundtrip(TestCase):
 
-    def check_roundtrip(self, f):
+    def check_roundtrip(self, f, *, compare_tokens_only=False):
         """
         Test roundtrip for `untokenize`. `f` is an open file or a string.
         The source code in f is tokenized to both 5- and 2-tuples.
@@ -1829,6 +1829,9 @@ def check_roundtrip(self, f):
         tokenize.untokenize(), and the latter tokenized again to 2-tuples.
         The test fails if the 3 pair tokenizations do not match.
 
+        If `compare_tokens_only` is False, the exact output of `untokenize`
+        is compared against the original source code.
+
         When untokenize bugs are fixed, untokenize with 5-tuples should
         reproduce code that does not contain a backslash continuation
         following spaces.  A proper test should test this.
@@ -1852,6 +1855,13 @@ def check_roundtrip(self, f):
         tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
         self.assertEqual(tokens2_from5, tokens2)
 
+        # Compare the exact output
+        if not compare_tokens_only:
+            readline = iter(code.splitlines(keepends=True)).__next__
+            # The BOM does not produce a token so there is no way to preserve it
+            code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
+            self.assertEqual(code_without_bom, tokenize.untokenize(tokenize.tokenize(readline)))
+
     def check_line_extraction(self, f):
         if isinstance(f, str):
             code = f.encode('utf-8')
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 1a60fd32a77ea4..8d9716df245cd9 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -169,6 +169,7 @@ def __init__(self):
         self.prev_row = 1
         self.prev_col = 0
         self.prev_type = None
+        self.prev_line = ""
         self.encoding = None
 
     def add_whitespace(self, start):
@@ -176,14 +177,27 @@ def add_whitespace(self, start):
         if row < self.prev_row or row == self.prev_row and col < self.prev_col:
             raise ValueError("start ({},{}) precedes previous end ({},{})"
                              .format(row, col, self.prev_row, self.prev_col))
-        row_offset = row - self.prev_row
-        if row_offset:
-            self.tokens.append("\\\n" * row_offset)
-            self.prev_col = 0
+        self.add_backslash_continuation(start)
         col_offset = col - self.prev_col
         if col_offset:
             self.tokens.append(" " * col_offset)
 
+    def add_backslash_continuation(self, start):
+        """Add backslash continuation characters if the row has increased
+        without encountering a newline token.
+
+        This also inserts the correct amount of whitespace before the backslash.
+        """
+        row = start[0]
+        row_offset = row - self.prev_row
+        if row_offset == 0:
+            return
+
+        line = self.prev_line.rstrip('\\\r\n')
+        ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
+        self.tokens.append(ws + "\\\n" * row_offset)
+        self.prev_col = 0
+
     def escape_brackets(self, token):
         characters = []
         consume_until_next_bracket = False
@@ -243,8 +257,6 @@ def untokenize(self, iterable):
                     end_line, end_col = end
                     extra_chars = last_line.count("{{") + last_line.count("}}")
                     end = (end_line, end_col + extra_chars)
-            elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
-                self.tokens.append(" ")
 
             self.add_whitespace(start)
             self.tokens.append(token)
@@ -253,6 +265,7 @@ def untokenize(self, iterable):
                 self.prev_row += 1
                 self.prev_col = 0
             self.prev_type = tok_type
+            self.prev_line = line
         return "".join(self.tokens)
 
     def compat(self, token, iterable):

From cc2fb5edb094deb38667ac013ffd5027722a84ed Mon Sep 17 00:00:00 2001
From: Tomas Roun <tomas.roun8@gmail.com>
Date: Sat, 26 Oct 2024 17:04:48 +0200
Subject: [PATCH 2/8] Add news entry

---
 .../next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst

diff --git a/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
new file mode 100644
index 00000000000000..291c5e6f6f2181
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
@@ -0,0 +1,2 @@
+Fix round-trip invariance for backslash continuations in
+:func:`tokenize.untokenize`.

From ca6293543ca99d0c335ac9d55f98f31bba239e50 Mon Sep 17 00:00:00 2001
From: Tomas Roun <tomas.roun8@gmail.com>
Date: Sat, 26 Oct 2024 19:43:42 +0200
Subject: [PATCH 3/8] Fix Windows

---
 Lib/tokenize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 8d9716df245cd9..9ce95a62d961ba 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -193,9 +193,10 @@ def add_backslash_continuation(self, start):
         if row_offset == 0:
             return
 
+        newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
         line = self.prev_line.rstrip('\\\r\n')
         ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
-        self.tokens.append(ws + "\\\n" * row_offset)
+        self.tokens.append(ws + f"\\{newline}" * row_offset)
         self.prev_col = 0
 
     def escape_brackets(self, token):

From a595dde6dbdfbca955e4238f58af2c52148d81dc Mon Sep 17 00:00:00 2001
From: Tomas Roun <tomas.roun8@gmail.com>
Date: Tue, 29 Oct 2024 00:33:53 +0100
Subject: [PATCH 4/8] Be more lenient with test_traceback

---
 Lib/test/test_tokenize.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 5aa3df27c022db..bc1fee59b2a4a2 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -2011,7 +2011,8 @@ def test_random_files(self):
                 print('tokenize', testfile)
             with open(testfile, 'rb') as f:
                 with self.subTest(file=testfile):
-                    self.check_roundtrip(f)
+                    compare_tokens_only = os.path.basename(testfile) == "test_traceback.py"  # Ambiguous backslash continuation
+                    self.check_roundtrip(f, compare_tokens_only=compare_tokens_only)
                     self.check_line_extraction(f)
 
 

From 6f6a6881b660e28b809e59f875684805c92e3cbc Mon Sep 17 00:00:00 2001
From: Tomas Roun <tomas.roun8@gmail.com>
Date: Tue, 29 Oct 2024 22:19:07 +0100
Subject: [PATCH 5/8] Check if a file can be compared exactly

---
 Lib/test/test_tokenize.py | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index bc1fee59b2a4a2..439ef23f35f420 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,4 +1,5 @@
 import os
+import re
 import token
 import tokenize
 import unittest
@@ -1803,7 +1804,7 @@ def test_backslash_continuation(self):
         u.prev_row = 2
         u.add_whitespace((4, 4))
         self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', '    '])
-        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n', compare_tokens_only=True)
+        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n')
 
     def test_iter_compat(self):
         u = tokenize.Untokenizer()
@@ -1819,9 +1820,25 @@ def test_iter_compat(self):
         self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
 
 
+def contains_ambiguous_backslash(source):
+    """Return `True` if the source contains a backslash on a
+    line by itself. For example:
+
+    a = (1
+        \\
+    )
+
+    Code like this cannot be untokenized exactly. This is because
+    the tokenizer does not produce any tokens for the line containing
+    the backslash and so there is no way to know its indent.
+    """
+    pattern = re.compile(br'\n\s*\\\s*\r?\n')
+    return pattern.search(source) is not None
+
+
 class TestRoundtrip(TestCase):
 
-    def check_roundtrip(self, f, *, compare_tokens_only=False):
+    def check_roundtrip(self, f):
         """
         Test roundtrip for `untokenize`. `f` is an open file or a string.
         The source code in f is tokenized to both 5- and 2-tuples.
@@ -1829,8 +1846,8 @@ def check_roundtrip(self, f, *, compare_tokens_only=False):
         tokenize.untokenize(), and the latter tokenized again to 2-tuples.
         The test fails if the 3 pair tokenizations do not match.
 
-        If `compare_tokens_only` is False, the exact output of `untokenize`
-        is compared against the original source code.
+        If the source code can be untokenized unambiguously, the
+        untokenized code must match the original code exactly.
 
         When untokenize bugs are fixed, untokenize with 5-tuples should
         reproduce code that does not contain a backslash continuation
@@ -1855,12 +1872,12 @@ def check_roundtrip(self, f, *, compare_tokens_only=False):
         tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
         self.assertEqual(tokens2_from5, tokens2)
 
-        # Compare the exact output
-        if not compare_tokens_only:
-            readline = iter(code.splitlines(keepends=True)).__next__
-            # The BOM does not produce a token so there is no way to preserve it
+        if not contains_ambiguous_backslash(code):
+            # The BOM does not produce a token so there is no way to preserve it.
             code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
-            self.assertEqual(code_without_bom, tokenize.untokenize(tokenize.tokenize(readline)))
+            readline = iter(code_without_bom.splitlines(keepends=True)).__next__
+            untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
+            self.assertEqual(code_without_bom, untokenized_code)
 
     def check_line_extraction(self, f):
         if isinstance(f, str):
@@ -2011,8 +2028,7 @@ def test_random_files(self):
                 print('tokenize', testfile)
             with open(testfile, 'rb') as f:
                 with self.subTest(file=testfile):
-                    compare_tokens_only = os.path.basename(testfile) == "test_traceback.py"  # Ambiguous backslash continuation
-                    self.check_roundtrip(f, compare_tokens_only=compare_tokens_only)
+                    self.check_roundtrip(f)
                     self.check_line_extraction(f)
 
 

From 497067ac29402ae7a2a78e862913711bbcbe2810 Mon Sep 17 00:00:00 2001
From: Tomas Roun <tomas.roun8@gmail.com>
Date: Tue, 29 Oct 2024 22:28:25 +0100
Subject: [PATCH 6/8] Simplify regex

---
 Lib/test/test_tokenize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 439ef23f35f420..480bff743a9f8a 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1832,7 +1832,7 @@ def contains_ambiguous_backslash(source):
     the tokenizer does not produce any tokens for the line containing
     the backslash and so there is no way to know its indent.
     """
-    pattern = re.compile(br'\n\s*\\\s*\r?\n')
+    pattern = re.compile(br'\n\s*\\\r?\n')
     return pattern.search(source) is not None
 
 

From 4b32c8e9efdf674291c0f501da48dd0009b0b39f Mon Sep 17 00:00:00 2001
From: Tomas Roun <tomas.roun8@gmail.com>
Date: Tue, 29 Oct 2024 22:55:46 +0100
Subject: [PATCH 7/8] Use a list for ambiguous files

---
 Lib/test/test_tokenize.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 480bff743a9f8a..387ca4758fbb2b 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1804,7 +1804,7 @@ def test_backslash_continuation(self):
         u.prev_row = 2
         u.add_whitespace((4, 4))
         self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', '    '])
-        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n')
+        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n', compare_tokens_only=True)
 
     def test_iter_compat(self):
         u = tokenize.Untokenizer()
@@ -1838,7 +1838,7 @@ def contains_ambiguous_backslash(source):
 
 class TestRoundtrip(TestCase):
 
-    def check_roundtrip(self, f):
+    def check_roundtrip(self, f, *, compare_tokens_only=False):
         """
         Test roundtrip for `untokenize`. `f` is an open file or a string.
         The source code in f is tokenized to both 5- and 2-tuples.
@@ -1846,8 +1846,8 @@ def check_roundtrip(self, f):
         tokenize.untokenize(), and the latter tokenized again to 2-tuples.
         The test fails if the 3 pair tokenizations do not match.
 
-        If the source code can be untokenized unambiguously, the
-        untokenized code must match the original code exactly.
+        If `compare_tokens_only` is False, the exact output of `untokenize`
+        is compared against the original source code.
 
         When untokenize bugs are fixed, untokenize with 5-tuples should
         reproduce code that does not contain a backslash continuation
@@ -1872,7 +1872,9 @@ def check_roundtrip(self, f):
         tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
         self.assertEqual(tokens2_from5, tokens2)
 
-        if not contains_ambiguous_backslash(code):
+        if compare_tokens_only:
+            self.assertTrue(contains_ambiguous_backslash(code))
+        else:
             # The BOM does not produce a token so there is no way to preserve it.
             code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
             readline = iter(code_without_bom.splitlines(keepends=True)).__next__
@@ -2019,6 +2021,8 @@ def test_random_files(self):
         import glob, random
         tempdir = os.path.dirname(__file__) or os.curdir
         testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py"))
+        # Known files which cannot be untokenized exactly
+        known_ambiguous_files = [os.path.join(tempdir, "test_traceback.py")]
 
         if not support.is_resource_enabled("cpu"):
             testfiles = random.sample(testfiles, 10)
@@ -2028,7 +2032,8 @@ def test_random_files(self):
                 print('tokenize', testfile)
             with open(testfile, 'rb') as f:
                 with self.subTest(file=testfile):
-                    self.check_roundtrip(f)
+                    compare_tokens_only = testfile in known_ambiguous_files
+                    self.check_roundtrip(f, compare_tokens_only=compare_tokens_only)
                     self.check_line_extraction(f)
 
 

From e2c9bb7af911250be40eda6e8e5bec1931bf508a Mon Sep 17 00:00:00 2001
From: Tomas Roun <tomas.roun8@gmail.com>
Date: Wed, 30 Oct 2024 19:04:20 +0100
Subject: [PATCH 8/8] Revert "Use a list for ambiguous files"

This reverts commit eb2e6f264dc0984f72fa0eb962d23240c4ccfc5b.
---
 Lib/test/test_tokenize.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 387ca4758fbb2b..480bff743a9f8a 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1804,7 +1804,7 @@ def test_backslash_continuation(self):
         u.prev_row = 2
         u.add_whitespace((4, 4))
         self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', '    '])
-        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n', compare_tokens_only=True)
+        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n')
 
     def test_iter_compat(self):
         u = tokenize.Untokenizer()
@@ -1838,7 +1838,7 @@ def contains_ambiguous_backslash(source):
 
 class TestRoundtrip(TestCase):
 
-    def check_roundtrip(self, f, *, compare_tokens_only=False):
+    def check_roundtrip(self, f):
         """
         Test roundtrip for `untokenize`. `f` is an open file or a string.
         The source code in f is tokenized to both 5- and 2-tuples.
@@ -1846,8 +1846,8 @@ def check_roundtrip(self, f, *, compare_tokens_only=False):
         tokenize.untokenize(), and the latter tokenized again to 2-tuples.
         The test fails if the 3 pair tokenizations do not match.
 
-        If `compare_tokens_only` is False, the exact output of `untokenize`
-        is compared against the original source code.
+        If the source code can be untokenized unambiguously, the
+        untokenized code must match the original code exactly.
 
         When untokenize bugs are fixed, untokenize with 5-tuples should
         reproduce code that does not contain a backslash continuation
@@ -1872,9 +1872,7 @@ def check_roundtrip(self, f, *, compare_tokens_only=False):
         tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
         self.assertEqual(tokens2_from5, tokens2)
 
-        if compare_tokens_only:
-            self.assertTrue(contains_ambiguous_backslash(code))
-        else:
+        if not contains_ambiguous_backslash(code):
             # The BOM does not produce a token so there is no way to preserve it.
             code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
             readline = iter(code_without_bom.splitlines(keepends=True)).__next__
@@ -2021,8 +2019,6 @@ def test_random_files(self):
         import glob, random
         tempdir = os.path.dirname(__file__) or os.curdir
         testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py"))
-        # Known files which cannot be untokenized exactly
-        known_ambiguous_files = [os.path.join(tempdir, "test_traceback.py")]
 
         if not support.is_resource_enabled("cpu"):
             testfiles = random.sample(testfiles, 10)
@@ -2032,8 +2028,7 @@ def test_random_files(self):
                 print('tokenize', testfile)
             with open(testfile, 'rb') as f:
                 with self.subTest(file=testfile):
-                    compare_tokens_only = testfile in known_ambiguous_files
-                    self.check_roundtrip(f, compare_tokens_only=compare_tokens_only)
+                    self.check_roundtrip(f)
                     self.check_line_extraction(f)