From e52f328e5920258060adec0337ffca77937e6b14 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Thu, 14 Jul 2022 11:32:28 -0400
Subject: [PATCH 1/4] gh-94823: Improve coverage in tokenizer.c:valid_utf8

When loading a source file from disk, there is a separate UTF-8 validator
distinct from the one in `unicode_decode_utf8`. This exercises that code path
with the same set of invalid inputs as we use for testing the "other" UTF-8
decoder.
---
 Lib/test/test_source_encoding.py | 58 ++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index a0375fda0d3656..e74256fdf4cfde 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -224,6 +224,64 @@ def test_crcrcrlf2(self):
         out = self.check_script_output(src, br"'\n\n\n'")
 
 
+class UTF8ValidatorTest(unittest.TestCase):
+    @unittest.skipIf(sys.platform.startswith("win"),
+                     "Times out on Windows")
+    def test_invalid_utf8(self):
+        # This is port of test_utf8_decode_invalid_sequences in test_unicode.py
+        # to exercise the separate utf8 validator in tokenize.c used when
+        # reading source files.
+
+        # Each example is put inside a string at the top of the file so
+        # it's an otherwise valid Python source file.
+        template = b'"%s"\n'
+
+        with tempfile.TemporaryDirectory() as tmpd:
+            fn = os.path.join(tmpd, 'test.py')
+
+            def check(content):
+                with open(fn, 'wb') as fp:
+                    fp.write(template % content)
+                script_helper.assert_python_failure(fn)
+
+            # continuation bytes in a sequence of 2, 3, or 4 bytes
+            continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0, 7)]
+            # start bytes of a 2-byte sequence equivalent to code points < 0x7F
+            invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
+            # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
+            invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
+            invalid_start_bytes = (
+                continuation_bytes + invalid_2B_seq_start_bytes +
+                invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
+            )
+
+            for byte in invalid_start_bytes:
+                check(byte)
+
+            for sb in invalid_2B_seq_start_bytes:
+                for cb in continuation_bytes:
+                    check(sb + cb)
+
+            for sb in invalid_4B_seq_start_bytes:
+                for cb1 in continuation_bytes[:3]:
+                    for cb3 in continuation_bytes[:3]:
+                        check(sb+cb1+b'\x80'+cb3)
+
+            for cb in [bytes([x]) for x in range(0x80, 0xA0, 5)]:
+                check(b'\xE0'+cb+b'\x80')
+                check(b'\xE0'+cb+b'\xBF')
+                # surrogates
+            for cb in [bytes([x]) for x in range(0xA0, 0xC0, 5)]:
+                check(b'\xED'+cb+b'\x80')
+                check(b'\xED'+cb+b'\xBF')
+            for cb in [bytes([x]) for x in range(0x80, 0x90, 5)]:
+                check(b'\xF0'+cb+b'\x80\x80')
+                check(b'\xF0'+cb+b'\xBF\xBF')
+            for cb in [bytes([x]) for x in range(0x90, 0xC0, 5)]:
+                check(b'\xF4'+cb+b'\x80\x80')
+                check(b'\xF4'+cb+b'\xBF\xBF')
+
+
 class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
 
     def check_script_output(self, src, expected):

From 61cf58fbeab4848e3523fefbbf84cd74e6456ad8 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 20 Jul 2022 11:41:35 -0400
Subject: [PATCH 2/4] Perform all examples, since we are excluding Windows
 anyway

---
 Lib/test/test_source_encoding.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index e74256fdf4cfde..80f22a8790b9d4 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -226,7 +226,7 @@ def test_crcrcrlf2(self):
 
 class UTF8ValidatorTest(unittest.TestCase):
     @unittest.skipIf(sys.platform.startswith("win"),
-                     "Times out on Windows")
+                     "Times out on Windows due to file I/O")
     def test_invalid_utf8(self):
         # This is port of test_utf8_decode_invalid_sequences in test_unicode.py
         # to exercise the separate utf8 validator in tokenize.c used when
@@ -245,7 +245,7 @@ def check(content):
                 script_helper.assert_python_failure(fn)
 
             # continuation bytes in a sequence of 2, 3, or 4 bytes
-            continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0, 7)]
+            continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
             # start bytes of a 2-byte sequence equivalent to code points < 0x7F
             invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
             # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
@@ -267,17 +267,17 @@ def check(content):
                     for cb3 in continuation_bytes[:3]:
                         check(sb+cb1+b'\x80'+cb3)
 
-            for cb in [bytes([x]) for x in range(0x80, 0xA0, 5)]:
+            for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
                 check(b'\xE0'+cb+b'\x80')
                 check(b'\xE0'+cb+b'\xBF')
                 # surrogates
-            for cb in [bytes([x]) for x in range(0xA0, 0xC0, 5)]:
+            for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
                 check(b'\xED'+cb+b'\x80')
                 check(b'\xED'+cb+b'\xBF')
-            for cb in [bytes([x]) for x in range(0x80, 0x90, 5)]:
+            for cb in [bytes([x]) for x in range(0x80, 0x90)]:
                 check(b'\xF0'+cb+b'\x80\x80')
                 check(b'\xF0'+cb+b'\xBF\xBF')
-            for cb in [bytes([x]) for x in range(0x90, 0xC0, 5)]:
+            for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
                 check(b'\xF4'+cb+b'\x80\x80')
                 check(b'\xF4'+cb+b'\xBF\xBF')
 

From 1f65938047f389b36d262acb024efad6480e16c4 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Fri, 12 Aug 2022 08:12:21 -0400
Subject: [PATCH 3/4] Only run test on Linux

---
 Lib/test/test_source_encoding.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index 80f22a8790b9d4..06eb9eef467d2f 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -225,13 +225,16 @@ def test_crcrcrlf2(self):
 
 
 class UTF8ValidatorTest(unittest.TestCase):
-    @unittest.skipIf(sys.platform.startswith("win"),
-                     "Times out on Windows due to file I/O")
+    @unittest.skipIf(not sys.platform.startswith("linux"),
+                     "Too slow to run on non-Linux platforms")
     def test_invalid_utf8(self):
         # This is port of test_utf8_decode_invalid_sequences in test_unicode.py
         # to exercise the separate utf8 validator in tokenize.c used when
         # reading source files.
 
+        # That file is written using low-level C file I/O, so the only way to
+        # test it is to write actual files to disk.
+
         # Each example is put inside a string at the top of the file so
         # it's an otherwise valid Python source file.
         template = b'"%s"\n'

From 1f90863e6a515b3defd3a0aa47c00bf995f12e60 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 16 Aug 2022 10:14:31 -0400
Subject: [PATCH 4/4] Fix typos in comment

---
 Lib/test/test_source_encoding.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index 06eb9eef467d2f..e1b0de2adef621 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -228,9 +228,9 @@ class UTF8ValidatorTest(unittest.TestCase):
     @unittest.skipIf(not sys.platform.startswith("linux"),
                      "Too slow to run on non-Linux platforms")
     def test_invalid_utf8(self):
-        # This is port of test_utf8_decode_invalid_sequences in test_unicode.py
-        # to exercise the separate utf8 validator in tokenize.c used when
-        # reading source files.
+        # This is a port of test_utf8_decode_invalid_sequences in
+        # test_unicode.py to exercise the separate utf8 validator in
+        # Parser/tokenizer.c used when reading source files.
 
         # That file is written using low-level C file I/O, so the only way to
         # test it is to write actual files to disk.