From e52f328e5920258060adec0337ffca77937e6b14 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 14 Jul 2022 11:32:28 -0400 Subject: [PATCH 1/4] gh-94823: Improve coverage in tokenizer.c:valid_utf8 When loading a source file from disk, there is a separate UTF-8 validator distinct from the one in `unicode_decode_utf8`. This exercises that code path with the same set of invalid inputs as we use for testing the "other" UTF-8 decoder. --- Lib/test/test_source_encoding.py | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index a0375fda0d3656..e74256fdf4cfde 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -224,6 +224,64 @@ def test_crcrcrlf2(self): out = self.check_script_output(src, br"'\n\n\n'") +class UTF8ValidatorTest(unittest.TestCase): + @unittest.skipIf(sys.platform.startswith("win"), + "Times out on Windows") + def test_invalid_utf8(self): + # This is port of test_utf8_decode_invalid_sequences in test_unicode.py + # to exercise the separate utf8 validator in tokenize.c used when + # reading source files. + + # Each example is put inside a string at the top of the file so + # it's an otherwise valid Python source file. + template = b'"%s"\n' + + with tempfile.TemporaryDirectory() as tmpd: + fn = os.path.join(tmpd, 'test.py') + + def check(content): + with open(fn, 'wb') as fp: + fp.write(template % content) + script_helper.assert_python_failure(fn) + + # continuation bytes in a sequence of 2, 3, or 4 bytes + continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0, 7)] + # start bytes of a 2-byte sequence equivalent to code points < 0x7F + invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)] + # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF + invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)] + invalid_start_bytes = ( + continuation_bytes + invalid_2B_seq_start_bytes + + invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)] + ) + + for byte in invalid_start_bytes: + check(byte) + + for sb in invalid_2B_seq_start_bytes: + for cb in continuation_bytes: + check(sb + cb) + + for sb in invalid_4B_seq_start_bytes: + for cb1 in continuation_bytes[:3]: + for cb3 in continuation_bytes[:3]: + check(sb+cb1+b'\x80'+cb3) + + for cb in [bytes([x]) for x in range(0x80, 0xA0, 5)]: + check(b'\xE0'+cb+b'\x80') + check(b'\xE0'+cb+b'\xBF') + # surrogates + for cb in [bytes([x]) for x in range(0xA0, 0xC0, 5)]: + check(b'\xED'+cb+b'\x80') + check(b'\xED'+cb+b'\xBF') + for cb in [bytes([x]) for x in range(0x80, 0x90, 5)]: + check(b'\xF0'+cb+b'\x80\x80') + check(b'\xF0'+cb+b'\xBF\xBF') + for cb in [bytes([x]) for x in range(0x90, 0xC0, 5)]: + check(b'\xF4'+cb+b'\x80\x80') + check(b'\xF4'+cb+b'\xBF\xBF') + + class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): def check_script_output(self, src, expected): From 61cf58fbeab4848e3523fefbbf84cd74e6456ad8 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 20 Jul 2022 11:41:35 -0400 Subject: [PATCH 2/4] Perform all examples, since we are excluding Windows anyway --- Lib/test/test_source_encoding.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index e74256fdf4cfde..80f22a8790b9d4 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -226,7 +226,7 @@ def test_crcrcrlf2(self): class UTF8ValidatorTest(unittest.TestCase): @unittest.skipIf(sys.platform.startswith("win"), - "Times out on Windows") + "Times out on Windows due to file I/O") def test_invalid_utf8(self): # This is port of test_utf8_decode_invalid_sequences in test_unicode.py # to exercise the separate utf8 validator in tokenize.c used when @@ -245,7 +245,7 @@ def check(content): script_helper.assert_python_failure(fn) # continuation bytes in a sequence of 2, 3, or 4 bytes - continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0, 7)] + continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)] # start bytes of a 2-byte sequence equivalent to code points < 0x7F invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)] # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF @@ -267,17 +267,17 @@ def check(content): for cb3 in continuation_bytes[:3]: check(sb+cb1+b'\x80'+cb3) - for cb in [bytes([x]) for x in range(0x80, 0xA0, 5)]: + for cb in [bytes([x]) for x in range(0x80, 0xA0)]: check(b'\xE0'+cb+b'\x80') check(b'\xE0'+cb+b'\xBF') # surrogates - for cb in [bytes([x]) for x in range(0xA0, 0xC0, 5)]: + for cb in [bytes([x]) for x in range(0xA0, 0xC0)]: check(b'\xED'+cb+b'\x80') check(b'\xED'+cb+b'\xBF') - for cb in [bytes([x]) for x in range(0x80, 0x90, 5)]: + for cb in [bytes([x]) for x in range(0x80, 0x90)]: check(b'\xF0'+cb+b'\x80\x80') check(b'\xF0'+cb+b'\xBF\xBF') - for cb in [bytes([x]) for x in range(0x90, 0xC0, 5)]: + for cb in [bytes([x]) for x in range(0x90, 0xC0)]: check(b'\xF4'+cb+b'\x80\x80') check(b'\xF4'+cb+b'\xBF\xBF') From 1f65938047f389b36d262acb024efad6480e16c4 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 12 Aug 2022 08:12:21 -0400 Subject: [PATCH 3/4] Only run test on Linux --- Lib/test/test_source_encoding.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 80f22a8790b9d4..06eb9eef467d2f 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -225,13 +225,16 @@ def test_crcrcrlf2(self): class UTF8ValidatorTest(unittest.TestCase): - @unittest.skipIf(sys.platform.startswith("win"), - "Times out on Windows due to file I/O") + @unittest.skipIf(not sys.platform.startswith("linux"), + "Too slow to run on non-Linux platforms") def test_invalid_utf8(self): # This is port of test_utf8_decode_invalid_sequences in test_unicode.py # to exercise the separate utf8 validator in tokenize.c used when # reading source files. + # That file is written using low-level C file I/O, so the only way to + # test it is to write actual files to disk. + # Each example is put inside a string at the top of the file so # it's an otherwise valid Python source file. template = b'"%s"\n' From 1f90863e6a515b3defd3a0aa47c00bf995f12e60 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 16 Aug 2022 10:14:31 -0400 Subject: [PATCH 4/4] Fix typos in comment --- Lib/test/test_source_encoding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 06eb9eef467d2f..e1b0de2adef621 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -228,9 +228,9 @@ class UTF8ValidatorTest(unittest.TestCase): @unittest.skipIf(not sys.platform.startswith("linux"), "Too slow to run on non-Linux platforms") def test_invalid_utf8(self): - # This is port of test_utf8_decode_invalid_sequences in test_unicode.py - # to exercise the separate utf8 validator in tokenize.c used when - # reading source files. + # This is a port of test_utf8_decode_invalid_sequences in + # test_unicode.py to exercise the separate utf8 validator in + # Parser/tokenizer.c used when reading source files. # That file is written using low-level C file I/O, so the only way to # test it is to write actual files to disk.