From 0e11e20ddd48903736348648dc6531f64187a87d Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 12 Jul 2022 08:07:48 +0200 Subject: [PATCH 1/3] update test_re from cpython3.10 --- Lib/test/test_re.py | 212 +++++++++++++++++++++++++++++++++----------- 1 file changed, 159 insertions(+), 53 deletions(-) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 77d497b74c..62bfc3a7aa 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1,5 +1,6 @@ from test.support import (gc_collect, bigmemtest, _2G, - cpython_only, captured_stdout) + cpython_only, captured_stdout, + check_disallow_instantiation) import locale import re import sre_compile @@ -53,8 +54,6 @@ def checkTemplateError(self, pattern, repl, string, errmsg, pos=None): if pos is not None: self.assertEqual(err.pos, pos) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_keep_buffer(self): # See bug 14212 b = bytearray(b'x') @@ -219,6 +218,16 @@ def test_symbolic_groups(self): re.compile(r'(?Px)(?P=a)(?(a)y)') re.compile(r'(?Px)(?P=a1)(?(a1)y)') re.compile(r'(?Px)\1(?(1)y)') + re.compile(b'(?Px)(?P=a1)(?(a1)y)') + # New valid identifiers in Python 3 + re.compile('(?P<ยต>x)(?P=ยต)(?(ยต)y)') + re.compile('(?P<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>x)(?P=๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข)(?(๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข)y)') + # Support > 100 groups. + pat = '|'.join('x(?P%x)y' % (i, i) for i in range(1, 200 + 1)) + pat = '(?:%s)(?(200)z|t)' % pat + self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) + + def test_symbolic_groups_errors(self): self.checkPatternError(r'(?P)(?P)', "redefinition of group name 'a' as group 2; " "was group 1") @@ -244,16 +253,22 @@ def test_symbolic_groups(self): self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3) self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3) self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3) - # New valid/invalid identifiers in Python 3 - re.compile('(?P<ยต>x)(?P=ยต)(?(ยต)y)') - re.compile('(?P<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>x)(?P=๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข)(?(๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข)y)') self.checkPatternError('(?P<ยฉ>x)', "bad character in group name 'ยฉ'", 4) + self.checkPatternError('(?P=ยฉ)', "bad character in group name 'ยฉ'", 4) + self.checkPatternError('(?(ยฉ)y)', "bad character in group name 'ยฉ'", 3) + + def test_symbolic_refs(self): + self.assertEqual(re.sub('(?Px)|(?Py)', r'\g', 'xx'), '') + self.assertEqual(re.sub('(?Px)|(?Py)', r'\2', 'xx'), '') + self.assertEqual(re.sub(b'(?Px)', br'\g', b'xx'), b'xx') + # New valid identifiers in Python 3 + self.assertEqual(re.sub('(?P<ยต>x)', r'\g<ยต>', 'xx'), 'xx') + self.assertEqual(re.sub('(?P<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>x)', r'\g<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>', 'xx'), 'xx') # Support > 100 groups. pat = '|'.join('x(?P%x)y' % (i, i) for i in range(1, 200 + 1)) - pat = '(?:%s)(?(200)z|t)' % pat - self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) + self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8') - def test_symbolic_refs(self): + def test_symbolic_refs_errors(self): self.checkTemplateError('(?Px)', r'\g, unterminated name', 3) self.checkTemplateError('(?Px)', r'\g<', 'xx', @@ -271,18 +286,14 @@ def test_symbolic_refs(self): 'invalid group reference 2', 1) with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"): re.sub('(?Px)', r'\g', 'xx') - self.assertEqual(re.sub('(?Px)|(?Py)', r'\g', 'xx'), '') - self.assertEqual(re.sub('(?Px)|(?Py)', r'\2', 'xx'), '') self.checkTemplateError('(?Px)', r'\g<-1>', 'xx', "bad character in group name '-1'", 3) - # New valid/invalid identifiers in Python 3 - self.assertEqual(re.sub('(?P<ยต>x)', r'\g<ยต>', 'xx'), 'xx') - self.assertEqual(re.sub('(?P<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>x)', r'\g<๐”˜๐”ซ๐”ฆ๐” ๐”ฌ๐”ก๐”ข>', 'xx'), 'xx') self.checkTemplateError('(?Px)', r'\g<ยฉ>', 'xx', "bad character in group name 'ยฉ'", 3) - # Support > 100 groups. - pat = '|'.join('x(?P%x)y' % (i, i) for i in range(1, 200 + 1)) - self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8') + self.checkTemplateError('(?Px)', r'\g<ใŠ€>', 'xx', + "bad character in group name 'ใŠ€'", 3) + self.checkTemplateError('(?Px)', r'\g<ยน>', 'xx', + "bad character in group name 'ยน'", 3) def test_re_subn(self): self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) @@ -544,12 +555,28 @@ def test_re_groupref_exists(self): pat = '(?:%s)(?(200)z)' % pat self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) - self.checkPatternError(r'(?P)(?(0))', 'bad group number', 10) + def test_re_groupref_exists_errors(self): + self.checkPatternError(r'(?P)(?(0)a|b)', 'bad group number', 10) + self.checkPatternError(r'()(?(-1)a|b)', + "bad character in group name '-1'", 5) + self.checkPatternError(r'()(?(ใŠ€)a|b)', + "bad character in group name 'ใŠ€'", 5) + self.checkPatternError(r'()(?(ยน)a|b)', + "bad character in group name 'ยน'", 5) + self.checkPatternError(r'()(?(1', + "missing ), unterminated name", 5) + self.checkPatternError(r'()(?(1)a', + "missing ), unterminated subpattern", 2) self.checkPatternError(r'()(?(1)a|b', 'missing ), unterminated subpattern', 2) + self.checkPatternError(r'()(?(1)a|b|c', + 'conditional backref with more than ' + 'two branches', 10) self.checkPatternError(r'()(?(1)a|b|c)', 'conditional backref with more than ' 'two branches', 10) + self.checkPatternError(r'()(?(2)a)', + "invalid group reference 2", 5) def test_re_groupref_overflow(self): from sre_constants import MAXGROUPS @@ -623,8 +650,6 @@ def test_repeat_minmax(self): self.checkPatternError(r'x{2,1}', 'min repeat greater than max repeat', 2) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_getattr(self): self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) @@ -698,8 +723,6 @@ def test_other_escapes(self): with self.subTest(c): self.assertRaises(re.error, re.compile, '[\\%c]' % c) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_named_unicode_escapes(self): # test individual Unicode named escapes self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<')) @@ -733,6 +756,10 @@ def test_named_unicode_escapes(self): "undefined character name 'SPAM'", 0) self.checkPatternError(r'[\N{SPAM}]', "undefined character name 'SPAM'", 1) + self.checkPatternError(r'\N{KEYCAP NUMBER SIGN}', + "undefined character name 'KEYCAP NUMBER SIGN'", 0) + self.checkPatternError(r'[\N{KEYCAP NUMBER SIGN}]', + "undefined character name 'KEYCAP NUMBER SIGN'", 1) self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0) self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1) @@ -762,8 +789,6 @@ def test_string_boundaries(self): # Can match around the whitespace. self.assertEqual(len(re.findall(r"\B", " ")), 2) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_bigcharset(self): self.assertEqual(re.match("([\u2222\u2223])", "\u2222").group(1), "\u2222") @@ -848,16 +873,30 @@ def test_ignore_case(self): self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") - assert '\u212a'.lower() == 'k' # 'โ„ช' + # Two different characters have the same lowercase. + assert 'K'.lower() == '\u212a'.lower() == 'k' # 'โ„ช' self.assertTrue(re.match(r'K', '\u212a', re.I)) self.assertTrue(re.match(r'k', '\u212a', re.I)) self.assertTrue(re.match(r'\u212a', 'K', re.I)) self.assertTrue(re.match(r'\u212a', 'k', re.I)) - assert '\u017f'.upper() == 'S' # 'ลฟ' + + # Two different characters have the same uppercase. + assert 's'.upper() == '\u017f'.upper() == 'S' # 'ลฟ' self.assertTrue(re.match(r'S', '\u017f', re.I)) self.assertTrue(re.match(r's', '\u017f', re.I)) self.assertTrue(re.match(r'\u017f', 'S', re.I)) self.assertTrue(re.match(r'\u017f', 's', re.I)) + + # Two different characters have the same uppercase. Unicode 9.0+. + assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'ะฒ', 'แฒ€', 'ะ’' + self.assertTrue(re.match(r'\u0412', '\u0432', re.I)) + self.assertTrue(re.match(r'\u0412', '\u1c80', re.I)) + self.assertTrue(re.match(r'\u0432', '\u0412', re.I)) + self.assertTrue(re.match(r'\u0432', '\u1c80', re.I)) + self.assertTrue(re.match(r'\u1c80', '\u0412', re.I)) + self.assertTrue(re.match(r'\u1c80', '\u0432', re.I)) + + # Two different characters have the same multicharacter uppercase. assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # '๏ฌ…', '๏ฌ†' self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I)) self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I)) @@ -871,16 +910,31 @@ def test_ignore_case_set(self): self.assertTrue(re.match(br'[19a]', b'a', re.I)) self.assertTrue(re.match(br'[19a]', b'A', re.I)) self.assertTrue(re.match(br'[19A]', b'a', re.I)) - assert '\u212a'.lower() == 'k' # 'โ„ช' + + # Two different characters have the same lowercase. + assert 'K'.lower() == '\u212a'.lower() == 'k' # 'โ„ช' self.assertTrue(re.match(r'[19K]', '\u212a', re.I)) self.assertTrue(re.match(r'[19k]', '\u212a', re.I)) self.assertTrue(re.match(r'[19\u212a]', 'K', re.I)) self.assertTrue(re.match(r'[19\u212a]', 'k', re.I)) - assert '\u017f'.upper() == 'S' # 'ลฟ' + + # Two different characters have the same uppercase. + assert 's'.upper() == '\u017f'.upper() == 'S' # 'ลฟ' self.assertTrue(re.match(r'[19S]', '\u017f', re.I)) self.assertTrue(re.match(r'[19s]', '\u017f', re.I)) self.assertTrue(re.match(r'[19\u017f]', 'S', re.I)) self.assertTrue(re.match(r'[19\u017f]', 's', re.I)) + + # Two different characters have the same uppercase. Unicode 9.0+. + assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'ะฒ', 'แฒ€', 'ะ’' + self.assertTrue(re.match(r'[19\u0412]', '\u0432', re.I)) + self.assertTrue(re.match(r'[19\u0412]', '\u1c80', re.I)) + self.assertTrue(re.match(r'[19\u0432]', '\u0412', re.I)) + self.assertTrue(re.match(r'[19\u0432]', '\u1c80', re.I)) + self.assertTrue(re.match(r'[19\u1c80]', '\u0412', re.I)) + self.assertTrue(re.match(r'[19\u1c80]', '\u0432', re.I)) + + # Two different characters have the same multicharacter uppercase. assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # '๏ฌ…', '๏ฌ†' self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I)) self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I)) @@ -904,16 +958,30 @@ def test_ignore_case_range(self): self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I)) self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I)) - assert '\u212a'.lower() == 'k' # 'โ„ช' + # Two different characters have the same lowercase. + assert 'K'.lower() == '\u212a'.lower() == 'k' # 'โ„ช' self.assertTrue(re.match(r'[J-M]', '\u212a', re.I)) self.assertTrue(re.match(r'[j-m]', '\u212a', re.I)) self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I)) self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I)) - assert '\u017f'.upper() == 'S' # 'ลฟ' + + # Two different characters have the same uppercase. + assert 's'.upper() == '\u017f'.upper() == 'S' # 'ลฟ' self.assertTrue(re.match(r'[R-T]', '\u017f', re.I)) self.assertTrue(re.match(r'[r-t]', '\u017f', re.I)) self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I)) self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I)) + + # Two different characters have the same uppercase. Unicode 9.0+. + assert '\u0432'.upper() == '\u1c80'.upper() == '\u0412' # 'ะฒ', 'แฒ€', 'ะ’' + self.assertTrue(re.match(r'[\u0411-\u0413]', '\u0432', re.I)) + self.assertTrue(re.match(r'[\u0411-\u0413]', '\u1c80', re.I)) + self.assertTrue(re.match(r'[\u0431-\u0433]', '\u0412', re.I)) + self.assertTrue(re.match(r'[\u0431-\u0433]', '\u1c80', re.I)) + self.assertTrue(re.match(r'[\u1c80-\u1c82]', '\u0412', re.I)) + self.assertTrue(re.match(r'[\u1c80-\u1c82]', '\u0432', re.I)) + + # Two different characters have the same multicharacter uppercase. assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # '๏ฌ…', '๏ฌ†' self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I)) self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I)) @@ -921,6 +989,7 @@ def test_ignore_case_range(self): def test_category(self): self.assertEqual(re.match(r"(\s)", " ").group(1), " ") + @cpython_only def test_case_helpers(self): import _sre for i in range(128): @@ -1068,8 +1137,6 @@ def test_pickling(self): # current pickle expects the _compile() reconstructor in re module from re import _compile - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_copying(self): import copy p = re.compile(r'(?P\d+)(?:\.(?P\d*))?') @@ -1375,8 +1442,6 @@ def test_bug_817234(self): self.assertEqual(next(iter).span(), (4, 4)) self.assertRaises(StopIteration, next, iter) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_bug_6561(self): # '\d' should match characters in Unicode category 'Nd' # (Number, Decimal Digit), but not those in 'Nl' (Number, @@ -1454,7 +1519,8 @@ def test_inline_flags(self): self.assertTrue(re.match(p, lower_char)) self.assertEqual( str(warns.warnings[0].message), - 'Flags not at the start of the expression %r' % p + 'Flags not at the start of the expression %r' + ' but at position 1' % p ) self.assertEqual(warns.warnings[0].filename, __file__) @@ -1463,7 +1529,8 @@ def test_inline_flags(self): self.assertTrue(re.match(p, lower_char)) self.assertEqual( str(warns.warnings[0].message), - 'Flags not at the start of the expression %r (truncated)' % p[:20] + 'Flags not at the start of the expression %r (truncated)' + ' but at position 1' % p[:20] ) self.assertEqual(warns.warnings[0].filename, __file__) @@ -1475,7 +1542,8 @@ def test_inline_flags(self): self.assertTrue(re.match(p, b'a')) self.assertEqual( str(warns.warnings[0].message), - 'Flags not at the start of the expression %r' % p + 'Flags not at the start of the expression %r' + ' but at position 1' % p ) self.assertEqual(warns.warnings[0].filename, __file__) @@ -1615,11 +1683,6 @@ def test_scoped_flags(self): self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab')) self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB')) - self.assertTrue(re.match(r'(?x: a) b', 'a b')) - self.assertIsNone(re.match(r'(?x: a) b', ' a b')) - self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE)) - self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) - self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0')) self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0')) self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII)) @@ -1645,6 +1708,33 @@ def test_scoped_flags(self): self.checkPatternError(r'(?i+', 'missing -, : or )', 3) self.checkPatternError(r'(?iz', 'unknown flag', 3) + def test_ignore_spaces(self): + for space in " \t\n\r\v\f": + self.assertTrue(re.fullmatch(space + 'a', 'a', re.VERBOSE)) + for space in b" ", b"\t", b"\n", b"\r", b"\v", b"\f": + self.assertTrue(re.fullmatch(space + b'a', b'a', re.VERBOSE)) + self.assertTrue(re.fullmatch('(?x) a', 'a')) + self.assertTrue(re.fullmatch(' (?x) a', 'a', re.VERBOSE)) + self.assertTrue(re.fullmatch('(?x) (?x) a', 'a')) + self.assertTrue(re.fullmatch(' a(?x: b) c', ' ab c')) + self.assertTrue(re.fullmatch(' a(?-x: b) c', 'a bc', re.VERBOSE)) + self.assertTrue(re.fullmatch('(?x) a(?-x: b) c', 'a bc')) + self.assertTrue(re.fullmatch('(?x) a| b', 'a')) + self.assertTrue(re.fullmatch('(?x) a| b', 'b')) + + def test_comments(self): + self.assertTrue(re.fullmatch('#x\na', 'a', re.VERBOSE)) + self.assertTrue(re.fullmatch(b'#x\na', b'a', re.VERBOSE)) + self.assertTrue(re.fullmatch('(?x)#x\na', 'a')) + self.assertTrue(re.fullmatch('#x\n(?x)#y\na', 'a', re.VERBOSE)) + self.assertTrue(re.fullmatch('(?x)#x\n(?x)#y\na', 'a')) + self.assertTrue(re.fullmatch('#x\na(?x:#y\nb)#z\nc', '#x\nab#z\nc')) + self.assertTrue(re.fullmatch('#x\na(?-x:#y\nb)#z\nc', 'a#y\nbc', + re.VERBOSE)) + self.assertTrue(re.fullmatch('(?x)#x\na(?-x:#y\nb)#z\nc', 'a#y\nbc')) + self.assertTrue(re.fullmatch('(?x)#x\na|#y\nb', 'a')) + self.assertTrue(re.fullmatch('(?x)#x\na|#y\nb', 'b')) + def test_bug_6509(self): # Replacement strings of both types must parse properly. # all strings @@ -1663,10 +1753,6 @@ def test_bug_6509(self): pat = re.compile(b'..') self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes') - # RUSTPYTHON: here in rustpython, we borrow the string only at the - # time of matching, so we will not check the string type when creating - # SRE_Scanner, expect this, other tests has passed - @cpython_only def test_dealloc(self): # issue 3299: check for segfault in debug build import _sre @@ -1738,6 +1824,7 @@ def test_repeat_minmax_overflow(self): self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) + @cpython_only def test_repeat_minmax_overflow_maxrepeat(self): try: from _sre import MAXREPEAT @@ -1772,8 +1859,6 @@ def test_issue17998(self): self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'), [b'xyz'], msg=pattern) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_match_repr(self): for string in '[abracadabra]', S('[abracadabra]'): m = re.search(r'(.+)(.*?)\1', string) @@ -1820,8 +1905,6 @@ def test_zerowidth(self): self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")], [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)]) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_bug_2537(self): # issue 2537: empty submatches for outer_op in ('{0,}', '*', '+', '{1,187}'): @@ -1832,6 +1915,7 @@ def test_bug_2537(self): self.assertEqual(m.group(1), "") self.assertEqual(m.group(2), "y") + @cpython_only def test_debug_flag(self): pat = r'(\.)(?:[ch]|py)(?(1)$|: )' with captured_stdout() as out: @@ -2153,8 +2237,6 @@ def test_inline_flags(self): self.check('(?i)pattern', "re.compile('(?i)pattern', re.IGNORECASE)") - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_unknown_flags(self): self.check_flags('random pattern', 0x123000, "re.compile('random pattern', 0x123000)") @@ -2207,6 +2289,18 @@ class ImplementationTest(unittest.TestCase): Test implementation details of the re module. """ + @cpython_only + def test_immutable(self): + # bpo-43908: check that re types are immutable + with self.assertRaises(TypeError): + re.Match.foo = 1 + with self.assertRaises(TypeError): + re.Pattern.foo = 1 + with self.assertRaises(TypeError): + pat = re.compile("") + tp = type(pat.scanner("")) + tp.foo = 1 + def test_overlap_table(self): f = sre_compile._generate_overlap_table self.assertEqual(f(""), []) @@ -2216,6 +2310,18 @@ def test_overlap_table(self): self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1]) self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0]) + def test_signedness(self): + self.assertGreaterEqual(sre_compile.MAXREPEAT, 0) + self.assertGreaterEqual(sre_compile.MAXGROUPS, 0) + + @cpython_only + def test_disallow_instantiation(self): + # Ensure that the type disallows instantiation (bpo-43916) + check_disallow_instantiation(self, re.Match) + check_disallow_instantiation(self, re.Pattern) + pat = re.compile("") + check_disallow_instantiation(self, type(pat.scanner(""))) + class ExternalTests(unittest.TestCase): @@ -2236,7 +2342,7 @@ def test_re_benchmarks(self): def test_re_tests(self): 're_tests test suite' - from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR + from test.re_tests import tests, FAIL, SYNTAX_ERROR for t in tests: pattern = s = outcome = repl = expected = None if len(t) == 5: From d7842d1e8e4acca1107cd74895999ff522604f22 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 25 Jul 2022 22:07:26 +0200 Subject: [PATCH 2/3] pass htmlparser tests --- Cargo.lock | 3 +-- Lib/test/test_htmlparser.py | 24 ---------------------- Lib/test/test_re.py | 33 +++++++++++++++++++++++++++++++ extra_tests/snippets/stdlib_re.py | 5 ++++- vm/Cargo.toml | 3 ++- 5 files changed, 40 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 59f7cdd512..ecca468c98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2067,8 +2067,7 @@ dependencies = [ [[package]] name = "sre-engine" version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5872399287c284fed4bc773cb7f6041623ac88213774f5e11e89e2131681fc1" +source = "git+https://github.com/qingshi163/sre-engine?branch=refactor#82675ae5bd541139163c33312f84e421dddcdd1e" dependencies = [ "bitflags", "num_enum", diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 5bfbefcd17..4d79f367cc 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -112,8 +112,6 @@ def test_processing_instruction_only(self): ("pi", "processing instruction ?"), ]) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_simple_html(self): self._run_check(""" @@ -258,8 +256,6 @@ def test_startendtag(self): ("endtag", "p"), ]) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_get_starttag_text(self): s = """""" self._run_check_extra(s, [ @@ -345,8 +341,6 @@ def test_condcoms(self): ('comment', '[if lte IE 7]>pretty?", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_slashes_in_starttag(self): self._run_check('', [('startendtag', 'a', [('foo', 'var')])]) html = (' confuses the parser')] self._run_check(html, expected) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_correct_detection_of_start_tags(self): # see #13273 html = ('
The rain ' @@ -618,8 +608,6 @@ def test_convert_charrefs_dropped_text(self): class AttributesTestCase(TestCaseBase): - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_attr_syntax(self): output = [ ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)]) @@ -629,8 +617,6 @@ def test_attr_syntax(self): self._run_check("""""", output) self._run_check("""""", output) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_attr_values(self): self._run_check("""""", [("starttag", "a", [("b", "xxx\n\txxx"), @@ -646,8 +632,6 @@ def test_attr_values(self): "", [("starttag", "a", [("href", "mailto:xyz@example.com")])]) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_attr_nonascii(self): # see issue 7311 self._run_check( @@ -668,8 +652,6 @@ def test_attr_entity_replacement(self): "", [("starttag", "a", [("b", "&><\"'")])]) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_attr_funky_names(self): self._run_check( "", @@ -718,8 +700,6 @@ def test_malformed_attributes(self): ] self._run_check(html, expected) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_malformed_adjacent_attributes(self): # see #12629 self._run_check('', @@ -732,8 +712,6 @@ def test_malformed_adjacent_attributes(self): ('endtag', 'x')]) # see #755670 for the following 3 tests - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_adjacent_attributes(self): self._run_check('', [("starttag", "a", @@ -759,8 +737,6 @@ def test_end_tag_in_attribute_value(self): [("href", "http://www.example.org/\">;")]), ("data", "spam"), ("endtag", "a")]) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_with_unquoted_attributes(self): # see #12008 html = ("" diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 62bfc3a7aa..03cb8172de 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -54,6 +54,8 @@ def checkTemplateError(self, pattern, repl, string, errmsg, pos=None): if pos is not None: self.assertEqual(err.pos, pos) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_keep_buffer(self): # See bug 14212 b = bytearray(b'x') @@ -555,6 +557,8 @@ def test_re_groupref_exists(self): pat = '(?:%s)(?(200)z)' % pat self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_re_groupref_exists_errors(self): self.checkPatternError(r'(?P)(?(0)a|b)', 'bad group number', 10) self.checkPatternError(r'()(?(-1)a|b)', @@ -650,6 +654,8 @@ def test_repeat_minmax(self): self.checkPatternError(r'x{2,1}', 'min repeat greater than max repeat', 2) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_getattr(self): self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)") self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U) @@ -723,6 +729,8 @@ def test_other_escapes(self): with self.subTest(c): self.assertRaises(re.error, re.compile, '[\\%c]' % c) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_named_unicode_escapes(self): # test individual Unicode named escapes self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<')) @@ -789,6 +797,8 @@ def test_string_boundaries(self): # Can match around the whitespace. self.assertEqual(len(re.findall(r"\B", " ")), 2) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_bigcharset(self): self.assertEqual(re.match("([\u2222\u2223])", "\u2222").group(1), "\u2222") @@ -861,6 +871,8 @@ def test_lookbehind(self): self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)') self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)') + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_ignore_case(self): self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") @@ -901,6 +913,8 @@ def test_ignore_case(self): self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I)) self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I)) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_ignore_case_set(self): self.assertTrue(re.match(r'[19A]', 'A', re.I)) self.assertTrue(re.match(r'[19a]', 'a', re.I)) @@ -939,6 +953,8 @@ def test_ignore_case_set(self): self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I)) self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I)) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_ignore_case_range(self): # Issues #3511, #17381. self.assertTrue(re.match(r'[9-a]', '_', re.I)) @@ -1137,6 +1153,8 @@ def test_pickling(self): # current pickle expects the _compile() reconstructor in re module from re import _compile + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_copying(self): import copy p = re.compile(r'(?P\d+)(?:\.(?P\d*))?') @@ -1442,6 +1460,8 @@ def test_bug_817234(self): self.assertEqual(next(iter).span(), (4, 4)) self.assertRaises(StopIteration, next, iter) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_bug_6561(self): # '\d' should match characters in Unicode category 'Nd' # (Number, Decimal Digit), but not those in 'Nl' (Number, @@ -1471,6 +1491,8 @@ def test_empty_array(self): self.assertIsNone(re.compile(b"bla").match(a)) self.assertEqual(re.compile(b"").match(a).groups(), ()) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_inline_flags(self): # Bug #1700 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below @@ -1753,6 +1775,10 @@ def test_bug_6509(self): pat = re.compile(b'..') self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes') + # RUSTPYTHON: here in rustpython, we borrow the string only at the + # time of matching, so we will not check the string type when creating + # SRE_Scanner, expect this, other tests has passed + @cpython_only def test_dealloc(self): # issue 3299: check for segfault in debug build import _sre @@ -1859,6 +1885,8 @@ def test_issue17998(self): self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'), [b'xyz'], msg=pattern) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_match_repr(self): for string in '[abracadabra]', S('[abracadabra]'): m = re.search(r'(.+)(.*?)\1', string) @@ -1905,6 +1933,9 @@ def test_zerowidth(self): self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")], [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)]) + # TODO: RUSTPYTHON + # @unittest.expectedFailure + @unittest.skip("") def test_bug_2537(self): # issue 2537: empty submatches for outer_op in ('{0,}', '*', '+', '{1,187}'): @@ -2237,6 +2268,8 @@ def test_inline_flags(self): self.check('(?i)pattern', "re.compile('(?i)pattern', re.IGNORECASE)") + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_unknown_flags(self): self.check_flags('random pattern', 0x123000, "re.compile('random pattern', 0x123000)") diff --git a/extra_tests/snippets/stdlib_re.py b/extra_tests/snippets/stdlib_re.py index 45a505fcf7..17ecdba7f6 100644 --- a/extra_tests/snippets/stdlib_re.py +++ b/extra_tests/snippets/stdlib_re.py @@ -67,4 +67,7 @@ urlpattern = re.compile('//([^/#?]*)(.*)', re.DOTALL) url = '//www.example.org:80/foo/bar/baz.html' -assert urlpattern.match(url).group(1) == 'www.example.org:80' \ No newline at end of file +assert urlpattern.match(url).group(1) == 'www.example.org:80' + +assert re.compile('(?:\w+(?:\s|/(?!>))*)*').match('a /bb />ccc').group() == 'a /bb ' +assert re.compile('(?:(1)?)*').match('111').group() == '111' \ No newline at end of file diff --git a/vm/Cargo.toml b/vm/Cargo.toml index 3c72b2de0a..5fd905c8a0 100644 --- a/vm/Cargo.toml +++ b/vm/Cargo.toml @@ -72,7 +72,8 @@ memoffset = "0.6.5" optional = "0.5.0" # RustPython crates implementing functionality based on CPython -sre-engine = "0.1.2" +# sre-engine = "0.1.2" +sre-engine = { git = "https://github.com/qingshi163/sre-engine", branch = "refactor" } # to work on sre-engine locally # sre-engine = { path = "../../sre-engine" } From abc75866623808fa3510d65d0ffeecde3cda5799 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Wed, 27 Jul 2022 05:14:34 +0900 Subject: [PATCH 3/3] bump up sre-engine to 0.2.0 --- Cargo.lock | 5 +++-- vm/Cargo.toml | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ecca468c98..e0094cecbd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2066,8 +2066,9 @@ dependencies = [ [[package]] name = "sre-engine" -version = "0.1.2" -source = "git+https://github.com/qingshi163/sre-engine?branch=refactor#82675ae5bd541139163c33312f84e421dddcdd1e" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55e283f0ec6488739d0b972e3c17b70a8698b33c298a169430387f871af51a03" dependencies = [ "bitflags", "num_enum", diff --git a/vm/Cargo.toml b/vm/Cargo.toml index 5fd905c8a0..f1771238b5 100644 --- a/vm/Cargo.toml +++ b/vm/Cargo.toml @@ -72,8 +72,7 @@ memoffset = "0.6.5" optional = "0.5.0" # RustPython crates implementing functionality based on CPython -# sre-engine = "0.1.2" -sre-engine = { git = "https://github.com/qingshi163/sre-engine", branch = "refactor" } +sre-engine = "0.2.0" # to work on sre-engine locally # sre-engine = { path = "../../sre-engine" }