Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0c4fdba

Browse files
author
Fredrik Lundh
committed
closes bug #112468 (and all the other bugs that surfaced when
I fixed the a bug in the regression test harness...)
1 parent d3b1f11 commit 0c4fdba

2 files changed

Lines changed: 85 additions & 43 deletions

File tree

Lib/sre_parse.py

Lines changed: 47 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@
1414

1515
MAXREPEAT = 65535
1616

17-
# FIXME: the following might change in 2.0 final. but for now, this
18-
# seems to be the best way to be compatible with 1.5.2
19-
CHARMASK = 0xff
20-
2117
SPECIAL_CHARS = ".\\[{()*+?^$|"
2218
REPEAT_CHARS = "*+?{"
2319

@@ -181,9 +177,10 @@ def __next(self):
181177
char = char + c
182178
self.index = self.index + len(char)
183179
self.next = char
184-
def match(self, char):
180+
def match(self, char, skip=1):
185181
if char == self.next:
186-
self.__next()
182+
if skip:
183+
self.__next()
187184
return 1
188185
return 0
189186
def get(self):
@@ -230,16 +227,19 @@ def _class_escape(source, escape):
230227
return code
231228
try:
232229
if escape[1:2] == "x":
233-
# FIXME: in 2.0, \xNN must have exactly two digits
234-
while source.next in HEXDIGITS:
230+
# hexadecimal escape (exactly two digits)
231+
while source.next in HEXDIGITS and len(escape) < 4:
235232
escape = escape + source.get()
236233
escape = escape[2:]
237-
return LITERAL, int(escape[-4:], 16) & CHARMASK
234+
if len(escape) != 2:
235+
raise error, "bogus escape: %s" % repr("\\" + escape)
236+
return LITERAL, int(escape, 16) & 0xff
238237
elif str(escape[1:2]) in OCTDIGITS:
239-
while source.next in OCTDIGITS:
238+
# octal escape (up to three digits)
239+
while source.next in OCTDIGITS and len(escape) < 5:
240240
escape = escape + source.get()
241241
escape = escape[1:]
242-
return LITERAL, int(escape[-6:], 8) & CHARMASK
242+
return LITERAL, int(escape, 8) & 0xff
243243
if len(escape) == 2:
244244
return LITERAL, ord(escape[1])
245245
except ValueError:
@@ -256,24 +256,32 @@ def _escape(source, escape, state):
256256
return code
257257
try:
258258
if escape[1:2] == "x":
259-
while source.next in HEXDIGITS:
259+
# hexadecimal escape
260+
while source.next in HEXDIGITS and len(escape) < 4:
260261
escape = escape + source.get()
261262
escape = escape[2:]
262-
return LITERAL, int(escape[-4:], 16) & CHARMASK
263+
if len(escape) != 2:
264+
raise error, "bogus escape: %s" % repr("\\" + escape)
265+
return LITERAL, int(escape, 16) & 0xff
266+
elif escape[1:2] == "0":
267+
# octal escape
268+
while source.next in OCTDIGITS and len(escape) < 5:
269+
escape = escape + source.get()
270+
return LITERAL, int(escape[1:], 8) & 0xff
263271
elif escape[1:2] in DIGITS:
264-
while 1:
265-
group = _group(escape, state.groups)
266-
if group:
267-
if (not source.next or
268-
not _group(escape + source.next, state.groups)):
269-
return GROUPREF, group
270-
escape = escape + source.get()
271-
elif source.next in OCTDIGITS:
272+
# octal escape *or* decimal group reference (sigh)
273+
here = source.tell()
274+
if source.next in DIGITS:
275+
escape = escape + source.get()
276+
if escape[2] in OCTDIGITS and source.next in OCTDIGITS:
277+
# got three octal digits; this is an octal escape
272278
escape = escape + source.get()
273-
else:
274-
break
275-
escape = escape[1:]
276-
return LITERAL, int(escape[-6:], 8) & CHARMASK
279+
return LITERAL, int(escape[1:], 8) & 0xff
280+
# got at least one decimal digit; this is a group reference
281+
group = _group(escape, state.groups)
282+
if group:
283+
return GROUPREF, group
284+
raise error, "bogus escape: %s" % repr(escape)
277285
if len(escape) == 2:
278286
return LITERAL, ord(escape[1])
279287
except ValueError:
@@ -290,7 +298,7 @@ def _parse_sub(source, state, nested=1):
290298
continue
291299
if not nested:
292300
break
293-
if not source.next or source.match(")"):
301+
if not source.next or source.match(")", 0):
294302
break
295303
else:
296304
raise error, "pattern not properly closed"
@@ -395,7 +403,11 @@ def _parse(source, state):
395403
code2 = LITERAL, ord(this)
396404
if code1[0] != LITERAL or code2[0] != LITERAL:
397405
raise error, "illegal range"
398-
set.append((RANGE, (code1[1], code2[1])))
406+
lo = code1[1]
407+
hi = code2[1]
408+
if hi < lo:
409+
raise error, "illegal range"
410+
set.append((RANGE, (lo, hi)))
399411
else:
400412
if code1[0] is IN:
401413
code1 = code1[1][0]
@@ -505,6 +517,9 @@ def _parse(source, state):
505517
if source.next is None or source.next == ")":
506518
break
507519
source.get()
520+
if not source.match(")"):
521+
raise error, "unbalanced parenthesis"
522+
continue
508523
elif source.next in ("=", "!", "<"):
509524
# lookahead assertions
510525
char = source.get()
@@ -515,6 +530,8 @@ def _parse(source, state):
515530
dir = -1 # lookbehind
516531
char = source.get()
517532
p = _parse_sub(source, state)
533+
if not source.match(")"):
534+
raise error, "unbalanced parenthesis"
518535
if char == "=":
519536
subpattern.append((ASSERT, (dir, p)))
520537
else:
@@ -532,6 +549,8 @@ def _parse(source, state):
532549
else:
533550
group = state.getgroup(name)
534551
p = _parse_sub(source, state)
552+
if not source.match(")"):
553+
raise error, "unbalanced parenthesis"
535554
subpattern.append((SUBPATTERN, (group, p)))
536555
else:
537556
while 1:
@@ -625,7 +644,7 @@ def parse_template(source, pattern):
625644
break
626645
if not code:
627646
this = this[1:]
628-
code = LITERAL, int(this[-6:], 8) & CHARMASK
647+
code = LITERAL, int(this[-6:], 8) & 0xff
629648
a(code)
630649
else:
631650
try:

Lib/test/re_tests.py

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,23 @@
1616
# matching performs on large strings.
1717

1818
benchmarks = [
19+
20+
# test common prefix
21+
('Python|Perl', 'Perl'), # Alternation
22+
('(Python|Perl)', 'Perl'), # Grouped alternation
23+
24+
('Python|Perl|Tcl', 'Perl'), # Alternation
25+
('(Python|Perl|Tcl)', 'Perl'), # Grouped alternation
26+
27+
('(Python)\\1', 'PythonPython'), # Backreference
28+
('([0a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # Disable the fastmap optimization
29+
('([a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # A few sets
30+
1931
('Python', 'Python'), # Simple text literal
2032
('.*Python', 'Python'), # Bad text literal
2133
('.*Python.*', 'Python'), # Worse text literal
2234
('.*(Python)', 'Python'), # Bad text literal with grouping
2335

24-
('(Python|Perl|Tcl', 'Perl'), # Alternation
25-
('(Python|Perl|Tcl)', 'Perl'), # Grouped alternation
26-
('(Python)\\1', 'PythonPython'), # Backreference
27-
('([0a-z][a-z]*,)+', 'a5,b7,c9,'), # Disable the fastmap optimization
28-
('([a-z][a-z0-9]*,)+', 'a5,b7,c9,') # A few sets
2936
]
3037

3138
# Test suite (for verifying correctness)
@@ -79,12 +86,17 @@
7986
# Test various letter escapes
8087
(r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
8188
(r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
82-
(r'\u', '', SYNTAX_ERROR), # A Perl escape
89+
# NOTE: not an error under PCRE/PRE:
90+
# (r'\u', '', SYNTAX_ERROR), # A Perl escape
8391
(r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'),
8492
(r'\xff', '\377', SUCCEED, 'found', chr(255)),
85-
(r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)),
86-
(r'\x00f', '\017', SUCCEED, 'found', chr(15)),
87-
(r'\x00fe', '\376', SUCCEED, 'found', chr(254)),
93+
# new \x semantics
94+
(r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)),
95+
(r'\x00f', '\017', FAIL, 'found', chr(15)),
96+
(r'\x00fe', '\376', FAIL, 'found', chr(254)),
97+
# (r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)),
98+
# (r'\x00f', '\017', SUCCEED, 'found', chr(15)),
99+
# (r'\x00fe', '\376', SUCCEED, 'found', chr(254)),
88100

89101
(r"^\w+=(\\[\000-\277]|[^\n\\])*", "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c",
90102
SUCCEED, 'found', "SRC=eval.c g.c blah blah blah \\\\"),
@@ -138,7 +150,8 @@
138150
('a[b-d]', 'aac', SUCCEED, 'found', 'ac'),
139151
('a[-b]', 'a-', SUCCEED, 'found', 'a-'),
140152
('a[\\-b]', 'a-', SUCCEED, 'found', 'a-'),
141-
('a[b-]', 'a-', SYNTAX_ERROR),
153+
# NOTE: not an error under PCRE/PRE:
154+
# ('a[b-]', 'a-', SYNTAX_ERROR),
142155
('a[]b', '-', SYNTAX_ERROR),
143156
('a[', '-', SYNTAX_ERROR),
144157
('a\\', '-', SYNTAX_ERROR),
@@ -543,7 +556,9 @@
543556

544557
# Check odd placement of embedded pattern modifiers
545558

546-
('w(?i)', 'W', SYNTAX_ERROR),
559+
# not an error under PCRE/PRE:
560+
('w(?i)', 'W', SUCCEED, 'found', 'W'),
561+
# ('w(?i)', 'W', SYNTAX_ERROR),
547562

548563
# Comments using the x embedded pattern modifier
549564

@@ -577,20 +592,28 @@
577592
('\\D+', '1234abc5678', SUCCEED, 'found', 'abc'),
578593
('[\\D]+', '1234abc5678', SUCCEED, 'found', 'abc'),
579594
('[\\da-fA-F]+', '123abc', SUCCEED, 'found', '123abc'),
580-
('[\\d-x]', '-', SYNTAX_ERROR),
595+
# not an error under PCRE/PRE:
596+
# ('[\\d-x]', '-', SYNTAX_ERROR),
581597
(r'([\s]*)([\S]*)([\s]*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '),
582598
(r'(\s*)(\S*)(\s*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '),
583599

584600
(r'\xff', '\377', SUCCEED, 'found', chr(255)),
585-
(r'\x00ff', '\377', SUCCEED, 'found', chr(255)),
601+
# new \x semantics
602+
(r'\x00ff', '\377', FAIL, 'found', chr(255)),
603+
# (r'\x00ff', '\377', SUCCEED, 'found', chr(255)),
586604
(r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
587605
('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
588606
(r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)),
589607
(r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'),
590608

591-
# additional regression tests (1.6 and later)
609+
#
610+
# post-1.5.2 additions
592611

593612
# xmllib problem
594613
(r'(([a-z]+):)?([a-z]+)$', 'smil', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-smil'),
595-
614+
# bug 111869 (PRE/PCRE fails on this one, SRE doesn't)
615+
(r'.*d', 'abc\nabd', SUCCEED, 'found', 'abd'),
616+
# bug 112468
617+
('(', '', SYNTAX_ERROR),
618+
('[\\41]', '!', SUCCEED, 'found', '!'),
596619
]

0 commit comments

Comments
 (0)