Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ad3fc44

Browse files
committed
Implemented non-recursive SRE matching.
1 parent 41e2809 commit ad3fc44

9 files changed

Lines changed: 842 additions & 472 deletions

File tree

Doc/lib/libre.tex

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,15 @@ \subsection{Regular Expression Syntax \label{re-syntax}}
297297
fixed length. Patterns which start with negative lookbehind
298298
assertions may match at the beginning of the string being searched.
299299

300+
\item[\code{(?(\var{id/name})yes-pattern|no-pattern)}] Will try to match
301+
with \regexp{yes-pattern} if the group with given \var{id} or \var{name}
302+
exists, and with \regexp{no-pattern} if it doesn't. \regexp{|no-pattern}
303+
is optional and can be omitted. For example,
304+
\regexp{(<)?(\e w+@\e w+(?:\e .\e w+)+)(?(1)>)} is a poor email matching
305+
pattern, which will match with \code{'<[email protected]>'} as well as
306+
\code{'[email protected]'}, but not with \code{'<[email protected]'}.
307+
\versionadded{2.3}
308+
300309
\end{list}
301310

302311
The special sequences consist of \character{\e} and a character from the

Lib/sre_compile.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,19 @@ def fixup(literal, flags=flags):
145145
else:
146146
emit(OPCODES[op])
147147
emit(av-1)
148+
elif op is GROUPREF_EXISTS:
149+
emit(OPCODES[op])
150+
emit((av[0]-1)*2)
151+
skipyes = len(code); emit(0)
152+
_compile(code, av[1], flags)
153+
if av[2]:
154+
emit(OPCODES[JUMP])
155+
skipno = len(code); emit(0)
156+
code[skipyes] = len(code) - skipyes + 1
157+
_compile(code, av[2], flags)
158+
code[skipno] = len(code) - skipno
159+
else:
160+
code[skipyes] = len(code) - skipyes + 1
148161
else:
149162
raise ValueError, ("unsupported operand type", op)
150163

Lib/sre_constants.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
# update when constants are added or removed
1515

16-
MAGIC = 20030419
16+
MAGIC = 20031017
1717

1818
# max code word in this release
1919

@@ -42,6 +42,7 @@ class error(Exception):
4242
CHARSET = "charset"
4343
GROUPREF = "groupref"
4444
GROUPREF_IGNORE = "groupref_ignore"
45+
GROUPREF_EXISTS = "groupref_exists"
4546
IN = "in"
4647
IN_IGNORE = "in_ignore"
4748
INFO = "info"
@@ -108,7 +109,7 @@ class error(Exception):
108109
CALL,
109110
CATEGORY,
110111
CHARSET, BIGCHARSET,
111-
GROUPREF, GROUPREF_IGNORE,
112+
GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE,
112113
IN, IN_IGNORE,
113114
INFO,
114115
JUMP,

Lib/sre_parse.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,20 @@ def _parse_sub(source, state, nested=1):
364364
subpattern.append((BRANCH, (None, items)))
365365
return subpattern
366366

367+
def _parse_sub_cond(source, state, condgroup):
368+
item_yes = _parse(source, state)
369+
if source.match("|"):
370+
item_no = _parse(source, state)
371+
if source.match("|"):
372+
raise error, "conditional backref with more than two branches"
373+
else:
374+
item_no = None
375+
if source.next and not source.match(")", 0):
376+
raise error, "pattern not properly closed"
377+
subpattern = SubPattern(state)
378+
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
379+
return subpattern
380+
367381
def _parse(source, state):
368382
# parse a simple pattern
369383

@@ -499,6 +513,7 @@ def _parse(source, state):
499513
elif this == "(":
500514
group = 1
501515
name = None
516+
condgroup = None
502517
if source.match("?"):
503518
group = 0
504519
# options
@@ -568,6 +583,26 @@ def _parse(source, state):
568583
else:
569584
subpattern.append((ASSERT_NOT, (dir, p)))
570585
continue
586+
elif source.match("("):
587+
# conditional backreference group
588+
condname = ""
589+
while 1:
590+
char = source.get()
591+
if char is None:
592+
raise error, "unterminated name"
593+
if char == ")":
594+
break
595+
condname = condname + char
596+
group = 2
597+
if isname(condname):
598+
condgroup = state.groupdict.get(condname)
599+
if condgroup is None:
600+
raise error, "unknown group name"
601+
else:
602+
try:
603+
condgroup = atoi(condname)
604+
except ValueError:
605+
raise error, "bad character in group name"
571606
else:
572607
# flags
573608
if not source.next in FLAGS:
@@ -581,7 +616,10 @@ def _parse(source, state):
581616
group = None
582617
else:
583618
group = state.opengroup(name)
584-
p = _parse_sub(source, state)
619+
if condgroup:
620+
p = _parse_sub_cond(source, state, condgroup)
621+
else:
622+
p = _parse_sub(source, state)
585623
if not source.match(")"):
586624
raise error, "unbalanced parenthesis"
587625
if group is not None:

Lib/test/test_re.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,6 @@ def test_re_match(self):
169169
self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
170170

171171
def test_re_groupref_exists(self):
172-
return # not yet
173172
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
174173
('(', 'a'))
175174
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
@@ -405,19 +404,20 @@ def test_bug_418626(self):
405404
self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
406405
20003)
407406
self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
408-
# non-simple '*?' still recurses and hits the recursion limit
409-
self.assertRaises(RuntimeError, re.search, '(a|b)*?c', 10000*'ab'+'cd')
407+
# non-simple '*?' still used to hit the recursion limit, before the
408+
# non-recursive scheme was implemented.
409+
self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
410410

411411
def test_bug_612074(self):
412412
pat=u"["+re.escape(u"\u2039")+u"]"
413413
self.assertEqual(re.compile(pat) and 1, 1)
414414

415415
def test_stack_overflow(self):
416-
# nasty case that overflows the straightforward recursive
416+
# nasty cases that used to overflow the straightforward recursive
417417
# implementation of repeated groups.
418-
self.assertRaises(RuntimeError, re.match, '(x)*', 50000*'x')
419-
self.assertRaises(RuntimeError, re.match, '(x)*y', 50000*'x'+'y')
420-
self.assertRaises(RuntimeError, re.match, '(x)*?y', 50000*'x'+'y')
418+
self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
419+
self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
420+
self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
421421

422422
def test_scanner(self):
423423
def s_ident(scanner, token): return token

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ Extension modules
6161

6262
- Bug #814613: INET_ADDRSTRLEN fix needed for all compilers on SGI
6363

64+
- Implemented non-recursive SRE matching scheme (#757624).
65+
66+
- Implemented (?(id/name)yes|no) support in SRE (#572936).
67+
6468
Library
6569
-------
6670

0 commit comments

Comments
 (0)