Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 8a3ebf8

Browse files
author
Fredrik Lundh
committed
-- SRE 0.9.6 sync. this includes:
+ added "regs" attribute + fixed "pos" and "endpos" attributes + reset "lastindex" and "lastgroup" in scanner methods + removed (?P#id) syntax; the "lastindex" and "lastgroup" attributes are now always set + removed string module dependencies in sre_parse + better debugging support in sre_parse + various tweaks to build under 1.5.2
1 parent 4f1b208 commit 8a3ebf8

7 files changed

Lines changed: 1312 additions & 1185 deletions

File tree

Lib/sre.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,13 @@
1010
# other compatibility work.
1111
#
1212

13+
# FIXME: change all FIXME's to XXX ;-)
14+
1315
import sre_compile
1416
import sre_parse
1517

18+
import string
19+
1620
# flags
1721
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
1822
L = LOCALE = sre_compile.SRE_FLAG_LOCALE
@@ -53,6 +57,9 @@ def findall(pattern, string, maxsplit=0):
5357
def compile(pattern, flags=0):
5458
return _compile(pattern, flags)
5559

60+
def purge():
61+
_cache.clear()
62+
5663
def template(pattern, flags=0):
5764
return _compile(pattern, flags|T)
5865

@@ -65,18 +72,22 @@ def escape(pattern):
6572
s[i] = "\\000"
6673
else:
6774
s[i] = "\\" + c
68-
return pattern[:0].join(s)
75+
return _join(s, pattern)
6976

7077
# --------------------------------------------------------------------
7178
# internals
7279

7380
_cache = {}
7481
_MAXCACHE = 100
7582

83+
def _join(seq, sep):
84+
# internal: join into string having the same type as sep
85+
return string.join(seq, sep[:0])
86+
7687
def _compile(pattern, flags=0):
7788
# internal: compile pattern
7889
tp = type(pattern)
79-
if tp not in (type(""), type(u"")):
90+
if tp not in sre_compile.STRING_TYPES:
8091
return pattern
8192
key = (tp, pattern, flags)
8293
try:
@@ -89,10 +100,6 @@ def _compile(pattern, flags=0):
89100
_cache[key] = p
90101
return p
91102

92-
def purge():
93-
# clear pattern cache
94-
_cache.clear()
95-
96103
def _sub(pattern, template, string, count=0):
97104
# internal: pattern.sub implementation hook
98105
return _subn(pattern, template, string, count)[0]
@@ -120,7 +127,7 @@ def filter(match, template=template):
120127
i = e
121128
n = n + 1
122129
append(string[i:])
123-
return string[:0].join(s), n
130+
return _join(s, string[:0]), n
124131

125132
def _split(pattern, string, maxsplit=0):
126133
# internal: pattern.split implementation hook
@@ -161,11 +168,19 @@ def _pickle(p):
161168

162169
class Scanner:
163170
def __init__(self, lexicon):
171+
from sre_constants import BRANCH, SUBPATTERN, INDEX
164172
self.lexicon = lexicon
173+
# combine phrases into a compound pattern
165174
p = []
175+
s = sre_parse.Pattern()
166176
for phrase, action in lexicon:
167-
p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
168-
self.scanner = _compile("|".join(p))
177+
p.append(sre_parse.SubPattern(s, [
178+
(SUBPATTERN, (None, sre_parse.parse(phrase))),
179+
(INDEX, len(p))
180+
]))
181+
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
182+
s.groups = len(p)
183+
self.scanner = sre_compile.compile(p)
169184
def scan(self, string):
170185
result = []
171186
append = result.append

Lib/sre_compile.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,10 +197,11 @@ def fixup(literal, flags=flags):
197197
else:
198198
emit(ATCODES[av])
199199
elif op is BRANCH:
200-
emit(OPCODES[op])
201200
tail = []
202201
for av in av[1]:
202+
emit(OPCODES[op])
203203
skip = len(code); emit(0)
204+
emit(MAXCODE) # save mark
204205
_compile(code, av, flags)
205206
emit(OPCODES[JUMP])
206207
tail.append(len(code)); emit(0)
@@ -286,11 +287,18 @@ def _compile_info(code, pattern, flags):
286287
emit(OPCODES[FAILURE])
287288
code[skip] = len(code) - skip
288289

290+
STRING_TYPES = [type("")]
291+
292+
try:
293+
STRING_TYPES.append(type(unicode("")))
294+
except NameError:
295+
pass
296+
289297
def compile(p, flags=0):
290298
# internal: convert pattern list to internal format
291299

292300
# compile, as necessary
293-
if type(p) in (type(""), type(u"")):
301+
if type(p) in STRING_TYPES:
294302
import sre_parse
295303
pattern = p
296304
p = sre_parse.parse(p, flags)
@@ -308,6 +316,8 @@ def compile(p, flags=0):
308316

309317
code.append(OPCODES[SUCCESS])
310318

319+
# print code
320+
311321
# FIXME: <fl> get rid of this limitation!
312322
assert p.pattern.groups <= 100,\
313323
"sorry, but this version only supports 100 named groups"

Lib/sre_constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def makedict(list):
172172
# flags
173173
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
174174
SRE_FLAG_IGNORECASE = 2 # case insensitive
175-
SRE_FLAG_LOCALE = 4 # honor system locale
175+
SRE_FLAG_LOCALE = 4 # honour system locale
176176
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
177177
SRE_FLAG_DOTALL = 16 # treat target as a single string
178178
SRE_FLAG_UNICODE = 32 # use unicode locale

Lib/sre_parse.py

Lines changed: 79 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@
2525
SPECIAL_CHARS = ".\\[{()*+?^$|"
2626
REPEAT_CHARS = "*+?{"
2727

28-
DIGITS = tuple(string.digits)
28+
DIGITS = tuple("012345689")
2929

3030
OCTDIGITS = tuple("01234567")
3131
HEXDIGITS = tuple("0123456789abcdefABCDEF")
3232

33-
WHITESPACE = tuple(string.whitespace)
33+
WHITESPACE = tuple(" \t\n\r\v\f")
3434

3535
ESCAPES = {
3636
r"\a": (LITERAL, 7),
@@ -68,7 +68,8 @@
6868
"u": SRE_FLAG_UNICODE,
6969
}
7070

71-
class State:
71+
class Pattern:
72+
# master pattern object. keeps track of global attributes
7273
def __init__(self):
7374
self.flags = 0
7475
self.groups = 1
@@ -88,6 +89,33 @@ def __init__(self, pattern, data=None):
8889
data = []
8990
self.data = data
9091
self.width = None
92+
def dump(self, level=0):
93+
nl = 1
94+
for op, av in self.data:
95+
print level*" " + op,; nl = 0
96+
if op == "in":
97+
# member sublanguage
98+
print; nl = 1
99+
for op, a in av:
100+
print (level+1)*" " + op, a
101+
elif op == "branch":
102+
print; nl = 1
103+
i = 0
104+
for a in av[1]:
105+
if i > 0:
106+
print level*" " + "or"
107+
a.dump(level+1); nl = 1
108+
i = i + 1
109+
elif type(av) in (type(()), type([])):
110+
for a in av:
111+
if isinstance(a, SubPattern):
112+
if not nl: print
113+
a.dump(level+1); nl = 1
114+
else:
115+
print a, ; nl = 0
116+
else:
117+
print av, ; nl = 0
118+
if not nl: print
91119
def __repr__(self):
92120
return repr(self.data)
93121
def __len__(self):
@@ -255,10 +283,25 @@ def _escape(source, escape, state):
255283
pass
256284
raise error, "bogus escape: %s" % repr(escape)
257285

258-
def _branch(pattern, items):
259-
# form a branch operator from a set of items
286+
def _parse_sub(source, state, nested=1):
287+
# parse an alternation: a|b|c
260288

261-
subpattern = SubPattern(pattern)
289+
items = []
290+
while 1:
291+
items.append(_parse(source, state))
292+
if source.match("|"):
293+
continue
294+
if not nested:
295+
break
296+
if not source.next or source.match(")"):
297+
break
298+
else:
299+
raise error, "pattern not properly closed"
300+
301+
if len(items) == 1:
302+
return items[0]
303+
304+
subpattern = SubPattern(state)
262305

263306
# check if all items share a common prefix
264307
while 1:
@@ -285,7 +328,7 @@ def _branch(pattern, items):
285328
break
286329
else:
287330
# we can store this as a character set instead of a
288-
# branch (FIXME: use a range if possible)
331+
# branch (the compiler may optimize this even more)
289332
set = []
290333
for item in items:
291334
set.append(item[0])
@@ -296,8 +339,7 @@ def _branch(pattern, items):
296339
return subpattern
297340

298341
def _parse(source, state):
299-
300-
# parse regular expression pattern into an operator list.
342+
# parse a simple pattern
301343

302344
subpattern = SubPattern(state)
303345

@@ -451,22 +493,6 @@ def _parse(source, state):
451493
if gid is None:
452494
raise error, "unknown group name"
453495
subpattern.append((GROUPREF, gid))
454-
elif source.match("#"):
455-
index = ""
456-
while 1:
457-
char = source.get()
458-
if char is None:
459-
raise error, "unterminated index"
460-
if char == ")":
461-
break
462-
index = index + char
463-
try:
464-
index = int(index)
465-
if index < 0 or index > MAXREPEAT:
466-
raise ValueError
467-
except ValueError:
468-
raise error, "illegal index"
469-
subpattern.append((INDEX, index))
470496
continue
471497
else:
472498
char = source.get()
@@ -491,48 +517,27 @@ def _parse(source, state):
491517
raise error, "syntax error"
492518
dir = -1 # lookbehind
493519
char = source.get()
494-
b = []
495-
while 1:
496-
p = _parse(source, state)
497-
if source.next == ")":
498-
if b:
499-
b.append(p)
500-
p = _branch(state, b)
501-
if char == "=":
502-
subpattern.append((ASSERT, (dir, p)))
503-
else:
504-
subpattern.append((ASSERT_NOT, (dir, p)))
505-
break
506-
elif source.match("|"):
507-
b.append(p)
508-
else:
509-
raise error, "pattern not properly closed"
520+
p = _parse_sub(source, state)
521+
if char == "=":
522+
subpattern.append((ASSERT, (dir, p)))
523+
else:
524+
subpattern.append((ASSERT_NOT, (dir, p)))
525+
continue
510526
else:
511527
# flags
512528
while FLAGS.has_key(source.next):
513529
state.flags = state.flags | FLAGS[source.get()]
514530
if group:
515531
# parse group contents
516-
b = []
517532
if group == 2:
518533
# anonymous group
519534
group = None
520535
else:
521536
group = state.getgroup(name)
522-
while 1:
523-
p = _parse(source, state)
524-
if group is not None:
525-
p.append((INDEX, group))
526-
if source.match(")"):
527-
if b:
528-
b.append(p)
529-
p = _branch(state, b)
530-
subpattern.append((SUBPATTERN, (group, p)))
531-
break
532-
elif source.match("|"):
533-
b.append(p)
534-
else:
535-
raise error, "group not properly closed"
537+
p = _parse_sub(source, state)
538+
subpattern.append((SUBPATTERN, (group, p)))
539+
if group is not None:
540+
p.append((INDEX, group))
536541
else:
537542
while 1:
538543
char = source.get()
@@ -555,26 +560,24 @@ def _parse(source, state):
555560

556561
return subpattern
557562

558-
def parse(pattern, flags=0):
563+
def parse(str, flags=0):
559564
# parse 're' pattern into list of (opcode, argument) tuples
560-
source = Tokenizer(pattern)
561-
state = State()
562-
state.flags = flags
563-
b = []
564-
while 1:
565-
p = _parse(source, state)
566-
tail = source.get()
567-
if tail == "|":
568-
b.append(p)
569-
elif tail == ")":
570-
raise error, "unbalanced parenthesis"
571-
elif tail is None:
572-
if b:
573-
b.append(p)
574-
p = _branch(state, b)
575-
break
576-
else:
577-
raise error, "bogus characters at end of regular expression"
565+
566+
source = Tokenizer(str)
567+
568+
pattern = Pattern()
569+
pattern.flags = flags
570+
571+
p = _parse_sub(source, pattern, 0)
572+
573+
tail = source.get()
574+
if tail == ")":
575+
raise error, "unbalanced parenthesis"
576+
elif tail:
577+
raise error, "bogus characters at end of regular expression"
578+
579+
# p.dump()
580+
578581
return p
579582

580583
def parse_template(source, pattern):
@@ -656,4 +659,4 @@ def expand_template(template, match):
656659
if s is None:
657660
raise error, "empty group"
658661
a(s)
659-
return sep.join(p)
662+
return string.join(p, sep)

Lib/test/output/test_sre

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
test_sre
22
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
33
=== Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a')
4+
=== grouping error ('(a)(b)c|ab', 'ab', 0, 'found+"-"+g1+"-"+g2', 'ab-None-None') 'ab-None-b' should be 'ab-None-None'
5+
=== grouping error ('(a)+b|aac', 'aac', 0, 'found+"-"+g1', 'aac-None') 'aac-a' should be 'aac-None'
46
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')

0 commit comments

Comments
 (0)