2525SPECIAL_CHARS = ".\\ [{()*+?^$|"
2626REPEAT_CHARS = "*+?{"
2727
28- DIGITS = tuple (string . digits )
28+ DIGITS = tuple ("012345689" )
2929
3030OCTDIGITS = tuple ("01234567" )
3131HEXDIGITS = tuple ("0123456789abcdefABCDEF" )
3232
33- WHITESPACE = tuple (string . whitespace )
33+ WHITESPACE = tuple (" \t \n \r \v \f " )
3434
3535ESCAPES = {
3636 r"\a" : (LITERAL , 7 ),
6868 "u" : SRE_FLAG_UNICODE ,
6969}
7070
71- class State :
71+ class Pattern :
72+ # master pattern object. keeps track of global attributes
7273 def __init__ (self ):
7374 self .flags = 0
7475 self .groups = 1
@@ -88,6 +89,33 @@ def __init__(self, pattern, data=None):
8889 data = []
8990 self .data = data
9091 self .width = None
92+ def dump (self , level = 0 ):
93+ nl = 1
94+ for op , av in self .data :
95+ print level * " " + op ,; nl = 0
96+ if op == "in" :
97+ # member sublanguage
98+ print ; nl = 1
99+ for op , a in av :
100+ print (level + 1 )* " " + op , a
101+ elif op == "branch" :
102+ print ; nl = 1
103+ i = 0
104+ for a in av [1 ]:
105+ if i > 0 :
106+ print level * " " + "or"
107+ a .dump (level + 1 ); nl = 1
108+ i = i + 1
109+ elif type (av ) in (type (()), type ([])):
110+ for a in av :
111+ if isinstance (a , SubPattern ):
112+ if not nl : print
113+ a .dump (level + 1 ); nl = 1
114+ else :
115+ print a , ; nl = 0
116+ else :
117+ print av , ; nl = 0
118+ if not nl : print
91119 def __repr__ (self ):
92120 return repr (self .data )
93121 def __len__ (self ):
@@ -255,10 +283,25 @@ def _escape(source, escape, state):
255283 pass
256284 raise error , "bogus escape: %s" % repr (escape )
257285
258- def _branch ( pattern , items ):
259- # form a branch operator from a set of items
286+ def _parse_sub ( source , state , nested = 1 ):
287+ # parse an alternation: a|b|c
260288
261- subpattern = SubPattern (pattern )
289+ items = []
290+ while 1 :
291+ items .append (_parse (source , state ))
292+ if source .match ("|" ):
293+ continue
294+ if not nested :
295+ break
296+ if not source .next or source .match (")" ):
297+ break
298+ else :
299+ raise error , "pattern not properly closed"
300+
301+ if len (items ) == 1 :
302+ return items [0 ]
303+
304+ subpattern = SubPattern (state )
262305
263306 # check if all items share a common prefix
264307 while 1 :
@@ -285,7 +328,7 @@ def _branch(pattern, items):
285328 break
286329 else :
287330 # we can store this as a character set instead of a
288- # branch (FIXME: use a range if possible )
331+ # branch (the compiler may optimize this even more )
289332 set = []
290333 for item in items :
291334 set .append (item [0 ])
@@ -296,8 +339,7 @@ def _branch(pattern, items):
296339 return subpattern
297340
298341def _parse (source , state ):
299-
300- # parse regular expression pattern into an operator list.
342+ # parse a simple pattern
301343
302344 subpattern = SubPattern (state )
303345
@@ -451,22 +493,6 @@ def _parse(source, state):
451493 if gid is None :
452494 raise error , "unknown group name"
453495 subpattern .append ((GROUPREF , gid ))
454- elif source .match ("#" ):
455- index = ""
456- while 1 :
457- char = source .get ()
458- if char is None :
459- raise error , "unterminated index"
460- if char == ")" :
461- break
462- index = index + char
463- try :
464- index = int (index )
465- if index < 0 or index > MAXREPEAT :
466- raise ValueError
467- except ValueError :
468- raise error , "illegal index"
469- subpattern .append ((INDEX , index ))
470496 continue
471497 else :
472498 char = source .get ()
@@ -491,48 +517,27 @@ def _parse(source, state):
491517 raise error , "syntax error"
492518 dir = - 1 # lookbehind
493519 char = source .get ()
494- b = []
495- while 1 :
496- p = _parse (source , state )
497- if source .next == ")" :
498- if b :
499- b .append (p )
500- p = _branch (state , b )
501- if char == "=" :
502- subpattern .append ((ASSERT , (dir , p )))
503- else :
504- subpattern .append ((ASSERT_NOT , (dir , p )))
505- break
506- elif source .match ("|" ):
507- b .append (p )
508- else :
509- raise error , "pattern not properly closed"
520+ p = _parse_sub (source , state )
521+ if char == "=" :
522+ subpattern .append ((ASSERT , (dir , p )))
523+ else :
524+ subpattern .append ((ASSERT_NOT , (dir , p )))
525+ continue
510526 else :
511527 # flags
512528 while FLAGS .has_key (source .next ):
513529 state .flags = state .flags | FLAGS [source .get ()]
514530 if group :
515531 # parse group contents
516- b = []
517532 if group == 2 :
518533 # anonymous group
519534 group = None
520535 else :
521536 group = state .getgroup (name )
522- while 1 :
523- p = _parse (source , state )
524- if group is not None :
525- p .append ((INDEX , group ))
526- if source .match (")" ):
527- if b :
528- b .append (p )
529- p = _branch (state , b )
530- subpattern .append ((SUBPATTERN , (group , p )))
531- break
532- elif source .match ("|" ):
533- b .append (p )
534- else :
535- raise error , "group not properly closed"
537+ p = _parse_sub (source , state )
538+ subpattern .append ((SUBPATTERN , (group , p )))
539+ if group is not None :
540+ p .append ((INDEX , group ))
536541 else :
537542 while 1 :
538543 char = source .get ()
@@ -555,26 +560,24 @@ def _parse(source, state):
555560
556561 return subpattern
557562
558- def parse (pattern , flags = 0 ):
563+ def parse (str , flags = 0 ):
559564 # parse 're' pattern into list of (opcode, argument) tuples
560- source = Tokenizer (pattern )
561- state = State ()
562- state .flags = flags
563- b = []
564- while 1 :
565- p = _parse (source , state )
566- tail = source .get ()
567- if tail == "|" :
568- b .append (p )
569- elif tail == ")" :
570- raise error , "unbalanced parenthesis"
571- elif tail is None :
572- if b :
573- b .append (p )
574- p = _branch (state , b )
575- break
576- else :
577- raise error , "bogus characters at end of regular expression"
565+
566+ source = Tokenizer (str )
567+
568+ pattern = Pattern ()
569+ pattern .flags = flags
570+
571+ p = _parse_sub (source , pattern , 0 )
572+
573+ tail = source .get ()
574+ if tail == ")" :
575+ raise error , "unbalanced parenthesis"
576+ elif tail :
577+ raise error , "bogus characters at end of regular expression"
578+
579+ # p.dump()
580+
578581 return p
579582
580583def parse_template (source , pattern ):
@@ -656,4 +659,4 @@ def expand_template(template, match):
656659 if s is None :
657660 raise error , "empty group"
658661 a (s )
659- return sep .join (p )
662+ return string .join (p , sep )
0 commit comments