Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b25e1ad

Browse files
author
Fredrik Lundh
committed
sre 2.1b2 update:
- take locale into account for word boundary anchors (#410271) - restored 2.0's *? behaviour (#233283, #408936 and others) - speed up re.sub/re.subn
1 parent 8e9972c commit b25e1ad

8 files changed

Lines changed: 165 additions & 49 deletions

File tree

Lib/sre.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
"U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
2424
"UNICODE", "error" ]
2525

26+
__version__ = "2.1b2"
27+
2628
# this module works under 1.5.2 and later. don't use string methods
2729
import string
2830

@@ -90,6 +92,7 @@ def compile(pattern, flags=0):
9092
def purge():
9193
"Clear the regular expression cache"
9294
_cache.clear()
95+
_cache_repl.clear()
9396

9497
def template(pattern, flags=0):
9598
"Compile a template pattern, returning a pattern object"
@@ -111,6 +114,8 @@ def escape(pattern):
111114
# internals
112115

113116
_cache = {}
117+
_cache_repl = {}
118+
114119
_MAXCACHE = 100
115120

116121
def _join(seq, sep):
@@ -134,6 +139,21 @@ def _compile(*key):
134139
_cache[key] = p
135140
return p
136141

142+
def _compile_repl(*key):
143+
# internal: compile replacement pattern
144+
p = _cache_repl.get(key)
145+
if p is not None:
146+
return p
147+
repl, pattern = key
148+
try:
149+
p = sre_parse.parse_template(repl, pattern)
150+
except error, v:
151+
raise error, v # invalid expression
152+
if len(_cache_repl) >= _MAXCACHE:
153+
_cache_repl.clear()
154+
_cache_repl[key] = p
155+
return p
156+
137157
def _expand(pattern, match, template):
138158
# internal: match.expand implementation hook
139159
template = sre_parse.parse_template(template, pattern)
@@ -148,7 +168,7 @@ def _subn(pattern, template, string, count=0):
148168
if callable(template):
149169
filter = template
150170
else:
151-
template = sre_parse.parse_template(template, pattern)
171+
template = _compile_repl(template, pattern)
152172
def filter(match, template=template):
153173
return sre_parse.expand_template(template, match)
154174
n = i = 0

Lib/sre_compile.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,12 @@ def fixup(literal, flags=flags):
105105
elif op is AT:
106106
emit(OPCODES[op])
107107
if flags & SRE_FLAG_MULTILINE:
108-
emit(ATCODES[AT_MULTILINE.get(av, av)])
109-
else:
110-
emit(ATCODES[av])
108+
av = AT_MULTILINE.get(av, av)
109+
if flags & SRE_FLAG_LOCALE:
110+
av = AT_LOCALE.get(av, av)
111+
elif flags & SRE_FLAG_UNICODE:
112+
av = AT_UNICODE.get(av, av)
113+
emit(ATCODES[av])
111114
elif op is BRANCH:
112115
emit(OPCODES[op])
113116
tail = []
@@ -124,11 +127,10 @@ def fixup(literal, flags=flags):
124127
elif op is CATEGORY:
125128
emit(OPCODES[op])
126129
if flags & SRE_FLAG_LOCALE:
127-
emit(CHCODES[CH_LOCALE[av]])
130+
av = CH_LOCALE[av]
128131
elif flags & SRE_FLAG_UNICODE:
129-
emit(CHCODES[CH_UNICODE[av]])
130-
else:
131-
emit(CHCODES[av])
132+
av = CH_UNICODE[av]
133+
emit(CHCODES[av])
132134
elif op is GROUPREF:
133135
if flags & SRE_FLAG_IGNORECASE:
134136
emit(OPCODES[OP_IGNORE[op]])

Lib/sre_constants.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
# update when constants are added or removed
1313

14-
MAGIC = 20010115
14+
MAGIC = 20010320
1515

1616
# max code word in this release
1717

@@ -67,6 +67,10 @@ class error(Exception):
6767
AT_END = "at_end"
6868
AT_END_LINE = "at_end_line"
6969
AT_END_STRING = "at_end_string"
70+
AT_LOC_BOUNDARY = "at_loc_boundary"
71+
AT_LOC_NON_BOUNDARY = "at_loc_non_boundary"
72+
AT_UNI_BOUNDARY = "at_uni_boundary"
73+
AT_UNI_NON_BOUNDARY = "at_uni_non_boundary"
7074

7175
# categories
7276
CATEGORY_DIGIT = "category_digit"
@@ -119,7 +123,9 @@ class error(Exception):
119123

120124
ATCODES = [
121125
AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY,
122-
AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING
126+
AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING,
127+
AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY,
128+
AT_UNI_NON_BOUNDARY
123129
]
124130

125131
CHCODES = [
@@ -157,6 +163,16 @@ def makedict(list):
157163
AT_END: AT_END_LINE
158164
}
159165

166+
AT_LOCALE = {
167+
AT_BOUNDARY: AT_LOC_BOUNDARY,
168+
AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
169+
}
170+
171+
AT_UNICODE = {
172+
AT_BOUNDARY: AT_UNI_BOUNDARY,
173+
AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
174+
}
175+
160176
CH_LOCALE = {
161177
CATEGORY_DIGIT: CATEGORY_DIGIT,
162178
CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,

Lib/sre_parse.py

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,16 @@ def parse_template(source, pattern):
638638
s = Tokenizer(source)
639639
p = []
640640
a = p.append
641+
def literal(literal, p=p):
642+
if p and p[-1][0] is LITERAL:
643+
p[-1] = LITERAL, p[-1][1] + literal
644+
else:
645+
p.append((LITERAL, literal))
646+
sep = source[:0]
647+
if type(sep) is type(""):
648+
char = chr
649+
else:
650+
char = unichr
641651
while 1:
642652
this = s.get()
643653
if this is None:
@@ -681,33 +691,42 @@ def parse_template(source, pattern):
681691
break
682692
if not code:
683693
this = this[1:]
684-
code = LITERAL, atoi(this[-6:], 8) & 0xff
685-
a(code)
694+
code = LITERAL, char(atoi(this[-6:], 8) & 0xff)
695+
if code[0] is LITERAL:
696+
literal(code[1])
697+
else:
698+
a(code)
686699
else:
687700
try:
688-
a(ESCAPES[this])
701+
this = char(ESCAPES[this][1])
689702
except KeyError:
690-
for c in this:
691-
a((LITERAL, ord(c)))
703+
pass
704+
literal(this)
692705
else:
693-
a((LITERAL, ord(this)))
694-
return p
706+
literal(this)
707+
# convert template to groups and literals lists
708+
i = 0
709+
groups = []
710+
literals = []
711+
for c, s in p:
712+
if c is MARK:
713+
groups.append((i, s))
714+
literals.append(None)
715+
else:
716+
literals.append(s)
717+
i = i + 1
718+
return groups, literals
695719

696720
def expand_template(template, match):
697-
# XXX: <fl> this is sooooo slow. drop in the slicelist code instead
698-
p = []
699-
a = p.append
721+
g = match.group
700722
sep = match.string[:0]
701-
if type(sep) is type(""):
702-
char = chr
703-
else:
704-
char = unichr
705-
for c, s in template:
706-
if c is LITERAL:
707-
a(char(s))
708-
elif c is MARK:
709-
s = match.group(s)
723+
groups, literals = template
724+
literals = literals[:]
725+
try:
726+
for index, group in groups:
727+
literals[index] = s = g(group)
710728
if s is None:
711-
raise error, "empty group"
712-
a(s)
713-
return string.join(p, sep)
729+
raise IndexError
730+
except IndexError:
731+
raise error, "empty group"
732+
return string.join(literals, sep)

Lib/test/re_tests.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,3 +639,14 @@
639639
# bug 130748: ^* should be an error (nothing to repeat)
640640
(r'^*', '', SYNTAX_ERROR),
641641
]
642+
643+
try:
644+
u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'")
645+
except SyntaxError:
646+
pass
647+
else:
648+
tests.extend([
649+
# bug 410271: \b broken under locales
650+
(r'\b.\b', 'a', SUCCEED, 'found', 'a'),
651+
(r'(?u)\b.\b', u, SUCCEED, 'found', u),
652+
])

Lib/test/test_sre.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,8 @@ def bump_num(matchobj):
329329
u = unicode(s, "latin-1")
330330
except NameError:
331331
pass
332+
except TypeError:
333+
continue # skip unicode test strings
332334
else:
333335
result=obj.search(u)
334336
if result==None:

Modules/_sre.c

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
* 2000-10-24 fl really fixed assert_not; reset groups in findall
2525
* 2000-12-21 fl fixed memory leak in groupdict
2626
* 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL
27-
* 2001-01-15 fl avoid recursion for MIN_UTIL; fixed uppercase literal bug
27+
* 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug
2828
* 2001-01-16 fl fixed memory leak in pattern destructor
29+
* 2001-03-20 fl lots of fixes for 2.1b2
2930
*
3031
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
3132
*
@@ -40,7 +41,7 @@
4041

4142
#ifndef SRE_RECURSIVE
4243

43-
char copyright[] = " SRE 2.1 Copyright (c) 1997-2001 by Secret Labs AB ";
44+
char copyright[] = " SRE 2.1b2 Copyright (c) 1997-2001 by Secret Labs AB ";
4445

4546
#include "Python.h"
4647

@@ -141,11 +142,6 @@ static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
141142
106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
142143
120, 121, 122, 123, 124, 125, 126, 127 };
143144

144-
static unsigned int sre_lower(unsigned int ch)
145-
{
146-
return ((ch) < 128 ? sre_char_lower[ch] : ch);
147-
}
148-
149145
#define SRE_IS_DIGIT(ch)\
150146
((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
151147
#define SRE_IS_SPACE(ch)\
@@ -157,30 +153,39 @@ static unsigned int sre_lower(unsigned int ch)
157153
#define SRE_IS_WORD(ch)\
158154
((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
159155

160-
/* locale-specific character predicates */
161-
162-
static unsigned int sre_lower_locale(unsigned int ch)
156+
static unsigned int sre_lower(unsigned int ch)
163157
{
164-
return ((ch) < 256 ? tolower((ch)) : ch);
158+
return ((ch) < 128 ? sre_char_lower[ch] : ch);
165159
}
160+
161+
/* locale-specific character predicates */
162+
166163
#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
167164
#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
168165
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
169166
#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
170167
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
171168

169+
static unsigned int sre_lower_locale(unsigned int ch)
170+
{
171+
return ((ch) < 256 ? tolower((ch)) : ch);
172+
}
173+
172174
/* unicode-specific character predicates */
173175

174176
#if defined(HAVE_UNICODE)
175-
static unsigned int sre_lower_unicode(unsigned int ch)
176-
{
177-
return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
178-
}
177+
179178
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
180179
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
181180
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
182181
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
183182
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
183+
184+
static unsigned int sre_lower_unicode(unsigned int ch)
185+
{
186+
return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
187+
}
188+
184189
#endif
185190

186191
LOCAL(int)
@@ -418,6 +423,42 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
418423
this = ((void*) ptr < state->end) ?
419424
SRE_IS_WORD((int) ptr[0]) : 0;
420425
return this == that;
426+
427+
case SRE_AT_LOC_BOUNDARY:
428+
if (state->beginning == state->end)
429+
return 0;
430+
that = ((void*) ptr > state->beginning) ?
431+
SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
432+
this = ((void*) ptr < state->end) ?
433+
SRE_LOC_IS_WORD((int) ptr[0]) : 0;
434+
return this != that;
435+
436+
case SRE_AT_LOC_NON_BOUNDARY:
437+
if (state->beginning == state->end)
438+
return 0;
439+
that = ((void*) ptr > state->beginning) ?
440+
SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
441+
this = ((void*) ptr < state->end) ?
442+
SRE_LOC_IS_WORD((int) ptr[0]) : 0;
443+
return this == that;
444+
445+
case SRE_AT_UNI_BOUNDARY:
446+
if (state->beginning == state->end)
447+
return 0;
448+
that = ((void*) ptr > state->beginning) ?
449+
SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
450+
this = ((void*) ptr < state->end) ?
451+
SRE_UNI_IS_WORD((int) ptr[0]) : 0;
452+
return this != that;
453+
454+
case SRE_AT_UNI_NON_BOUNDARY:
455+
if (state->beginning == state->end)
456+
return 0;
457+
that = ((void*) ptr > state->beginning) ?
458+
SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
459+
this = ((void*) ptr < state->end) ?
460+
SRE_UNI_IS_WORD((int) ptr[0]) : 0;
461+
return this == that;
421462
}
422463

423464
return 0;
@@ -1037,7 +1078,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
10371078

10381079
/* see if the tail matches */
10391080
state->repeat = rp->prev;
1040-
if (rp->pattern[2] == 65535) {
1081+
/* FIXME: the following fix doesn't always work (#133283) */
1082+
if (0 && rp->pattern[2] == 65535) {
10411083
/* unbounded repeat */
10421084
for (;;) {
10431085
i = SRE_MATCH(state, pattern, level + 1);

Modules/sre_constants.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* See the _sre.c file for information on usage and redistribution.
1212
*/
1313

14-
#define SRE_MAGIC 20010115
14+
#define SRE_MAGIC 20010320
1515
#define SRE_OP_FAILURE 0
1616
#define SRE_OP_SUCCESS 1
1717
#define SRE_OP_ANY 2
@@ -49,6 +49,10 @@
4949
#define SRE_AT_END 5
5050
#define SRE_AT_END_LINE 6
5151
#define SRE_AT_END_STRING 7
52+
#define SRE_AT_LOC_BOUNDARY 8
53+
#define SRE_AT_LOC_NON_BOUNDARY 9
54+
#define SRE_AT_UNI_BOUNDARY 10
55+
#define SRE_AT_UNI_NON_BOUNDARY 11
5256
#define SRE_CATEGORY_DIGIT 0
5357
#define SRE_CATEGORY_NOT_DIGIT 1
5458
#define SRE_CATEGORY_SPACE 2

0 commit comments

Comments
 (0)