Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3562f11

Browse files
author
Fredrik Lundh
committed
-- use charset bitmaps where appropriate. this gives a 5-10%
speedup for some tests, including the python tokenizer. -- added support for an optional charset anchor to the engine (currently unused by the code generator). -- removed workaround for array module bug.
1 parent c13222c commit 3562f11

5 files changed

Lines changed: 182 additions & 63 deletions

File tree

Lib/sre_compile.py

Lines changed: 93 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,71 @@
1616
from sre_constants import *
1717

1818
# find an array type code that matches the engine's code size
19-
for WORDSIZE in "BHil":
19+
for WORDSIZE in "Hil":
2020
if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
2121
break
2222
else:
2323
raise RuntimeError, "cannot find a useable array type"
2424

25+
MAXCODE = 65535
26+
27+
def _charset(charset, fixup):
28+
# internal: optimize character set
29+
out = []
30+
charmap = [0]*256
31+
try:
32+
for op, av in charset:
33+
if op is NEGATE:
34+
out.append((op, av))
35+
elif op is LITERAL:
36+
charmap[fixup(av)] = 1
37+
elif op is RANGE:
38+
for i in range(fixup(av[0]), fixup(av[1])+1):
39+
charmap[i] = 1
40+
elif op is CATEGORY:
41+
# FIXME: could append to charmap tail
42+
return charset # cannot compress
43+
except IndexError:
44+
# unicode
45+
return charset
46+
# compress character map
47+
i = p = n = 0
48+
runs = []
49+
for c in charmap:
50+
if c:
51+
if n == 0:
52+
p = i
53+
n = n + 1
54+
elif n:
55+
runs.append((p, n))
56+
n = 0
57+
i = i + 1
58+
if n:
59+
runs.append((p, n))
60+
if len(runs) <= 2:
61+
# use literal/range
62+
for p, n in runs:
63+
if n == 1:
64+
out.append((LITERAL, p))
65+
else:
66+
out.append((RANGE, (p, p+n-1)))
67+
if len(out) < len(charset):
68+
return out
69+
else:
70+
# use bitmap
71+
data = []
72+
m = 1; v = 0
73+
for c in charmap:
74+
if c:
75+
v = v + m
76+
m = m << 1
77+
if m > MAXCODE:
78+
data.append(v)
79+
m = 1; v = 0
80+
out.append((CHARSET, data))
81+
return out
82+
return charset
83+
2584
def _compile(code, pattern, flags):
2685
# internal: compile a (sub)pattern
2786
emit = code.append
@@ -41,7 +100,7 @@ def fixup(literal, flags=flags):
41100
emit(OPCODES[op])
42101
fixup = lambda x: x
43102
skip = len(code); emit(0)
44-
for op, av in av:
103+
for op, av in _charset(av, fixup):
45104
emit(OPCODES[op])
46105
if op is NEGATE:
47106
pass
@@ -50,6 +109,8 @@ def fixup(literal, flags=flags):
50109
elif op is RANGE:
51110
emit(fixup(av[0]))
52111
emit(fixup(av[1]))
112+
elif op is CHARSET:
113+
code.extend(av)
53114
elif op is CATEGORY:
54115
if flags & SRE_FLAG_LOCALE:
55116
emit(CHCODES[CH_LOCALE[av]])
@@ -155,13 +216,14 @@ def fixup(literal, flags=flags):
155216

156217
def _compile_info(code, pattern, flags):
157218
# internal: compile an info block. in the current version,
158-
# this contains min/max pattern width and a literal prefix,
159-
# if any
219+
# this contains min/max pattern width, and an optional literal
220+
# prefix or a character map
160221
lo, hi = pattern.getwidth()
161222
if lo == 0:
162223
return # not worth it
163224
# look for a literal prefix
164225
prefix = []
226+
charset = [] # not used
165227
if not (flags & SRE_FLAG_IGNORECASE):
166228
for op, av in pattern.data:
167229
if op is LITERAL:
@@ -174,26 +236,40 @@ def _compile_info(code, pattern, flags):
174236
skip = len(code); emit(0)
175237
# literal flag
176238
mask = 0
177-
if len(prefix) == len(pattern.data):
178-
mask = 1
239+
if prefix:
240+
mask = SRE_INFO_PREFIX
241+
if len(prefix) == len(pattern.data):
242+
mask = mask + SRE_INFO_LITERAL
243+
elif charset:
244+
mask = mask + SRE_INFO_CHARSET
179245
emit(mask)
180246
# pattern length
181-
emit(lo)
182-
if hi < 32768:
247+
if lo < MAXCODE:
248+
emit(lo)
249+
else:
250+
emit(MAXCODE)
251+
prefix = prefix[:MAXCODE]
252+
if hi < MAXCODE:
183253
emit(hi)
184254
else:
185255
emit(0)
186256
# add literal prefix
187-
emit(len(prefix))
188257
if prefix:
189-
code.extend(prefix)
190-
# generate overlap table
191-
table = [-1] + ([0]*len(prefix))
192-
for i in range(len(prefix)):
193-
table[i+1] = table[i]+1
194-
while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
195-
table[i+1] = table[table[i+1]-1]+1
196-
code.extend(table[1:]) # don't store first entry
258+
emit(len(prefix))
259+
if prefix:
260+
code.extend(prefix)
261+
# generate overlap table
262+
table = [-1] + ([0]*len(prefix))
263+
for i in range(len(prefix)):
264+
table[i+1] = table[i]+1
265+
while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
266+
table[i+1] = table[table[i+1]-1]+1
267+
code.extend(table[1:]) # don't store first entry
268+
elif charset:
269+
for char in charset:
270+
emit(OPCODES[LITERAL])
271+
emit(char)
272+
emit(OPCODES[FAILURE])
197273
code[skip] = len(code) - skip
198274

199275
def compile(p, flags=0):

Lib/sre_constants.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class error(Exception):
2828
BRANCH = "branch"
2929
CALL = "call"
3030
CATEGORY = "category"
31+
CHARSET = "charset"
3132
GROUP = "group"
3233
GROUP_IGNORE = "group_ignore"
3334
IN = "in"
@@ -87,6 +88,7 @@ class error(Exception):
8788
BRANCH,
8889
CALL,
8990
CATEGORY,
91+
CHARSET,
9092
GROUP, GROUP_IGNORE,
9193
IN, IN_IGNORE,
9294
INFO,
@@ -166,13 +168,18 @@ def makedict(list):
166168
}
167169

168170
# flags
169-
SRE_FLAG_TEMPLATE = 1
170-
SRE_FLAG_IGNORECASE = 2
171-
SRE_FLAG_LOCALE = 4
172-
SRE_FLAG_MULTILINE = 8
173-
SRE_FLAG_DOTALL = 16
174-
SRE_FLAG_UNICODE = 32
175-
SRE_FLAG_VERBOSE = 64
171+
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
172+
SRE_FLAG_IGNORECASE = 2 # case insensitive
173+
SRE_FLAG_LOCALE = 4 # honour system locale
174+
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
175+
SRE_FLAG_DOTALL = 16 # treat target as a single string
176+
SRE_FLAG_UNICODE = 32 # use unicode locale
177+
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
178+
179+
# flags for INFO primitive
180+
SRE_INFO_PREFIX = 1 # has prefix
181+
SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
182+
SRE_INFO_CHARSET = 4 # pattern starts with character from given set
176183

177184
if __name__ == "__main__":
178185
import string
@@ -201,12 +208,18 @@ def dump(f, d, prefix):
201208
dump(f, OPCODES, "SRE_OP")
202209
dump(f, ATCODES, "SRE")
203210
dump(f, CHCODES, "SRE")
211+
204212
f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
205213
f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
206214
f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE)
207215
f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE)
208216
f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL)
209217
f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
210218
f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
219+
220+
f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
221+
f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
222+
f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
223+
211224
f.close()
212225
print "done"

Lib/sre_parse.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,10 @@
1616

1717
from sre_constants import *
1818

19-
# FIXME: should be 65535, but the arraymodule is still broken
20-
MAXREPEAT = 32767
19+
MAXREPEAT = 65535
2120

22-
# FIXME: might change in 2.0 final. but for now, this seems
23-
# to be the best way to be compatible with 1.5.2
21+
# FIXME: the following might change in 2.0 final. but for now, this
22+
# seems to be the best way to be compatible with 1.5.2
2423
CHARMASK = 0xff
2524

2625
SPECIAL_CHARS = ".\\[{()*+?^$|"

Modules/_sre.c

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,13 @@ SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
378378
set += 2;
379379
break;
380380

381+
case SRE_OP_CHARSET:
382+
/* args: <bitmap> (16 bits per code word) */
383+
if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
384+
return ok;
385+
set += 16;
386+
break;
387+
381388
case SRE_OP_CATEGORY:
382389
/* args: <category> */
383390
if (sre_category(set[0], (int) ch))
@@ -952,35 +959,38 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
952959
SRE_CHAR* ptr = state->start;
953960
SRE_CHAR* end = state->end;
954961
int status = 0;
955-
int prefix_len = 0;
956-
SRE_CODE* prefix;
957-
SRE_CODE* overlap;
958-
int literal = 0;
962+
int prefix_len;
963+
SRE_CODE* prefix = NULL;
964+
SRE_CODE* charset = NULL;
965+
SRE_CODE* overlap = NULL;
966+
int flags = 0;
959967

960968
if (pattern[0] == SRE_OP_INFO) {
961969
/* optimization info block */
962-
/* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix> <6=data...> */
970+
/* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
971+
972+
flags = pattern[2];
963973

964974
if (pattern[3] > 0) {
965975
/* adjust end point (but make sure we leave at least one
966-
character in there) */
976+
character in there, so literal search will work) */
967977
end -= pattern[3]-1;
968978
if (end <= ptr)
969979
end = ptr+1;
970980
}
971981

972-
literal = pattern[2];
973-
974-
prefix = pattern + 6;
975-
prefix_len = pattern[5];
976-
977-
overlap = prefix + prefix_len - 1;
982+
if (flags & SRE_INFO_PREFIX) {
983+
prefix_len = pattern[5];
984+
prefix = pattern + 6;
985+
overlap = prefix + prefix_len - 1;
986+
} else if (flags & SRE_INFO_CHARSET)
987+
charset = pattern + 5;
978988

979989
pattern += 1 + pattern[1];
980990
}
981991

982992
#if defined(USE_FAST_SEARCH)
983-
if (prefix_len > 1) {
993+
if (prefix && overlap && prefix_len > 1) {
984994
/* pattern starts with a known prefix. use the overlap
985995
table to skip forward as fast as we possibly can */
986996
int i = 0;
@@ -998,8 +1008,8 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
9981008
TRACE(("%8d: === SEARCH === hit\n", PTR(ptr)));
9991009
state->start = ptr - prefix_len + 1;
10001010
state->ptr = ptr + 1;
1001-
if (literal)
1002-
return 1; /* all of it */
1011+
if (flags & SRE_INFO_LITERAL)
1012+
return 1; /* we got all of it */
10031013
status = SRE_MATCH(state, pattern + 2*prefix_len);
10041014
if (status != 0)
10051015
return status;
@@ -1016,9 +1026,9 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
10161026
}
10171027
#endif
10181028

1019-
if (pattern[0] == SRE_OP_LITERAL) {
1020-
/* pattern starts with a literal character. this is used for
1021-
short prefixes, and if fast search is disabled*/
1029+
if (pattern[0] == SRE_OP_LITERAL) {
1030+
/* pattern starts with a literal character. this is used
1031+
for short prefixes, and if fast search is disabled */
10221032
SRE_CODE chr = pattern[1];
10231033
for (;;) {
10241034
while (ptr < end && (SRE_CODE) ptr[0] != chr)
@@ -1032,6 +1042,22 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
10321042
if (status != 0)
10331043
break;
10341044
}
1045+
#if 0
1046+
} else if (charset) {
1047+
/* pattern starts with a character from a known set */
1048+
for (;;) {
1049+
while (ptr < end && !SRE_MEMBER(charset, ptr[0]))
1050+
ptr++;
1051+
if (ptr == end)
1052+
return 0;
1053+
TRACE(("%8d: === SEARCH === charset\n", PTR(ptr)));
1054+
state->start = ptr;
1055+
state->ptr = ptr;
1056+
status = SRE_MATCH(state, pattern);
1057+
if (status != 0)
1058+
break;
1059+
}
1060+
#endif
10351061
} else
10361062
/* general case */
10371063
while (ptr <= end) {
@@ -1044,6 +1070,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
10441070

10451071
return status;
10461072
}
1073+
10471074

10481075
#if !defined(SRE_RECURSIVE)
10491076

0 commit comments

Comments
 (0)