Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 19af43d

Browse files
author
Fredrik Lundh
committed
added martin's BIGCHARSET patch to SRE 2.1.1. martin reports 2x
speedups for certain unicode character ranges.
1 parent 1fb5ce0 commit 19af43d

4 files changed

Lines changed: 107 additions & 31 deletions

File tree

Lib/sre_compile.py

Lines changed: 71 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,8 @@ def _compile_charset(charset, flags, code, fixup=None):
156156
emit(fixup(av[1]))
157157
elif op is CHARSET:
158158
code.extend(av)
159+
elif op is BIGCHARSET:
160+
code.extend(av)
159161
elif op is CATEGORY:
160162
if flags & SRE_FLAG_LOCALE:
161163
emit(CHCODES[CH_LOCALE[av]])
@@ -185,7 +187,7 @@ def _optimize_charset(charset, fixup):
185187
return charset # cannot compress
186188
except IndexError:
187189
# character set contains unicode characters
188-
return charset
190+
return _optimize_unicode(charset, fixup)
189191
# compress character map
190192
i = p = n = 0
191193
runs = []
@@ -211,19 +213,78 @@ def _optimize_charset(charset, fixup):
211213
return out
212214
else:
213215
# use bitmap
214-
data = []
215-
m = 1; v = 0
216-
for c in charmap:
217-
if c:
218-
v = v + m
219-
m = m << 1
220-
if m > MAXCODE:
221-
data.append(v)
222-
m = 1; v = 0
216+
data = _mk_bitmap(charmap)
223217
out.append((CHARSET, data))
224218
return out
225219
return charset
226220

221+
def _mk_bitmap(bits):
222+
data = []
223+
m = 1; v = 0
224+
for c in bits:
225+
if c:
226+
v = v + m
227+
m = m << 1
228+
if m > MAXCODE:
229+
data.append(v)
230+
m = 1; v = 0
231+
return data
232+
233+
# To represent a big charset, first a bitmap of all characters in the
234+
# set is constructed. Then, this bitmap is sliced into chunks of 256
235+
# characters, duplicate chunks are eliminitated, and each chunk is
236+
# given a number. In the compiled expression, the charset is
237+
# represented by a 16-bit word sequence, consisting of one word for
238+
# the number of different chunks, a sequence of 256 bytes (128 words)
239+
# of chunk numbers indexed by their original chunk position, and a
240+
# sequence of chunks (16 words each).
241+
242+
# Compression is normally good: in a typical charset, large ranges of
243+
# Unicode will be either completely excluded (e.g. if only cyrillic
244+
# letters are to be matched), or completely included (e.g. if large
245+
# subranges of Kanji match). These ranges will be represented by
246+
# chunks of all one-bits or all zero-bits.
247+
248+
# Matching can be also done efficiently: the more significant byte of
249+
# the Unicode character is an index into the chunk number, and the
250+
# less significant byte is a bit index in the chunk (just like the
251+
# CHARSET matching).
252+
253+
def _optimize_unicode(charset, fixup):
254+
charmap = [0]*65536
255+
negate = 0
256+
for op, av in charset:
257+
if op is NEGATE:
258+
negate = 1
259+
elif op is LITERAL:
260+
charmap[fixup(av)] = 1
261+
elif op is RANGE:
262+
for i in range(fixup(av[0]), fixup(av[1])+1):
263+
charmap[i] = 1
264+
elif op is CATEGORY:
265+
# XXX: could expand category
266+
return charset # cannot compress
267+
if negate:
268+
for i in range(65536):
269+
charmap[i] = not charmap[i]
270+
comps = {}
271+
mapping = [0]*256
272+
block = 0
273+
data = []
274+
for i in range(256):
275+
chunk = tuple(charmap[i*256:(i+1)*256])
276+
new = comps.setdefault(chunk, block)
277+
mapping[i] = new
278+
if new == block:
279+
block += 1
280+
data += _mk_bitmap(chunk)
281+
header = [block]
282+
assert MAXCODE == 65535
283+
for i in range(128):
284+
header.append(mapping[2*i]+256*mapping[2*i+1])
285+
data[0:0] = header
286+
return [(BIGCHARSET, data)]
287+
227288
def _simple(av):
228289
# check if av is a "simple" operator
229290
lo, hi = av[2].getwidth()

Lib/sre_constants.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
# update when constants are added or removed
1313

14-
MAGIC = 20010320
14+
MAGIC = 20010701
1515

1616
# max code word in this release
1717

@@ -33,6 +33,7 @@ class error(Exception):
3333
ASSERT = "assert"
3434
ASSERT_NOT = "assert_not"
3535
AT = "at"
36+
BIGCHARSET = "bigcharset"
3637
BRANCH = "branch"
3738
CALL = "call"
3839
CATEGORY = "category"
@@ -103,7 +104,7 @@ class error(Exception):
103104
BRANCH,
104105
CALL,
105106
CATEGORY,
106-
CHARSET,
107+
CHARSET, BIGCHARSET,
107108
GROUPREF, GROUPREF_IGNORE,
108109
IN, IN_IGNORE,
109110
INFO,

Modules/_sre.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,19 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
506506
set += 16;
507507
break;
508508

509+
case SRE_OP_BIGCHARSET:
510+
/* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
511+
{
512+
int count, block;
513+
count = *(set++);
514+
block = ((unsigned char*)set)[ch >> 8];
515+
set += 128;
516+
if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
517+
return ok;
518+
set += count*16;
519+
break;
520+
}
521+
509522
case SRE_OP_CATEGORY:
510523
/* <CATEGORY> <code> */
511524
if (sre_category(set[0], (int) ch))

Modules/sre_constants.h

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* See the _sre.c file for information on usage and redistribution.
1212
*/
1313

14-
#define SRE_MAGIC 20010320
14+
#define SRE_MAGIC 20010701
1515
#define SRE_OP_FAILURE 0
1616
#define SRE_OP_SUCCESS 1
1717
#define SRE_OP_ANY 2
@@ -23,24 +23,25 @@
2323
#define SRE_OP_CALL 8
2424
#define SRE_OP_CATEGORY 9
2525
#define SRE_OP_CHARSET 10
26-
#define SRE_OP_GROUPREF 11
27-
#define SRE_OP_GROUPREF_IGNORE 12
28-
#define SRE_OP_IN 13
29-
#define SRE_OP_IN_IGNORE 14
30-
#define SRE_OP_INFO 15
31-
#define SRE_OP_JUMP 16
32-
#define SRE_OP_LITERAL 17
33-
#define SRE_OP_LITERAL_IGNORE 18
34-
#define SRE_OP_MARK 19
35-
#define SRE_OP_MAX_UNTIL 20
36-
#define SRE_OP_MIN_UNTIL 21
37-
#define SRE_OP_NOT_LITERAL 22
38-
#define SRE_OP_NOT_LITERAL_IGNORE 23
39-
#define SRE_OP_NEGATE 24
40-
#define SRE_OP_RANGE 25
41-
#define SRE_OP_REPEAT 26
42-
#define SRE_OP_REPEAT_ONE 27
43-
#define SRE_OP_SUBPATTERN 28
26+
#define SRE_OP_BIGCHARSET 11
27+
#define SRE_OP_GROUPREF 12
28+
#define SRE_OP_GROUPREF_IGNORE 13
29+
#define SRE_OP_IN 14
30+
#define SRE_OP_IN_IGNORE 15
31+
#define SRE_OP_INFO 16
32+
#define SRE_OP_JUMP 17
33+
#define SRE_OP_LITERAL 18
34+
#define SRE_OP_LITERAL_IGNORE 19
35+
#define SRE_OP_MARK 20
36+
#define SRE_OP_MAX_UNTIL 21
37+
#define SRE_OP_MIN_UNTIL 22
38+
#define SRE_OP_NOT_LITERAL 23
39+
#define SRE_OP_NOT_LITERAL_IGNORE 24
40+
#define SRE_OP_NEGATE 25
41+
#define SRE_OP_RANGE 26
42+
#define SRE_OP_REPEAT 27
43+
#define SRE_OP_REPEAT_ONE 28
44+
#define SRE_OP_SUBPATTERN 29
4445
#define SRE_AT_BEGINNING 0
4546
#define SRE_AT_BEGINNING_LINE 1
4647
#define SRE_AT_BEGINNING_STRING 2

0 commit comments

Comments
 (0)