Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 78e2f06

Browse files
committed
Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds.
1 parent 53d93ad commit 78e2f06

4 files changed

Lines changed: 90 additions & 35 deletions

File tree

Lib/sre_compile.py

Lines changed: 46 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616

1717
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
1818

19-
MAXCODE = 65535
19+
if _sre.CODESIZE == 2:
20+
MAXCODE = 65535
21+
else:
22+
MAXCODE = 0xFFFFFFFFL
2023

2124
def _compile(code, pattern, flags):
2225
# internal: compile a (sub)pattern
@@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup):
191194
# XXX: could append to charmap tail
192195
return charset # cannot compress
193196
except IndexError:
194-
if sys.maxunicode != 65535:
195-
# XXX: big charsets don't work in UCS-4 builds
196-
return charset
197197
# character set contains unicode characters
198198
return _optimize_unicode(charset, fixup)
199199
# compress character map
@@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup):
228228

229229
def _mk_bitmap(bits):
230230
data = []
231-
m = 1; v = 0
231+
if _sre.CODESIZE == 2:
232+
start = (1, 0)
233+
else:
234+
start = (1L, 0L)
235+
m, v = start
232236
for c in bits:
233237
if c:
234238
v = v + m
235239
m = m << 1
236240
if m > MAXCODE:
237241
data.append(v)
238-
m = 1; v = 0
242+
m, v = start
239243
return data
240244

241245
# To represent a big charset, first a bitmap of all characters in the
@@ -258,21 +262,38 @@ def _mk_bitmap(bits):
258262
# less significant byte is a bit index in the chunk (just like the
259263
# CHARSET matching).
260264

265+
# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
266+
# of the basic multilingual plane; an efficient representation
267+
# for all of UTF-16 has not yet been developed. This means,
268+
# in particular, that negated charsets cannot be represented as
269+
# bigcharsets.
270+
261271
def _optimize_unicode(charset, fixup):
272+
try:
273+
import array
274+
except ImportError:
275+
return charset
262276
charmap = [0]*65536
263277
negate = 0
264-
for op, av in charset:
265-
if op is NEGATE:
266-
negate = 1
267-
elif op is LITERAL:
268-
charmap[fixup(av)] = 1
269-
elif op is RANGE:
270-
for i in range(fixup(av[0]), fixup(av[1])+1):
271-
charmap[i] = 1
272-
elif op is CATEGORY:
273-
# XXX: could expand category
274-
return charset # cannot compress
278+
try:
279+
for op, av in charset:
280+
if op is NEGATE:
281+
negate = 1
282+
elif op is LITERAL:
283+
charmap[fixup(av)] = 1
284+
elif op is RANGE:
285+
for i in range(fixup(av[0]), fixup(av[1])+1):
286+
charmap[i] = 1
287+
elif op is CATEGORY:
288+
# XXX: could expand category
289+
return charset # cannot compress
290+
except IndexError:
291+
# non-BMP characters
292+
return charset
275293
if negate:
294+
if sys.maxunicode != 65535:
295+
# XXX: negation does not work with big charsets
296+
return charset
276297
for i in range(65536):
277298
charmap[i] = not charmap[i]
278299
comps = {}
@@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup):
287308
block = block + 1
288309
data = data + _mk_bitmap(chunk)
289310
header = [block]
290-
assert MAXCODE == 65535
291-
for i in range(128):
292-
if sys.byteorder == 'big':
293-
header.append(256*mapping[2*i]+mapping[2*i+1])
294-
else:
295-
header.append(mapping[2*i]+256*mapping[2*i+1])
311+
if MAXCODE == 65535:
312+
code = 'H'
313+
else:
314+
code = 'L'
315+
# Convert block indices to byte array of 256 bytes
316+
mapping = array.array('b', mapping).tostring()
317+
# Convert byte array to word array
318+
header = header + array.array(code, mapping).tolist()
296319
data[0:0] = header
297320
return [(BIGCHARSET, data)]
298321

Lib/sre_constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
# update when constants are added or removed
1515

16-
MAGIC = 20010701
16+
MAGIC = 20030419
1717

1818
# max code word in this release
1919

Modules/_sre.c

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
* 2001-10-24 fl added finditer primitive (for 2.2 only)
2121
* 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
2222
* 2002-11-09 fl fixed empty sub/subn return type
23+
* 2003-04-18 mvl fully support 4-byte codes
2324
*
2425
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
2526
*
@@ -510,22 +511,44 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
510511
break;
511512

512513
case SRE_OP_CHARSET:
513-
/* <CHARSET> <bitmap> (16 bits per code word) */
514-
if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
515-
return ok;
516-
set += 16;
514+
if (sizeof(SRE_CODE) == 2) {
515+
/* <CHARSET> <bitmap> (16 bits per code word) */
516+
if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
517+
return ok;
518+
set += 16;
519+
}
520+
else {
521+
/* <CHARSET> <bitmap> (32 bits per code word) */
522+
if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
523+
return ok;
524+
set += 8;
525+
}
517526
break;
518527

519528
case SRE_OP_BIGCHARSET:
520529
/* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
521530
{
522531
int count, block;
523532
count = *(set++);
524-
block = ((unsigned char*)set)[ch >> 8];
525-
set += 128;
526-
if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
527-
return ok;
528-
set += count*16;
533+
534+
if (sizeof(SRE_CODE) == 2) {
535+
block = ((unsigned char*)set)[ch >> 8];
536+
set += 128;
537+
if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
538+
return ok;
539+
set += count*16;
540+
}
541+
else {
542+
if (ch < 65536)
543+
block = ((unsigned char*)set)[ch >> 8];
544+
else
545+
block = -1;
546+
set += 64;
547+
if (block >=0 &&
548+
(set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
549+
return ok;
550+
set += count*8;
551+
}
529552
break;
530553
}
531554

@@ -1371,7 +1394,10 @@ _compile(PyObject* self_, PyObject* args)
13711394

13721395
for (i = 0; i < n; i++) {
13731396
PyObject *o = PyList_GET_ITEM(code, i);
1374-
self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1397+
if (PyInt_Check(o))
1398+
self->code[i] = (SRE_CODE) PyInt_AsLong(o);
1399+
else
1400+
self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
13751401
}
13761402

13771403
if (PyErr_Occurred()) {
@@ -3045,6 +3071,12 @@ PyMODINIT_FUNC init_sre(void)
30453071
Py_DECREF(x);
30463072
}
30473073

3074+
x = PyInt_FromLong(sizeof(SRE_CODE));
3075+
if (x) {
3076+
PyDict_SetItemString(d, "CODESIZE", x);
3077+
Py_DECREF(x);
3078+
}
3079+
30483080
x = PyString_FromString(copyright);
30493081
if (x) {
30503082
PyDict_SetItemString(d, "copyright", x);

Modules/sre_constants.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* See the _sre.c file for information on usage and redistribution.
1212
*/
1313

14-
#define SRE_MAGIC 20010701
14+
#define SRE_MAGIC 20030419
1515
#define SRE_OP_FAILURE 0
1616
#define SRE_OP_SUCCESS 1
1717
#define SRE_OP_ANY 2

0 commit comments

Comments
 (0)