1616
1717assert _sre .MAGIC == MAGIC , "SRE module mismatch"
1818
19- MAXCODE = 65535
19+ if _sre .CODESIZE == 2 :
20+ MAXCODE = 65535
21+ else :
22+ MAXCODE = 0xFFFFFFFFL
2023
2124def _compile (code , pattern , flags ):
2225 # internal: compile a (sub)pattern
@@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup):
191194 # XXX: could append to charmap tail
192195 return charset # cannot compress
193196 except IndexError :
194- if sys .maxunicode != 65535 :
195- # XXX: big charsets don't work in UCS-4 builds
196- return charset
197197 # character set contains unicode characters
198198 return _optimize_unicode (charset , fixup )
199199 # compress character map
@@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup):
228228
229229def _mk_bitmap (bits ):
230230 data = []
231- m = 1 ; v = 0
231+ if _sre .CODESIZE == 2 :
232+ start = (1 , 0 )
233+ else :
234+ start = (1L , 0L )
235+ m , v = start
232236 for c in bits :
233237 if c :
234238 v = v + m
235239 m = m << 1
236240 if m > MAXCODE :
237241 data .append (v )
238- m = 1 ; v = 0
242+ m , v = start
239243 return data
240244
241245# To represent a big charset, first a bitmap of all characters in the
@@ -258,21 +262,38 @@ def _mk_bitmap(bits):
258262# less significant byte is a bit index in the chunk (just like the
259263# CHARSET matching).
260264
265+ # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
266+ # of the basic multilingual plane; an efficient representation
267+ # for all of UTF-16 has not yet been developed. This means,
268+ # in particular, that negated charsets cannot be represented as
269+ # bigcharsets.
270+
261271def _optimize_unicode (charset , fixup ):
272+ try :
273+ import array
274+ except ImportError :
275+ return charset
262276 charmap = [0 ]* 65536
263277 negate = 0
264- for op , av in charset :
265- if op is NEGATE :
266- negate = 1
267- elif op is LITERAL :
268- charmap [fixup (av )] = 1
269- elif op is RANGE :
270- for i in range (fixup (av [0 ]), fixup (av [1 ])+ 1 ):
271- charmap [i ] = 1
272- elif op is CATEGORY :
273- # XXX: could expand category
274- return charset # cannot compress
278+ try :
279+ for op , av in charset :
280+ if op is NEGATE :
281+ negate = 1
282+ elif op is LITERAL :
283+ charmap [fixup (av )] = 1
284+ elif op is RANGE :
285+ for i in range (fixup (av [0 ]), fixup (av [1 ])+ 1 ):
286+ charmap [i ] = 1
287+ elif op is CATEGORY :
288+ # XXX: could expand category
289+ return charset # cannot compress
290+ except IndexError :
291+ # non-BMP characters
292+ return charset
275293 if negate :
294+ if sys .maxunicode != 65535 :
295+ # XXX: negation does not work with big charsets
296+ return charset
276297 for i in range (65536 ):
277298 charmap [i ] = not charmap [i ]
278299 comps = {}
@@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup):
287308 block = block + 1
288309 data = data + _mk_bitmap (chunk )
289310 header = [block ]
290- assert MAXCODE == 65535
291- for i in range (128 ):
292- if sys .byteorder == 'big' :
293- header .append (256 * mapping [2 * i ]+ mapping [2 * i + 1 ])
294- else :
295- header .append (mapping [2 * i ]+ 256 * mapping [2 * i + 1 ])
311+ if MAXCODE == 65535 :
312+ code = 'H'
313+ else :
314+ code = 'L'
315+ # Convert block indices to byte array of 256 bytes
316+ mapping = array .array ('b' , mapping ).tostring ()
317+ # Convert byte array to word array
318+ header = header + array .array (code , mapping ).tolist ()
296319 data [0 :0 ] = header
297320 return [(BIGCHARSET , data )]
298321
0 commit comments