2222else :
2323 MAXCODE = 0xFFFFFFFF
2424
25- def _identityfunction (x ):
26- return x
27-
2825_LITERAL_CODES = set ([LITERAL , NOT_LITERAL ])
2926_REPEATING_CODES = set ([REPEAT , MIN_REPEAT , MAX_REPEAT ])
3027_SUCCESS_CODES = set ([SUCCESS , FAILURE ])
@@ -53,7 +50,7 @@ def fixup(literal, flags=flags):
5350 return _sre .getlower (literal , flags )
5451 else :
5552 emit (OPCODES [op ])
56- fixup = _identityfunction
53+ fixup = None
5754 skip = _len (code ); emit (0 )
5855 _compile_charset (av , flags , code , fixup )
5956 code [skip ] = _len (code ) - skip
@@ -172,17 +169,15 @@ def fixup(literal, flags=flags):
172169def _compile_charset (charset , flags , code , fixup = None ):
173170 # compile charset subprogram
174171 emit = code .append
175- if fixup is None :
176- fixup = _identityfunction
177- for op , av in _optimize_charset (charset , fixup ):
172+ for op , av in _optimize_charset (charset , fixup , flags & SRE_FLAG_UNICODE ):
178173 emit (OPCODES [op ])
179174 if op is NEGATE :
180175 pass
181176 elif op is LITERAL :
182- emit (fixup ( av ) )
177+ emit (av )
183178 elif op is RANGE :
184- emit (fixup ( av [0 ]) )
185- emit (fixup ( av [1 ]) )
179+ emit (av [0 ])
180+ emit (av [1 ])
186181 elif op is CHARSET :
187182 code .extend (av )
188183 elif op is BIGCHARSET :
@@ -198,7 +193,7 @@ def _compile_charset(charset, flags, code, fixup=None):
198193 raise error ("internal: unsupported set operator" )
199194 emit (OPCODES [FAILURE ])
200195
201- def _optimize_charset (charset , fixup ):
196+ def _optimize_charset (charset , fixup , isunicode ):
202197 # internal: optimize character set
203198 out = []
204199 tail = []
@@ -207,9 +202,15 @@ def _optimize_charset(charset, fixup):
207202 while True :
208203 try :
209204 if op is LITERAL :
210- charmap [fixup (av )] = 1
205+ i = av
206+ if fixup :
207+ i = fixup (i )
208+ charmap [i ] = 1
211209 elif op is RANGE :
212- for i in range (fixup (av [0 ]), fixup (av [1 ])+ 1 ):
210+ r = range (av [0 ], av [1 ]+ 1 )
211+ if fixup :
212+ r = map (fixup , r )
213+ for i in r :
213214 charmap [i ] = 1
214215 elif op is NEGATE :
215216 out .append ((op , av ))
@@ -221,7 +222,20 @@ def _optimize_charset(charset, fixup):
221222 charmap += b'\0 ' * 0xff00
222223 continue
223224 # character set contains non-BMP character codes
224- tail .append ((op , av ))
225+ if fixup and isunicode and op is RANGE :
226+ lo , hi = av
227+ ranges = [av ]
228+ # There are only two ranges of cased astral characters:
229+ # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
230+ _fixup_range (max (0x10000 , lo ), min (0x11fff , hi ),
231+ ranges , fixup )
232+ for lo , hi in ranges :
233+ if lo == hi :
234+ tail .append ((LITERAL , hi ))
235+ else :
236+ tail .append ((RANGE , (lo , hi )))
237+ else :
238+ tail .append ((op , av ))
225239 break
226240
227241 # compress character map
@@ -247,8 +261,10 @@ def _optimize_charset(charset, fixup):
247261 else :
248262 out .append ((RANGE , (p , q - 1 )))
249263 out += tail
250- if len (out ) < len (charset ):
264+ # if the case was changed or new representation is more compact
265+ if fixup or len (out ) < len (charset ):
251266 return out
267+ # else original character set is good enough
252268 return charset
253269
254270 # use bitmap
@@ -297,6 +313,24 @@ def _optimize_charset(charset, fixup):
297313 out += tail
298314 return out
299315
316+ def _fixup_range (lo , hi , ranges , fixup ):
317+ for i in map (fixup , range (lo , hi + 1 )):
318+ for k , (lo , hi ) in enumerate (ranges ):
319+ if i < lo :
320+ if l == lo - 1 :
321+ ranges [k ] = (i , hi )
322+ else :
323+ ranges .insert (k , (i , i ))
324+ break
325+ elif i > hi :
326+ if i == hi + 1 :
327+ ranges [k ] = (lo , i )
328+ break
329+ else :
330+ break
331+ else :
332+ ranges .append ((i , i ))
333+
300334_CODEBITS = _sre .CODESIZE * 8
301335_BITS_TRANS = b'0' + b'1' * 255
302336def _mk_bitmap (bits , _CODEBITS = _CODEBITS , _int = int ):
0 commit comments