Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b1847e7

Browse files
Issue #17381: Fixed handling of case-insensitive ranges in regular expressions.
1 parent 36ac510 commit b1847e7

3 files changed

Lines changed: 71 additions & 15 deletions

File tree

Lib/sre_compile.py

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@
2222
else:
2323
MAXCODE = 0xFFFFFFFF
2424

25-
def _identityfunction(x):
26-
return x
27-
2825
_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
2926
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
3027
_SUCCESS_CODES = set([SUCCESS, FAILURE])
@@ -53,7 +50,7 @@ def fixup(literal, flags=flags):
5350
return _sre.getlower(literal, flags)
5451
else:
5552
emit(OPCODES[op])
56-
fixup = _identityfunction
53+
fixup = None
5754
skip = _len(code); emit(0)
5855
_compile_charset(av, flags, code, fixup)
5956
code[skip] = _len(code) - skip
@@ -172,17 +169,15 @@ def fixup(literal, flags=flags):
172169
def _compile_charset(charset, flags, code, fixup=None):
173170
# compile charset subprogram
174171
emit = code.append
175-
if fixup is None:
176-
fixup = _identityfunction
177-
for op, av in _optimize_charset(charset, fixup):
172+
for op, av in _optimize_charset(charset, fixup, flags & SRE_FLAG_UNICODE):
178173
emit(OPCODES[op])
179174
if op is NEGATE:
180175
pass
181176
elif op is LITERAL:
182-
emit(fixup(av))
177+
emit(av)
183178
elif op is RANGE:
184-
emit(fixup(av[0]))
185-
emit(fixup(av[1]))
179+
emit(av[0])
180+
emit(av[1])
186181
elif op is CHARSET:
187182
code.extend(av)
188183
elif op is BIGCHARSET:
@@ -198,7 +193,7 @@ def _compile_charset(charset, flags, code, fixup=None):
198193
raise error("internal: unsupported set operator")
199194
emit(OPCODES[FAILURE])
200195

201-
def _optimize_charset(charset, fixup):
196+
def _optimize_charset(charset, fixup, isunicode):
202197
# internal: optimize character set
203198
out = []
204199
tail = []
@@ -207,9 +202,15 @@ def _optimize_charset(charset, fixup):
207202
while True:
208203
try:
209204
if op is LITERAL:
210-
charmap[fixup(av)] = 1
205+
i = av
206+
if fixup:
207+
i = fixup(i)
208+
charmap[i] = 1
211209
elif op is RANGE:
212-
for i in range(fixup(av[0]), fixup(av[1])+1):
210+
r = range(av[0], av[1]+1)
211+
if fixup:
212+
r = map(fixup, r)
213+
for i in r:
213214
charmap[i] = 1
214215
elif op is NEGATE:
215216
out.append((op, av))
@@ -221,7 +222,20 @@ def _optimize_charset(charset, fixup):
221222
charmap += b'\0' * 0xff00
222223
continue
223224
# character set contains non-BMP character codes
224-
tail.append((op, av))
225+
if fixup and isunicode and op is RANGE:
226+
lo, hi = av
227+
ranges = [av]
228+
# There are only two ranges of cased astral characters:
229+
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
230+
_fixup_range(max(0x10000, lo), min(0x11fff, hi),
231+
ranges, fixup)
232+
for lo, hi in ranges:
233+
if lo == hi:
234+
tail.append((LITERAL, hi))
235+
else:
236+
tail.append((RANGE, (lo, hi)))
237+
else:
238+
tail.append((op, av))
225239
break
226240

227241
# compress character map
@@ -247,8 +261,10 @@ def _optimize_charset(charset, fixup):
247261
else:
248262
out.append((RANGE, (p, q - 1)))
249263
out += tail
250-
if len(out) < len(charset):
264+
# if the case was changed or new representation is more compact
265+
if fixup or len(out) < len(charset):
251266
return out
267+
# else original character set is good enough
252268
return charset
253269

254270
# use bitmap
@@ -297,6 +313,24 @@ def _optimize_charset(charset, fixup):
297313
out += tail
298314
return out
299315

316+
def _fixup_range(lo, hi, ranges, fixup):
317+
for i in map(fixup, range(lo, hi+1)):
318+
for k, (lo, hi) in enumerate(ranges):
319+
if i < lo:
320+
if l == lo - 1:
321+
ranges[k] = (i, hi)
322+
else:
323+
ranges.insert(k, (i, i))
324+
break
325+
elif i > hi:
326+
if i == hi + 1:
327+
ranges[k] = (lo, i)
328+
break
329+
else:
330+
break
331+
else:
332+
ranges.append((i, i))
333+
300334
_CODEBITS = _sre.CODESIZE * 8
301335
_BITS_TRANS = b'0' + b'1' * 255
302336
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):

Lib/test/test_re.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,25 @@ def test_ignore_case(self):
583583
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
584584
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
585585

586+
def test_ignore_case_range(self):
587+
# Issues #3511, #17381.
588+
self.assertTrue(re.match(r'[9-a]', '_', re.I))
589+
self.assertIsNone(re.match(r'[9-A]', '_', re.I))
590+
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
591+
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
592+
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
593+
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
594+
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
595+
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
596+
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
597+
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
598+
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
599+
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
600+
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
601+
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
602+
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
603+
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
604+
586605
def test_category(self):
587606
self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
588607

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ Core and Builtins
3333
Library
3434
-------
3535

36+
- Issue #17381: Fixed handling of case-insensitive ranges in regular
37+
expressions.
38+
3639
- Issue #22410: Module level functions in the re module now cache compiled
3740
locale-dependent regular expressions taking into account the locale.
3841

0 commit comments

Comments
 (0)