Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 463badf

Browse files
committed
Issue #3665: \u and \U escapes are now supported in unicode regular expressions.
Patch by Serhiy Storchaka.
1 parent c9aa842 commit 463badf

4 files changed

Lines changed: 144 additions & 34 deletions

File tree

Doc/library/re.rst

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -414,17 +414,24 @@ Most of the standard escapes supported by Python string literals are also
414414
accepted by the regular expression parser::
415415

416416
\a \b \f \n
417-
\r \t \v \x
418-
\\
417+
\r \t \u \U
418+
\v \x \\
419419

420420
(Note that ``\b`` is used to represent word boundaries, and means "backspace"
421421
only inside character classes.)
422422

423+
``'\u'`` and ``'\U'`` escape sequences are only recognized in Unicode
424+
patterns. In bytes patterns they are not treated specially.
425+
423426
Octal escapes are included in a limited form. If the first digit is a 0, or if
424427
there are three octal digits, it is considered an octal escape. Otherwise, it is
425428
a group reference. As for string literals, octal escapes are always at most
426429
three digits in length.
427430

431+
.. versionchanged:: 3.3
432+
The ``'\u'`` and ``'\U'`` escape sequences have been added.
433+
434+
428435

429436
.. _contents-of-module-re:
430437

Lib/sre_parse.py

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ def getwidth(self):
177177

178178
class Tokenizer:
179179
def __init__(self, string):
180+
self.istext = isinstance(string, str)
180181
self.string = string
181182
self.index = 0
182183
self.__next()
@@ -187,14 +188,14 @@ def __next(self):
187188
char = self.string[self.index:self.index+1]
188189
# Special case for the str8, since indexing returns a integer
189190
# XXX This is only needed for test_bug_926075 in test_re.py
190-
if char and isinstance(char, bytes):
191+
if char and not self.istext:
191192
char = chr(char[0])
192193
if char == "\\":
193194
try:
194195
c = self.string[self.index + 1]
195196
except IndexError:
196197
raise error("bogus escape (end of line)")
197-
if isinstance(self.string, bytes):
198+
if not self.istext:
198199
c = chr(c)
199200
char = char + c
200201
self.index = self.index + len(char)
@@ -209,6 +210,15 @@ def get(self):
209210
this = self.next
210211
self.__next()
211212
return this
213+
def getwhile(self, n, charset):
214+
result = ''
215+
for _ in range(n):
216+
c = self.next
217+
if c not in charset:
218+
break
219+
result += c
220+
self.__next()
221+
return result
212222
def tell(self):
213223
return self.index, self.next
214224
def seek(self, index):
@@ -241,20 +251,30 @@ def _class_escape(source, escape):
241251
c = escape[1:2]
242252
if c == "x":
243253
# hexadecimal escape (exactly two digits)
244-
while source.next in HEXDIGITS and len(escape) < 4:
245-
escape = escape + source.get()
246-
escape = escape[2:]
247-
if len(escape) != 2:
248-
raise error("bogus escape: %s" % repr("\\" + escape))
249-
return LITERAL, int(escape, 16) & 0xff
254+
escape += source.getwhile(2, HEXDIGITS)
255+
if len(escape) != 4:
256+
raise ValueError
257+
return LITERAL, int(escape[2:], 16) & 0xff
258+
elif c == "u" and source.istext:
259+
# unicode escape (exactly four digits)
260+
escape += source.getwhile(4, HEXDIGITS)
261+
if len(escape) != 6:
262+
raise ValueError
263+
return LITERAL, int(escape[2:], 16)
264+
elif c == "U" and source.istext:
265+
# unicode escape (exactly eight digits)
266+
escape += source.getwhile(8, HEXDIGITS)
267+
if len(escape) != 10:
268+
raise ValueError
269+
c = int(escape[2:], 16)
270+
chr(c) # raise ValueError for invalid code
271+
return LITERAL, c
250272
elif c in OCTDIGITS:
251273
# octal escape (up to three digits)
252-
while source.next in OCTDIGITS and len(escape) < 4:
253-
escape = escape + source.get()
254-
escape = escape[1:]
255-
return LITERAL, int(escape, 8) & 0xff
274+
escape += source.getwhile(2, OCTDIGITS)
275+
return LITERAL, int(escape[1:], 8) & 0xff
256276
elif c in DIGITS:
257-
raise error("bogus escape: %s" % repr(escape))
277+
raise ValueError
258278
if len(escape) == 2:
259279
return LITERAL, ord(escape[1])
260280
except ValueError:
@@ -273,15 +293,27 @@ def _escape(source, escape, state):
273293
c = escape[1:2]
274294
if c == "x":
275295
# hexadecimal escape
276-
while source.next in HEXDIGITS and len(escape) < 4:
277-
escape = escape + source.get()
296+
escape += source.getwhile(2, HEXDIGITS)
278297
if len(escape) != 4:
279298
raise ValueError
280299
return LITERAL, int(escape[2:], 16) & 0xff
300+
elif c == "u" and source.istext:
301+
# unicode escape (exactly four digits)
302+
escape += source.getwhile(4, HEXDIGITS)
303+
if len(escape) != 6:
304+
raise ValueError
305+
return LITERAL, int(escape[2:], 16)
306+
elif c == "U" and source.istext:
307+
# unicode escape (exactly eight digits)
308+
escape += source.getwhile(8, HEXDIGITS)
309+
if len(escape) != 10:
310+
raise ValueError
311+
c = int(escape[2:], 16)
312+
chr(c) # raise ValueError for invalid code
313+
return LITERAL, c
281314
elif c == "0":
282315
# octal escape
283-
while source.next in OCTDIGITS and len(escape) < 4:
284-
escape = escape + source.get()
316+
escape += source.getwhile(2, OCTDIGITS)
285317
return LITERAL, int(escape[1:], 8) & 0xff
286318
elif c in DIGITS:
287319
# octal escape *or* decimal group reference (sigh)

Lib/test/test_re.py

Lines changed: 83 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -526,24 +526,92 @@ def test_flags(self):
526526
self.assertNotEqual(re.compile('^pattern$', flag), None)
527527

528528
def test_sre_character_literals(self):
529-
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
530-
self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None)
531-
self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None)
532-
self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None)
533-
self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None)
534-
self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None)
535-
self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
536-
self.assertRaises(re.error, re.match, "\911", "")
529+
for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
530+
if i < 256:
531+
self.assertIsNotNone(re.match(r"\%03o" % i, chr(i)))
532+
self.assertIsNotNone(re.match(r"\%03o0" % i, chr(i)+"0"))
533+
self.assertIsNotNone(re.match(r"\%03o8" % i, chr(i)+"8"))
534+
self.assertIsNotNone(re.match(r"\x%02x" % i, chr(i)))
535+
self.assertIsNotNone(re.match(r"\x%02x0" % i, chr(i)+"0"))
536+
self.assertIsNotNone(re.match(r"\x%02xz" % i, chr(i)+"z"))
537+
if i < 0x10000:
538+
self.assertIsNotNone(re.match(r"\u%04x" % i, chr(i)))
539+
self.assertIsNotNone(re.match(r"\u%04x0" % i, chr(i)+"0"))
540+
self.assertIsNotNone(re.match(r"\u%04xz" % i, chr(i)+"z"))
541+
self.assertIsNotNone(re.match(r"\U%08x" % i, chr(i)))
542+
self.assertIsNotNone(re.match(r"\U%08x0" % i, chr(i)+"0"))
543+
self.assertIsNotNone(re.match(r"\U%08xz" % i, chr(i)+"z"))
544+
self.assertIsNotNone(re.match(r"\0", "\000"))
545+
self.assertIsNotNone(re.match(r"\08", "\0008"))
546+
self.assertIsNotNone(re.match(r"\01", "\001"))
547+
self.assertIsNotNone(re.match(r"\018", "\0018"))
548+
self.assertIsNotNone(re.match(r"\567", chr(0o167)))
549+
self.assertRaises(re.error, re.match, r"\911", "")
550+
self.assertRaises(re.error, re.match, r"\x1", "")
551+
self.assertRaises(re.error, re.match, r"\x1z", "")
552+
self.assertRaises(re.error, re.match, r"\u123", "")
553+
self.assertRaises(re.error, re.match, r"\u123z", "")
554+
self.assertRaises(re.error, re.match, r"\U0001234", "")
555+
self.assertRaises(re.error, re.match, r"\U0001234z", "")
556+
self.assertRaises(re.error, re.match, r"\U00110000", "")
537557

538558
def test_sre_character_class_literals(self):
559+
for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
560+
if i < 256:
561+
self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i)))
562+
self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i)))
563+
self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i)))
564+
self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i)))
565+
self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i)))
566+
self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i)))
567+
self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i)))
568+
self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i)))
569+
if i < 0x10000:
570+
self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i)))
571+
self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i)))
572+
self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i)))
573+
self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i)))
574+
self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
575+
self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
576+
self.assertRaises(re.error, re.match, r"[\911]", "")
577+
self.assertRaises(re.error, re.match, r"[\x1z]", "")
578+
self.assertRaises(re.error, re.match, r"[\u123z]", "")
579+
self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
580+
self.assertRaises(re.error, re.match, r"[\U00110000]", "")
581+
582+
def test_sre_byte_literals(self):
583+
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
584+
self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i])))
585+
self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
586+
self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
587+
self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i])))
588+
self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
589+
self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
590+
self.assertIsNotNone(re.match(br"\u", b'u'))
591+
self.assertIsNotNone(re.match(br"\U", b'U'))
592+
self.assertIsNotNone(re.match(br"\0", b"\000"))
593+
self.assertIsNotNone(re.match(br"\08", b"\0008"))
594+
self.assertIsNotNone(re.match(br"\01", b"\001"))
595+
self.assertIsNotNone(re.match(br"\018", b"\0018"))
596+
self.assertIsNotNone(re.match(br"\567", bytes([0o167])))
597+
self.assertRaises(re.error, re.match, br"\911", b"")
598+
self.assertRaises(re.error, re.match, br"\x1", b"")
599+
self.assertRaises(re.error, re.match, br"\x1z", b"")
600+
601+
def test_sre_byte_class_literals(self):
539602
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
540-
self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None)
541-
self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None)
542-
self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None)
543-
self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None)
544-
self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None)
545-
self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None)
546-
self.assertRaises(re.error, re.match, "[\911]", "")
603+
self.assertIsNotNone(re.match((r"[\%o]" % i).encode(), bytes([i])))
604+
self.assertIsNotNone(re.match((r"[\%o8]" % i).encode(), bytes([i])))
605+
self.assertIsNotNone(re.match((r"[\%03o]" % i).encode(), bytes([i])))
606+
self.assertIsNotNone(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
607+
self.assertIsNotNone(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
608+
self.assertIsNotNone(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
609+
self.assertIsNotNone(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
610+
self.assertIsNotNone(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
611+
self.assertIsNotNone(re.match(br"[\u]", b'u'))
612+
self.assertIsNotNone(re.match(br"[\U]", b'U'))
613+
self.assertRaises(re.error, re.match, br"[\911]", "")
614+
self.assertRaises(re.error, re.match, br"[\x1z]", "")
547615

548616
def test_bug_113254(self):
549617
self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ Core and Builtins
4040
Library
4141
-------
4242

43+
- Issue #3665: \u and \U escapes are now supported in unicode regular
44+
expressions. Patch by Serhiy Storchaka.
45+
4346
- Issue #15153: Added inspect.getgeneratorlocals to simplify white box
4447
testing of generator state updates
4548

0 commit comments

Comments
 (0)