Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f413b80

Browse files
committed
in narrow builds, make sure to test codepoints as identifier characters (closes #12732)
This fixes the use of Unicode identifiers outside the BMP in narrow builds.
1 parent 7bf4363 commit f413b80

4 files changed

Lines changed: 30 additions & 8 deletions

File tree

Lib/test/test_pep3131.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,12 @@ class T:
88
ä = 1
99
µ = 2 # this is a compatibility character
1010
= 3
11+
𝔘𝔫𝔦𝔠𝔬𝔡𝔢 = 4
1112
self.assertEqual(getattr(T, "\xe4"), 1)
1213
self.assertEqual(getattr(T, "\u03bc"), 2)
1314
self.assertEqual(getattr(T, '\u87d2'), 3)
15+
v = getattr(T, "\U0001d518\U0001d52b\U0001d526\U0001d520\U0001d52c\U0001d521\U0001d522")
16+
self.assertEqual(v, 4)
1417

1518
def test_invalid(self):
1619
try:

Lib/test/test_unicode.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,7 @@ def test_isidentifier(self):
404404
self.assertTrue("bc".isidentifier())
405405
self.assertTrue("b_".isidentifier())
406406
self.assertTrue("µ".isidentifier())
407+
self.assertTrue("𝔘𝔫𝔦𝔠𝔬𝔡𝔢".isidentifier())
407408

408409
self.assertFalse(" ".isidentifier())
409410
self.assertFalse("[".isidentifier())

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ What's New in Python 3.2.2?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #12732: In narrow unicode builds, allow Unicode identifiers which fall
14+
outside the BMP.
15+
1316
- Issue #11603: Fix a crash when __str__ is rebound as __repr__. Patch by
1417
Andreas Stührk.
1518

Objects/unicodeobject.c

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7972,14 +7972,30 @@ unicode_isnumeric(PyUnicodeObject *self)
79727972
return PyBool_FromLong(1);
79737973
}
79747974

7975+
static Py_UCS4
7976+
decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size)
7977+
{
7978+
Py_UCS4 ch;
7979+
assert(*i < size);
7980+
ch = s[(*i)++];
7981+
#ifndef Py_UNICODE_WIDE
7982+
if ((ch & 0xfffffc00) == 0xd800 &&
7983+
*i < size
7984+
&& (s[*i] & 0xFFFFFC00) == 0xDC00)
7985+
ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00;
7986+
#endif
7987+
return ch;
7988+
}
7989+
79757990
int
79767991
PyUnicode_IsIdentifier(PyObject *self)
79777992
{
7978-
register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7979-
register const Py_UNICODE *e;
7993+
Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self);
7994+
Py_UCS4 first;
7995+
const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
79807996

79817997
/* Special case for empty strings */
7982-
if (PyUnicode_GET_SIZE(self) == 0)
7998+
if (!size)
79837999
return 0;
79848000

79858001
/* PEP 3131 says that the first character must be in
@@ -7990,14 +8006,13 @@ PyUnicode_IsIdentifier(PyObject *self)
79908006
definition of XID_Start and XID_Continue, it is sufficient
79918007
to check just for these, except that _ must be allowed
79928008
as starting an identifier. */
7993-
if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8009+
first = decode_ucs4(p, &i, size);
8010+
if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
79948011
return 0;
79958012

7996-
e = p + PyUnicode_GET_SIZE(self);
7997-
for (p++; p < e; p++) {
7998-
if (!_PyUnicode_IsXidContinue(*p))
8013+
while (i < size)
8014+
if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size)))
79998015
return 0;
8000-
}
80018016
return 1;
80028017
}
80038018

0 commit comments

Comments
 (0)