Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b2e796a

Browse files
committed
in wide builds, avoid storing high unicode characters from source code with surrogates
This is accomplished by decoding with utf-32 instead of utf-16 on all builds. The patch is by Adam Olsen.
1 parent 7b1b094 commit b2e796a

3 files changed

Lines changed: 25 additions & 9 deletions

File tree

Lib/test/test_pep263.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,14 @@ def test_issue4626(self):
3636
exec(c, d)
3737
self.assertEquals(d['\xc6'], '\xc6')
3838

39+
def test_issue3297(self):
40+
c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec")
41+
d = {}
42+
exec(c, d)
43+
self.assertEqual(d['a'], d['b'])
44+
self.assertEqual(len(d['a']), len(d['b']))
45+
self.assertEqual(ascii(d['a']), ascii(d['b']))
46+
3947
def test_main():
4048
support.run_unittest(PEP263Test)
4149

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ What's New in Python 3.2 Alpha 1?
1212
Core and Builtins
1313
-----------------
1414

15+
- Issue #3297: On wide unicode builds, do not split unicode characters into
16+
surrogates.
17+
1518
- Remove length limitation when constructing a complex number from a string.
1619

1720
- Issue #1087418: Boost performance of bitwise operations for longs.

Python/ast.c

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3246,10 +3246,11 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
32463246
u = NULL;
32473247
} else {
32483248
/* check for integer overflow */
3249-
if (len > PY_SIZE_MAX / 4)
3249+
if (len > PY_SIZE_MAX / 6)
32503250
return NULL;
3251-
/* "\XX" may become "\u005c\uHHLL" (12 bytes) */
3252-
u = PyBytes_FromStringAndSize((char *)NULL, len * 4);
3251+
/* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
3252+
"\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
3253+
u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
32533254
if (u == NULL)
32543255
return NULL;
32553256
p = buf = PyBytes_AsString(u);
@@ -3266,20 +3267,24 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons
32663267
PyObject *w;
32673268
char *r;
32683269
Py_ssize_t rn, i;
3269-
w = decode_utf8(c, &s, end, "utf-16-be");
3270+
w = decode_utf8(c, &s, end, "utf-32-be");
32703271
if (w == NULL) {
32713272
Py_DECREF(u);
32723273
return NULL;
32733274
}
32743275
r = PyBytes_AS_STRING(w);
32753276
rn = Py_SIZE(w);
3276-
assert(rn % 2 == 0);
3277-
for (i = 0; i < rn; i += 2) {
3278-
sprintf(p, "\\u%02x%02x",
3277+
assert(rn % 4 == 0);
3278+
for (i = 0; i < rn; i += 4) {
3279+
sprintf(p, "\\U%02x%02x%02x%02x",
32793280
r[i + 0] & 0xFF,
3280-
r[i + 1] & 0xFF);
3281-
p += 6;
3281+
r[i + 1] & 0xFF,
3282+
r[i + 2] & 0xFF,
3283+
r[i + 3] & 0xFF);
3284+
p += 10;
32823285
}
3286+
/* Should be impossible to overflow */
3287+
assert(p - buf <= Py_SIZE(u));
32833288
Py_DECREF(w);
32843289
} else {
32853290
*p++ = *s++;

0 commit comments

Comments
 (0)