Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit bd3be8f

Browse files
committed
Fix to the UTF-8 encoder: it failed on 0-length input strings.
Fix for the UTF-8 decoder: it will now accept isolated surrogates (previously it raised an exception which causes round-trips to fail). Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for marshalling Unicode objects, so we better make sure it works for all Unicode code points, including isolated surrogates). Bumped the PYC magic in a non-standard way -- please review. This was needed because the old PYC format used illegal UTF-8 sequences for isolated high surrogates which now raise an exception.
1 parent 9273ec7 commit bd3be8f

4 files changed

Lines changed: 71 additions & 31 deletions

File tree

Lib/test/output/test_unicodedata

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
test_unicodedata
22
Testing Unicode Database...
3-
Methods: 6c7a7c02657b69d0fdd7a7d174f573194bba2e18
3+
Methods: 84b72943b1d4320bc1e64a4888f7cdf62eea219a
44
Functions: 41e1d4792185d6474a43c83ce4f593b1bdb01f8a
55
API: ok

Lib/test/test_unicode.py

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,23 @@
2323
verify(repr(u"'\"") == """u'\\'"'""")
2424
verify(repr(u"'") == '''u"'"''')
2525
verify(repr(u'"') == """u'"'""")
26-
verify(repr(u''.join(map(unichr, range(256)))) ==
27-
"u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
28-
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
29-
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
30-
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
31-
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
32-
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
33-
"\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
34-
"\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
35-
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
36-
"\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
37-
"\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
38-
"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
39-
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
40-
"\\xfe\\xff'")
26+
latin1repr = (
27+
"u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
28+
"\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
29+
"\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
30+
"JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
31+
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
32+
"\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
33+
"\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
34+
"\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
35+
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
36+
"\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
37+
"\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
38+
"\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
39+
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
40+
"\\xfe\\xff'")
41+
testrepr = repr(u''.join(map(unichr, range(256))))
42+
verify(testrepr == latin1repr)
4143

4244
def test(method, input, output, *args):
4345
if verbose:
@@ -495,6 +497,7 @@ def __str__(self):
495497
verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
496498

497499
# UTF-8 specific encoding tests:
500+
verify(u''.encode('utf-8') == '')
498501
verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
499502
verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
500503
verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96')
@@ -552,14 +555,7 @@ def __str__(self):
552555
'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
553556
verify(unicode(u.encode(encoding),encoding) == u)
554557

555-
# Roundtrip safety for non-BMP (just a few chars)
556-
u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
557-
for encoding in ('utf-8',
558-
'utf-16', 'utf-16-le', 'utf-16-be',
559-
#'raw_unicode_escape',
560-
'unicode_escape', 'unicode_internal'):
561-
verify(unicode(u.encode(encoding),encoding) == u)
562-
558+
# Roundtrip safety for BMP (just the first 256 chars)
563559
u = u''.join(map(unichr, range(256)))
564560
for encoding in (
565561
'latin-1',
@@ -571,6 +567,7 @@ def __str__(self):
571567
except ValueError,why:
572568
print '*** codec for "%s" failed: %s' % (encoding, why)
573569

570+
# Roundtrip safety for BMP (just the first 128 chars)
574571
u = u''.join(map(unichr, range(128)))
575572
for encoding in (
576573
'ascii',
@@ -582,6 +579,19 @@ def __str__(self):
582579
except ValueError,why:
583580
print '*** codec for "%s" failed: %s' % (encoding, why)
584581

582+
# Roundtrip safety for non-BMP (just a few chars)
583+
u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
584+
for encoding in ('utf-8',
585+
'utf-16', 'utf-16-le', 'utf-16-be',
586+
#'raw_unicode_escape',
587+
'unicode_escape', 'unicode_internal'):
588+
verify(unicode(u.encode(encoding),encoding) == u)
589+
590+
# UTF-8 must be roundtrip safe for all UCS-2 code points
591+
u = u''.join(map(unichr, range(0x10000)))
592+
for encoding in ('utf-8',):
593+
verify(unicode(u.encode(encoding),encoding) == u)
594+
585595
print 'done.'
586596

587597
print 'Testing standard mapping codecs...',

Objects/unicodeobject.c

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,12 +1065,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
10651065
goto utf8Error;
10661066
}
10671067
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1068-
if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1068+
if (ch < 0x0800) {
1069+
/* Note: UTF-8 encodings of surrogates are considered
1070+
legal UTF-8 sequences;
1071+
1072+
XXX For wide builds (UCS-4) we should probably try
1073+
to recombine the surrogates into a single code
1074+
unit.
1075+
*/
10691076
errmsg = "illegal encoding";
10701077
goto utf8Error;
10711078
}
10721079
else
1073-
*p++ = (Py_UNICODE)ch;
1080+
*p++ = (Py_UNICODE)ch;
10741081
break;
10751082

10761083
case 4:
@@ -1084,9 +1091,9 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
10841091
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
10851092
/* validate and convert to UTF-16 */
10861093
if ((ch < 0x10000) /* minimum value allowed for 4
1087-
byte encoding */
1094+
byte encoding */
10881095
|| (ch > 0x10ffff)) /* maximum value allowed for
1089-
UTF-16 */
1096+
UTF-16 */
10901097
{
10911098
errmsg = "illegal encoding";
10921099
goto utf8Error;
@@ -1175,11 +1182,15 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
11751182
unsigned int cbWritten = 0;
11761183
int i = 0;
11771184

1185+
/* Short-cut for emtpy strings */
1186+
if (size == 0)
1187+
return PyString_FromStringAndSize(NULL, 0);
1188+
1189+
/* We allocate 4 more bytes to have room for at least one full
1190+
UTF-8 sequence; saves a few cycles in the loop below */
11781191
v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
11791192
if (v == NULL)
11801193
return NULL;
1181-
if (size == 0)
1182-
return v;
11831194

11841195
p = PyString_AS_STRING(v);
11851196
while (i < size) {

Python/import.c

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,27 @@ extern time_t PyOS_GetLastModificationTime(char *, FILE *);
4141
the Unicode -U option is in use. IMO (Tim's), that's a Bad Idea
4242
(quite apart from that the -U option doesn't work so isn't used
4343
anyway).
44+
45+
XXX MAL, 2002-02-07: I had to modify the MAGIC due to a fix of the
46+
UTF-8 encoder (it previously produced invalid UTF-8 for unpaired
47+
high surrogates), so I simply bumped the month value to 20 (invalid
48+
month) and set the day to 1. This should be recognizable by any
49+
algorithm relying on the above scheme. Perhaps we should simply
50+
start counting in increments of 10 from now on ?!
51+
52+
Known values:
53+
Python 1.5: 20121
54+
Python 1.5.1: 20121
55+
Python 1.5.2: 20121
56+
Python 2.0: 50823
57+
Python 2.0.1: 50823
58+
Python 2.1: 60202
59+
Python 2.1.1: 60202
60+
Python 2.1.2: 60202
61+
Python 2.2: 60717
62+
Python 2.3a0: 62001
4463
*/
45-
#define MAGIC (60717 | ((long)'\r'<<16) | ((long)'\n'<<24))
64+
#define MAGIC (62001 | ((long)'\r'<<16) | ((long)'\n'<<24))
4665

4766
/* Magic word as global; note that _PyImport_Init() can change the
4867
value of this global to accommodate for alterations of how the

0 commit comments

Comments
 (0)