Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 158701d

Browse files
author
Victor Stinner
committed
Merged revisions 80382 via svnmerge from
svn+ssh://[email protected]/python/branches/py3k ........ r80382 | victor.stinner | 2010-04-22 21:38:16 +0200 (jeu., 22 avril 2010) | 3 lines Issue #8092: Fix PyUnicode_EncodeUTF8() to support error handler producing unicode string (eg. backslashreplace) ........
1 parent 754b98c commit 158701d

3 files changed

Lines changed: 93 additions & 47 deletions

File tree

Lib/test/test_codecs.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,16 @@ def test_decoder_state(self):
571571
def test_lone_surrogates(self):
572572
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
573573
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
574+
self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
575+
b'[\\udc80]')
576+
self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
577+
b'[�]')
578+
self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
579+
b'[\x80]')
580+
self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
581+
b'[]')
582+
self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
583+
b'[?]')
574584

575585
def test_surrogatepass_handler(self):
576586
self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ What's New in Python 3.1.3?
1212
Core and Builtins
1313
-----------------
1414

15+
- Issue #8092: Fix PyUnicode_EncodeUTF8() to support error handler producing
16+
unicode string (eg. backslashreplace)
17+
1518
- Issue #8014: Setting a T_UINT or T_PYSSIZET attribute of an object with
1619
PyMemberDefs could produce an internal error; raise TypeError instead.
1720

Objects/unicodeobject.c

Lines changed: 80 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,12 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
159159
const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
160160
Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
161161

162+
static void raise_encode_exception(PyObject **exceptionObject,
163+
const char *encoding,
164+
const Py_UNICODE *unicode, Py_ssize_t size,
165+
Py_ssize_t startpos, Py_ssize_t endpos,
166+
const char *reason);
167+
162168
/* Same for linebreaks */
163169
static unsigned char ascii_linebreak[] = {
164170
0, 0, 0, 0, 0, 0, 0, 0,
@@ -2461,61 +2467,88 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
24612467
/* Encode Latin-1 */
24622468
*p++ = (char)(0xc0 | (ch >> 6));
24632469
*p++ = (char)(0x80 | (ch & 0x3f));
2464-
}
2465-
else {
2466-
/* Encode UCS2 Unicode ordinals */
2467-
if (ch < 0x10000) {
2470+
} else if (0xD800 <= ch && ch <= 0xDFFF) {
24682471
#ifndef Py_UNICODE_WIDE
2469-
/* Special case: check for high surrogate */
2470-
if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2471-
Py_UCS4 ch2 = s[i];
2472-
/* Check for low surrogate and combine the two to
2473-
form a UCS4 value */
2474-
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2475-
ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2476-
i++;
2477-
goto encodeUCS4;
2478-
}
2479-
/* Fall through: handles isolated high surrogates */
2480-
}
2472+
/* Special case: check for high and low surrogate */
2473+
if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2474+
Py_UCS4 ch2 = s[i];
2475+
/* Combine the two surrogates to form a UCS4 value */
2476+
ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2477+
i++;
2478+
2479+
/* Encode UCS4 Unicode ordinals */
2480+
*p++ = (char)(0xf0 | (ch >> 18));
2481+
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2482+
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2483+
*p++ = (char)(0x80 | (ch & 0x3f));
2484+
24812485
#endif
2482-
if (ch >= 0xd800 && ch <= 0xdfff) {
2483-
Py_ssize_t newpos;
2484-
PyObject *rep;
2485-
char *prep;
2486-
int k;
2487-
rep = unicode_encode_call_errorhandler
2488-
(errors, &errorHandler, "utf-8", "surrogates not allowed",
2489-
s, size, &exc, i-1, i, &newpos);
2490-
if (!rep)
2491-
goto error;
2492-
/* Implementation limitations: only support error handler that return
2493-
bytes, and only support up to four replacement bytes. */
2494-
if (!PyBytes_Check(rep)) {
2495-
PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
2496-
Py_DECREF(rep);
2486+
} else {
2487+
Py_ssize_t newpos;
2488+
PyObject *rep;
2489+
Py_ssize_t repsize, k;
2490+
rep = unicode_encode_call_errorhandler
2491+
(errors, &errorHandler, "utf-8", "surrogates not allowed",
2492+
s, size, &exc, i-1, i, &newpos);
2493+
if (!rep)
2494+
goto error;
2495+
2496+
if (PyBytes_Check(rep))
2497+
repsize = PyBytes_GET_SIZE(rep);
2498+
else
2499+
repsize = PyUnicode_GET_SIZE(rep);
2500+
2501+
if (repsize > 4) {
2502+
Py_ssize_t offset;
2503+
2504+
if (result == NULL)
2505+
offset = p - stackbuf;
2506+
else
2507+
offset = p - PyBytes_AS_STRING(result);
2508+
2509+
if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2510+
/* integer overflow */
2511+
PyErr_NoMemory();
24972512
goto error;
24982513
}
2499-
if (PyBytes_Size(rep) > 4) {
2500-
PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
2501-
Py_DECREF(rep);
2502-
goto error;
2514+
nallocated += repsize - 4;
2515+
if (result != NULL) {
2516+
if (_PyBytes_Resize(&result, nallocated) < 0)
2517+
goto error;
2518+
} else {
2519+
result = PyBytes_FromStringAndSize(NULL, nallocated);
2520+
if (result == NULL)
2521+
goto error;
2522+
Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
25032523
}
2504-
prep = PyBytes_AsString(rep);
2505-
for(k = PyBytes_Size(rep); k > 0; k--)
2524+
p = PyBytes_AS_STRING(result) + offset;
2525+
}
2526+
2527+
if (PyBytes_Check(rep)) {
2528+
char *prep = PyBytes_AS_STRING(rep);
2529+
for(k = repsize; k > 0; k--)
25062530
*p++ = *prep++;
2507-
Py_DECREF(rep);
2508-
continue;
2509-
2531+
} else /* rep is unicode */ {
2532+
Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2533+
Py_UNICODE c;
2534+
2535+
for(k=0; k<repsize; k++) {
2536+
c = prep[k];
2537+
if (0x80 <= c) {
2538+
raise_encode_exception(&exc, "utf-8", s, size,
2539+
i-1, i, "surrogates not allowed");
2540+
goto error;
2541+
}
2542+
*p++ = (char)prep[k];
2543+
}
25102544
}
2511-
*p++ = (char)(0xe0 | (ch >> 12));
2512-
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2513-
*p++ = (char)(0x80 | (ch & 0x3f));
2514-
continue;
2545+
Py_DECREF(rep);
25152546
}
2516-
#ifndef Py_UNICODE_WIDE
2517-
encodeUCS4:
2518-
#endif
2547+
} else if (ch < 0x10000) {
2548+
*p++ = (char)(0xe0 | (ch >> 12));
2549+
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2550+
*p++ = (char)(0x80 | (ch & 0x3f));
2551+
} else /* ch >= 0x10000 */ {
25192552
/* Encode UCS4 Unicode ordinals */
25202553
*p++ = (char)(0xf0 | (ch >> 18));
25212554
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));

0 commit comments

Comments
 (0)