@@ -159,6 +159,12 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
159159 const Py_UNICODE * unicode , Py_ssize_t size , PyObject * * exceptionObject ,
160160 Py_ssize_t startpos , Py_ssize_t endpos , Py_ssize_t * newpos );
161161
162+ static void raise_encode_exception (PyObject * * exceptionObject ,
163+ const char * encoding ,
164+ const Py_UNICODE * unicode , Py_ssize_t size ,
165+ Py_ssize_t startpos , Py_ssize_t endpos ,
166+ const char * reason );
167+
162168/* Same for linebreaks */
163169static unsigned char ascii_linebreak [] = {
164170 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
@@ -2461,61 +2467,88 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
24612467 /* Encode Latin-1 */
24622468 * p ++ = (char )(0xc0 | (ch >> 6 ));
24632469 * p ++ = (char )(0x80 | (ch & 0x3f ));
2464- }
2465- else {
2466- /* Encode UCS2 Unicode ordinals */
2467- if (ch < 0x10000 ) {
2470+ } else if (0xD800 <= ch && ch <= 0xDFFF ) {
24682471#ifndef Py_UNICODE_WIDE
2469- /* Special case: check for high surrogate */
2470- if (0xD800 <= ch && ch <= 0xDBFF && i != size ) {
2471- Py_UCS4 ch2 = s [i ];
2472- /* Check for low surrogate and combine the two to
2473- form a UCS4 value */
2474- if (0xDC00 <= ch2 && ch2 <= 0xDFFF ) {
2475- ch = ((ch - 0xD800 ) << 10 | (ch2 - 0xDC00 )) + 0x10000 ;
2476- i ++ ;
2477- goto encodeUCS4 ;
2478- }
2479- /* Fall through: handles isolated high surrogates */
2480- }
2472+ /* Special case: check for high and low surrogate */
2473+ if (ch <= 0xDBFF && i != size && 0xDC00 <= s [i ] && s [i ] <= 0xDFFF ) {
2474+ Py_UCS4 ch2 = s [i ];
2475+ /* Combine the two surrogates to form a UCS4 value */
2476+ ch = ((ch - 0xD800 ) << 10 | (ch2 - 0xDC00 )) + 0x10000 ;
2477+ i ++ ;
2478+
2479+ /* Encode UCS4 Unicode ordinals */
2480+ * p ++ = (char )(0xf0 | (ch >> 18 ));
2481+ * p ++ = (char )(0x80 | ((ch >> 12 ) & 0x3f ));
2482+ * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
2483+ * p ++ = (char )(0x80 | (ch & 0x3f ));
2484+
24812485#endif
2482- if (ch >= 0xd800 && ch <= 0xdfff ) {
2483- Py_ssize_t newpos ;
2484- PyObject * rep ;
2485- char * prep ;
2486- int k ;
2487- rep = unicode_encode_call_errorhandler
2488- (errors , & errorHandler , "utf-8" , "surrogates not allowed" ,
2489- s , size , & exc , i - 1 , i , & newpos );
2490- if (!rep )
2491- goto error ;
2492- /* Implementation limitations: only support error handler that return
2493- bytes, and only support up to four replacement bytes. */
2494- if (!PyBytes_Check (rep )) {
2495- PyErr_SetString (PyExc_TypeError , "error handler should have returned bytes" );
2496- Py_DECREF (rep );
2486+ } else {
2487+ Py_ssize_t newpos ;
2488+ PyObject * rep ;
2489+ Py_ssize_t repsize , k ;
2490+ rep = unicode_encode_call_errorhandler
2491+ (errors , & errorHandler , "utf-8" , "surrogates not allowed" ,
2492+ s , size , & exc , i - 1 , i , & newpos );
2493+ if (!rep )
2494+ goto error ;
2495+
2496+ if (PyBytes_Check (rep ))
2497+ repsize = PyBytes_GET_SIZE (rep );
2498+ else
2499+ repsize = PyUnicode_GET_SIZE (rep );
2500+
2501+ if (repsize > 4 ) {
2502+ Py_ssize_t offset ;
2503+
2504+ if (result == NULL )
2505+ offset = p - stackbuf ;
2506+ else
2507+ offset = p - PyBytes_AS_STRING (result );
2508+
2509+ if (nallocated > PY_SSIZE_T_MAX - repsize + 4 ) {
2510+ /* integer overflow */
2511+ PyErr_NoMemory ();
24972512 goto error ;
24982513 }
2499- if (PyBytes_Size (rep ) > 4 ) {
2500- PyErr_SetString (PyExc_TypeError , "error handler returned too many bytes" );
2501- Py_DECREF (rep );
2502- goto error ;
2514+ nallocated += repsize - 4 ;
2515+ if (result != NULL ) {
2516+ if (_PyBytes_Resize (& result , nallocated ) < 0 )
2517+ goto error ;
2518+ } else {
2519+ result = PyBytes_FromStringAndSize (NULL , nallocated );
2520+ if (result == NULL )
2521+ goto error ;
2522+ Py_MEMCPY (PyBytes_AS_STRING (result ), stackbuf , offset );
25032523 }
2504- prep = PyBytes_AsString (rep );
2505- for (k = PyBytes_Size (rep ); k > 0 ; k -- )
2524+ p = PyBytes_AS_STRING (result ) + offset ;
2525+ }
2526+
2527+ if (PyBytes_Check (rep )) {
2528+ char * prep = PyBytes_AS_STRING (rep );
2529+ for (k = repsize ; k > 0 ; k -- )
25062530 * p ++ = * prep ++ ;
2507- Py_DECREF (rep );
2508- continue ;
2509-
2531+ } else /* rep is unicode */ {
2532+ Py_UNICODE * prep = PyUnicode_AS_UNICODE (rep );
2533+ Py_UNICODE c ;
2534+
2535+ for (k = 0 ; k < repsize ; k ++ ) {
2536+ c = prep [k ];
2537+ if (0x80 <= c ) {
2538+ raise_encode_exception (& exc , "utf-8" , s , size ,
2539+ i - 1 , i , "surrogates not allowed" );
2540+ goto error ;
2541+ }
2542+ * p ++ = (char )prep [k ];
2543+ }
25102544 }
2511- * p ++ = (char )(0xe0 | (ch >> 12 ));
2512- * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
2513- * p ++ = (char )(0x80 | (ch & 0x3f ));
2514- continue ;
2545+ Py_DECREF (rep );
25152546 }
2516- #ifndef Py_UNICODE_WIDE
2517- encodeUCS4 :
2518- #endif
2547+ } else if (ch < 0x10000 ) {
2548+ * p ++ = (char )(0xe0 | (ch >> 12 ));
2549+ * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
2550+ * p ++ = (char )(0x80 | (ch & 0x3f ));
2551+ } else /* ch >= 0x10000 */ {
25192552 /* Encode UCS4 Unicode ordinals */
25202553 * p ++ = (char )(0xf0 | (ch >> 18 ));
25212554 * p ++ = (char )(0x80 | ((ch >> 12 ) & 0x3f ));
0 commit comments