@@ -159,6 +159,12 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
159159 const Py_UNICODE * unicode , Py_ssize_t size , PyObject * * exceptionObject ,
160160 Py_ssize_t startpos , Py_ssize_t endpos , Py_ssize_t * newpos );
161161
162+ static void raise_encode_exception (PyObject * * exceptionObject ,
163+ const char * encoding ,
164+ const Py_UNICODE * unicode , Py_ssize_t size ,
165+ Py_ssize_t startpos , Py_ssize_t endpos ,
166+ const char * reason );
167+
162168/* Same for linebreaks */
163169static unsigned char ascii_linebreak [] = {
164170 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
@@ -2542,61 +2548,88 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
25422548 /* Encode Latin-1 */
25432549 * p ++ = (char )(0xc0 | (ch >> 6 ));
25442550 * p ++ = (char )(0x80 | (ch & 0x3f ));
2545- }
2546- else {
2547- /* Encode UCS2 Unicode ordinals */
2548- if (ch < 0x10000 ) {
2551+ } else if (0xD800 <= ch && ch <= 0xDFFF ) {
25492552#ifndef Py_UNICODE_WIDE
2550- /* Special case: check for high surrogate */
2551- if (0xD800 <= ch && ch <= 0xDBFF && i != size ) {
2552- Py_UCS4 ch2 = s [i ];
2553- /* Check for low surrogate and combine the two to
2554- form a UCS4 value */
2555- if (0xDC00 <= ch2 && ch2 <= 0xDFFF ) {
2556- ch = ((ch - 0xD800 ) << 10 | (ch2 - 0xDC00 )) + 0x10000 ;
2557- i ++ ;
2558- goto encodeUCS4 ;
2559- }
2560- /* Fall through: handles isolated high surrogates */
2561- }
2553+ /* Special case: check for high and low surrogate */
2554+ if (ch <= 0xDBFF && i != size && 0xDC00 <= s [i ] && s [i ] <= 0xDFFF ) {
2555+ Py_UCS4 ch2 = s [i ];
2556+ /* Combine the two surrogates to form a UCS4 value */
2557+ ch = ((ch - 0xD800 ) << 10 | (ch2 - 0xDC00 )) + 0x10000 ;
2558+ i ++ ;
2559+
2560+ /* Encode UCS4 Unicode ordinals */
2561+ * p ++ = (char )(0xf0 | (ch >> 18 ));
2562+ * p ++ = (char )(0x80 | ((ch >> 12 ) & 0x3f ));
2563+ * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
2564+ * p ++ = (char )(0x80 | (ch & 0x3f ));
2565+
25622566#endif
2563- if (ch >= 0xd800 && ch <= 0xdfff ) {
2564- Py_ssize_t newpos ;
2565- PyObject * rep ;
2566- char * prep ;
2567- int k ;
2568- rep = unicode_encode_call_errorhandler
2569- (errors , & errorHandler , "utf-8" , "surrogates not allowed" ,
2570- s , size , & exc , i - 1 , i , & newpos );
2571- if (!rep )
2572- goto error ;
2573- /* Implementation limitations: only support error handler that return
2574- bytes, and only support up to four replacement bytes. */
2575- if (!PyBytes_Check (rep )) {
2576- PyErr_SetString (PyExc_TypeError , "error handler should have returned bytes" );
2577- Py_DECREF (rep );
2567+ } else {
2568+ Py_ssize_t newpos ;
2569+ PyObject * rep ;
2570+ Py_ssize_t repsize , k ;
2571+ rep = unicode_encode_call_errorhandler
2572+ (errors , & errorHandler , "utf-8" , "surrogates not allowed" ,
2573+ s , size , & exc , i - 1 , i , & newpos );
2574+ if (!rep )
2575+ goto error ;
2576+
2577+ if (PyBytes_Check (rep ))
2578+ repsize = PyBytes_GET_SIZE (rep );
2579+ else
2580+ repsize = PyUnicode_GET_SIZE (rep );
2581+
2582+ if (repsize > 4 ) {
2583+ Py_ssize_t offset ;
2584+
2585+ if (result == NULL )
2586+ offset = p - stackbuf ;
2587+ else
2588+ offset = p - PyBytes_AS_STRING (result );
2589+
2590+ if (nallocated > PY_SSIZE_T_MAX - repsize + 4 ) {
2591+ /* integer overflow */
2592+ PyErr_NoMemory ();
25782593 goto error ;
25792594 }
2580- if (PyBytes_Size (rep ) > 4 ) {
2581- PyErr_SetString (PyExc_TypeError , "error handler returned too many bytes" );
2582- Py_DECREF (rep );
2583- goto error ;
2595+ nallocated += repsize - 4 ;
2596+ if (result != NULL ) {
2597+ if (_PyBytes_Resize (& result , nallocated ) < 0 )
2598+ goto error ;
2599+ } else {
2600+ result = PyBytes_FromStringAndSize (NULL , nallocated );
2601+ if (result == NULL )
2602+ goto error ;
2603+ Py_MEMCPY (PyBytes_AS_STRING (result ), stackbuf , offset );
25842604 }
2585- prep = PyBytes_AsString (rep );
2586- for (k = PyBytes_Size (rep ); k > 0 ; k -- )
2605+ p = PyBytes_AS_STRING (result ) + offset ;
2606+ }
2607+
2608+ if (PyBytes_Check (rep )) {
2609+ char * prep = PyBytes_AS_STRING (rep );
2610+ for (k = repsize ; k > 0 ; k -- )
25872611 * p ++ = * prep ++ ;
2588- Py_DECREF (rep );
2589- continue ;
2590-
2612+ } else /* rep is unicode */ {
2613+ Py_UNICODE * prep = PyUnicode_AS_UNICODE (rep );
2614+ Py_UNICODE c ;
2615+
2616+ for (k = 0 ; k < repsize ; k ++ ) {
2617+ c = prep [k ];
2618+ if (0x80 <= c ) {
2619+ raise_encode_exception (& exc , "utf-8" , s , size ,
2620+ i - 1 , i , "surrogates not allowed" );
2621+ goto error ;
2622+ }
2623+ * p ++ = (char )prep [k ];
2624+ }
25912625 }
2592- * p ++ = (char )(0xe0 | (ch >> 12 ));
2593- * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
2594- * p ++ = (char )(0x80 | (ch & 0x3f ));
2595- continue ;
2626+ Py_DECREF (rep );
25962627 }
2597- #ifndef Py_UNICODE_WIDE
2598- encodeUCS4 :
2599- #endif
2628+ } else if (ch < 0x10000 ) {
2629+ * p ++ = (char )(0xe0 | (ch >> 12 ));
2630+ * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
2631+ * p ++ = (char )(0x80 | (ch & 0x3f ));
2632+ } else /* ch >= 0x10000 */ {
26002633 /* Encode UCS4 Unicode ordinals */
26012634 * p ++ = (char )(0xf0 | (ch >> 18 ));
26022635 * p ++ = (char )(0x80 | ((ch >> 12 ) & 0x3f ));
0 commit comments