@@ -1172,85 +1172,92 @@ int utf8_encoding_error(const Py_UNICODE **source,
11721172}
11731173#endif
11741174
1175+ /* Allocation strategy: we default to Latin-1, then do one resize
1176+ whenever we hit an order boundary. The assumption is that
1177+ characters from higher orders usually occur often enough to warrant
1178+ this.
1179+ */
1180+
11751181PyObject * PyUnicode_EncodeUTF8 (const Py_UNICODE * s ,
11761182 int size ,
11771183 const char * errors )
11781184{
11791185 PyObject * v ;
11801186 char * p ;
1181- unsigned int cbAllocated = 2 * size ;
1182- unsigned int cbWritten = 0 ;
11831187 int i = 0 ;
1184-
1188+ int overalloc = 2 ;
1189+ int len ;
1190+
11851191 /* Short-cut for emtpy strings */
11861192 if (size == 0 )
11871193 return PyString_FromStringAndSize (NULL , 0 );
11881194
1189- /* We allocate 4 more bytes to have room for at least one full
1190- UTF-8 sequence; saves a few cycles in the loop below */
1191- v = PyString_FromStringAndSize (NULL , cbAllocated + 4 );
1195+ v = PyString_FromStringAndSize (NULL , overalloc * size );
11921196 if (v == NULL )
11931197 return NULL ;
11941198
11951199 p = PyString_AS_STRING (v );
1200+
11961201 while (i < size ) {
11971202 Py_UCS4 ch = s [i ++ ];
11981203
1199- if (ch < 0x80 ) {
1204+ if (ch < 0x80 )
1205+ /* Encode ASCII */
12001206 * p ++ = (char ) ch ;
1201- cbWritten ++ ;
1202- }
12031207
12041208 else if (ch < 0x0800 ) {
1209+ /* Encode Latin-1 */
12051210 * p ++ = (char )(0xc0 | (ch >> 6 ));
12061211 * p ++ = (char )(0x80 | (ch & 0x3f ));
1207- cbWritten += 2 ;
12081212 }
12091213
12101214 else {
1211-
1212- /* Assure that we have enough room for high order Unicode
1213- ordinals */
1214- if (cbWritten >= cbAllocated ) {
1215- cbAllocated += 4 * 10 ;
1216- if (_PyString_Resize (& v , cbAllocated + 4 ))
1217- goto onError ;
1218- p = PyString_AS_STRING (v ) + cbWritten ;
1219- }
1220-
1215+ /* Encode UCS2 Unicode ordinals */
12211216 if (ch < 0x10000 ) {
1222- /* Check for high surrogate */
1217+
1218+ /* Special case: check for high surrogate */
12231219 if (0xD800 <= ch && ch <= 0xDBFF && i != size ) {
12241220 Py_UCS4 ch2 = s [i ];
1225- /* Check for low surrogate */
1221+ /* Check for low surrogate and combine the two to
1222+ form a UCS4 value */
12261223 if (0xDC00 <= ch2 && ch2 <= 0xDFFF ) {
1227- ch = ((ch - 0xD800 ) << 10 | (ch2 - 0xDC00 )) + 0x00010000 ;
1228- * p ++ = (char )(0xf0 | (ch >> 18 ));
1229- * p ++ = (char )(0x80 | ((ch >> 12 ) & 0x3f ));
1230- * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
1231- * p ++ = (char )(0x80 | (ch & 0x3f ));
1232- i ++ ;
1233- cbWritten += 4 ;
1234- continue ;
1224+ ch = ((ch - 0xD800 ) << 10 | (ch2 - 0xDC00 )) + 0x10000 ;
1225+ i ++ ;
1226+ goto encodeUCS4 ;
12351227 }
12361228 /* Fall through: handles isolated high surrogates */
12371229 }
1230+
1231+ if (overalloc < 3 ) {
1232+ len = (int )(p - PyString_AS_STRING (v ));
1233+ overalloc = 3 ;
1234+ if (_PyString_Resize (& v , overalloc * size ))
1235+ goto onError ;
1236+ p = PyString_AS_STRING (v ) + len ;
1237+ }
12381238 * p ++ = (char )(0xe0 | (ch >> 12 ));
12391239 * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
12401240 * p ++ = (char )(0x80 | (ch & 0x3f ));
1241- cbWritten += 3 ;
1242-
1243- } else {
1244- * p ++ = (char )(0xf0 | (ch >>18 ));
1245- * p ++ = (char )(0x80 | ((ch >>12 ) & 0x3f ));
1246- * p ++ = (char )(0x80 | ((ch >>6 ) & 0x3f ));
1247- * p ++ = (char )(0x80 | (ch & 0x3f ));
1248- cbWritten += 4 ;
1241+ continue ;
1242+ }
1243+
1244+ /* Encode UCS4 Unicode ordinals */
1245+ encodeUCS4 :
1246+ if (overalloc < 4 ) {
1247+ len = (int )(p - PyString_AS_STRING (v ));
1248+ overalloc = 4 ;
1249+ if (_PyString_Resize (& v , overalloc * size ))
1250+ goto onError ;
1251+ p = PyString_AS_STRING (v ) + len ;
12491252 }
1253+ * p ++ = (char )(0xf0 | (ch >> 18 ));
1254+ * p ++ = (char )(0x80 | ((ch >> 12 ) & 0x3f ));
1255+ * p ++ = (char )(0x80 | ((ch >> 6 ) & 0x3f ));
1256+ * p ++ = (char )(0x80 | (ch & 0x3f ));
12501257 }
12511258 }
12521259 * p = '\0' ;
1253- if (_PyString_Resize (& v , cbWritten ))
1260+ if (_PyString_Resize (& v , ( int )( p - PyString_AS_STRING ( v )) ))
12541261 goto onError ;
12551262 return v ;
12561263
0 commit comments