@@ -1163,6 +1163,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
11631163 PyUnicodeObject * v ;
11641164 Py_UNICODE * p = NULL , * buf = NULL ;
11651165 const char * end ;
1166+ Py_UCS4 chr ;
11661167
11671168 /* Escaped strings will always be longer than the resulting
11681169 Unicode string, so we start with size here and then reduce the
@@ -1214,28 +1215,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12141215 * p ++ = x ;
12151216 break ;
12161217
1217- /* \xXXXX escape with 1-n hex digits. for compatibility
1218- with 8-bit strings, this code ignores all but the last
1219- two digits */
1218+ /* \xXX with two hex digits */
12201219 case 'x' :
1221- x = 0 ;
1222- c = (unsigned char )* s ;
1223- if (isxdigit (c )) {
1224- do {
1225- x = (x <<4 ) & 0xF0 ;
1226- if ('0' <= c && c <= '9' )
1227- x += c - '0' ;
1228- else if ('a' <= c && c <= 'f' )
1229- x += 10 + c - 'a' ;
1230- else
1231- x += 10 + c - 'A' ;
1232- c = (unsigned char )* ++ s ;
1233- } while (isxdigit (c ));
1234- * p ++ = (unsigned char ) x ;
1235- } else {
1236- * p ++ = '\\' ;
1237- * p ++ = (unsigned char )s [-1 ];
1220+ for (x = 0 , i = 0 ; i < 2 ; i ++ ) {
1221+ c = (unsigned char )s [i ];
1222+ if (!isxdigit (c )) {
1223+ if (unicodeescape_decoding_error (& s , & x , errors ,
1224+ "truncated \\xXX" ))
1225+ goto onError ;
1226+ i ++ ;
1227+ break ;
1228+ }
1229+ x = (x <<4 ) & ~0xF ;
1230+ if (c >= '0' && c <= '9' )
1231+ x += c - '0' ;
1232+ else if (c >= 'a' && c <= 'f' )
1233+ x += 10 + c - 'a' ;
1234+ else
1235+ x += 10 + c - 'A' ;
12381236 }
1237+ s += i ;
1238+ * p ++ = x ;
12391239 break ;
12401240
12411241 /* \uXXXX with 4 hex digits */
@@ -1261,36 +1261,50 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12611261 * p ++ = x ;
12621262 break ;
12631263
1264+ /* \UXXXXXXXX with 8 hex digits */
1265+ case 'U' :
1266+ for (chr = 0 , i = 0 ; i < 8 ; i ++ ) {
1267+ c = (unsigned char )s [i ];
1268+ if (!isxdigit (c )) {
1269+ if (unicodeescape_decoding_error (& s , & x , errors ,
1270+ "truncated \\uXXXX" ))
1271+ goto onError ;
1272+ i ++ ;
1273+ break ;
1274+ }
1275+ chr = (chr <<4 ) & ~0xF ;
1276+ if (c >= '0' && c <= '9' )
1277+ chr += c - '0' ;
1278+ else if (c >= 'a' && c <= 'f' )
1279+ chr += 10 + c - 'a' ;
1280+ else
1281+ chr += 10 + c - 'A' ;
1282+ }
1283+ s += i ;
1284+ goto store ;
1285+
12641286 case 'N' :
12651287 /* Ok, we need to deal with Unicode Character Names now,
12661288 * make sure we've imported the hash table data...
12671289 */
1268- if (pucnHash == NULL )
1269- {
1290+ if (pucnHash == NULL ) {
12701291 PyObject * mod = 0 , * v = 0 ;
1271-
12721292 mod = PyImport_ImportModule ("ucnhash" );
12731293 if (mod == NULL )
12741294 goto onError ;
12751295 v = PyObject_GetAttrString (mod ,"ucnhashAPI" );
12761296 Py_DECREF (mod );
12771297 if (v == NULL )
1278- {
12791298 goto onError ;
1280- }
12811299 pucnHash = PyCObject_AsVoidPtr (v );
12821300 Py_DECREF (v );
12831301 if (pucnHash == NULL )
1284- {
12851302 goto onError ;
1286- }
12871303 }
12881304
1289- if (* s == '{' )
1290- {
1305+ if (* s == '{' ) {
12911306 const char * start = s + 1 ;
12921307 const char * endBrace = start ;
1293- Py_UCS4 value ;
12941308 unsigned long j ;
12951309
12961310 /* look for either the closing brace, or we
@@ -1303,8 +1317,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
13031317 {
13041318 endBrace ++ ;
13051319 }
1306- if (endBrace != end && * endBrace == '}' )
1307- {
1320+ if (endBrace != end && * endBrace == '}' ) {
13081321 j = pucnHash -> hash (start , endBrace - start );
13091322 if (j > pucnHash -> cKeys ||
13101323 mystrnicmp (
@@ -1321,30 +1334,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
13211334 }
13221335 goto ucnFallthrough ;
13231336 }
1324- value = ((_Py_UnicodeCharacterName * )
1325- (pucnHash -> getValue (j )))-> value ;
1326- if (value < 1 <<16 )
1327- {
1328- /* In UCS-2 range, easy solution.. */
1329- * p ++ = value ;
1330- }
1331- else
1332- {
1333- /* Oops, its in UCS-4 space, */
1334- /* compute and append the two surrogates: */
1335- /* translate from 10000..10FFFF to 0..FFFFF */
1336- value -= 0x10000 ;
1337-
1338- /* high surrogate = top 10 bits added to D800 */
1339- * p ++ = 0xD800 + (value >> 10 );
1340-
1341- /* low surrogate = bottom 10 bits added to DC00 */
1342- * p ++ = 0xDC00 + (value & ~0xFC00 );
1343- }
1337+ chr = ((_Py_UnicodeCharacterName * )
1338+ (pucnHash -> getValue (j )))-> value ;
13441339 s = endBrace + 1 ;
1345- }
1346- else
1347- {
1340+ goto store ;
1341+ } else {
13481342 if (unicodeescape_decoding_error (
13491343 & s , & x , errors ,
13501344 "Unicode name missing closing brace" ))
@@ -1363,6 +1357,23 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
13631357 * p ++ = '\\' ;
13641358 * p ++ = (unsigned char )s [-1 ];
13651359 break ;
1360+ store :
1361+ /* when we get here, chr is a 32-bit unicode character */
1362+ if (chr <= 0xffff )
1363+ /* UCS-2 character */
1364+ * p ++ = (Py_UNICODE ) chr ;
1365+ else if (chr <= 0x10ffff ) {
1366+ /* UCS-4 character. store as two surrogate characters */
1367+ chr -= 0x10000L ;
1368+ * p ++ = 0xD800 + (Py_UNICODE ) (chr >> 10 );
1369+ * p ++ = 0xDC00 + (Py_UNICODE ) (chr & ~0xFC00 );
1370+ } else {
1371+ if (unicodeescape_decoding_error (
1372+ & s , & x , errors ,
1373+ "Illegal Unicode character" )
1374+ )
1375+ goto onError ;
1376+ }
13661377 }
13671378 }
13681379 if (_PyUnicode_Resize (v , (int )(p - buf )))
0 commit comments