@@ -1110,10 +1110,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
11101110 const char * errors )
11111111{
11121112 PyUnicodeObject * v ;
1113- Py_UNICODE * p = NULL , * buf = NULL ;
1113+ Py_UNICODE * p , * buf ;
11141114 const char * end ;
1115- Py_UCS4 chr ;
1116-
1115+ char * message ;
1116+ Py_UCS4 chr = 0xffffffff ; /* in case 'getcode' messes up */
1117+
11171118 /* Escaped strings will always be longer than the resulting
11181119 Unicode string, so we start with size here and then reduce the
11191120 length after conversion to the true value. */
@@ -1122,16 +1123,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
11221123 goto onError ;
11231124 if (size == 0 )
11241125 return (PyObject * )v ;
1126+
11251127 p = buf = PyUnicode_AS_UNICODE (v );
11261128 end = s + size ;
1129+
11271130 while (s < end ) {
11281131 unsigned char c ;
11291132 Py_UNICODE x ;
1130- int i ;
1133+ int i , digits ;
11311134
11321135 /* Non-escape characters are interpreted as Unicode ordinals */
11331136 if (* s != '\\' ) {
1134- * p ++ = (unsigned char )* s ++ ;
1137+ * p ++ = (unsigned char ) * s ++ ;
11351138 continue ;
11361139 }
11371140
@@ -1164,60 +1167,31 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
11641167 * p ++ = x ;
11651168 break ;
11661169
1167- /* \xXX with two hex digits */
1170+ /* hex escapes */
1171+ /* \xXX */
11681172 case 'x' :
1169- for (x = 0 , i = 0 ; i < 2 ; i ++ ) {
1170- c = (unsigned char )s [i ];
1171- if (!isxdigit (c )) {
1172- if (unicodeescape_decoding_error (& s , & x , errors ,
1173- "truncated \\xXX" ))
1174- goto onError ;
1175- i ++ ;
1176- break ;
1177- }
1178- x = (x <<4 ) & ~0xF ;
1179- if (c >= '0' && c <= '9' )
1180- x += c - '0' ;
1181- else if (c >= 'a' && c <= 'f' )
1182- x += 10 + c - 'a' ;
1183- else
1184- x += 10 + c - 'A' ;
1185- }
1186- s += i ;
1187- * p ++ = x ;
1188- break ;
1173+ digits = 2 ;
1174+ message = "truncated \\xXX escape" ;
1175+ goto hexescape ;
11891176
1190- /* \uXXXX with 4 hex digits */
1177+ /* \uXXXX */
11911178 case 'u' :
1192- for (x = 0 , i = 0 ; i < 4 ; i ++ ) {
1193- c = (unsigned char )s [i ];
1194- if (!isxdigit (c )) {
1195- if (unicodeescape_decoding_error (& s , & x , errors ,
1196- "truncated \\uXXXX" ))
1197- goto onError ;
1198- i ++ ;
1199- break ;
1200- }
1201- x = (x <<4 ) & ~0xF ;
1202- if (c >= '0' && c <= '9' )
1203- x += c - '0' ;
1204- else if (c >= 'a' && c <= 'f' )
1205- x += 10 + c - 'a' ;
1206- else
1207- x += 10 + c - 'A' ;
1208- }
1209- s += i ;
1210- * p ++ = x ;
1211- break ;
1179+ digits = 4 ;
1180+ message = "truncated \\uXXXX escape" ;
1181+ goto hexescape ;
12121182
1213- /* \UXXXXXXXX with 8 hex digits */
1183+ /* \UXXXXXXXX */
12141184 case 'U' :
1215- for (chr = 0 , i = 0 ; i < 8 ; i ++ ) {
1216- c = (unsigned char )s [i ];
1185+ digits = 8 ;
1186+ message = "truncated \\UXXXXXXXX escape" ;
1187+ hexescape :
1188+ chr = 0 ;
1189+ for (i = 0 ; i < digits ; i ++ ) {
1190+ c = (unsigned char ) s [i ];
12171191 if (!isxdigit (c )) {
1218- if (unicodeescape_decoding_error (& s , & x , errors ,
1219- "truncated \\uXXXX" ))
1192+ if (unicodeescape_decoding_error (& s , & x , errors , message ))
12201193 goto onError ;
1194+ chr = x ;
12211195 i ++ ;
12221196 break ;
12231197 }
@@ -1230,95 +1204,80 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12301204 chr += 10 + c - 'A' ;
12311205 }
12321206 s += i ;
1233- goto store ;
1207+ store :
1208+ /* when we get here, chr is a 32-bit unicode character */
1209+ if (chr <= 0xffff )
1210+ /* UCS-2 character */
1211+ * p ++ = (Py_UNICODE ) chr ;
1212+ else if (chr <= 0x10ffff ) {
1213+ /* UCS-4 character. store as two surrogate characters */
1214+ chr -= 0x10000L ;
1215+ * p ++ = 0xD800 + (Py_UNICODE ) (chr >> 10 );
1216+ * p ++ = 0xDC00 + (Py_UNICODE ) (chr & ~0xFC00 );
1217+ } else {
1218+ if (unicodeescape_decoding_error (
1219+ & s , & x , errors ,
1220+ "illegal Unicode character" )
1221+ )
1222+ goto onError ;
1223+ * p ++ = x ; /* store replacement character */
1224+ }
1225+ break ;
12341226
1227+ /* \N{name} */
12351228 case 'N' :
1236- /* Ok, we need to deal with Unicode Character Names now,
1237- * make sure we've imported the hash table data...
1238- */
1229+ message = "malformed \\N character escape" ;
12391230 if (ucnhash_CAPI == NULL ) {
1240- PyObject * mod = 0 , * v = 0 ;
1241- mod = PyImport_ImportModule ("unicodedata" );
1242- if (mod == NULL )
1231+ /* load the unicode data module */
1232+ PyObject * m , * v ;
1233+ m = PyImport_ImportModule ("unicodedata" );
1234+ if (m == NULL )
12431235 goto ucnhashError ;
1244- v = PyObject_GetAttrString (mod , "ucnhash_CAPI" );
1245- Py_DECREF (mod );
1236+ v = PyObject_GetAttrString (m , "ucnhash_CAPI" );
1237+ Py_DECREF (m );
12461238 if (v == NULL )
12471239 goto ucnhashError ;
12481240 ucnhash_CAPI = PyCObject_AsVoidPtr (v );
12491241 Py_DECREF (v );
12501242 if (ucnhash_CAPI == NULL )
12511243 goto ucnhashError ;
12521244 }
1253-
12541245 if (* s == '{' ) {
1255- const char * start = s + 1 ;
1256- const char * endBrace = start ;
1257-
1246+ const char * start = s + 1 ;
12581247 /* look for the closing brace */
1259- while (* endBrace != '}' && endBrace < end )
1260- endBrace ++ ;
1261- if (endBrace != end && * endBrace == '}' ) {
1262- if (!ucnhash_CAPI -> getcode (start , endBrace - start , & chr )) {
1263- if (unicodeescape_decoding_error (
1264- & s , & x , errors ,
1265- "Invalid Unicode Character Name" )
1266- )
1267- goto onError ;
1268- goto ucnFallthrough ;
1269- }
1270- s = endBrace + 1 ;
1271- goto store ;
1272- } else {
1273- if (unicodeescape_decoding_error (
1274- & s , & x , errors ,
1275- "Unicode name missing closing brace" ))
1276- goto onError ;
1277- goto ucnFallthrough ;
1248+ while (* s != '}' && s < end )
1249+ s ++ ;
1250+ if (s > start && s < end && * s == '}' ) {
1251+ /* found a name. look it up in the unicode database */
1252+ message = "unknown Unicode character name" ;
1253+ s ++ ;
1254+ if (ucnhash_CAPI -> getcode (start , s - start - 1 , & chr ))
1255+ goto store ;
12781256 }
1279- break ;
12801257 }
1281- if (unicodeescape_decoding_error (
1282- & s , & x , errors ,
1283- "Missing opening brace for Unicode Character Name escape" ))
1258+ if (unicodeescape_decoding_error (& s , & x , errors , message ))
12841259 goto onError ;
1285- ucnFallthrough :
1286- /* fall through on purpose */
1287- default :
1260+ * p ++ = x ;
1261+ break ;
1262+
1263+ default :
12881264 * p ++ = '\\' ;
12891265 * p ++ = (unsigned char )s [-1 ];
12901266 break ;
1291- store :
1292- /* when we get here, chr is a 32-bit unicode character */
1293- if (chr <= 0xffff )
1294- /* UCS-2 character */
1295- * p ++ = (Py_UNICODE ) chr ;
1296- else if (chr <= 0x10ffff ) {
1297- /* UCS-4 character. store as two surrogate characters */
1298- chr -= 0x10000L ;
1299- * p ++ = 0xD800 + (Py_UNICODE ) (chr >> 10 );
1300- * p ++ = 0xDC00 + (Py_UNICODE ) (chr & ~0xFC00 );
1301- } else {
1302- if (unicodeescape_decoding_error (
1303- & s , & x , errors ,
1304- "Illegal Unicode character" )
1305- )
1306- goto onError ;
1307- }
13081267 }
13091268 }
13101269 if (_PyUnicode_Resize (v , (int )(p - buf )))
13111270 goto onError ;
13121271 return (PyObject * )v ;
13131272
1314- ucnhashError :
1273+ ucnhashError :
13151274 PyErr_SetString (
13161275 PyExc_UnicodeError ,
13171276 "\\N escapes not supported (can't load unicodedata module)"
13181277 );
13191278 return NULL ;
13201279
1321- onError :
1280+ onError :
13221281 Py_XDECREF (v );
13231282 return NULL ;
13241283}
0 commit comments