Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ccc7473

Browse files
author
Fredrik Lundh
committed
reorganized PyUnicode_DecodeUnicodeEscape a bit (in order to make it
less likely that bug #132817 ever appears again)
1 parent b95896b commit ccc7473

1 file changed

Lines changed: 69 additions & 110 deletions

File tree

Objects/unicodeobject.c

Lines changed: 69 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,10 +1110,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
11101110
const char *errors)
11111111
{
11121112
PyUnicodeObject *v;
1113-
Py_UNICODE *p = NULL, *buf = NULL;
1113+
Py_UNICODE *p, *buf;
11141114
const char *end;
1115-
Py_UCS4 chr;
1116-
1115+
char* message;
1116+
Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1117+
11171118
/* Escaped strings will always be longer than the resulting
11181119
Unicode string, so we start with size here and then reduce the
11191120
length after conversion to the true value. */
@@ -1122,16 +1123,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
11221123
goto onError;
11231124
if (size == 0)
11241125
return (PyObject *)v;
1126+
11251127
p = buf = PyUnicode_AS_UNICODE(v);
11261128
end = s + size;
1129+
11271130
while (s < end) {
11281131
unsigned char c;
11291132
Py_UNICODE x;
1130-
int i;
1133+
int i, digits;
11311134

11321135
/* Non-escape characters are interpreted as Unicode ordinals */
11331136
if (*s != '\\') {
1134-
*p++ = (unsigned char)*s++;
1137+
*p++ = (unsigned char) *s++;
11351138
continue;
11361139
}
11371140

@@ -1164,60 +1167,31 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
11641167
*p++ = x;
11651168
break;
11661169

1167-
/* \xXX with two hex digits */
1170+
/* hex escapes */
1171+
/* \xXX */
11681172
case 'x':
1169-
for (x = 0, i = 0; i < 2; i++) {
1170-
c = (unsigned char)s[i];
1171-
if (!isxdigit(c)) {
1172-
if (unicodeescape_decoding_error(&s, &x, errors,
1173-
"truncated \\xXX"))
1174-
goto onError;
1175-
i++;
1176-
break;
1177-
}
1178-
x = (x<<4) & ~0xF;
1179-
if (c >= '0' && c <= '9')
1180-
x += c - '0';
1181-
else if (c >= 'a' && c <= 'f')
1182-
x += 10 + c - 'a';
1183-
else
1184-
x += 10 + c - 'A';
1185-
}
1186-
s += i;
1187-
*p++ = x;
1188-
break;
1173+
digits = 2;
1174+
message = "truncated \\xXX escape";
1175+
goto hexescape;
11891176

1190-
/* \uXXXX with 4 hex digits */
1177+
/* \uXXXX */
11911178
case 'u':
1192-
for (x = 0, i = 0; i < 4; i++) {
1193-
c = (unsigned char)s[i];
1194-
if (!isxdigit(c)) {
1195-
if (unicodeescape_decoding_error(&s, &x, errors,
1196-
"truncated \\uXXXX"))
1197-
goto onError;
1198-
i++;
1199-
break;
1200-
}
1201-
x = (x<<4) & ~0xF;
1202-
if (c >= '0' && c <= '9')
1203-
x += c - '0';
1204-
else if (c >= 'a' && c <= 'f')
1205-
x += 10 + c - 'a';
1206-
else
1207-
x += 10 + c - 'A';
1208-
}
1209-
s += i;
1210-
*p++ = x;
1211-
break;
1179+
digits = 4;
1180+
message = "truncated \\uXXXX escape";
1181+
goto hexescape;
12121182

1213-
/* \UXXXXXXXX with 8 hex digits */
1183+
/* \UXXXXXXXX */
12141184
case 'U':
1215-
for (chr = 0, i = 0; i < 8; i++) {
1216-
c = (unsigned char)s[i];
1185+
digits = 8;
1186+
message = "truncated \\UXXXXXXXX escape";
1187+
hexescape:
1188+
chr = 0;
1189+
for (i = 0; i < digits; i++) {
1190+
c = (unsigned char) s[i];
12171191
if (!isxdigit(c)) {
1218-
if (unicodeescape_decoding_error(&s, &x, errors,
1219-
"truncated \\uXXXX"))
1192+
if (unicodeescape_decoding_error(&s, &x, errors, message))
12201193
goto onError;
1194+
chr = x;
12211195
i++;
12221196
break;
12231197
}
@@ -1230,95 +1204,80 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12301204
chr += 10 + c - 'A';
12311205
}
12321206
s += i;
1233-
goto store;
1207+
store:
1208+
/* when we get here, chr is a 32-bit unicode character */
1209+
if (chr <= 0xffff)
1210+
/* UCS-2 character */
1211+
*p++ = (Py_UNICODE) chr;
1212+
else if (chr <= 0x10ffff) {
1213+
/* UCS-4 character. store as two surrogate characters */
1214+
chr -= 0x10000L;
1215+
*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1216+
*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1217+
} else {
1218+
if (unicodeescape_decoding_error(
1219+
&s, &x, errors,
1220+
"illegal Unicode character")
1221+
)
1222+
goto onError;
1223+
*p++ = x; /* store replacement character */
1224+
}
1225+
break;
12341226

1227+
/* \N{name} */
12351228
case 'N':
1236-
/* Ok, we need to deal with Unicode Character Names now,
1237-
* make sure we've imported the hash table data...
1238-
*/
1229+
message = "malformed \\N character escape";
12391230
if (ucnhash_CAPI == NULL) {
1240-
PyObject *mod = 0, *v = 0;
1241-
mod = PyImport_ImportModule("unicodedata");
1242-
if (mod == NULL)
1231+
/* load the unicode data module */
1232+
PyObject *m, *v;
1233+
m = PyImport_ImportModule("unicodedata");
1234+
if (m == NULL)
12431235
goto ucnhashError;
1244-
v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
1245-
Py_DECREF(mod);
1236+
v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1237+
Py_DECREF(m);
12461238
if (v == NULL)
12471239
goto ucnhashError;
12481240
ucnhash_CAPI = PyCObject_AsVoidPtr(v);
12491241
Py_DECREF(v);
12501242
if (ucnhash_CAPI == NULL)
12511243
goto ucnhashError;
12521244
}
1253-
12541245
if (*s == '{') {
1255-
const char *start = s + 1;
1256-
const char *endBrace = start;
1257-
1246+
const char *start = s+1;
12581247
/* look for the closing brace */
1259-
while (*endBrace != '}' && endBrace < end)
1260-
endBrace++;
1261-
if (endBrace != end && *endBrace == '}') {
1262-
if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
1263-
if (unicodeescape_decoding_error(
1264-
&s, &x, errors,
1265-
"Invalid Unicode Character Name")
1266-
)
1267-
goto onError;
1268-
goto ucnFallthrough;
1269-
}
1270-
s = endBrace + 1;
1271-
goto store;
1272-
} else {
1273-
if (unicodeescape_decoding_error(
1274-
&s, &x, errors,
1275-
"Unicode name missing closing brace"))
1276-
goto onError;
1277-
goto ucnFallthrough;
1248+
while (*s != '}' && s < end)
1249+
s++;
1250+
if (s > start && s < end && *s == '}') {
1251+
/* found a name. look it up in the unicode database */
1252+
message = "unknown Unicode character name";
1253+
s++;
1254+
if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1255+
goto store;
12781256
}
1279-
break;
12801257
}
1281-
if (unicodeescape_decoding_error(
1282-
&s, &x, errors,
1283-
"Missing opening brace for Unicode Character Name escape"))
1258+
if (unicodeescape_decoding_error(&s, &x, errors, message))
12841259
goto onError;
1285-
ucnFallthrough:
1286-
/* fall through on purpose */
1287-
default:
1260+
*p++ = x;
1261+
break;
1262+
1263+
default:
12881264
*p++ = '\\';
12891265
*p++ = (unsigned char)s[-1];
12901266
break;
1291-
store:
1292-
/* when we get here, chr is a 32-bit unicode character */
1293-
if (chr <= 0xffff)
1294-
/* UCS-2 character */
1295-
*p++ = (Py_UNICODE) chr;
1296-
else if (chr <= 0x10ffff) {
1297-
/* UCS-4 character. store as two surrogate characters */
1298-
chr -= 0x10000L;
1299-
*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1300-
*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1301-
} else {
1302-
if (unicodeescape_decoding_error(
1303-
&s, &x, errors,
1304-
"Illegal Unicode character")
1305-
)
1306-
goto onError;
1307-
}
13081267
}
13091268
}
13101269
if (_PyUnicode_Resize(v, (int)(p - buf)))
13111270
goto onError;
13121271
return (PyObject *)v;
13131272

1314-
ucnhashError:
1273+
ucnhashError:
13151274
PyErr_SetString(
13161275
PyExc_UnicodeError,
13171276
"\\N escapes not supported (can't load unicodedata module)"
13181277
);
13191278
return NULL;
13201279

1321-
onError:
1280+
onError:
13221281
Py_XDECREF(v);
13231282
return NULL;
13241283
}

0 commit comments

Comments
 (0)