Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit df84675

Browse files
author
Fredrik Lundh
committed
changed \x to consume exactly two hex digits, also for unicode
strings. closes PEP-223. also added \U escape (eight hex digits).
1 parent 03dd010 commit df84675

1 file changed

Lines changed: 66 additions & 55 deletions

File tree

Objects/unicodeobject.c

Lines changed: 66 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
11631163
PyUnicodeObject *v;
11641164
Py_UNICODE *p = NULL, *buf = NULL;
11651165
const char *end;
1166+
Py_UCS4 chr;
11661167

11671168
/* Escaped strings will always be longer than the resulting
11681169
Unicode string, so we start with size here and then reduce the
@@ -1214,28 +1215,27 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12141215
*p++ = x;
12151216
break;
12161217

1217-
/* \xXXXX escape with 1-n hex digits. for compatibility
1218-
with 8-bit strings, this code ignores all but the last
1219-
two digits */
1218+
/* \xXX with two hex digits */
12201219
case 'x':
1221-
x = 0;
1222-
c = (unsigned char)*s;
1223-
if (isxdigit(c)) {
1224-
do {
1225-
x = (x<<4) & 0xF0;
1226-
if ('0' <= c && c <= '9')
1227-
x += c - '0';
1228-
else if ('a' <= c && c <= 'f')
1229-
x += 10 + c - 'a';
1230-
else
1231-
x += 10 + c - 'A';
1232-
c = (unsigned char)*++s;
1233-
} while (isxdigit(c));
1234-
*p++ = (unsigned char) x;
1235-
} else {
1236-
*p++ = '\\';
1237-
*p++ = (unsigned char)s[-1];
1220+
for (x = 0, i = 0; i < 2; i++) {
1221+
c = (unsigned char)s[i];
1222+
if (!isxdigit(c)) {
1223+
if (unicodeescape_decoding_error(&s, &x, errors,
1224+
"truncated \\xXX"))
1225+
goto onError;
1226+
i++;
1227+
break;
1228+
}
1229+
x = (x<<4) & ~0xF;
1230+
if (c >= '0' && c <= '9')
1231+
x += c - '0';
1232+
else if (c >= 'a' && c <= 'f')
1233+
x += 10 + c - 'a';
1234+
else
1235+
x += 10 + c - 'A';
12381236
}
1237+
s += i;
1238+
*p++ = x;
12391239
break;
12401240

12411241
/* \uXXXX with 4 hex digits */
@@ -1261,36 +1261,50 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
12611261
*p++ = x;
12621262
break;
12631263

1264+
/* \UXXXXXXXX with 8 hex digits */
1265+
case 'U':
1266+
for (chr = 0, i = 0; i < 8; i++) {
1267+
c = (unsigned char)s[i];
1268+
if (!isxdigit(c)) {
1269+
if (unicodeescape_decoding_error(&s, &x, errors,
1270+
"truncated \\uXXXX"))
1271+
goto onError;
1272+
i++;
1273+
break;
1274+
}
1275+
chr = (chr<<4) & ~0xF;
1276+
if (c >= '0' && c <= '9')
1277+
chr += c - '0';
1278+
else if (c >= 'a' && c <= 'f')
1279+
chr += 10 + c - 'a';
1280+
else
1281+
chr += 10 + c - 'A';
1282+
}
1283+
s += i;
1284+
goto store;
1285+
12641286
case 'N':
12651287
/* Ok, we need to deal with Unicode Character Names now,
12661288
* make sure we've imported the hash table data...
12671289
*/
1268-
if (pucnHash == NULL)
1269-
{
1290+
if (pucnHash == NULL) {
12701291
PyObject *mod = 0, *v = 0;
1271-
12721292
mod = PyImport_ImportModule("ucnhash");
12731293
if (mod == NULL)
12741294
goto onError;
12751295
v = PyObject_GetAttrString(mod,"ucnhashAPI");
12761296
Py_DECREF(mod);
12771297
if (v == NULL)
1278-
{
12791298
goto onError;
1280-
}
12811299
pucnHash = PyCObject_AsVoidPtr(v);
12821300
Py_DECREF(v);
12831301
if (pucnHash == NULL)
1284-
{
12851302
goto onError;
1286-
}
12871303
}
12881304

1289-
if (*s == '{')
1290-
{
1305+
if (*s == '{') {
12911306
const char *start = s + 1;
12921307
const char *endBrace = start;
1293-
Py_UCS4 value;
12941308
unsigned long j;
12951309

12961310
/* look for either the closing brace, or we
@@ -1303,8 +1317,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
13031317
{
13041318
endBrace++;
13051319
}
1306-
if (endBrace != end && *endBrace == '}')
1307-
{
1320+
if (endBrace != end && *endBrace == '}') {
13081321
j = pucnHash->hash(start, endBrace - start);
13091322
if (j > pucnHash->cKeys ||
13101323
mystrnicmp(
@@ -1321,30 +1334,11 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
13211334
}
13221335
goto ucnFallthrough;
13231336
}
1324-
value = ((_Py_UnicodeCharacterName *)
1325-
(pucnHash->getValue(j)))->value;
1326-
if (value < 1<<16)
1327-
{
1328-
/* In UCS-2 range, easy solution.. */
1329-
*p++ = value;
1330-
}
1331-
else
1332-
{
1333-
/* Oops, its in UCS-4 space, */
1334-
/* compute and append the two surrogates: */
1335-
/* translate from 10000..10FFFF to 0..FFFFF */
1336-
value -= 0x10000;
1337-
1338-
/* high surrogate = top 10 bits added to D800 */
1339-
*p++ = 0xD800 + (value >> 10);
1340-
1341-
/* low surrogate = bottom 10 bits added to DC00 */
1342-
*p++ = 0xDC00 + (value & ~0xFC00);
1343-
}
1337+
chr = ((_Py_UnicodeCharacterName *)
1338+
(pucnHash->getValue(j)))->value;
13441339
s = endBrace + 1;
1345-
}
1346-
else
1347-
{
1340+
goto store;
1341+
} else {
13481342
if (unicodeescape_decoding_error(
13491343
&s, &x, errors,
13501344
"Unicode name missing closing brace"))
@@ -1363,6 +1357,23 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
13631357
*p++ = '\\';
13641358
*p++ = (unsigned char)s[-1];
13651359
break;
1360+
store:
1361+
/* when we get here, chr is a 32-bit unicode character */
1362+
if (chr <= 0xffff)
1363+
/* UCS-2 character */
1364+
*p++ = (Py_UNICODE) chr;
1365+
else if (chr <= 0x10ffff) {
1366+
/* UCS-4 character. store as two surrogate characters */
1367+
chr -= 0x10000L;
1368+
*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1369+
*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1370+
} else {
1371+
if (unicodeescape_decoding_error(
1372+
&s, &x, errors,
1373+
"Illegal Unicode character")
1374+
)
1375+
goto onError;
1376+
}
13661377
}
13671378
}
13681379
if (_PyUnicode_Resize(v, (int)(p - buf)))

0 commit comments

Comments
 (0)