Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 68e6933

Browse files
committed
Bug fix for UTF-8 encoding bug (buffer overrun) #541828.
1 parent e3c764b commit 68e6933

1 file changed

Lines changed: 46 additions & 39 deletions

File tree

Objects/unicodeobject.c

Lines changed: 46 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,85 +1172,92 @@ int utf8_encoding_error(const Py_UNICODE **source,
11721172
}
11731173
#endif
11741174

1175+
/* Allocation strategy: we default to Latin-1, then do one resize
1176+
whenever we hit an order boundary. The assumption is that
1177+
characters from higher orders usually occur often enough to warrant
1178+
this.
1179+
*/
1180+
11751181
PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
11761182
int size,
11771183
const char *errors)
11781184
{
11791185
PyObject *v;
11801186
char *p;
1181-
unsigned int cbAllocated = 2 * size;
1182-
unsigned int cbWritten = 0;
11831187
int i = 0;
1184-
1188+
int overalloc = 2;
1189+
int len;
1190+
11851191
/* Short-cut for emtpy strings */
11861192
if (size == 0)
11871193
return PyString_FromStringAndSize(NULL, 0);
11881194

1189-
/* We allocate 4 more bytes to have room for at least one full
1190-
UTF-8 sequence; saves a few cycles in the loop below */
1191-
v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
1195+
v = PyString_FromStringAndSize(NULL, overalloc * size);
11921196
if (v == NULL)
11931197
return NULL;
11941198

11951199
p = PyString_AS_STRING(v);
1200+
11961201
while (i < size) {
11971202
Py_UCS4 ch = s[i++];
11981203

1199-
if (ch < 0x80) {
1204+
if (ch < 0x80)
1205+
/* Encode ASCII */
12001206
*p++ = (char) ch;
1201-
cbWritten++;
1202-
}
12031207

12041208
else if (ch < 0x0800) {
1209+
/* Encode Latin-1 */
12051210
*p++ = (char)(0xc0 | (ch >> 6));
12061211
*p++ = (char)(0x80 | (ch & 0x3f));
1207-
cbWritten += 2;
12081212
}
12091213

12101214
else {
1211-
1212-
/* Assure that we have enough room for high order Unicode
1213-
ordinals */
1214-
if (cbWritten >= cbAllocated) {
1215-
cbAllocated += 4 * 10;
1216-
if (_PyString_Resize(&v, cbAllocated + 4))
1217-
goto onError;
1218-
p = PyString_AS_STRING(v) + cbWritten;
1219-
}
1220-
1215+
/* Encode UCS2 Unicode ordinals */
12211216
if (ch < 0x10000) {
1222-
/* Check for high surrogate */
1217+
1218+
/* Special case: check for high surrogate */
12231219
if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
12241220
Py_UCS4 ch2 = s[i];
1225-
/* Check for low surrogate */
1221+
/* Check for low surrogate and combine the two to
1222+
form a UCS4 value */
12261223
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1227-
ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x00010000;
1228-
*p++ = (char)(0xf0 | (ch >> 18));
1229-
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1230-
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1231-
*p++ = (char)(0x80 | (ch & 0x3f));
1232-
i++;
1233-
cbWritten += 4;
1234-
continue;
1224+
ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1225+
i++;
1226+
goto encodeUCS4;
12351227
}
12361228
/* Fall through: handles isolated high surrogates */
12371229
}
1230+
1231+
if (overalloc < 3) {
1232+
len = (int)(p - PyString_AS_STRING(v));
1233+
overalloc = 3;
1234+
if (_PyString_Resize(&v, overalloc * size))
1235+
goto onError;
1236+
p = PyString_AS_STRING(v) + len;
1237+
}
12381238
*p++ = (char)(0xe0 | (ch >> 12));
12391239
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
12401240
*p++ = (char)(0x80 | (ch & 0x3f));
1241-
cbWritten += 3;
1242-
1243-
} else {
1244-
*p++ = (char)(0xf0 | (ch>>18));
1245-
*p++ = (char)(0x80 | ((ch>>12) & 0x3f));
1246-
*p++ = (char)(0x80 | ((ch>>6) & 0x3f));
1247-
*p++ = (char)(0x80 | (ch & 0x3f));
1248-
cbWritten += 4;
1241+
continue;
1242+
}
1243+
1244+
/* Encode UCS4 Unicode ordinals */
1245+
encodeUCS4:
1246+
if (overalloc < 4) {
1247+
len = (int)(p - PyString_AS_STRING(v));
1248+
overalloc = 4;
1249+
if (_PyString_Resize(&v, overalloc * size))
1250+
goto onError;
1251+
p = PyString_AS_STRING(v) + len;
12491252
}
1253+
*p++ = (char)(0xf0 | (ch >> 18));
1254+
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1255+
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1256+
*p++ = (char)(0x80 | (ch & 0x3f));
12501257
}
12511258
}
12521259
*p = '\0';
1253-
if (_PyString_Resize(&v, cbWritten))
1260+
if (_PyString_Resize(&v, (int)(p - PyString_AS_STRING(v))))
12541261
goto onError;
12551262
return v;
12561263

0 commit comments

Comments
 (0)