Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit e78178e

Browse files
committed
Bytes (which are the input for decoding) are mutable now. If a decoding
error callback changes the bytes object in the exception the decoder might use memory that's no longer in use. Change unicode_decode_call_errorhandler() so that it fetches the adresses of the bytes array (start and end) from the exception object and passes them back to the caller.
1 parent 2dbde5e commit e78178e

2 files changed

Lines changed: 68 additions & 19 deletions

File tree

Lib/test/test_codeccallbacks.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,39 @@ def test_bug828737(self):
806806
text = 'abc<def>ghi'*n
807807
text.translate(charmap)
808808

809+
def test_mutatingdecodehandler(self):
810+
baddata = [
811+
("ascii", b"\xff"),
812+
("utf-7", b"++"),
813+
("utf-8", b"\xff"),
814+
("utf-16", b"\xff"),
815+
("unicode-escape", b"\\u123g"),
816+
("raw-unicode-escape", b"\\u123g"),
817+
("unicode-internal", b"\xff"),
818+
]
819+
820+
def replacing(exc):
821+
if isinstance(exc, UnicodeDecodeError):
822+
exc.object = 42
823+
return ("\u4242", 0)
824+
else:
825+
raise TypeError("don't know how to handle %r" % exc)
826+
codecs.register_error("test.replacing", replacing)
827+
for (encoding, data) in baddata:
828+
self.assertRaises(TypeError, data.decode, encoding, "test.replacing")
829+
830+
def mutating(exc):
831+
if isinstance(exc, UnicodeDecodeError):
832+
exc.object[:] = b""
833+
return ("\u4242", 0)
834+
else:
835+
raise TypeError("don't know how to handle %r" % exc)
836+
codecs.register_error("test.mutating", mutating)
837+
# If the decoder doesn't pick up the modified input the following
838+
# will lead to an endless loop
839+
for (encoding, data) in baddata:
840+
self.assertRaises(TypeError, data.decode, encoding, "test.replacing")
841+
809842
def test_main():
810843
test.test_support.run_unittest(CodecCallbackTest)
811844

Objects/unicodeobject.c

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,17 +1269,19 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
12691269
static
12701270
int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
12711271
const char *encoding, const char *reason,
1272-
const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1272+
const char **input, const char **inend, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
12731273
PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
12741274
{
12751275
static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
12761276

12771277
PyObject *restuple = NULL;
12781278
PyObject *repunicode = NULL;
12791279
Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1280+
Py_ssize_t insize;
12801281
Py_ssize_t requiredsize;
12811282
Py_ssize_t newpos;
12821283
Py_UNICODE *repptr;
1284+
PyObject *inputobj = NULL;
12831285
Py_ssize_t repsize;
12841286
int res = -1;
12851287

@@ -1291,7 +1293,7 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
12911293

12921294
if (*exceptionObject == NULL) {
12931295
*exceptionObject = PyUnicodeDecodeError_Create(
1294-
encoding, input, insize, *startinpos, *endinpos, reason);
1296+
encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
12951297
if (*exceptionObject == NULL)
12961298
goto onError;
12971299
}
@@ -1313,6 +1315,19 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
13131315
}
13141316
if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
13151317
goto onError;
1318+
1319+
/* Copy back the bytes variables, which might have been modified by the
1320+
callback */
1321+
inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1322+
if (!inputobj)
1323+
goto onError;
1324+
if (!PyBytes_Check(inputobj)) {
1325+
PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1326+
}
1327+
*input = PyBytes_AS_STRING(inputobj);
1328+
insize = PyBytes_GET_SIZE(inputobj);
1329+
*inend = *input + insize;
1330+
13161331
if (newpos<0)
13171332
newpos = insize+newpos;
13181333
if (newpos<0 || newpos>insize) {
@@ -1335,10 +1350,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
13351350
*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
13361351
}
13371352
*endinpos = newpos;
1338-
*inptr = input + newpos;
1353+
*inptr = *input + newpos;
13391354
Py_UNICODE_COPY(*outptr, repptr, repsize);
13401355
*outptr += repsize;
13411356
*outpos += repsize;
1357+
13421358
/* we made it! */
13431359
res = 0;
13441360

@@ -1503,7 +1519,7 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
15031519
else if (SPECIAL(ch,0,0)) {
15041520
errmsg = "unexpected special character";
15051521
s++;
1506-
goto utf7Error;
1522+
goto utf7Error;
15071523
}
15081524
else {
15091525
*p++ = ch;
@@ -1516,7 +1532,7 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
15161532
if (unicode_decode_call_errorhandler(
15171533
errors, &errorHandler,
15181534
"utf7", errmsg,
1519-
starts, size, &startinpos, &endinpos, &exc, &s,
1535+
&starts, &e, &startinpos, &endinpos, &exc, &s,
15201536
(PyObject **)&unicode, &outpos, &p))
15211537
goto onError;
15221538
}
@@ -1527,7 +1543,7 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
15271543
if (unicode_decode_call_errorhandler(
15281544
errors, &errorHandler,
15291545
"utf7", "unterminated shift sequence",
1530-
starts, size, &startinpos, &endinpos, &exc, &s,
1546+
&starts, &e, &startinpos, &endinpos, &exc, &s,
15311547
(PyObject **)&unicode, &outpos, &p))
15321548
goto onError;
15331549
if (s < e)
@@ -1848,7 +1864,7 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
18481864
if (unicode_decode_call_errorhandler(
18491865
errors, &errorHandler,
18501866
"utf8", errmsg,
1851-
starts, size, &startinpos, &endinpos, &exc, &s,
1867+
&starts, &e, &startinpos, &endinpos, &exc, &s,
18521868
(PyObject **)&unicode, &outpos, &p))
18531869
goto onError;
18541870
}
@@ -2132,7 +2148,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
21322148
if (unicode_decode_call_errorhandler(
21332149
errors, &errorHandler,
21342150
"utf16", errmsg,
2135-
starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2151+
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
21362152
(PyObject **)&unicode, &outpos, &p))
21372153
goto onError;
21382154
}
@@ -2342,7 +2358,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
23422358
if (unicode_decode_call_errorhandler(
23432359
errors, &errorHandler,
23442360
"unicodeescape", "end of string in escape sequence",
2345-
starts, size, &startinpos, &endinpos, &exc, &s,
2361+
&starts, &end, &startinpos, &endinpos, &exc, &s,
23462362
(PyObject **)&v, &outpos, &p))
23472363
goto onError;
23482364
goto nextByte;
@@ -2354,7 +2370,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
23542370
if (unicode_decode_call_errorhandler(
23552371
errors, &errorHandler,
23562372
"unicodeescape", message,
2357-
starts, size, &startinpos, &endinpos, &exc, &s,
2373+
&starts, &end, &startinpos, &endinpos, &exc, &s,
23582374
(PyObject **)&v, &outpos, &p))
23592375
goto onError;
23602376
goto nextByte;
@@ -2393,7 +2409,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
23932409
if (unicode_decode_call_errorhandler(
23942410
errors, &errorHandler,
23952411
"unicodeescape", "illegal Unicode character",
2396-
starts, size, &startinpos, &endinpos, &exc, &s,
2412+
&starts, &end, &startinpos, &endinpos, &exc, &s,
23972413
(PyObject **)&v, &outpos, &p))
23982414
goto onError;
23992415
}
@@ -2435,7 +2451,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
24352451
if (unicode_decode_call_errorhandler(
24362452
errors, &errorHandler,
24372453
"unicodeescape", message,
2438-
starts, size, &startinpos, &endinpos, &exc, &s,
2454+
&starts, &end, &startinpos, &endinpos, &exc, &s,
24392455
(PyObject **)&v, &outpos, &p))
24402456
goto onError;
24412457
break;
@@ -2449,7 +2465,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
24492465
if (unicode_decode_call_errorhandler(
24502466
errors, &errorHandler,
24512467
"unicodeescape", message,
2452-
starts, size, &startinpos, &endinpos, &exc, &s,
2468+
&starts, &end, &startinpos, &endinpos, &exc, &s,
24532469
(PyObject **)&v, &outpos, &p))
24542470
goto onError;
24552471
}
@@ -2728,7 +2744,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
27282744
if (unicode_decode_call_errorhandler(
27292745
errors, &errorHandler,
27302746
"rawunicodeescape", "truncated \\uXXXX",
2731-
starts, size, &startinpos, &endinpos, &exc, &s,
2747+
&starts, &end, &startinpos, &endinpos, &exc, &s,
27322748
(PyObject **)&v, &outpos, &p))
27332749
goto onError;
27342750
goto nextByte;
@@ -2746,7 +2762,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
27462762
if (unicode_decode_call_errorhandler(
27472763
errors, &errorHandler,
27482764
"rawunicodeescape", "\\Uxxxxxxxx out of range",
2749-
starts, size, &startinpos, &endinpos, &exc, &s,
2765+
&starts, &end, &startinpos, &endinpos, &exc, &s,
27502766
(PyObject **)&v, &outpos, &p))
27512767
goto onError;
27522768
}
@@ -2897,7 +2913,7 @@ PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
28972913
if (unicode_decode_call_errorhandler(
28982914
errors, &errorHandler,
28992915
"unicode_internal", reason,
2900-
starts, size, &startinpos, &endinpos, &exc, &s,
2916+
&starts, &end, &startinpos, &endinpos, &exc, &s,
29012917
(PyObject **)&v, &outpos, &p)) {
29022918
goto onError;
29032919
}
@@ -3277,7 +3293,7 @@ PyObject *PyUnicode_DecodeASCII(const char *s,
32773293
if (unicode_decode_call_errorhandler(
32783294
errors, &errorHandler,
32793295
"ascii", "ordinal not in range(128)",
3280-
starts, size, &startinpos, &endinpos, &exc, &s,
3296+
&starts, &e, &startinpos, &endinpos, &exc, &s,
32813297
(PyObject **)&v, &outpos, &p))
32823298
goto onError;
32833299
}
@@ -3578,7 +3594,7 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
35783594
if (unicode_decode_call_errorhandler(
35793595
errors, &errorHandler,
35803596
"charmap", "character maps to <undefined>",
3581-
starts, size, &startinpos, &endinpos, &exc, &s,
3597+
&starts, &e, &startinpos, &endinpos, &exc, &s,
35823598
(PyObject **)&v, &outpos, &p)) {
35833599
goto onError;
35843600
}
@@ -3628,7 +3644,7 @@ PyObject *PyUnicode_DecodeCharmap(const char *s,
36283644
if (unicode_decode_call_errorhandler(
36293645
errors, &errorHandler,
36303646
"charmap", "character maps to <undefined>",
3631-
starts, size, &startinpos, &endinpos, &exc, &s,
3647+
&starts, &e, &startinpos, &endinpos, &exc, &s,
36323648
(PyObject **)&v, &outpos, &p)) {
36333649
Py_DECREF(x);
36343650
goto onError;

0 commit comments

Comments
 (0)