Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b4bbee2

Browse files
committed
Issue #14579: Fix CVE-2012-2135: vulnerability in the utf-16 decoder after error handling.
Patch by Serhiy Storchaka.
1 parent ca9652e commit b4bbee2

3 files changed

Lines changed: 50 additions & 35 deletions

File tree

Lib/test/test_codecs.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -540,8 +540,19 @@ def test_partial(self):
540540
)
541541

542542
def test_errors(self):
543-
self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
544-
b"\xff", "strict", True)
543+
tests = [
544+
(b'\xff', '\ufffd'),
545+
(b'A\x00Z', 'A\ufffd'),
546+
(b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
547+
(b'\x00\xd8', '\ufffd'),
548+
(b'\x00\xd8A', '\ufffd'),
549+
(b'\x00\xd8A\x00', '\ufffdA'),
550+
(b'\x00\xdcA\x00', '\ufffdA'),
551+
]
552+
for raw, expected in tests:
553+
self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
554+
raw, 'strict', True)
555+
self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
545556

546557
def test_nonbmp(self):
547558
self.assertEqual("\U00010203".encode(self.encoding),
@@ -568,8 +579,19 @@ def test_partial(self):
568579
)
569580

570581
def test_errors(self):
571-
self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
572-
b"\xff", "strict", True)
582+
tests = [
583+
(b'\xff', '\ufffd'),
584+
(b'\x00A\xff', 'A\ufffd'),
585+
(b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
586+
(b'\xd8\x00', '\ufffd'),
587+
(b'\xd8\x00\xdc', '\ufffd'),
588+
(b'\xd8\x00\x00A', '\ufffdA'),
589+
(b'\xdc\x00\x00A', '\ufffdA'),
590+
]
591+
for raw, expected in tests:
592+
self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
593+
raw, 'strict', True)
594+
self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
573595

574596
def test_nonbmp(self):
575597
self.assertEqual("\U00010203".encode(self.encoding),

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ What's New in Python 3.2.4
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #14579: Fix CVE-2012-2135: vulnerability in the utf-16 decoder after
14+
error handling. Patch by Serhiy Storchaka.
15+
1316
- Issue #15404: Refleak in PyMethodObject repr.
1417

1518
- Issue #15394: An issue in PyModule_Create that caused references to

Objects/unicodeobject.c

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3425,7 +3425,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
34253425
/* Unpack UTF-16 encoded data */
34263426
p = unicode->str;
34273427
q = (unsigned char *)s;
3428-
e = q + size - 1;
3428+
e = q + size;
34293429

34303430
if (byteorder)
34313431
bo = *byteorder;
@@ -3476,8 +3476,20 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
34763476
#endif
34773477

34783478
aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3479-
while (q < e) {
3479+
while (1) {
34803480
Py_UNICODE ch;
3481+
if (e - q < 2) {
3482+
/* remaining byte at the end? (size should be even) */
3483+
if (q == e || consumed)
3484+
break;
3485+
errmsg = "truncated data";
3486+
startinpos = ((const char *)q) - starts;
3487+
endinpos = ((const char *)e) - starts;
3488+
outpos = p - PyUnicode_AS_UNICODE(unicode);
3489+
goto utf16Error;
3490+
/* The remaining input chars are ignored if the callback
3491+
chooses to skip the input */
3492+
}
34813493
/* First check for possible aligned read of a C 'long'. Unaligned
34823494
reads are more expensive, better to defer to another iteration. */
34833495
if (!((size_t) q & LONG_PTR_MASK)) {
@@ -3546,8 +3558,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
35463558
}
35473559
p = _p;
35483560
q = _q;
3549-
if (q >= e)
3550-
break;
3561+
if (e - q < 2)
3562+
continue;
35513563
}
35523564
ch = (q[ihi] << 8) | q[ilo];
35533565

@@ -3559,10 +3571,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
35593571
}
35603572

35613573
/* UTF-16 code pair: */
3562-
if (q > e) {
3574+
if (e - q < 2) {
35633575
errmsg = "unexpected end of data";
35643576
startinpos = (((const char *)q) - 2) - starts;
3565-
endinpos = ((const char *)e) + 1 - starts;
3577+
endinpos = ((const char *)e) - starts;
35663578
goto utf16Error;
35673579
}
35683580
if (0xD800 <= ch && ch <= 0xDBFF) {
@@ -3606,31 +3618,9 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
36063618
&outpos,
36073619
&p))
36083620
goto onError;
3609-
}
3610-
/* remaining byte at the end? (size should be even) */
3611-
if (e == q) {
3612-
if (!consumed) {
3613-
errmsg = "truncated data";
3614-
startinpos = ((const char *)q) - starts;
3615-
endinpos = ((const char *)e) + 1 - starts;
3616-
outpos = p - PyUnicode_AS_UNICODE(unicode);
3617-
if (unicode_decode_call_errorhandler(
3618-
errors,
3619-
&errorHandler,
3620-
"utf16", errmsg,
3621-
&starts,
3622-
(const char **)&e,
3623-
&startinpos,
3624-
&endinpos,
3625-
&exc,
3626-
(const char **)&q,
3627-
&unicode,
3628-
&outpos,
3629-
&p))
3630-
goto onError;
3631-
/* The remaining input chars are ignored if the callback
3632-
chooses to skip the input */
3633-
}
3621+
/* Update data because unicode_decode_call_errorhandler might have
3622+
changed the input object. */
3623+
aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
36343624
}
36353625

36363626
if (byteorder)

0 commit comments

Comments
 (0)