Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 48e188e

Browse files
Issue #11461: Fix the incremental UTF-16 decoder. Original patch by
Amaury Forgeot d'Arc. Added tests for partial decoding of non-BMP characters.
1 parent dec798e commit 48e188e

3 files changed

Lines changed: 47 additions & 9 deletions

File tree

Lib/test/test_codecs.py

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def test_badbom(self):
313313

314314
def test_partial(self):
315315
self.check_partial(
316-
"\x00\xff\u0100\uffff",
316+
"\x00\xff\u0100\uffff\U00010000",
317317
[
318318
"", # first byte of BOM read
319319
"", # second byte of BOM read
@@ -335,6 +335,10 @@ def test_partial(self):
335335
"\x00\xff\u0100",
336336
"\x00\xff\u0100",
337337
"\x00\xff\u0100\uffff",
338+
"\x00\xff\u0100\uffff",
339+
"\x00\xff\u0100\uffff",
340+
"\x00\xff\u0100\uffff",
341+
"\x00\xff\u0100\uffff\U00010000",
338342
]
339343
)
340344

@@ -369,7 +373,7 @@ class UTF32LETest(ReadTest):
369373

370374
def test_partial(self):
371375
self.check_partial(
372-
"\x00\xff\u0100\uffff",
376+
"\x00\xff\u0100\uffff\U00010000",
373377
[
374378
"",
375379
"",
@@ -387,6 +391,10 @@ def test_partial(self):
387391
"\x00\xff\u0100",
388392
"\x00\xff\u0100",
389393
"\x00\xff\u0100\uffff",
394+
"\x00\xff\u0100\uffff",
395+
"\x00\xff\u0100\uffff",
396+
"\x00\xff\u0100\uffff",
397+
"\x00\xff\u0100\uffff\U00010000",
390398
]
391399
)
392400

@@ -409,7 +417,7 @@ class UTF32BETest(ReadTest):
409417

410418
def test_partial(self):
411419
self.check_partial(
412-
"\x00\xff\u0100\uffff",
420+
"\x00\xff\u0100\uffff\U00010000",
413421
[
414422
"",
415423
"",
@@ -427,6 +435,10 @@ def test_partial(self):
427435
"\x00\xff\u0100",
428436
"\x00\xff\u0100",
429437
"\x00\xff\u0100\uffff",
438+
"\x00\xff\u0100\uffff",
439+
"\x00\xff\u0100\uffff",
440+
"\x00\xff\u0100\uffff",
441+
"\x00\xff\u0100\uffff\U00010000",
430442
]
431443
)
432444

@@ -477,7 +489,7 @@ def test_badbom(self):
477489

478490
def test_partial(self):
479491
self.check_partial(
480-
"\x00\xff\u0100\uffff",
492+
"\x00\xff\u0100\uffff\U00010000",
481493
[
482494
"", # first byte of BOM read
483495
"", # second byte of BOM read => byteorder known
@@ -489,6 +501,10 @@ def test_partial(self):
489501
"\x00\xff\u0100",
490502
"\x00\xff\u0100",
491503
"\x00\xff\u0100\uffff",
504+
"\x00\xff\u0100\uffff",
505+
"\x00\xff\u0100\uffff",
506+
"\x00\xff\u0100\uffff",
507+
"\x00\xff\u0100\uffff\U00010000",
492508
]
493509
)
494510

@@ -526,7 +542,7 @@ class UTF16LETest(ReadTest):
526542

527543
def test_partial(self):
528544
self.check_partial(
529-
"\x00\xff\u0100\uffff",
545+
"\x00\xff\u0100\uffff\U00010000",
530546
[
531547
"",
532548
"\x00",
@@ -536,6 +552,10 @@ def test_partial(self):
536552
"\x00\xff\u0100",
537553
"\x00\xff\u0100",
538554
"\x00\xff\u0100\uffff",
555+
"\x00\xff\u0100\uffff",
556+
"\x00\xff\u0100\uffff",
557+
"\x00\xff\u0100\uffff",
558+
"\x00\xff\u0100\uffff\U00010000",
539559
]
540560
)
541561

@@ -565,7 +585,7 @@ class UTF16BETest(ReadTest):
565585

566586
def test_partial(self):
567587
self.check_partial(
568-
"\x00\xff\u0100\uffff",
588+
"\x00\xff\u0100\uffff\U00010000",
569589
[
570590
"",
571591
"\x00",
@@ -575,6 +595,10 @@ def test_partial(self):
575595
"\x00\xff\u0100",
576596
"\x00\xff\u0100",
577597
"\x00\xff\u0100\uffff",
598+
"\x00\xff\u0100\uffff",
599+
"\x00\xff\u0100\uffff",
600+
"\x00\xff\u0100\uffff",
601+
"\x00\xff\u0100\uffff\U00010000",
578602
]
579603
)
580604

@@ -604,7 +628,7 @@ class UTF8Test(ReadTest):
604628

605629
def test_partial(self):
606630
self.check_partial(
607-
"\x00\xff\u07ff\u0800\uffff",
631+
"\x00\xff\u07ff\u0800\uffff\U00010000",
608632
[
609633
"\x00",
610634
"\x00",
@@ -617,6 +641,10 @@ def test_partial(self):
617641
"\x00\xff\u07ff\u0800",
618642
"\x00\xff\u07ff\u0800",
619643
"\x00\xff\u07ff\u0800\uffff",
644+
"\x00\xff\u07ff\u0800\uffff",
645+
"\x00\xff\u07ff\u0800\uffff",
646+
"\x00\xff\u07ff\u0800\uffff",
647+
"\x00\xff\u07ff\u0800\uffff\U00010000",
620648
]
621649
)
622650

@@ -694,7 +722,7 @@ class UTF8SigTest(ReadTest):
694722

695723
def test_partial(self):
696724
self.check_partial(
697-
"\ufeff\x00\xff\u07ff\u0800\uffff",
725+
"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
698726
[
699727
"",
700728
"",
@@ -713,6 +741,10 @@ def test_partial(self):
713741
"\ufeff\x00\xff\u07ff\u0800",
714742
"\ufeff\x00\xff\u07ff\u0800",
715743
"\ufeff\x00\xff\u07ff\u0800\uffff",
744+
"\ufeff\x00\xff\u07ff\u0800\uffff",
745+
"\ufeff\x00\xff\u07ff\u0800\uffff",
746+
"\ufeff\x00\xff\u07ff\u0800\uffff",
747+
"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
716748
]
717749
)
718750

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ What's New in Python 3.2.4
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #11461: Fix the incremental UTF-16 decoder. Original patch by
14+
Amaury Forgeot d'Arc.
15+
1316
- Issue #16367: Fix FileIO.readall() on Windows for files larger than 2 GB.
1417

1518
- Issue #16455: On FreeBSD and Solaris, if the locale is C, the

Objects/unicodeobject.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3573,8 +3573,11 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
35733573

35743574
/* UTF-16 code pair: */
35753575
if (e - q < 2) {
3576+
q -= 2;
3577+
if (consumed)
3578+
break;
35763579
errmsg = "unexpected end of data";
3577-
startinpos = (((const char *)q) - 2) - starts;
3580+
startinpos = ((const char *)q) - starts;
35783581
endinpos = ((const char *)e) - starts;
35793582
goto utf16Error;
35803583
}

0 commit comments

Comments
 (0)