Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ae3b32a

Browse files
Issue #11461: Fix the incremental UTF-16 decoder. Original patch by
Amaury Forgeot d'Arc. Added tests for partial decoding of non-BMP characters.
2 parents 040c3c8 + 48e188e commit ae3b32a

3 files changed

Lines changed: 47 additions & 9 deletions

File tree

Lib/test/test_codecs.py

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ def test_badbom(self):
330330

331331
def test_partial(self):
332332
self.check_partial(
333-
"\x00\xff\u0100\uffff",
333+
"\x00\xff\u0100\uffff\U00010000",
334334
[
335335
"", # first byte of BOM read
336336
"", # second byte of BOM read
@@ -352,6 +352,10 @@ def test_partial(self):
352352
"\x00\xff\u0100",
353353
"\x00\xff\u0100",
354354
"\x00\xff\u0100\uffff",
355+
"\x00\xff\u0100\uffff",
356+
"\x00\xff\u0100\uffff",
357+
"\x00\xff\u0100\uffff",
358+
"\x00\xff\u0100\uffff\U00010000",
355359
]
356360
)
357361

@@ -386,7 +390,7 @@ class UTF32LETest(ReadTest):
386390

387391
def test_partial(self):
388392
self.check_partial(
389-
"\x00\xff\u0100\uffff",
393+
"\x00\xff\u0100\uffff\U00010000",
390394
[
391395
"",
392396
"",
@@ -404,6 +408,10 @@ def test_partial(self):
404408
"\x00\xff\u0100",
405409
"\x00\xff\u0100",
406410
"\x00\xff\u0100\uffff",
411+
"\x00\xff\u0100\uffff",
412+
"\x00\xff\u0100\uffff",
413+
"\x00\xff\u0100\uffff",
414+
"\x00\xff\u0100\uffff\U00010000",
407415
]
408416
)
409417

@@ -426,7 +434,7 @@ class UTF32BETest(ReadTest):
426434

427435
def test_partial(self):
428436
self.check_partial(
429-
"\x00\xff\u0100\uffff",
437+
"\x00\xff\u0100\uffff\U00010000",
430438
[
431439
"",
432440
"",
@@ -444,6 +452,10 @@ def test_partial(self):
444452
"\x00\xff\u0100",
445453
"\x00\xff\u0100",
446454
"\x00\xff\u0100\uffff",
455+
"\x00\xff\u0100\uffff",
456+
"\x00\xff\u0100\uffff",
457+
"\x00\xff\u0100\uffff",
458+
"\x00\xff\u0100\uffff\U00010000",
447459
]
448460
)
449461

@@ -494,7 +506,7 @@ def test_badbom(self):
494506

495507
def test_partial(self):
496508
self.check_partial(
497-
"\x00\xff\u0100\uffff",
509+
"\x00\xff\u0100\uffff\U00010000",
498510
[
499511
"", # first byte of BOM read
500512
"", # second byte of BOM read => byteorder known
@@ -506,6 +518,10 @@ def test_partial(self):
506518
"\x00\xff\u0100",
507519
"\x00\xff\u0100",
508520
"\x00\xff\u0100\uffff",
521+
"\x00\xff\u0100\uffff",
522+
"\x00\xff\u0100\uffff",
523+
"\x00\xff\u0100\uffff",
524+
"\x00\xff\u0100\uffff\U00010000",
509525
]
510526
)
511527

@@ -543,7 +559,7 @@ class UTF16LETest(ReadTest):
543559

544560
def test_partial(self):
545561
self.check_partial(
546-
"\x00\xff\u0100\uffff",
562+
"\x00\xff\u0100\uffff\U00010000",
547563
[
548564
"",
549565
"\x00",
@@ -553,6 +569,10 @@ def test_partial(self):
553569
"\x00\xff\u0100",
554570
"\x00\xff\u0100",
555571
"\x00\xff\u0100\uffff",
572+
"\x00\xff\u0100\uffff",
573+
"\x00\xff\u0100\uffff",
574+
"\x00\xff\u0100\uffff",
575+
"\x00\xff\u0100\uffff\U00010000",
556576
]
557577
)
558578

@@ -582,7 +602,7 @@ class UTF16BETest(ReadTest):
582602

583603
def test_partial(self):
584604
self.check_partial(
585-
"\x00\xff\u0100\uffff",
605+
"\x00\xff\u0100\uffff\U00010000",
586606
[
587607
"",
588608
"\x00",
@@ -592,6 +612,10 @@ def test_partial(self):
592612
"\x00\xff\u0100",
593613
"\x00\xff\u0100",
594614
"\x00\xff\u0100\uffff",
615+
"\x00\xff\u0100\uffff",
616+
"\x00\xff\u0100\uffff",
617+
"\x00\xff\u0100\uffff",
618+
"\x00\xff\u0100\uffff\U00010000",
595619
]
596620
)
597621

@@ -621,7 +645,7 @@ class UTF8Test(ReadTest):
621645

622646
def test_partial(self):
623647
self.check_partial(
624-
"\x00\xff\u07ff\u0800\uffff",
648+
"\x00\xff\u07ff\u0800\uffff\U00010000",
625649
[
626650
"\x00",
627651
"\x00",
@@ -634,6 +658,10 @@ def test_partial(self):
634658
"\x00\xff\u07ff\u0800",
635659
"\x00\xff\u07ff\u0800",
636660
"\x00\xff\u07ff\u0800\uffff",
661+
"\x00\xff\u07ff\u0800\uffff",
662+
"\x00\xff\u07ff\u0800\uffff",
663+
"\x00\xff\u07ff\u0800\uffff",
664+
"\x00\xff\u07ff\u0800\uffff\U00010000",
637665
]
638666
)
639667

@@ -816,7 +844,7 @@ class UTF8SigTest(ReadTest):
816844

817845
def test_partial(self):
818846
self.check_partial(
819-
"\ufeff\x00\xff\u07ff\u0800\uffff",
847+
"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
820848
[
821849
"",
822850
"",
@@ -835,6 +863,10 @@ def test_partial(self):
835863
"\ufeff\x00\xff\u07ff\u0800",
836864
"\ufeff\x00\xff\u07ff\u0800",
837865
"\ufeff\x00\xff\u07ff\u0800\uffff",
866+
"\ufeff\x00\xff\u07ff\u0800\uffff",
867+
"\ufeff\x00\xff\u07ff\u0800\uffff",
868+
"\ufeff\x00\xff\u07ff\u0800\uffff",
869+
"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
838870
]
839871
)
840872

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ What's New in Python 3.3.1?
1212
Core and Builtins
1313
-----------------
1414

15+
- Issue #11461: Fix the incremental UTF-16 decoder. Original patch by
16+
Amaury Forgeot d'Arc.
17+
1518
- Issue #16881: Fix Py_ARRAY_LENGTH macro for GCC < 3.1.
1619

1720
- Issue #16856: Fix a segmentation fault from calling repr() on a dict with

Objects/unicodeobject.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5284,8 +5284,11 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
52845284
/* The remaining input chars are ignored if the callback
52855285
chooses to skip the input */
52865286
case 1:
5287+
q -= 2;
5288+
if (consumed)
5289+
goto End;
52875290
errmsg = "unexpected end of data";
5288-
startinpos = ((const char *)q) - 2 - starts;
5291+
startinpos = ((const char *)q) - starts;
52895292
endinpos = ((const char *)e) - starts;
52905293
break;
52915294
case 2:

0 commit comments

Comments
 (0)