Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 180a336

Browse files
committed
Issue #4574: reading an UTF16-encoded text file crashes if \r on 64-char boundary.
1 parent ff94552 commit 180a336

3 files changed

Lines changed: 87 additions & 55 deletions

File tree

Lib/io.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1282,25 +1282,23 @@ class IncrementalNewlineDecoder(codecs.IncrementalDecoder):
12821282
"""
12831283
def __init__(self, decoder, translate, errors='strict'):
12841284
codecs.IncrementalDecoder.__init__(self, errors=errors)
1285-
self.buffer = b''
12861285
self.translate = translate
12871286
self.decoder = decoder
12881287
self.seennl = 0
1288+
self.pendingcr = False
12891289

12901290
def decode(self, input, final=False):
12911291
# decode input (with the eventual \r from a previous pass)
1292-
if self.buffer:
1293-
input = self.buffer + input
1294-
12951292
output = self.decoder.decode(input, final=final)
1293+
if self.pendingcr and (output or final):
1294+
output = "\r" + output
1295+
self.pendingcr = False
12961296

12971297
# retain last \r even when not translating data:
12981298
# then readline() is sure to get \r\n in one pass
12991299
if output.endswith("\r") and not final:
13001300
output = output[:-1]
1301-
self.buffer = b'\r'
1302-
else:
1303-
self.buffer = b''
1301+
self.pendingcr = True
13041302

13051303
# Record which newlines are read
13061304
crlf = output.count('\r\n')
@@ -1319,20 +1317,19 @@ def decode(self, input, final=False):
13191317

13201318
def getstate(self):
13211319
buf, flag = self.decoder.getstate()
1322-
return buf + self.buffer, flag
1320+
flag <<= 1
1321+
if self.pendingcr:
1322+
flag |= 1
1323+
return buf, flag
13231324

13241325
def setstate(self, state):
13251326
buf, flag = state
1326-
if buf.endswith(b'\r'):
1327-
self.buffer = b'\r'
1328-
buf = buf[:-1]
1329-
else:
1330-
self.buffer = b''
1331-
self.decoder.setstate((buf, flag))
1327+
self.pendingcr = bool(flag & 1)
1328+
self.decoder.setstate((buf, flag >> 1))
13321329

13331330
def reset(self):
13341331
self.seennl = 0
1335-
self.buffer = b''
1332+
self.pendingcr = False
13361333
self.decoder.reset()
13371334

13381335
_LF = 1

Lib/test/test_io.py

Lines changed: 72 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -679,8 +679,9 @@ def process_word(self):
679679
@classmethod
680680
def lookupTestDecoder(cls, name):
681681
if cls.codecEnabled and name == 'test_decoder':
682+
latin1 = codecs.lookup('latin-1')
682683
return codecs.CodecInfo(
683-
name='test_decoder', encode=None, decode=None,
684+
name='test_decoder', encode=latin1.encode, decode=None,
684685
incrementalencoder=None,
685686
streamreader=None, streamwriter=None,
686687
incrementaldecoder=cls)
@@ -840,8 +841,11 @@ def testNewlines(self):
840841
[ '\r\n', [ "unix\nwindows\r\n", "os9\rlast\nnonl" ] ],
841842
[ '\r', [ "unix\nwindows\r", "\nos9\r", "last\nnonl" ] ],
842843
]
843-
844-
encodings = ('utf-8', 'latin-1')
844+
encodings = (
845+
'utf-8', 'latin-1',
846+
'utf-16', 'utf-16-le', 'utf-16-be',
847+
'utf-32', 'utf-32-le', 'utf-32-be',
848+
)
845849

846850
# Try a range of buffer sizes to test the case where \r is the last
847851
# character in TextIOWrapper._pending_line.
@@ -1195,56 +1199,84 @@ def test_issue2282(self):
11951199

11961200
self.assertEqual(buffer.seekable(), txt.seekable())
11971201

1198-
def test_newline_decoder(self):
1199-
import codecs
1200-
decoder = codecs.getincrementaldecoder("utf-8")()
1201-
decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
1202+
def check_newline_decoder_utf8(self, decoder):
1203+
# UTF-8 specific tests for a newline decoder
1204+
def _check_decode(b, s, **kwargs):
1205+
# We exercise getstate() / setstate() as well as decode()
1206+
state = decoder.getstate()
1207+
self.assertEquals(decoder.decode(b, **kwargs), s)
1208+
decoder.setstate(state)
1209+
self.assertEquals(decoder.decode(b, **kwargs), s)
12021210

1203-
self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
1211+
_check_decode(b'\xe8\xa2\x88', "\u8888")
12041212

1205-
self.assertEquals(decoder.decode(b'\xe8'), "")
1206-
self.assertEquals(decoder.decode(b'\xa2'), "")
1207-
self.assertEquals(decoder.decode(b'\x88'), "\u8888")
1213+
_check_decode(b'\xe8', "")
1214+
_check_decode(b'\xa2', "")
1215+
_check_decode(b'\x88', "\u8888")
12081216

1209-
self.assertEquals(decoder.decode(b'\xe8'), "")
1210-
self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True)
1217+
_check_decode(b'\xe8', "")
1218+
_check_decode(b'\xa2', "")
1219+
_check_decode(b'\x88', "\u8888")
12111220

1212-
decoder.setstate((b'', 0))
1213-
self.assertEquals(decoder.decode(b'\n'), "\n")
1214-
self.assertEquals(decoder.decode(b'\r'), "")
1215-
self.assertEquals(decoder.decode(b'', final=True), "\n")
1216-
self.assertEquals(decoder.decode(b'\r', final=True), "\n")
1217-
1218-
self.assertEquals(decoder.decode(b'\r'), "")
1219-
self.assertEquals(decoder.decode(b'a'), "\na")
1220-
1221-
self.assertEquals(decoder.decode(b'\r\r\n'), "\n\n")
1222-
self.assertEquals(decoder.decode(b'\r'), "")
1223-
self.assertEquals(decoder.decode(b'\r'), "\n")
1224-
self.assertEquals(decoder.decode(b'\na'), "\na")
1225-
1226-
self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), "\u8888\n")
1227-
self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
1228-
self.assertEquals(decoder.decode(b'\n'), "\n")
1229-
self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), "\u8888")
1230-
self.assertEquals(decoder.decode(b'\n'), "\n")
1221+
_check_decode(b'\xe8', "")
1222+
self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True)
12311223

1232-
decoder = codecs.getincrementaldecoder("utf-8")()
1233-
decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
1224+
decoder.reset()
1225+
_check_decode(b'\n', "\n")
1226+
_check_decode(b'\r', "")
1227+
_check_decode(b'', "\n", final=True)
1228+
_check_decode(b'\r', "\n", final=True)
1229+
1230+
_check_decode(b'\r', "")
1231+
_check_decode(b'a', "\na")
1232+
1233+
_check_decode(b'\r\r\n', "\n\n")
1234+
_check_decode(b'\r', "")
1235+
_check_decode(b'\r', "\n")
1236+
_check_decode(b'\na', "\na")
1237+
1238+
_check_decode(b'\xe8\xa2\x88\r\n', "\u8888\n")
1239+
_check_decode(b'\xe8\xa2\x88', "\u8888")
1240+
_check_decode(b'\n', "\n")
1241+
_check_decode(b'\xe8\xa2\x88\r', "\u8888")
1242+
_check_decode(b'\n', "\n")
1243+
1244+
def check_newline_decoder(self, decoder, encoding):
1245+
result = []
1246+
encoder = codecs.getincrementalencoder(encoding)()
1247+
def _decode_bytewise(s):
1248+
for b in encoder.encode(s):
1249+
result.append(decoder.decode(bytes([b])))
12341250
self.assertEquals(decoder.newlines, None)
1235-
decoder.decode(b"abc\n\r")
1251+
_decode_bytewise("abc\n\r")
12361252
self.assertEquals(decoder.newlines, '\n')
1237-
decoder.decode(b"\nabc")
1253+
_decode_bytewise("\nabc")
12381254
self.assertEquals(decoder.newlines, ('\n', '\r\n'))
1239-
decoder.decode(b"abc\r")
1255+
_decode_bytewise("abc\r")
12401256
self.assertEquals(decoder.newlines, ('\n', '\r\n'))
1241-
decoder.decode(b"abc")
1257+
_decode_bytewise("abc")
12421258
self.assertEquals(decoder.newlines, ('\r', '\n', '\r\n'))
1243-
decoder.decode(b"abc\r")
1259+
_decode_bytewise("abc\r")
1260+
self.assertEquals("".join(result), "abc\n\nabcabc\nabcabc")
12441261
decoder.reset()
1245-
self.assertEquals(decoder.decode(b"abc"), "abc")
1262+
self.assertEquals(decoder.decode("abc".encode(encoding)), "abc")
12461263
self.assertEquals(decoder.newlines, None)
12471264

1265+
def test_newline_decoder(self):
1266+
encodings = (
1267+
'utf-8', 'latin-1',
1268+
'utf-16', 'utf-16-le', 'utf-16-be',
1269+
'utf-32', 'utf-32-le', 'utf-32-be',
1270+
)
1271+
for enc in encodings:
1272+
decoder = codecs.getincrementaldecoder(enc)()
1273+
decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
1274+
self.check_newline_decoder(decoder, enc)
1275+
decoder = codecs.getincrementaldecoder("utf-8")()
1276+
decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
1277+
self.check_newline_decoder_utf8(decoder)
1278+
1279+
12481280
# XXX Tests for open()
12491281

12501282
class MiscIOTest(unittest.TestCase):

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ Core and Builtins
4545
Library
4646
-------
4747

48+
- Issue #4574: reading an UTF16-encoded text file crashes if \r on 64-char
49+
boundary.
50+
4851
- Issue #4223: inspect.getsource() will now correctly display source code
4952
for packages loaded via zipimport (or any other conformant PEP 302
5053
loader). Original patch by Alexander Belopolsky.

0 commit comments

Comments
 (0)