Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit e57d7b1

Browse files
committed
The changes to the stateful codecs in 2.4 resulted in StreamReader.readline()
trying to return a complete line even if a size parameter was given (see http://www.python.org/sf/1076985). This leads to buffer overflows with long source lines under Windows if e.g. cp1252 is used as the source encoding. This patch reverts the behaviour of readline() to something that behaves more like Python 2.3: If a size parameter is given, read() is called only once. As a side effect of this, readline() now supports all types of linebreaks supported by unicode.splitlines(). Note that the tokenizer is still broken and it's possible to provoke segfaults (see http://www.python.org/sf/1089395).
1 parent dcba662 commit e57d7b1

2 files changed

Lines changed: 121 additions & 43 deletions

File tree

Lib/codecs.py

Lines changed: 41 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ def __init__(self, stream, errors='strict'):
230230
self.errors = errors
231231
self.bytebuffer = ""
232232
self.charbuffer = u""
233+
self.atcr = False
233234

234235
def decode(self, input, errors='strict'):
235236
raise NotImplementedError
@@ -256,41 +257,39 @@ def read(self, size=-1, chars=-1):
256257
definition of the encoding and the given size, e.g. if
257258
optional encoding endings or state markers are available
258259
on the stream, these should be read too.
259-
260260
"""
261261
# read until we get the required number of characters (if available)
262-
done = False
263262
while True:
264263
# can the request can be satisfied from the character buffer?
265264
if chars < 0:
266265
if self.charbuffer:
267-
done = True
266+
break
268267
else:
269268
if len(self.charbuffer) >= chars:
270-
done = True
271-
if done:
272-
if chars < 0:
273-
result = self.charbuffer
274-
self.charbuffer = u""
275-
break
276-
else:
277-
result = self.charbuffer[:chars]
278-
self.charbuffer = self.charbuffer[chars:]
279269
break
280270
# we need more data
281271
if size < 0:
282272
newdata = self.stream.read()
283273
else:
284274
newdata = self.stream.read(size)
275+
# decode bytes (those remaining from the last call included)
285276
data = self.bytebuffer + newdata
286-
object, decodedbytes = self.decode(data, self.errors)
277+
newchars, decodedbytes = self.decode(data, self.errors)
287278
# keep undecoded bytes until the next call
288279
self.bytebuffer = data[decodedbytes:]
289280
# put new characters in the character buffer
290-
self.charbuffer += object
281+
self.charbuffer += newchars
291282
# there was no data available
292283
if not newdata:
293-
done = True
284+
break
285+
if chars < 0:
286+
# Return everything we've got
287+
result = self.charbuffer
288+
self.charbuffer = u""
289+
else:
290+
# Return the first chars characters
291+
result = self.charbuffer[:chars]
292+
self.charbuffer = self.charbuffer[chars:]
294293
return result
295294

296295
def readline(self, size=None, keepends=True):
@@ -302,24 +301,36 @@ def readline(self, size=None, keepends=True):
302301
read() method.
303302
304303
"""
305-
if size is None:
306-
size = 10
304+
readsize = size or 72
307305
line = u""
306+
# If size is given, we call read() only once
308307
while True:
309-
data = self.read(size)
308+
data = self.read(readsize)
309+
if self.atcr and data.startswith(u"\n"):
310+
data = data[1:]
311+
if data:
312+
self.atcr = data.endswith(u"\r")
310313
line += data
311-
pos = line.find("\n")
312-
if pos>=0:
313-
self.charbuffer = line[pos+1:] + self.charbuffer
314-
if keepends:
315-
line = line[:pos+1]
316-
else:
317-
line = line[:pos]
318-
return line
319-
elif not data:
320-
return line
321-
if size<8000:
322-
size *= 2
314+
lines = line.splitlines(True)
315+
if lines:
316+
line0withend = lines[0]
317+
line0withoutend = lines[0].splitlines(False)[0]
318+
if line0withend != line0withoutend: # We really have a line end
319+
# Put the rest back together and keep it until the next call
320+
self.charbuffer = u"".join(lines[1:]) + self.charbuffer
321+
if keepends:
322+
line = line0withend
323+
else:
324+
line = line0withoutend
325+
break
326+
# we didn't get anything or this was our only try
327+
elif not data or size is not None:
328+
if line and not keepends:
329+
line = line.splitlines(False)[0]
330+
break
331+
if readsize<8000:
332+
readsize *= 2
333+
return line
323334

324335
def readlines(self, sizehint=None, keepends=True):
325336

Lib/test/test_codecs.py

Lines changed: 80 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,16 @@ def read(self, size=-1):
2323
self._buffer = self._buffer[size:]
2424
return s
2525

26-
class PartialReadTest(unittest.TestCase):
27-
def check_partial(self, encoding, input, partialresults):
26+
class ReadTest(unittest.TestCase):
27+
def check_partial(self, input, partialresults):
2828
# get a StreamReader for the encoding and feed the bytestring version
2929
# of input to the reader byte by byte. Read every available from
3030
# the StreamReader and check that the results equal the appropriate
3131
# entries from partialresults.
3232
q = Queue()
33-
r = codecs.getreader(encoding)(q)
33+
r = codecs.getreader(self.encoding)(q)
3434
result = u""
35-
for (c, partialresult) in zip(input.encode(encoding), partialresults):
35+
for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
3636
q.write(c)
3737
result += r.read()
3838
self.assertEqual(result, partialresult)
@@ -41,13 +41,81 @@ def check_partial(self, encoding, input, partialresults):
4141
self.assertEqual(r.bytebuffer, "")
4242
self.assertEqual(r.charbuffer, u"")
4343

44-
class UTF16Test(PartialReadTest):
44+
def test_readline(self):
45+
def getreader(input):
46+
stream = StringIO.StringIO(input.encode(self.encoding))
47+
return codecs.getreader(self.encoding)(stream)
48+
49+
def readalllines(input, keepends=True):
50+
reader = getreader(input)
51+
lines = []
52+
while True:
53+
line = reader.readline(keepends=keepends)
54+
if not line:
55+
break
56+
lines.append(line)
57+
return "".join(lines)
58+
59+
s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
60+
self.assertEqual(readalllines(s, True), s)
61+
self.assertEqual(readalllines(s, False), u"foobarbazspameggs")
62+
63+
# Test long lines (multiple calls to read() in readline())
64+
vw = []
65+
vwo = []
66+
for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
67+
vw.append((i*200)*u"\3042" + lineend)
68+
vwo.append((i*200)*u"\3042")
69+
self.assertEqual(readalllines("".join(vw), True), "".join(vw))
70+
self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
71+
72+
# Test lines where the first read might end with \r, so the
73+
# reader has to look ahead whether this is a lone \r or a \r\n
74+
for size in xrange(80):
75+
for lineend in u"\n \r\n \r \u2028".split():
76+
s = size*u"a" + lineend + u"xxx\n"
77+
self.assertEqual(
78+
getreader(s).readline(keepends=True),
79+
size*u"a" + lineend,
80+
)
81+
self.assertEqual(
82+
getreader(s).readline(keepends=False),
83+
size*u"a",
84+
)
85+
86+
def test_readlinequeue(self):
87+
q = Queue()
88+
writer = codecs.getwriter(self.encoding)(q)
89+
reader = codecs.getreader(self.encoding)(q)
90+
91+
# No lineends
92+
writer.write(u"foo\r")
93+
self.assertEqual(reader.readline(keepends=False), u"foo")
94+
writer.write(u"\nbar\r")
95+
self.assertEqual(reader.readline(keepends=False), u"bar")
96+
writer.write(u"baz")
97+
self.assertEqual(reader.readline(keepends=False), u"baz")
98+
self.assertEqual(reader.readline(keepends=False), u"")
99+
100+
# Lineends
101+
writer.write(u"foo\r")
102+
self.assertEqual(reader.readline(keepends=True), u"foo\r")
103+
writer.write(u"\nbar\r")
104+
self.assertEqual(reader.readline(keepends=True), u"bar\r")
105+
writer.write(u"baz")
106+
self.assertEqual(reader.readline(keepends=True), u"baz")
107+
self.assertEqual(reader.readline(keepends=True), u"")
108+
writer.write(u"foo\r\n")
109+
self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
110+
111+
class UTF16Test(ReadTest):
112+
encoding = "utf-16"
45113

46114
spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
47115
spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
48116

49117
def test_only_one_bom(self):
50-
_,_,reader,writer = codecs.lookup("utf-16")
118+
_,_,reader,writer = codecs.lookup(self.encoding)
51119
# encode some stream
52120
s = StringIO.StringIO()
53121
f = writer(s)
@@ -63,7 +131,6 @@ def test_only_one_bom(self):
63131

64132
def test_partial(self):
65133
self.check_partial(
66-
"utf-16",
67134
u"\x00\xff\u0100\uffff",
68135
[
69136
u"", # first byte of BOM read
@@ -79,11 +146,11 @@ def test_partial(self):
79146
]
80147
)
81148

82-
class UTF16LETest(PartialReadTest):
149+
class UTF16LETest(ReadTest):
150+
encoding = "utf-16-le"
83151

84152
def test_partial(self):
85153
self.check_partial(
86-
"utf-16-le",
87154
u"\x00\xff\u0100\uffff",
88155
[
89156
u"",
@@ -97,11 +164,11 @@ def test_partial(self):
97164
]
98165
)
99166

100-
class UTF16BETest(PartialReadTest):
167+
class UTF16BETest(ReadTest):
168+
encoding = "utf-16-be"
101169

102170
def test_partial(self):
103171
self.check_partial(
104-
"utf-16-be",
105172
u"\x00\xff\u0100\uffff",
106173
[
107174
u"",
@@ -115,11 +182,11 @@ def test_partial(self):
115182
]
116183
)
117184

118-
class UTF8Test(PartialReadTest):
185+
class UTF8Test(ReadTest):
186+
encoding = "utf-8"
119187

120188
def test_partial(self):
121189
self.check_partial(
122-
"utf-8",
123190
u"\x00\xff\u07ff\u0800\uffff",
124191
[
125192
u"\x00",

0 commit comments

Comments
 (0)