Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1e133ca

Browse files
Issue #21448: Fixed FeedParser feed() to avoid O(N**2) behavior when parsing long line.
Original patch by Raymond Hettinger.
2 parents 2a140fb + 320a1c0 commit 1e133ca

3 files changed

Lines changed: 80 additions & 12 deletions

File tree

Lib/email/feedparser.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ class BufferedSubFile(object):
5050
simple abstraction -- it parses until EOF closes the current message.
5151
"""
5252
def __init__(self):
53-
# The last partial line pushed into this object.
54-
self._partial = ''
53+
# Chunks of the last partial line pushed into this object.
54+
self._partial = []
5555
# The list of full, pushed lines, in reverse order
5656
self._lines = []
5757
# The stack of false-EOF checking predicates.
@@ -67,8 +67,8 @@ def pop_eof_matcher(self):
6767

6868
def close(self):
6969
# Don't forget any trailing partial line.
70-
self._lines.append(self._partial)
71-
self._partial = ''
70+
self.pushlines(''.join(self._partial).splitlines(True))
71+
self._partial = []
7272
self._closed = True
7373

7474
def readline(self):
@@ -96,16 +96,26 @@ def unreadline(self, line):
9696

9797
def push(self, data):
9898
"""Push some new data into this object."""
99-
# Handle any previous leftovers
100-
data, self._partial = self._partial + data, ''
10199
# Crack into lines, but preserve the linesep characters on the end of each
102100
parts = data.splitlines(True)
101+
102+
if not parts or not parts[0].endswith(('\n', '\r')):
103+
# No new complete lines, so just accumulate partials
104+
self._partial += parts
105+
return
106+
107+
if self._partial:
108+
# If there are previous leftovers, complete them now
109+
self._partial.append(parts[0])
110+
parts[0:1] = ''.join(self._partial).splitlines(True)
111+
del self._partial[:]
112+
103113
# If the last element of the list does not end in a newline, then treat
104114
# it as a partial line. We only check for '\n' here because a line
105115
# ending with '\r' might be a line that was split in the middle of a
106116
# '\r\n' sequence (see bugs 1555570 and 1721862).
107-
if parts and not parts[-1].endswith('\n'):
108-
self._partial = parts.pop()
117+
if not parts[-1].endswith('\n'):
118+
self._partial = [parts.pop()]
109119
self.pushlines(parts)
110120

111121
def pushlines(self, lines):

Lib/test/test_email/test_email.py

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from io import StringIO, BytesIO
1212
from itertools import chain
13+
from random import choice
1314

1415
import email
1516
import email.policy
@@ -3353,16 +3354,70 @@ def test_pushCR_LF(self):
33533354
bsf.push(il)
33543355
nt += n
33553356
n1 = 0
3356-
while True:
3357-
ol = bsf.readline()
3358-
if ol == NeedMoreData:
3359-
break
3357+
for ol in iter(bsf.readline, NeedMoreData):
33603358
om.append(ol)
33613359
n1 += 1
33623360
self.assertEqual(n, n1)
33633361
self.assertEqual(len(om), nt)
33643362
self.assertEqual(''.join([il for il, n in imt]), ''.join(om))
33653363

3364+
def test_push_random(self):
3365+
from email.feedparser import BufferedSubFile, NeedMoreData
3366+
3367+
n = 10000
3368+
chunksize = 5
3369+
chars = 'abcd \t\r\n'
3370+
3371+
s = ''.join(choice(chars) for i in range(n)) + '\n'
3372+
target = s.splitlines(True)
3373+
3374+
bsf = BufferedSubFile()
3375+
lines = []
3376+
for i in range(0, len(s), chunksize):
3377+
chunk = s[i:i+chunksize]
3378+
bsf.push(chunk)
3379+
lines.extend(iter(bsf.readline, NeedMoreData))
3380+
self.assertEqual(lines, target)
3381+
3382+
3383+
class TestFeedParsers(TestEmailBase):
3384+
3385+
def parse(self, chunks):
3386+
from email.feedparser import FeedParser
3387+
feedparser = FeedParser()
3388+
for chunk in chunks:
3389+
feedparser.feed(chunk)
3390+
return feedparser.close()
3391+
3392+
def test_newlines(self):
3393+
m = self.parse(['a:\nb:\rc:\r\nd:\n'])
3394+
self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
3395+
m = self.parse(['a:\nb:\rc:\r\nd:'])
3396+
self.assertEqual(m.keys(), ['a', 'b', 'c', 'd'])
3397+
m = self.parse(['a:\rb', 'c:\n'])
3398+
self.assertEqual(m.keys(), ['a', 'bc'])
3399+
m = self.parse(['a:\r', 'b:\n'])
3400+
self.assertEqual(m.keys(), ['a', 'b'])
3401+
m = self.parse(['a:\r', '\nb:\n'])
3402+
self.assertEqual(m.keys(), ['a', 'b'])
3403+
m = self.parse(['a:\x85b:\u2028c:\n'])
3404+
self.assertEqual(m.items(), [('a', '\x85'), ('b', '\u2028'), ('c', '')])
3405+
m = self.parse(['a:\r', 'b:\x85', 'c:\n'])
3406+
self.assertEqual(m.items(), [('a', ''), ('b', '\x85'), ('c', '')])
3407+
3408+
def test_long_lines(self):
3409+
M, N = 1000, 100000
3410+
m = self.parse(['a:b\n\n'] + ['x'*M] * N)
3411+
self.assertEqual(m.items(), [('a', 'b')])
3412+
self.assertEqual(m.get_payload(), 'x'*M*N)
3413+
m = self.parse(['a:b\r\r'] + ['x'*M] * N)
3414+
self.assertEqual(m.items(), [('a', 'b')])
3415+
self.assertEqual(m.get_payload(), 'x'*M*N)
3416+
m = self.parse(['a:b\r\r'] + ['x'*M+'\x85'] * N)
3417+
self.assertEqual(m.items(), [('a', 'b')])
3418+
self.assertEqual(m.get_payload(), ('x'*M+'\x85')*N)
3419+
m = self.parse(['a:\r', 'b: '] + ['x'*M] * N)
3420+
self.assertEqual(m.items(), [('a', ''), ('b', 'x'*M*N)])
33663421

33673422

33683423
class TestParsers(TestEmailBase):

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ Core and Builtins
115115
Library
116116
-------
117117

118+
- Issue #21448: Changed FeedParser feed() to avoid O(N**2) behavior when
119+
parsing long line. Original patch by Raymond Hettinger.
120+
118121
- Issue #17923: glob() patterns ending with a slash no longer match non-dirs on
119122
AIX. Based on patch by Delhallt.
120123

0 commit comments

Comments
 (0)