Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit dc1650c

Browse files
committed
#22233: Only split headers on \r and/or \n, per email RFCs.
Original patch by Martin Panter, new policy fixes by me.
1 parent 6b46ec7 commit dc1650c

6 files changed

Lines changed: 104 additions & 19 deletions

File tree

Lib/email/feedparser.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from email import message
2828
from email._policybase import compat32
2929
from collections import deque
30+
from io import StringIO
3031

3132
NLCRE = re.compile('\r\n|\r|\n')
3233
NLCRE_bol = re.compile('(\r\n|\r|\n)')
@@ -51,8 +52,9 @@ class BufferedSubFile(object):
5152
simple abstraction -- it parses until EOF closes the current message.
5253
"""
5354
def __init__(self):
54-
# Chunks of the last partial line pushed into this object.
55-
self._partial = []
55+
# Text stream of the last partial line pushed into this object.
56+
# See issue 22233 for why this is a text stream and not a list.
57+
self._partial = StringIO(newline='')
5658
# A deque of full, pushed lines
5759
self._lines = deque()
5860
# The stack of false-EOF checking predicates.
@@ -68,8 +70,10 @@ def pop_eof_matcher(self):
6870

6971
def close(self):
7072
# Don't forget any trailing partial line.
71-
self.pushlines(''.join(self._partial).splitlines(True))
72-
self._partial = []
73+
self._partial.seek(0)
74+
self.pushlines(self._partial.readlines())
75+
self._partial.seek(0)
76+
self._partial.truncate()
7377
self._closed = True
7478

7579
def readline(self):
@@ -97,26 +101,23 @@ def unreadline(self, line):
97101

98102
def push(self, data):
99103
"""Push some new data into this object."""
100-
# Crack into lines, but preserve the linesep characters on the end of each
101-
parts = data.splitlines(True)
102-
103-
if not parts or not parts[0].endswith(('\n', '\r')):
104-
# No new complete lines, so just accumulate partials
105-
self._partial += parts
104+
self._partial.write(data)
105+
if '\n' not in data and '\r' not in data:
106+
# No new complete lines, wait for more.
106107
return
107108

108-
if self._partial:
109-
# If there are previous leftovers, complete them now
110-
self._partial.append(parts[0])
111-
parts[0:1] = ''.join(self._partial).splitlines(True)
112-
del self._partial[:]
109+
# Crack into lines, preserving the linesep characters.
110+
self._partial.seek(0)
111+
parts = self._partial.readlines()
112+
self._partial.seek(0)
113+
self._partial.truncate()
113114

114115
# If the last element of the list does not end in a newline, then treat
115116
# it as a partial line. We only check for '\n' here because a line
116117
# ending with '\r' might be a line that was split in the middle of a
117118
# '\r\n' sequence (see bugs 1555570 and 1721862).
118119
if not parts[-1].endswith('\n'):
119-
self._partial = [parts.pop()]
120+
self._partial.write(parts.pop())
120121
self.pushlines(parts)
121122

122123
def pushlines(self, lines):

Lib/email/policy.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
code that adds all the email6 features.
33
"""
44

5+
import re
56
from email._policybase import Policy, Compat32, compat32, _extend_docstrings
67
from email.utils import _has_surrogates
78
from email.headerregistry import HeaderRegistry as HeaderRegistry
@@ -18,6 +19,8 @@
1819
'HTTP',
1920
]
2021

22+
linesep_splitter = re.compile(r'\n|\r')
23+
2124
@_extend_docstrings
2225
class EmailPolicy(Policy):
2326

@@ -135,6 +138,8 @@ def header_store_parse(self, name, value):
135138
if hasattr(value, 'name') and value.name.lower() == name.lower():
136139
return (name, value)
137140
if isinstance(value, str) and len(value.splitlines())>1:
141+
# XXX this error message isn't quite right when we use splitlines
142+
# (see issue 22233), but I'm not sure what should happen here.
138143
raise ValueError("Header values may not contain linefeed "
139144
"or carriage return characters")
140145
return (name, self.header_factory(name, value))
@@ -150,7 +155,9 @@ def header_fetch_parse(self, name, value):
150155
"""
151156
if hasattr(value, 'name'):
152157
return value
153-
return self.header_factory(name, ''.join(value.splitlines()))
158+
# We can't use splitlines here because it splits on more than \r and \n.
159+
value = ''.join(linesep_splitter.split(value))
160+
return self.header_factory(name, value)
154161

155162
def fold(self, name, value):
156163
"""+

Lib/test/test_email/test_email.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3444,10 +3444,12 @@ def test_newlines(self):
34443444
self.assertEqual(m.keys(), ['a', 'b'])
34453445
m = self.parse(['a:\r', '\nb:\n'])
34463446
self.assertEqual(m.keys(), ['a', 'b'])
3447+
3448+
# Only CR and LF should break header fields
34473449
m = self.parse(['a:\x85b:\u2028c:\n'])
3448-
self.assertEqual(m.items(), [('a', '\x85'), ('b', '\u2028'), ('c', '')])
3450+
self.assertEqual(m.items(), [('a', '\x85b:\u2028c:')])
34493451
m = self.parse(['a:\r', 'b:\x85', 'c:\n'])
3450-
self.assertEqual(m.items(), [('a', ''), ('b', '\x85'), ('c', '')])
3452+
self.assertEqual(m.items(), [('a', ''), ('b', '\x85c:')])
34513453

34523454
def test_long_lines(self):
34533455
# Expected peak memory use on 32-bit platform: 6*N*M bytes.

Lib/test/test_email/test_parser.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import email
33
import unittest
44
from email.message import Message
5+
from email.policy import default
56
from test.test_email import TestEmailBase
67

78

@@ -32,5 +33,45 @@ def test_custom_message_gets_policy_if_possible_from_file(self):
3233
# XXX add tests for other functions that take Message arg.
3334

3435

36+
class TestParserBase:
37+
38+
def test_only_split_on_cr_lf(self):
39+
# The unicode line splitter splits on unicode linebreaks, which are
40+
# more numerous than allowed by the email RFCs; make sure we are only
41+
# splitting on those two.
42+
msg = self.parser(
43+
"Next-Line: not\x85broken\r\n"
44+
"Null: not\x00broken\r\n"
45+
"Vertical-Tab: not\vbroken\r\n"
46+
"Form-Feed: not\fbroken\r\n"
47+
"File-Separator: not\x1Cbroken\r\n"
48+
"Group-Separator: not\x1Dbroken\r\n"
49+
"Record-Separator: not\x1Ebroken\r\n"
50+
"Line-Separator: not\u2028broken\r\n"
51+
"Paragraph-Separator: not\u2029broken\r\n"
52+
"\r\n",
53+
policy=default,
54+
)
55+
self.assertEqual(msg.items(), [
56+
("Next-Line", "not\x85broken"),
57+
("Null", "not\x00broken"),
58+
("Vertical-Tab", "not\vbroken"),
59+
("Form-Feed", "not\fbroken"),
60+
("File-Separator", "not\x1Cbroken"),
61+
("Group-Separator", "not\x1Dbroken"),
62+
("Record-Separator", "not\x1Ebroken"),
63+
("Line-Separator", "not\u2028broken"),
64+
("Paragraph-Separator", "not\u2029broken"),
65+
])
66+
self.assertEqual(msg.get_payload(), "")
67+
68+
class TestParser(TestParserBase, TestEmailBase):
69+
parser = staticmethod(email.message_from_string)
70+
71+
class TestBytesParser(TestParserBase, TestEmailBase):
72+
def parser(self, s, *args, **kw):
73+
return email.message_from_bytes(s.encode(), *args, **kw)
74+
75+
3576
if __name__ == '__main__':
3677
unittest.main()

Lib/test/test_httplib.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,36 @@ def test_malformed_headers_coped_with(self):
283283
self.assertEqual(resp.getheader('First'), 'val')
284284
self.assertEqual(resp.getheader('Second'), 'val')
285285

286+
def test_parse_all_octets(self):
287+
# Ensure no valid header field octet breaks the parser
288+
body = (
289+
b'HTTP/1.1 200 OK\r\n'
290+
b"!#$%&'*+-.^_`|~: value\r\n" # Special token characters
291+
b'VCHAR: ' + bytes(range(0x21, 0x7E + 1)) + b'\r\n'
292+
b'obs-text: ' + bytes(range(0x80, 0xFF + 1)) + b'\r\n'
293+
b'obs-fold: text\r\n'
294+
b' folded with space\r\n'
295+
b'\tfolded with tab\r\n'
296+
b'Content-Length: 0\r\n'
297+
b'\r\n'
298+
)
299+
sock = FakeSocket(body)
300+
resp = client.HTTPResponse(sock)
301+
resp.begin()
302+
self.assertEqual(resp.getheader('Content-Length'), '0')
303+
self.assertEqual(resp.msg['Content-Length'], '0')
304+
self.assertEqual(resp.getheader("!#$%&'*+-.^_`|~"), 'value')
305+
self.assertEqual(resp.msg["!#$%&'*+-.^_`|~"], 'value')
306+
vchar = ''.join(map(chr, range(0x21, 0x7E + 1)))
307+
self.assertEqual(resp.getheader('VCHAR'), vchar)
308+
self.assertEqual(resp.msg['VCHAR'], vchar)
309+
self.assertIsNotNone(resp.getheader('obs-text'))
310+
self.assertIn('obs-text', resp.msg)
311+
for folded in (resp.getheader('obs-fold'), resp.msg['obs-fold']):
312+
self.assertTrue(folded.startswith('text'))
313+
self.assertIn(' folded with space', folded)
314+
self.assertTrue(folded.endswith('folded with tab'))
315+
286316
def test_invalid_headers(self):
287317
conn = client.HTTPConnection('example.com')
288318
conn.sock = FakeSocket('')

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ Core and Builtins
6262
Library
6363
-------
6464

65+
- Issue #22233: Break email header lines *only* on the RFC specified CR and LF
66+
characters, not on arbitrary unicode line breaks. This also fixes a bug in
67+
HTTP header parsing.
68+
6569
- Issue 27988: Fix email iter_attachments incorrect mutation of payload list.
6670

6771
- Issue #27691: Fix ssl module's parsing of GEN_RID subject alternative name

0 commit comments

Comments
 (0)