Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 07ea53c

Browse files
committed
#1079: Fix parsing of encoded words.
This is a behavior change: before this leading and trailing spaces were stripped from ASCII parts, now they are preserved. Without this fix we didn't parse the examples in the RFC correctly, so I think breaking backward compatibility here is justified. Patch by Ralf Schlatterbeck.
1 parent e11eb0f commit 07ea53c

5 files changed

Lines changed: 113 additions & 19 deletions

File tree

Lib/email/header.py

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
\? # literal ?
4141
(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
4242
\?= # literal ?=
43-
(?=[ \t]|$) # whitespace or the end of the string
4443
''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
4544

4645
# Field name regexp, including trailing colon, but not separating whitespace,
@@ -86,15 +85,29 @@ def decode_header(header):
8685
words = []
8786
for line in header.splitlines():
8887
parts = ecre.split(line)
88+
first = True
8989
while parts:
90-
unencoded = parts.pop(0).strip()
90+
unencoded = parts.pop(0)
91+
if first:
92+
unencoded = unencoded.lstrip()
93+
first = False
9194
if unencoded:
9295
words.append((unencoded, None, None))
9396
if parts:
9497
charset = parts.pop(0).lower()
9598
encoding = parts.pop(0).lower()
9699
encoded = parts.pop(0)
97100
words.append((encoded, encoding, charset))
101+
# Now loop over words and remove words that consist of whitespace
102+
# between two encoded strings.
103+
import sys
104+
droplist = []
105+
for n, w in enumerate(words):
106+
if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
107+
droplist.append(n-1)
108+
for d in reversed(droplist):
109+
del words[d]
110+
98111
# The next step is to decode each encoded word by applying the reverse
99112
# base64 or quopri transformation. decoded_words is now a list of the
100113
# form (decoded_word, charset).
@@ -217,22 +230,27 @@ def __str__(self):
217230
self._normalize()
218231
uchunks = []
219232
lastcs = None
233+
lastspace = None
220234
for string, charset in self._chunks:
221235
# We must preserve spaces between encoded and non-encoded word
222236
# boundaries, which means for us we need to add a space when we go
223237
# from a charset to None/us-ascii, or from None/us-ascii to a
224238
# charset. Only do this for the second and subsequent chunks.
239+
# Don't add a space if the None/us-ascii string already has
240+
# a space (trailing or leading depending on transition)
225241
nextcs = charset
226242
if nextcs == _charset.UNKNOWN8BIT:
227243
original_bytes = string.encode('ascii', 'surrogateescape')
228244
string = original_bytes.decode('ascii', 'replace')
229245
if uchunks:
246+
hasspace = string and self._nonctext(string[0])
230247
if lastcs not in (None, 'us-ascii'):
231-
if nextcs in (None, 'us-ascii'):
248+
if nextcs in (None, 'us-ascii') and not hasspace:
232249
uchunks.append(SPACE)
233250
nextcs = None
234-
elif nextcs not in (None, 'us-ascii'):
251+
elif nextcs not in (None, 'us-ascii') and not lastspace:
235252
uchunks.append(SPACE)
253+
lastspace = string and self._nonctext(string[-1])
236254
lastcs = nextcs
237255
uchunks.append(string)
238256
return EMPTYSTRING.join(uchunks)
@@ -291,6 +309,11 @@ def append(self, s, charset=None, errors='strict'):
291309
charset = UTF8
292310
self._chunks.append((s, charset))
293311

312+
def _nonctext(self, s):
313+
"""True if string s is not a ctext character of RFC822.
314+
"""
315+
return s.isspace() or s in ('(', ')', '\\')
316+
294317
def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
295318
r"""Encode a message header into an RFC-compliant format.
296319
@@ -334,7 +357,20 @@ def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
334357
maxlinelen = 1000000
335358
formatter = _ValueFormatter(self._headerlen, maxlinelen,
336359
self._continuation_ws, splitchars)
360+
lastcs = None
361+
hasspace = lastspace = None
337362
for string, charset in self._chunks:
363+
if hasspace is not None:
364+
hasspace = string and self._nonctext(string[0])
365+
import sys
366+
if lastcs not in (None, 'us-ascii'):
367+
if not hasspace or charset not in (None, 'us-ascii'):
368+
formatter.add_transition()
369+
elif charset not in (None, 'us-ascii') and not lastspace:
370+
formatter.add_transition()
371+
lastspace = string and self._nonctext(string[-1])
372+
lastcs = charset
373+
hasspace = False
338374
lines = string.splitlines()
339375
if lines:
340376
formatter.feed('', lines[0], charset)
@@ -351,6 +387,7 @@ def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
351387
formatter.feed(fws, sline, charset)
352388
if len(lines) > 1:
353389
formatter.newline()
390+
if self._chunks:
354391
formatter.add_transition()
355392
value = formatter._str(linesep)
356393
if _embeded_header.search(value):

Lib/nntplib.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def decode_header(header_str):
166166
parts.append(v.decode(enc or 'ascii'))
167167
else:
168168
parts.append(v)
169-
return ' '.join(parts)
169+
return ''.join(parts)
170170

171171
def _parse_overview_fmt(lines):
172172
"""Parse a list of string representing the response to LIST OVERVIEW.FMT

Lib/test/test_email/test_asian_codecs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def test_japanese_codecs(self):
4141
Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?=
4242
=?iso-8859-1?q?Gr=FC=DF_Gott!?=""")
4343
eq(decode_header(h.encode()),
44-
[(b'Hello World!', None),
44+
[(b'Hello World! ', None),
4545
(b'\x1b$B%O%m!<%o!<%k%I!*\x1b(B', 'iso-2022-jp'),
4646
(b'Gr\xfc\xdf Gott!', gcode)])
4747
subject_bytes = (b'test-ja \xa4\xd8\xc5\xea\xb9\xc6\xa4\xb5'

Lib/test/test_email/test_email.py

Lines changed: 66 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1994,9 +1994,9 @@ def test_rfc2047_multiline(self):
19941994
foo bar =?mac-iceland?q?r=8Aksm=9Arg=8Cs?="""
19951995
dh = decode_header(s)
19961996
eq(dh, [
1997-
(b'Re:', None),
1997+
(b'Re: ', None),
19981998
(b'r\x8aksm\x9arg\x8cs', 'mac-iceland'),
1999-
(b'baz foo bar', None),
1999+
(b' baz foo bar ', None),
20002000
(b'r\x8aksm\x9arg\x8cs', 'mac-iceland')])
20012001
header = make_header(dh)
20022002
eq(str(header),
@@ -2005,35 +2005,37 @@ def test_rfc2047_multiline(self):
20052005
Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?=
20062006
=?mac-iceland?q?=9Arg=8Cs?=""")
20072007

2008-
def test_whitespace_eater_unicode(self):
2008+
def test_whitespace_keeper_unicode(self):
20092009
eq = self.assertEqual
20102010
s = '=?ISO-8859-1?Q?Andr=E9?= Pirard <[email protected]>'
20112011
dh = decode_header(s)
20122012
eq(dh, [(b'Andr\xe9', 'iso-8859-1'),
2013-
(b'Pirard <[email protected]>', None)])
2013+
(b' Pirard <[email protected]>', None)])
20142014
header = str(make_header(dh))
20152015
eq(header, 'Andr\xe9 Pirard <[email protected]>')
20162016

2017-
def test_whitespace_eater_unicode_2(self):
2017+
def test_whitespace_keeper_unicode_2(self):
20182018
eq = self.assertEqual
20192019
s = 'The =?iso-8859-1?b?cXVpY2sgYnJvd24gZm94?= jumped over the =?iso-8859-1?b?bGF6eSBkb2c=?='
20202020
dh = decode_header(s)
2021-
eq(dh, [(b'The', None), (b'quick brown fox', 'iso-8859-1'),
2022-
(b'jumped over the', None), (b'lazy dog', 'iso-8859-1')])
2021+
eq(dh, [(b'The ', None), (b'quick brown fox', 'iso-8859-1'),
2022+
(b' jumped over the ', None), (b'lazy dog', 'iso-8859-1')])
20232023
hu = str(make_header(dh))
20242024
eq(hu, 'The quick brown fox jumped over the lazy dog')
20252025

20262026
def test_rfc2047_missing_whitespace(self):
20272027
s = 'Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord'
20282028
dh = decode_header(s)
2029-
self.assertEqual(dh, [(s, None)])
2029+
self.assertEqual(dh, [(b'Sm', None), (b'\xf6', 'iso-8859-1'),
2030+
(b'rg', None), (b'\xe5', 'iso-8859-1'),
2031+
(b'sbord', None)])
20302032

20312033
def test_rfc2047_with_whitespace(self):
20322034
s = 'Sm =?ISO-8859-1?B?9g==?= rg =?ISO-8859-1?B?5Q==?= sbord'
20332035
dh = decode_header(s)
2034-
self.assertEqual(dh, [(b'Sm', None), (b'\xf6', 'iso-8859-1'),
2035-
(b'rg', None), (b'\xe5', 'iso-8859-1'),
2036-
(b'sbord', None)])
2036+
self.assertEqual(dh, [(b'Sm ', None), (b'\xf6', 'iso-8859-1'),
2037+
(b' rg ', None), (b'\xe5', 'iso-8859-1'),
2038+
(b' sbord', None)])
20372039

20382040
def test_rfc2047_B_bad_padding(self):
20392041
s = '=?iso-8859-1?B?%s?='
@@ -2051,6 +2053,57 @@ def test_rfc2047_Q_invalid_digits(self):
20512053
self.assertEqual(decode_header(s),
20522054
[(b'andr\xe9=zz', 'iso-8659-1')])
20532055

2056+
def test_rfc2047_rfc2047_1(self):
2057+
# 1st testcase at end of rfc2047
2058+
s = '(=?ISO-8859-1?Q?a?=)'
2059+
self.assertEqual(decode_header(s),
2060+
[(b'(', None), (b'a', 'iso-8859-1'), (b')', None)])
2061+
2062+
def test_rfc2047_rfc2047_2(self):
2063+
# 2nd testcase at end of rfc2047
2064+
s = '(=?ISO-8859-1?Q?a?= b)'
2065+
self.assertEqual(decode_header(s),
2066+
[(b'(', None), (b'a', 'iso-8859-1'), (b' b)', None)])
2067+
2068+
def test_rfc2047_rfc2047_3(self):
2069+
# 3rd testcase at end of rfc2047
2070+
s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)'
2071+
self.assertEqual(decode_header(s),
2072+
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
2073+
2074+
def test_rfc2047_rfc2047_4(self):
2075+
# 4th testcase at end of rfc2047
2076+
s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)'
2077+
self.assertEqual(decode_header(s),
2078+
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
2079+
2080+
def test_rfc2047_rfc2047_5a(self):
2081+
# 5th testcase at end of rfc2047 newline is \r\n
2082+
s = '(=?ISO-8859-1?Q?a?=\r\n =?ISO-8859-1?Q?b?=)'
2083+
self.assertEqual(decode_header(s),
2084+
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
2085+
2086+
def test_rfc2047_rfc2047_5b(self):
2087+
# 5th testcase at end of rfc2047 newline is \n
2088+
s = '(=?ISO-8859-1?Q?a?=\n =?ISO-8859-1?Q?b?=)'
2089+
self.assertEqual(decode_header(s),
2090+
[(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
2091+
2092+
def test_rfc2047_rfc2047_6(self):
2093+
# 6th testcase at end of rfc2047
2094+
s = '(=?ISO-8859-1?Q?a_b?=)'
2095+
self.assertEqual(decode_header(s),
2096+
[(b'(', None), (b'a b', 'iso-8859-1'), (b')', None)])
2097+
2098+
def test_rfc2047_rfc2047_7(self):
2099+
# 7th testcase at end of rfc2047
2100+
s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-2?Q?_b?=)'
2101+
self.assertEqual(decode_header(s),
2102+
[(b'(', None), (b'a', 'iso-8859-1'), (b' b', 'iso-8859-2'),
2103+
(b')', None)])
2104+
self.assertEqual(make_header(decode_header(s)).encode(), s.lower())
2105+
self.assertEqual(str(make_header(decode_header(s))), '(a b)')
2106+
20542107

20552108
# Test the MIMEMessage class
20562109
class TestMIMEMessage(TestEmailBase):
@@ -4388,11 +4441,11 @@ def test_encoded_adjacent_nonencoded(self):
43884441
h = make_header(decode_header(s))
43894442
eq(h.encode(), s)
43904443

4391-
def test_whitespace_eater(self):
4444+
def test_whitespace_keeper(self):
43924445
eq = self.assertEqual
43934446
s = 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztk=?= =?koi8-r?q?=CA?= zz.'
43944447
parts = decode_header(s)
4395-
eq(parts, [(b'Subject:', None), (b'\xf0\xd2\xcf\xd7\xc5\xd2\xcb\xc1 \xce\xc1 \xc6\xc9\xce\xc1\xcc\xd8\xce\xd9\xca', 'koi8-r'), (b'zz.', None)])
4448+
eq(parts, [(b'Subject: ', None), (b'\xf0\xd2\xcf\xd7\xc5\xd2\xcb\xc1 \xce\xc1 \xc6\xc9\xce\xc1\xcc\xd8\xce\xd9\xca', 'koi8-r'), (b' zz.', None)])
43964449
hdr = make_header(parts)
43974450
eq(hdr.encode(),
43984451
'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztnK?= zz.')

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ What's New in Python 3.3.0 Beta 1?
1010
Library
1111
-------
1212

13+
- Issue #1079: email.header.decode_header now correctly parses all the examples
14+
in RFC2047. There is a necessary visible behavior change: the leading and/or
15+
trailing whitespace on ASCII parts is now preserved.
16+
1317
- Issue #14969: Better handling of exception chaining in contextlib.ExitStack
1418

1519
- Issue #14962: Update text coloring in IDLE shell window after changing

0 commit comments

Comments
 (0)