#1079: Fix parsing of encoded words.

bitdancer · bitdancer · commit 07ea53cb2188 · 2012-06-02T17:56:49.000-04:00
This is a behavior change: before this leading and trailing spaces were
stripped from ASCII parts, now they are preserved.  Without this fix we didn't
parse the examples in the RFC correctly, so I think breaking backward
compatibility here is justified.

Patch by Ralf Schlatterbeck.
diff --git a/Lib/email/header.py b/Lib/email/header.py
@@ -40,7 +40,6 @@
   \?                    # literal ?
   (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
   \?=                   # literal ?=
-  (?=[ \t]|$)           # whitespace or the end of the string
   ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
 
 # Field name regexp, including trailing colon, but not separating whitespace,
@@ -86,15 +85,29 @@ def decode_header(header):
     words = []
     for line in header.splitlines():
         parts = ecre.split(line)
+        first = True
         while parts:
-            unencoded = parts.pop(0).strip()
+            unencoded = parts.pop(0)
+            if first:
+                unencoded = unencoded.lstrip()
+                first = False
             if unencoded:
                 words.append((unencoded, None, None))
             if parts:
                 charset = parts.pop(0).lower()
                 encoding = parts.pop(0).lower()
                 encoded = parts.pop(0)
                 words.append((encoded, encoding, charset))
+    # Now loop over words and remove words that consist of whitespace
+    # between two encoded strings.
+    import sys
+    droplist = []
+    for n, w in enumerate(words):
+        if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
+            droplist.append(n-1)
+    for d in reversed(droplist):
+        del words[d]
+
     # The next step is to decode each encoded word by applying the reverse
     # base64 or quopri transformation.  decoded_words is now a list of the
     # form (decoded_word, charset).
@@ -217,22 +230,27 @@ def __str__(self):
         self._normalize()
         uchunks = []
         lastcs = None
+        lastspace = None
         for string, charset in self._chunks:
             # We must preserve spaces between encoded and non-encoded word
             # boundaries, which means for us we need to add a space when we go
             # from a charset to None/us-ascii, or from None/us-ascii to a
             # charset.  Only do this for the second and subsequent chunks.
+            # Don't add a space if the None/us-ascii string already has
+            # a space (trailing or leading depending on transition)
             nextcs = charset
             if nextcs == _charset.UNKNOWN8BIT:
                 original_bytes = string.encode('ascii', 'surrogateescape')
                 string = original_bytes.decode('ascii', 'replace')
             if uchunks:
+                hasspace = string and self._nonctext(string[0])
                 if lastcs not in (None, 'us-ascii'):
-                    if nextcs in (None, 'us-ascii'):
+                    if nextcs in (None, 'us-ascii') and not hasspace:
                         uchunks.append(SPACE)
                         nextcs = None
-                elif nextcs not in (None, 'us-ascii'):
+                elif nextcs not in (None, 'us-ascii') and not lastspace:
                     uchunks.append(SPACE)
+            lastspace = string and self._nonctext(string[-1])
             lastcs = nextcs
             uchunks.append(string)
         return EMPTYSTRING.join(uchunks)
@@ -291,6 +309,11 @@ def append(self, s, charset=None, errors='strict'):
                 charset = UTF8
         self._chunks.append((s, charset))
 
+    def _nonctext(self, s):
+        """True if string s is not a ctext character of RFC822.
+        """
+        return s.isspace() or s in ('(', ')', '\\')
+
     def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
         r"""Encode a message header into an RFC-compliant format.
 
@@ -334,7 +357,20 @@ def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
             maxlinelen = 1000000
         formatter = _ValueFormatter(self._headerlen, maxlinelen,
                                     self._continuation_ws, splitchars)
+        lastcs = None
+        hasspace = lastspace = None
         for string, charset in self._chunks:
+            if hasspace is not None:
+                hasspace = string and self._nonctext(string[0])
+                import sys
+                if lastcs not in (None, 'us-ascii'):
+                    if not hasspace or charset not in (None, 'us-ascii'):
+                        formatter.add_transition()
+                elif charset not in (None, 'us-ascii') and not lastspace:
+                    formatter.add_transition()
+            lastspace = string and self._nonctext(string[-1])
+            lastcs = charset
+            hasspace = False
             lines = string.splitlines()
             if lines:
                 formatter.feed('', lines[0], charset)
@@ -351,6 +387,7 @@ def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
                     formatter.feed(fws, sline, charset)
             if len(lines) > 1:
                 formatter.newline()
+        if self._chunks:
             formatter.add_transition()
         value = formatter._str(linesep)
         if _embeded_header.search(value):
diff --git a/Lib/nntplib.py b/Lib/nntplib.py
@@ -166,7 +166,7 @@ def decode_header(header_str):
             parts.append(v.decode(enc or 'ascii'))
         else:
             parts.append(v)
-    return ' '.join(parts)
+    return ''.join(parts)
 
 def _parse_overview_fmt(lines):
     """Parse a list of string representing the response to LIST OVERVIEW.FMT
diff --git a/Lib/test/test_email/test_asian_codecs.py b/Lib/test/test_email/test_asian_codecs.py
@@ -41,7 +41,7 @@ def test_japanese_codecs(self):
 Hello World! =?iso-2022-jp?b?GyRCJU8lbSE8JW8hPCVrJUkhKhsoQg==?=
  =?iso-8859-1?q?Gr=FC=DF_Gott!?=""")
         eq(decode_header(h.encode()),
-           [(b'Hello World!', None),
+           [(b'Hello World! ', None),
             (b'\x1b$B%O%m!<%o!<%k%I!*\x1b(B', 'iso-2022-jp'),
             (b'Gr\xfc\xdf Gott!', gcode)])
         subject_bytes = (b'test-ja \xa4\xd8\xc5\xea\xb9\xc6\xa4\xb5'
diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py
@@ -1994,9 +1994,9 @@ def test_rfc2047_multiline(self):
  foo bar =?mac-iceland?q?r=8Aksm=9Arg=8Cs?="""
         dh = decode_header(s)
         eq(dh, [
-            (b'Re:', None),
+            (b'Re: ', None),
             (b'r\x8aksm\x9arg\x8cs', 'mac-iceland'),
-            (b'baz foo bar', None),
+            (b' baz foo bar ', None),
             (b'r\x8aksm\x9arg\x8cs', 'mac-iceland')])
         header = make_header(dh)
         eq(str(header),
@@ -2005,35 +2005,37 @@ def test_rfc2047_multiline(self):
 Re: =?mac-iceland?q?r=8Aksm=9Arg=8Cs?= baz foo bar =?mac-iceland?q?r=8Aksm?=
  =?mac-iceland?q?=9Arg=8Cs?=""")
 
-    def test_whitespace_eater_unicode(self):
+    def test_whitespace_keeper_unicode(self):
         eq = self.assertEqual
         s = '=?ISO-8859-1?Q?Andr=E9?= Pirard <pirard@dom.ain>'
         dh = decode_header(s)
         eq(dh, [(b'Andr\xe9', 'iso-8859-1'),
-                (b'Pirard <pirard@dom.ain>', None)])
+                (b' Pirard <pirard@dom.ain>', None)])
         header = str(make_header(dh))
         eq(header, 'Andr\xe9 Pirard <pirard@dom.ain>')
 
-    def test_whitespace_eater_unicode_2(self):
+    def test_whitespace_keeper_unicode_2(self):
         eq = self.assertEqual
         s = 'The =?iso-8859-1?b?cXVpY2sgYnJvd24gZm94?= jumped over the =?iso-8859-1?b?bGF6eSBkb2c=?='
         dh = decode_header(s)
-        eq(dh, [(b'The', None), (b'quick brown fox', 'iso-8859-1'),
-                (b'jumped over the', None), (b'lazy dog', 'iso-8859-1')])
+        eq(dh, [(b'The ', None), (b'quick brown fox', 'iso-8859-1'),
+                (b' jumped over the ', None), (b'lazy dog', 'iso-8859-1')])
         hu = str(make_header(dh))
         eq(hu, 'The quick brown fox jumped over the lazy dog')
 
     def test_rfc2047_missing_whitespace(self):
         s = 'Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord'
         dh = decode_header(s)
-        self.assertEqual(dh, [(s, None)])
+        self.assertEqual(dh, [(b'Sm', None), (b'\xf6', 'iso-8859-1'),
+                              (b'rg', None), (b'\xe5', 'iso-8859-1'),
+                              (b'sbord', None)])
 
     def test_rfc2047_with_whitespace(self):
         s = 'Sm =?ISO-8859-1?B?9g==?= rg =?ISO-8859-1?B?5Q==?= sbord'
         dh = decode_header(s)
-        self.assertEqual(dh, [(b'Sm', None), (b'\xf6', 'iso-8859-1'),
-                              (b'rg', None), (b'\xe5', 'iso-8859-1'),
-                              (b'sbord', None)])
+        self.assertEqual(dh, [(b'Sm ', None), (b'\xf6', 'iso-8859-1'),
+                              (b' rg ', None), (b'\xe5', 'iso-8859-1'),
+                              (b' sbord', None)])
 
     def test_rfc2047_B_bad_padding(self):
         s = '=?iso-8859-1?B?%s?='
@@ -2051,6 +2053,57 @@ def test_rfc2047_Q_invalid_digits(self):
         self.assertEqual(decode_header(s),
                         [(b'andr\xe9=zz', 'iso-8659-1')])
 
+    def test_rfc2047_rfc2047_1(self):
+        # 1st testcase at end of rfc2047
+        s = '(=?ISO-8859-1?Q?a?=)'
+        self.assertEqual(decode_header(s),
+            [(b'(', None), (b'a', 'iso-8859-1'), (b')', None)])
+
+    def test_rfc2047_rfc2047_2(self):
+        # 2nd testcase at end of rfc2047
+        s = '(=?ISO-8859-1?Q?a?= b)'
+        self.assertEqual(decode_header(s),
+            [(b'(', None), (b'a', 'iso-8859-1'), (b' b)', None)])
+
+    def test_rfc2047_rfc2047_3(self):
+        # 3rd testcase at end of rfc2047
+        s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)'
+        self.assertEqual(decode_header(s),
+            [(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
+
+    def test_rfc2047_rfc2047_4(self):
+        # 4th testcase at end of rfc2047
+        s = '(=?ISO-8859-1?Q?a?=  =?ISO-8859-1?Q?b?=)'
+        self.assertEqual(decode_header(s),
+            [(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
+
+    def test_rfc2047_rfc2047_5a(self):
+        # 5th testcase at end of rfc2047 newline is \r\n
+        s = '(=?ISO-8859-1?Q?a?=\r\n    =?ISO-8859-1?Q?b?=)'
+        self.assertEqual(decode_header(s),
+            [(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
+
+    def test_rfc2047_rfc2047_5b(self):
+        # 5th testcase at end of rfc2047 newline is \n
+        s = '(=?ISO-8859-1?Q?a?=\n    =?ISO-8859-1?Q?b?=)'
+        self.assertEqual(decode_header(s),
+            [(b'(', None), (b'ab', 'iso-8859-1'), (b')', None)])
+
+    def test_rfc2047_rfc2047_6(self):
+        # 6th testcase at end of rfc2047
+        s = '(=?ISO-8859-1?Q?a_b?=)'
+        self.assertEqual(decode_header(s),
+            [(b'(', None), (b'a b', 'iso-8859-1'), (b')', None)])
+
+    def test_rfc2047_rfc2047_7(self):
+        # 7th testcase at end of rfc2047
+        s = '(=?ISO-8859-1?Q?a?= =?ISO-8859-2?Q?_b?=)'
+        self.assertEqual(decode_header(s),
+            [(b'(', None), (b'a', 'iso-8859-1'), (b' b', 'iso-8859-2'),
+             (b')', None)])
+        self.assertEqual(make_header(decode_header(s)).encode(), s.lower())
+        self.assertEqual(str(make_header(decode_header(s))), '(a b)')
+
 
 # Test the MIMEMessage class
 class TestMIMEMessage(TestEmailBase):
@@ -4388,11 +4441,11 @@ def test_encoded_adjacent_nonencoded(self):
         h = make_header(decode_header(s))
         eq(h.encode(), s)
 
-    def test_whitespace_eater(self):
+    def test_whitespace_keeper(self):
         eq = self.assertEqual
         s = 'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztk=?= =?koi8-r?q?=CA?= zz.'
         parts = decode_header(s)
-        eq(parts, [(b'Subject:', None), (b'\xf0\xd2\xcf\xd7\xc5\xd2\xcb\xc1 \xce\xc1 \xc6\xc9\xce\xc1\xcc\xd8\xce\xd9\xca', 'koi8-r'), (b'zz.', None)])
+        eq(parts, [(b'Subject: ', None), (b'\xf0\xd2\xcf\xd7\xc5\xd2\xcb\xc1 \xce\xc1 \xc6\xc9\xce\xc1\xcc\xd8\xce\xd9\xca', 'koi8-r'), (b' zz.', None)])
         hdr = make_header(parts)
         eq(hdr.encode(),
            'Subject: =?koi8-r?b?8NLP18XSy8EgzsEgxsnOwczYztnK?= zz.')
diff --git a/Misc/NEWS b/Misc/NEWS
@@ -10,6 +10,10 @@ What's New in Python 3.3.0 Beta 1?
 Library
 -------
 
+- Issue #1079: email.header.decode_header now correctly parses all the examples
+  in RFC2047.  There is a necessary visible behavior change: the leading and/or
+  trailing whitespace on ASCII parts is now preserved.
+
 - Issue #14969: Better handling of exception chaining in contextlib.ExitStack
 
 - Issue #14962: Update text coloring in IDLE shell window after changing