From 5e86f3136669624cc3a03a647acb7ba59142be35 Mon Sep 17 00:00:00 2001 From: Daniel Lenski Date: Mon, 10 Jan 2022 19:32:00 -0800 Subject: [PATCH 1/3] gh-67022: Document bytes/str inconsistency in email.header.decode_header() This function's possible return types have been surprising and error-prone for the entirety of its Python 3.x history. It can return either: 1. `typing.List[typing.Tuple[bytes, typing.Optional[str]]]` of length >1 2. or `typing.List[typing.Tuple[str, None]]`, of length exactly 1 This means that any user of this function must be prepared to accept either `bytes` or `str` for the first member of the 2-tuples it returns, which is a very surprising behavior in Python 3.x, particularly given that the second member of the tuple is supposed to represent the charset/encoding of the first member. This patch documents the behavior of this function, and adds test cases to demonstrate it. As discussed in bpo-22833, this cannot be changed in a backwards-compatible way, and some users of this function depend precisely on the existing behavior. --- Doc/library/email.header.rst | 26 ++++++++++++++----- Lib/email/header.py | 11 +++++--- Lib/test/test_email/test_email.py | 12 +++++++++ .../2022-01-11-21-40-14.bpo-22833.WB-JWw.rst | 1 + 4 files changed, 40 insertions(+), 10 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst diff --git a/Doc/library/email.header.rst b/Doc/library/email.header.rst index 219fad0d2f6745..604eedd9200606 100644 --- a/Doc/library/email.header.rst +++ b/Doc/library/email.header.rst @@ -178,16 +178,31 @@ The :mod:`email.header` module also provides the following convenient functions. Decode a message header value without converting the character set. The header value is in *header*. - This function returns a list of ``(decoded_string, charset)`` pairs containing - each of the decoded parts of the header. *charset* is ``None`` for non-encoded - parts of the header, otherwise a lower case string containing the name of the - character set specified in the encoded string. + For historical reasons, this function may return either: - Here's an example:: + 1. A list of pairs containing each of the decoded parts of the header, + ``(decoded_bytes, charset)``, where *decoded_bytes* is always an instance of + :class:`bytes`, and *charset* is either: + + - A lower case string containing the name of the character set specified. + + - ``None`` for non-encoded parts of the header. + + 2. A list of length 1 containing a pair ``(string, None)``, where + *string* is always an instance of :class:`str`. + + An :exc:`email.errors.HeaderParseError` may be raised when certain decoding + errors occur (e.g. a base64 decoding exception). + + Here are examples: >>> from email.header import decode_header >>> decode_header('=?iso-8859-1?q?p=F6stal?=') [(b'p\xf6stal', 'iso-8859-1')] + >>> decode_header('unencoded_string') + [('unencoded_string', None)] + >>> decode_header('bar =?utf-8?B?ZsOzbw==?=') + [(b'bar ', None), (b'f\xc3\xb3o', 'utf-8')] .. function:: make_header(decoded_seq, maxlinelen=None, header_name=None, continuation_ws=' ') @@ -202,4 +217,3 @@ The :mod:`email.header` module also provides the following convenient functions. This function takes one of those sequence of pairs and returns a :class:`Header` instance. Optional *maxlinelen*, *header_name*, and *continuation_ws* are as in the :class:`Header` constructor. - diff --git a/Lib/email/header.py b/Lib/email/header.py index 113a81f41314ec..783d0c15464fb8 100644 --- a/Lib/email/header.py +++ b/Lib/email/header.py @@ -59,10 +59,13 @@ def decode_header(header): """Decode a message header value without converting charset. - Returns a list of (string, charset) pairs containing each of the decoded - parts of the header. Charset is None for non-encoded parts of the header, - otherwise a lower-case string containing the name of the character set - specified in the encoded string. + For historical reasons, this function may return either: + + 1. A list of length 1 containing a pair (str, None). + 2. A list of (bytes, charset) pairs containing each of the decoded + parts of the header. Charset is None for non-encoded parts of the header, + otherwise a lower-case string containing the name of the character set + specified in the encoded string. header may be a string that may or may not contain RFC2047 encoded words, or it may be a Header object. diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 8765d121fd0813..b8116d073a2670 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -2568,6 +2568,18 @@ def test_multiline_header(self): self.assertEqual(str(make_header(decode_header(s))), '"Müller T" ') + def test_unencoded_ascii(self): + # bpo-22833/gh-67022: returns [(str, None)] rather than [(bytes, None)] + s = 'header without encoded words' + self.assertEqual(decode_header(s), + [('header without encoded words', None)]) + + def test_unencoded_utf8(self): + # bpo-22833/gh-67022: returns [(str, None)] rather than [(bytes, None)] + s = 'header with unexpected non ASCII caract\xe8res' + self.assertEqual(decode_header(s), + [('header with unexpected non ASCII caract\xe8res', None)]) + # Test the MIMEMessage class class TestMIMEMessage(TestEmailBase): diff --git a/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst b/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst new file mode 100644 index 00000000000000..edebe73755cd19 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst @@ -0,0 +1 @@ +The inconsistent return types of :func:`email.header.decode_header` are now documented. From 9c0c56f672a21b382bf97ca0478fc8ba34badefe Mon Sep 17 00:00:00 2001 From: Daniel Lenski Date: Wed, 11 Jun 2025 13:15:20 -0700 Subject: [PATCH 2/3] Add warnings about obsolescence of 'email.header.decode_header' and 'email.header.make_header' functions. Recommend use of `email.headerregistry.HeaderRegistry` instead, as suggested in https://github.com/python/cpython/pull/92900#discussion_r1112472177 --- Doc/library/email.header.rst | 10 ++++++++++ Lib/email/header.py | 6 ++++++ .../Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst | 5 ++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Doc/library/email.header.rst b/Doc/library/email.header.rst index 604eedd9200606..c3392a62b8ee79 100644 --- a/Doc/library/email.header.rst +++ b/Doc/library/email.header.rst @@ -204,6 +204,11 @@ The :mod:`email.header` module also provides the following convenient functions. >>> decode_header('bar =?utf-8?B?ZsOzbw==?=') [(b'bar ', None), (b'f\xc3\xb3o', 'utf-8')] + .. note:: + + This function exists for for backwards compatibility only. For + new code, we recommend using :class:`email.headerregistry.HeaderRegistry`. + .. function:: make_header(decoded_seq, maxlinelen=None, header_name=None, continuation_ws=' ') @@ -217,3 +222,8 @@ The :mod:`email.header` module also provides the following convenient functions. This function takes one of those sequence of pairs and returns a :class:`Header` instance. Optional *maxlinelen*, *header_name*, and *continuation_ws* are as in the :class:`Header` constructor. + + .. note:: + + This function exists for for backwards compatibility only, and is + not recommended for use in new code. diff --git a/Lib/email/header.py b/Lib/email/header.py index 783d0c15464fb8..220a84a7454b21 100644 --- a/Lib/email/header.py +++ b/Lib/email/header.py @@ -72,6 +72,9 @@ def decode_header(header): An email.errors.HeaderParseError may be raised when certain decoding error occurs (e.g. a base64 decoding exception). + + This function exists for backwards compatibility only. For new code, we + recommend using email.headerregistry.HeaderRegistry instead. """ # If it is a Header object, we can just return the encoded chunks. if hasattr(header, '_chunks'): @@ -164,6 +167,9 @@ def make_header(decoded_seq, maxlinelen=None, header_name=None, This function takes one of those sequence of pairs and returns a Header instance. Optional maxlinelen, header_name, and continuation_ws are as in the Header constructor. + + This function exists for backwards compatibility only, and is not + recommended for use in new code. """ h = Header(maxlinelen=maxlinelen, header_name=header_name, continuation_ws=continuation_ws) diff --git a/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst b/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst index edebe73755cd19..a06190c5e5cfdf 100644 --- a/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst +++ b/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst @@ -1 +1,4 @@ -The inconsistent return types of :func:`email.header.decode_header` are now documented. +The inconsistent return types of :func:`email.header.decode_header` are now documented, +and the use of :func:`email.header.decode_header` and :func:`email.header.make_header` +are discouraged, with :class:`email.headerregistry.HeaderRegistry` recommended as a +replacement. From 1216bde7391b9669e4846cdc74c4594e93662785 Mon Sep 17 00:00:00 2001 From: Daniel Lenski Date: Fri, 13 Jun 2025 21:47:06 -0700 Subject: [PATCH 3/3] Removed NEWS item Per https://github.com/python/cpython/pull/92900#discussion_r2141163470, not wanted for doc-only PRs. --- .../next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst diff --git a/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst b/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst deleted file mode 100644 index a06190c5e5cfdf..00000000000000 --- a/Misc/NEWS.d/next/Library/2022-01-11-21-40-14.bpo-22833.WB-JWw.rst +++ /dev/null @@ -1,4 +0,0 @@ -The inconsistent return types of :func:`email.header.decode_header` are now documented, -and the use of :func:`email.header.decode_header` and :func:`email.header.make_header` -are discouraged, with :class:`email.headerregistry.HeaderRegistry` recommended as a -replacement.