Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 63194a7

Browse files
committed
Merge: #18044: Fix parsing of encoded words of the form =?utf8?q?=XX...?=
2 parents f9e6672 + 65171b2 commit 63194a7

5 files changed

Lines changed: 62 additions & 40 deletions

File tree

Lib/email/_header_value_parser.py

Lines changed: 7 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969

7070
import re
7171
import urllib # For urllib.parse.unquote
72+
from string import hexdigits
7273
from collections import namedtuple, OrderedDict
7374
from email import _encoded_words as _ew
7475
from email import errors
@@ -391,10 +392,6 @@ class UnstructuredTokenList(TokenList):
391392
token_type = 'unstructured'
392393

393394
def _fold(self, folded):
394-
if any(x.token_type=='encoded-word' for x in self):
395-
return self._fold_encoded(folded)
396-
# Here we can have either a pure ASCII string that may or may not
397-
# have surrogateescape encoded bytes, or a unicode string.
398395
last_ew = None
399396
for part in self.parts:
400397
tstr = str(part)
@@ -1386,35 +1383,6 @@ def _get_ptext_to_endchars(value, endchars):
13861383
pos = pos + 1
13871384
return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
13881385

1389-
def _decode_ew_run(value):
1390-
""" Decode a run of RFC2047 encoded words.
1391-
1392-
_decode_ew_run(value) -> (text, value, defects)
1393-
1394-
Scans the supplied value for a run of tokens that look like they are RFC
1395-
2047 encoded words, decodes those words into text according to RFC 2047
1396-
rules (whitespace between encoded words is discarded), and returns the text
1397-
and the remaining value (including any leading whitespace on the remaining
1398-
value), as well as a list of any defects encountered while decoding. The
1399-
input value may not have any leading whitespace.
1400-
1401-
"""
1402-
res = []
1403-
defects = []
1404-
last_ws = ''
1405-
while value:
1406-
try:
1407-
tok, ws, value = _wsp_splitter(value, 1)
1408-
except ValueError:
1409-
tok, ws, value = value, '', ''
1410-
if not (tok.startswith('=?') and tok.endswith('?=')):
1411-
return ''.join(res), last_ws + tok + ws + value, defects
1412-
text, charset, lang, new_defects = _ew.decode(tok)
1413-
res.append(text)
1414-
defects.extend(new_defects)
1415-
last_ws = ws
1416-
return ''.join(res), last_ws, defects
1417-
14181386
def get_fws(value):
14191387
"""FWS = 1*WSP
14201388
@@ -1440,7 +1408,8 @@ def get_encoded_word(value):
14401408
raise errors.HeaderParseError(
14411409
"expected encoded word but found {}".format(value))
14421410
remstr = ''.join(remainder)
1443-
if remstr[:2].isdigit():
1411+
if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
1412+
# The ? after the CTE was followed by an encoded word escape (=XX).
14441413
rest, *remainder = remstr.split('?=', 1)
14451414
tok = tok + '?=' + rest
14461415
if len(tok.split()) > 1:
@@ -1488,8 +1457,8 @@ def get_unstructured(value):
14881457
14891458
"""
14901459
# XXX: but what about bare CR and LF? They might signal the start or
1491-
# end of an encoded word. YAGNI for now, since out current parsers
1492-
# will never send us strings with bard CR or LF.
1460+
# end of an encoded word. YAGNI for now, since our current parsers
1461+
# will never send us strings with bare CR or LF.
14931462

14941463
unstructured = UnstructuredTokenList()
14951464
while value:
@@ -1501,6 +1470,8 @@ def get_unstructured(value):
15011470
try:
15021471
token, value = get_encoded_word(value)
15031472
except errors.HeaderParseError:
1473+
# XXX: Need to figure out how to register defects when
1474+
# appropriate here.
15041475
pass
15051476
else:
15061477
have_ws = True

Lib/test/test_email/test__encoded_words.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,11 @@ def test_unknown_charset(self):
122122
# XXX Should this be a new Defect instead?
123123
defects = [errors.CharsetError])
124124

125+
def test_q_nonascii(self):
126+
self._test('=?utf-8?q?=C3=89ric?=',
127+
'Éric',
128+
charset='utf-8')
129+
125130

126131
class TestEncodeQ(TestEmailBase):
127132

Lib/test/test_email/test__header_value_parser.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,15 @@ def test_get_encoded_word_leading_internal_space(self):
170170
[],
171171
'')
172172

173+
def test_get_encoded_word_quopri_utf_escape_follows_cte(self):
174+
# Issue 18044
175+
self._test_get_x(parser.get_encoded_word,
176+
'=?utf-8?q?=C3=89ric?=',
177+
'Éric',
178+
'Éric',
179+
[],
180+
'')
181+
173182
# get_unstructured
174183

175184
def _get_unst(self, value):

Lib/test/test_email/test_headerregistry.py

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,45 @@ def test_defects_is_tuple(self):
123123
# self.assertEqual(h, value)
124124
# self.assertDefectsEqual(h.defects, [errors.ObsoleteHeaderDefect])
125125

126-
def test_RFC2047_value_decoded(self):
127-
value = '=?utf-8?q?this_is_a_test?='
128-
h = self.make_header('subject', value)
129-
self.assertEqual(h, 'this is a test')
130126

127+
@parameterize
128+
class TestUnstructuredHeader(TestHeaderBase):
131129

130+
def string_as_value(self,
131+
source,
132+
decoded,
133+
*args):
134+
l = len(args)
135+
defects = args[0] if l>0 else []
136+
header = 'Subject:' + (' ' if source else '')
137+
folded = header + (args[1] if l>1 else source) + '\n'
138+
h = self.make_header('Subject', source)
139+
self.assertEqual(h, decoded)
140+
self.assertDefectsEqual(h.defects, defects)
141+
self.assertEqual(h.fold(policy=policy.default), folded)
142+
143+
string_params = {
144+
145+
'rfc2047_simple_quopri': (
146+
'=?utf-8?q?this_is_a_test?=',
147+
'this is a test',
148+
[],
149+
'this is a test'),
150+
151+
'rfc2047_gb2312_base64': (
152+
'=?gb2312?b?1eLKx9bQzsSy4srUo6E=?=',
153+
'\u8fd9\u662f\u4e2d\u6587\u6d4b\u8bd5\uff01',
154+
[],
155+
'=?utf-8?b?6L+Z5piv5Lit5paH5rWL6K+V77yB?='),
156+
157+
'rfc2047_simple_nonascii_quopri': (
158+
'=?utf-8?q?=C3=89ric?=',
159+
'Éric'),
160+
161+
}
162+
163+
164+
@parameterize
132165
class TestDateHeader(TestHeaderBase):
133166

134167
datestring = 'Sun, 23 Sep 2001 20:10:55 -0700'

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,10 @@ Core and Builtins
151151
Library
152152
-------
153153

154+
- Issue #18044: The new email header parser was mis-parsing encoded words where
155+
an encoded character immediately followed the '?' that follows the CTE
156+
character, resulting in a decoding failure. They are now decoded correctly.
157+
154158
- Issue #18101: Tcl.split() now process strings nested in a tuple as it
155159
do with byte strings.
156160

0 commit comments

Comments
 (0)