Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1f9d24a

Browse files
committed
Merge: #18431: Decode encoded words in atoms in new email parser.
2 parents ae95b4f + 923512f commit 1f9d24a

4 files changed

Lines changed: 73 additions & 3 deletions

File tree

Lib/email/_header_value_parser.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1624,6 +1624,7 @@ def get_quoted_string(value):
16241624
def get_atom(value):
16251625
"""atom = [CFWS] 1*atext [CFWS]
16261626
1627+
An atom could be an rfc2047 encoded word.
16271628
"""
16281629
atom = Atom()
16291630
if value and value[0] in CFWS_LEADER:
@@ -1632,7 +1633,15 @@ def get_atom(value):
16321633
if value and value[0] in ATOM_ENDS:
16331634
raise errors.HeaderParseError(
16341635
"expected atom but found '{}'".format(value))
1635-
token, value = get_atext(value)
1636+
if value.startswith('=?'):
1637+
try:
1638+
token, value = get_encoded_word(value)
1639+
except errors.HeaderParseError:
1640+
# XXX: need to figure out how to register defects when
1641+
# appropriate here.
1642+
token, value = get_atext(value)
1643+
else:
1644+
token, value = get_atext(value)
16361645
atom.append(token)
16371646
if value and value[0] in CFWS_LEADER:
16381647
token, value = get_cfws(value)
@@ -1661,12 +1670,22 @@ def get_dot_atom_text(value):
16611670
def get_dot_atom(value):
16621671
""" dot-atom = [CFWS] dot-atom-text [CFWS]
16631672
1673+
Any place we can have a dot atom, we could instead have an rfc2047 encoded
1674+
word.
16641675
"""
16651676
dot_atom = DotAtom()
16661677
if value[0] in CFWS_LEADER:
16671678
token, value = get_cfws(value)
16681679
dot_atom.append(token)
1669-
token, value = get_dot_atom_text(value)
1680+
if value.startswith('=?'):
1681+
try:
1682+
token, value = get_encoded_word(value)
1683+
except errors.HeaderParseError:
1684+
# XXX: need to figure out how to register defects when
1685+
# appropriate here.
1686+
token, value = get_dot_atom_text(value)
1687+
else:
1688+
token, value = get_dot_atom_text(value)
16701689
dot_atom.append(token)
16711690
if value and value[0] in CFWS_LEADER:
16721691
token, value = get_cfws(value)

Lib/test/test_email/test__header_value_parser.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -808,9 +808,13 @@ def test_get_atom_atom_ends_at_special(self):
808808
self.assertEqual(atom[2].comments, ['bar'])
809809

810810
def test_get_atom_atom_ends_at_noncfws(self):
811-
atom = self._test_get_x(parser.get_atom,
811+
self._test_get_x(parser.get_atom,
812812
'bob fred', 'bob ', 'bob ', [], 'fred')
813813

814+
def test_get_atom_rfc2047_atom(self):
815+
self._test_get_x(parser.get_atom,
816+
'=?utf-8?q?=20bob?=', ' bob', ' bob', [], '')
817+
814818
# get_dot_atom_text
815819

816820
def test_get_dot_atom_text(self):
@@ -885,6 +889,10 @@ def test_get_dot_atom_trailing_dot_raises(self):
885889
with self.assertRaises(errors.HeaderParseError):
886890
parser.get_dot_atom(' (foo) bar.bang. foo')
887891

892+
def test_get_dot_atom_rfc2047_atom(self):
893+
self._test_get_x(parser.get_dot_atom,
894+
'=?utf-8?q?=20bob?=', ' bob', ' bob', [], '')
895+
888896
# get_word (if this were black box we'd repeat all the qs/atom tests)
889897

890898
def test_get_word_atom_yields_atom(self):
@@ -2156,6 +2164,22 @@ def test_get_address_complex(self):
21562164
self.assertEqual(address[0].token_type,
21572165
'mailbox')
21582166

2167+
def test_get_address_rfc2047_display_name(self):
2168+
address = self._test_get_x(parser.get_address,
2169+
'=?utf-8?q?=C3=89ric?= <[email protected]>',
2170+
'Éric <[email protected]>',
2171+
'Éric <[email protected]>',
2172+
[],
2173+
'')
2174+
self.assertEqual(address.token_type, 'address')
2175+
self.assertEqual(len(address.mailboxes), 1)
2176+
self.assertEqual(address.mailboxes,
2177+
address.all_mailboxes)
2178+
self.assertEqual(address.mailboxes[0].display_name,
2179+
'Éric')
2180+
self.assertEqual(address[0].token_type,
2181+
'mailbox')
2182+
21592183
def test_get_address_empty_group(self):
21602184
address = self._test_get_x(parser.get_address,
21612185
'Monty Python:;',

Lib/test/test_email/test_headerregistry.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,10 @@ def string_as_value(self,
158158
'=?utf-8?q?=C3=89ric?=',
159159
'Éric'),
160160

161+
'rfc2047_quopri_with_regular_text': (
162+
'The =?utf-8?q?=C3=89ric=2C?= Himself',
163+
'The Éric, Himself'),
164+
161165
}
162166

163167

@@ -1119,6 +1123,26 @@ class TestAddressHeader(TestHeaderBase):
11191123
'example.com',
11201124
None),
11211125

1126+
'rfc2047_atom_is_decoded':
1127+
('=?utf-8?q?=C3=89ric?= <[email protected]>',
1128+
[],
1129+
'Éric <[email protected]>',
1130+
'Éric',
1131+
1132+
'foo',
1133+
'example.com',
1134+
None),
1135+
1136+
'rfc2047_atom_in_phrase_is_decoded':
1137+
('The =?utf-8?q?=C3=89ric=2C?= Himself <[email protected]>',
1138+
[],
1139+
'"The Éric, Himself" <[email protected]>',
1140+
'The Éric, Himself',
1141+
1142+
'foo',
1143+
'example.com',
1144+
None),
1145+
11221146
}
11231147

11241148
# XXX: Need many more examples, and in particular some with names in

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ Core and Builtins
154154
Library
155155
-------
156156

157+
- Issue #18431: The new email header parser now decodes RFC2047 encoded words
158+
in structured headers.
159+
157160
- Issue #18044: The new email header parser was mis-parsing encoded words where
158161
an encoded character immediately followed the '?' that follows the CTE
159162
character, resulting in a decoding failure. They are now decoded correctly.

0 commit comments

Comments
 (0)