6969
7070import re
7171import urllib # For urllib.parse.unquote
72+ from string import hexdigits
7273from collections import namedtuple , OrderedDict
7374from email import _encoded_words as _ew
7475from email import errors
@@ -391,10 +392,6 @@ class UnstructuredTokenList(TokenList):
391392 token_type = 'unstructured'
392393
393394 def _fold (self , folded ):
394- if any (x .token_type == 'encoded-word' for x in self ):
395- return self ._fold_encoded (folded )
396- # Here we can have either a pure ASCII string that may or may not
397- # have surrogateescape encoded bytes, or a unicode string.
398395 last_ew = None
399396 for part in self .parts :
400397 tstr = str (part )
@@ -1386,35 +1383,6 @@ def _get_ptext_to_endchars(value, endchars):
13861383 pos = pos + 1
13871384 return '' .join (vchars ), '' .join ([fragment [pos :]] + remainder ), had_qp
13881385
1389- def _decode_ew_run (value ):
1390- """ Decode a run of RFC2047 encoded words.
1391-
1392- _decode_ew_run(value) -> (text, value, defects)
1393-
1394- Scans the supplied value for a run of tokens that look like they are RFC
1395- 2047 encoded words, decodes those words into text according to RFC 2047
1396- rules (whitespace between encoded words is discarded), and returns the text
1397- and the remaining value (including any leading whitespace on the remaining
1398- value), as well as a list of any defects encountered while decoding. The
1399- input value may not have any leading whitespace.
1400-
1401- """
1402- res = []
1403- defects = []
1404- last_ws = ''
1405- while value :
1406- try :
1407- tok , ws , value = _wsp_splitter (value , 1 )
1408- except ValueError :
1409- tok , ws , value = value , '' , ''
1410- if not (tok .startswith ('=?' ) and tok .endswith ('?=' )):
1411- return '' .join (res ), last_ws + tok + ws + value , defects
1412- text , charset , lang , new_defects = _ew .decode (tok )
1413- res .append (text )
1414- defects .extend (new_defects )
1415- last_ws = ws
1416- return '' .join (res ), last_ws , defects
1417-
14181386def get_fws (value ):
14191387 """FWS = 1*WSP
14201388
@@ -1440,7 +1408,8 @@ def get_encoded_word(value):
14401408 raise errors .HeaderParseError (
14411409 "expected encoded word but found {}" .format (value ))
14421410 remstr = '' .join (remainder )
1443- if remstr [:2 ].isdigit ():
1411+ if len (remstr ) > 1 and remstr [0 ] in hexdigits and remstr [1 ] in hexdigits :
1412+ # The ? after the CTE was followed by an encoded word escape (=XX).
14441413 rest , * remainder = remstr .split ('?=' , 1 )
14451414 tok = tok + '?=' + rest
14461415 if len (tok .split ()) > 1 :
@@ -1488,8 +1457,8 @@ def get_unstructured(value):
14881457
14891458 """
14901459 # XXX: but what about bare CR and LF? They might signal the start or
1491- # end of an encoded word. YAGNI for now, since out current parsers
1492- # will never send us strings with bard CR or LF.
1460+ # end of an encoded word. YAGNI for now, since our current parsers
1461+ # will never send us strings with bare CR or LF.
14931462
14941463 unstructured = UnstructuredTokenList ()
14951464 while value :
@@ -1501,6 +1470,8 @@ def get_unstructured(value):
15011470 try :
15021471 token , value = get_encoded_word (value )
15031472 except errors .HeaderParseError :
1473+ # XXX: Need to figure out how to register defects when
1474+ # appropriate here.
15041475 pass
15051476 else :
15061477 have_ws = True
0 commit comments