sanitize charset names in email

python · malemburg · Nov 9, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
commit 95f2e65dbdee909c88cd8b6276ad9c803c4115cb
@@ -796,6 +796,7 @@ def params(self):
                         value = urllib.parse.unquote(value, encoding='latin-1')
                     else:
                         try:
+                            charset = utils._sanitize_charset_name(charset, 'us-ascii')
                             value = value.decode(charset, 'surrogateescape')
                         except (LookupError, UnicodeEncodeError):
                             # XXX: there should really be a custom defect for

@@ -446,6 +446,15 @@ def decode_params(params):
                 new_params.append((name, '"%s"' % value))
     return new_params
 
+def _sanitize_charset_name(charset, fallback_charset):
+    if not charset:
+        return charset
+    sanitized = ''.join(
+        c for c in charset
+        if (ord(c) < 0xDC80 or ord(c) > 0xDCFF) and c.isascii()
+    )
+    return sanitized if sanitized else fallback_charset
+
 def collapse_rfc2231_value(value, errors='replace',
                            fallback_charset='us-ascii'):
     if not isinstance(value, tuple) or len(value) != 3:
@@ -458,6 +467,7 @@ def collapse_rfc2231_value(value, errors='replace',
         # Issue 17369: if charset/lang is None, decode_rfc2231 couldn't parse
         # the value, so use the fallback_charset.
         charset = fallback_charset
+    charset = _sanitize_charset_name(charset, fallback_charset)
     rawbytes = bytes(text, 'raw-unicode-escape')
     try:
         return str(rawbytes, charset, errors)