bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-24843)

vstinner · web-flow · commit 9976834f807e · 2021-03-17T21:46:53.000+01:00
Python no longer fails at startup with a fatal error if a command
line argument contains an invalid Unicode character.

The Py_DecodeLocale() function now escapes byte sequences which would
be decoded as Unicode characters outside the [U+0000; U+10ffff]
range.

Use MAX_UNICODE constant in unicodeobject.c.
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
@@ -200,38 +200,72 @@ def test_undecodable_code(self):
         if not stdout.startswith(pattern):
             raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
 
+    @unittest.skipIf(sys.platform == 'win32',
+                     'Windows has a native unicode API')
+    def test_invalid_utf8_arg(self):
+        # bpo-35883: Py_DecodeLocale() must escape b'\xfd\xbf\xbf\xbb\xba\xba'
+        # byte sequence with surrogateescape rather than decoding it as the
+        # U+7fffbeba character which is outside the [U+0000; U+10ffff] range of
+        # Python Unicode characters.
+        #
+        # Test with default config, in the C locale, in the Python UTF-8 Mode.
+        code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))'
+        base_cmd = [sys.executable, '-c', code]
+
+        def run_default(arg):
+            cmd = [sys.executable, '-c', code, arg]
+            return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+
+        def run_c_locale(arg):
+            cmd = [sys.executable, '-c', code, arg]
+            env = dict(os.environ)
+            env['LC_ALL'] = 'C'
+            return subprocess.run(cmd, stdout=subprocess.PIPE,
+                                  text=True, env=env)
+
+        def run_utf8_mode(arg):
+            cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
+            return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+
+        valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
+        # invalid UTF-8 byte sequences with a valid UTF-8 sequence
+        # in the middle.
+        invalid_utf8 = (
+            b'\xff'                      # invalid byte
+            b'\xc3\xff'                  # invalid byte sequence
+            b'\xc3\xa9'                  # valid utf-8: U+00E9 character
+            b'\xed\xa0\x80'              # lone surrogate character (invalid)
+            b'\xfd\xbf\xbf\xbb\xba\xba'  # character outside [U+0000; U+10ffff]
+        )
+        test_args = [valid_utf8, invalid_utf8]
+
+        for run_cmd in (run_default, run_c_locale, run_utf8_mode):
+            with self.subTest(run_cmd=run_cmd):
+                for arg in test_args:
+                    proc = run_cmd(arg)
+                    self.assertEqual(proc.stdout.rstrip(), ascii(arg))
+
     @unittest.skipUnless((sys.platform == 'darwin' or
                 support.is_android), 'test specific to Mac OS X and Android')
     def test_osx_android_utf8(self):
-        def check_output(text):
-            decoded = text.decode('utf-8', 'surrogateescape')
-            expected = ascii(decoded).encode('ascii') + b'\n'
+        text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
+        code = "import sys; print(ascii(sys.argv[1]))"
 
-            env = os.environ.copy()
-            # C locale gives ASCII locale encoding, but Python uses UTF-8
-            # to parse the command line arguments on Mac OS X and Android.
-            env['LC_ALL'] = 'C'
+        decoded = text.decode('utf-8', 'surrogateescape')
+        expected = ascii(decoded).encode('ascii') + b'\n'
 
-            p = subprocess.Popen(
-                (sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
-                stdout=subprocess.PIPE,
-                env=env)
-            stdout, stderr = p.communicate()
-            self.assertEqual(stdout, expected)
-            self.assertEqual(p.returncode, 0)
+        env = os.environ.copy()
+        # C locale gives ASCII locale encoding, but Python uses UTF-8
+        # to parse the command line arguments on Mac OS X and Android.
+        env['LC_ALL'] = 'C'
 
-        # test valid utf-8
-        text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
-        check_output(text)
-
-        # test invalid utf-8
-        text = (
-            b'\xff'         # invalid byte
-            b'\xc3\xa9'     # valid utf-8 character
-            b'\xc3\xff'     # invalid byte sequence
-            b'\xed\xa0\x80' # lone surrogate character (invalid)
-        )
-        check_output(text)
+        p = subprocess.Popen(
+            (sys.executable, "-c", code, text),
+            stdout=subprocess.PIPE,
+            env=env)
+        stdout, stderr = p.communicate()
+        self.assertEqual(stdout, expected)
+        self.assertEqual(p.returncode, 0)
 
     def test_non_interactive_output_buffering(self):
         code = textwrap.dedent("""
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst b/Misc/NEWS.d/next/Core and Builtins/2021-03-13-13-57-21.bpo-35883.UyGpdG.rst
@@ -0,0 +1,4 @@
+Python no longer fails at startup with a fatal error if a command line
+argument contains an invalid Unicode character. The
+:c:func:`Py_DecodeLocale` function now escapes byte sequences which would be
+decoded as Unicode characters outside the [U+0000; U+10ffff] range.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -94,7 +94,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently
 extern "C" {
 #endif
 
-/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
+// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
+// The value must be the same in fileutils.c.
 #define MAX_UNICODE 0x10ffff
 
 #ifdef Py_DEBUG
@@ -1784,8 +1785,8 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
             *maxchar = ch;
             if (*maxchar > MAX_UNICODE) {
                 PyErr_Format(PyExc_ValueError,
-                             "character U+%x is not in range [U+0000; U+10ffff]",
-                             ch);
+                             "character U+%x is not in range [U+0000; U+%x]",
+                             ch, MAX_UNICODE);
                 return -1;
             }
         }
@@ -14089,7 +14090,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
     {
     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
-    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
+    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
     default:
         Py_UNREACHABLE();
     }
diff --git a/Python/fileutils.c b/Python/fileutils.c
@@ -34,6 +34,13 @@ extern int winerror_to_errno(int);
 int _Py_open_cloexec_works = -1;
 #endif
 
+// The value must be the same in unicodeobject.c.
+#define MAX_UNICODE 0x10ffff
+
+// mbstowcs() and mbrtowc() errors
+static const size_t DECODE_ERROR = ((size_t)-1);
+static const size_t INCOMPLETE_CHARACTER = (size_t)-2;
+
 
 static int
 get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
@@ -82,6 +89,57 @@ _Py_device_encoding(int fd)
 #endif
 }
 
+
+static size_t
+is_valid_wide_char(wchar_t ch)
+{
+    if (Py_UNICODE_IS_SURROGATE(ch)) {
+        // Reject lone surrogate characters
+        return 0;
+    }
+    if (ch > MAX_UNICODE) {
+        // bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
+        // The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
+        // it creates characters outside the [U+0000; U+10ffff] range:
+        // https://sourceware.org/bugzilla/show_bug.cgi?id=2373
+        return 0;
+    }
+    return 1;
+}
+
+
+static size_t
+_Py_mbstowcs(wchar_t *dest, const char *src, size_t n)
+{
+    size_t count = mbstowcs(dest, src, n);
+    if (dest != NULL && count != DECODE_ERROR) {
+        for (size_t i=0; i < count; i++) {
+            wchar_t ch = dest[i];
+            if (!is_valid_wide_char(ch)) {
+                return DECODE_ERROR;
+            }
+        }
+    }
+    return count;
+}
+
+
+#ifdef HAVE_MBRTOWC
+static size_t
+_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
+{
+    assert(pwc != NULL);
+    size_t count = mbrtowc(pwc, str, len, pmbs);
+    if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) {
+        if (!is_valid_wide_char(*pwc)) {
+            return DECODE_ERROR;
+        }
+    }
+    return count;
+}
+#endif
+
+
 #if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS)
 
 #define USE_FORCE_ASCII
@@ -148,8 +206,8 @@ check_force_ascii(void)
         size_t res;
 
         ch = (unsigned char)0xA7;
-        res = mbstowcs(&wch, (char*)&ch, 1);
-        if (res != (size_t)-1 && wch == L'\xA7') {
+        res = _Py_mbstowcs(&wch, (char*)&ch, 1);
+        if (res != DECODE_ERROR && wch == L'\xA7') {
             /* On HP-UX withe C locale or the POSIX locale,
                nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
                Latin1 encoding in practice. Force ASCII in this case.
@@ -196,8 +254,8 @@ check_force_ascii(void)
 
         unsigned uch = (unsigned char)i;
         ch[0] = (char)uch;
-        res = mbstowcs(wch, ch, 1);
-        if (res != (size_t)-1) {
+        res = _Py_mbstowcs(wch, ch, 1);
+        if (res != DECODE_ERROR) {
             /* decoding a non-ASCII character from the locale encoding succeed:
                the locale encoding is not ASCII, force ASCII */
             return 1;
@@ -387,9 +445,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
      */
     argsize = strlen(arg);
 #else
-    argsize = mbstowcs(NULL, arg, 0);
+    argsize = _Py_mbstowcs(NULL, arg, 0);
 #endif
-    if (argsize != (size_t)-1) {
+    if (argsize != DECODE_ERROR) {
         if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
             return -1;
         }
@@ -398,21 +456,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
             return -1;
         }
 
-        count = mbstowcs(res, arg, argsize + 1);
-        if (count != (size_t)-1) {
-            wchar_t *tmp;
-            /* Only use the result if it contains no
-               surrogate characters. */
-            for (tmp = res; *tmp != 0 &&
-                         !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
-                ;
-            if (*tmp == 0) {
-                if (wlen != NULL) {
-                    *wlen = count;
-                }
-                *wstr = res;
-                return 0;
+        count = _Py_mbstowcs(res, arg, argsize + 1);
+        if (count != DECODE_ERROR) {
+            *wstr = res;
+            if (wlen != NULL) {
+                *wlen = count;
             }
+            return 0;
         }
         PyMem_RawFree(res);
     }
@@ -436,46 +486,36 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
     out = res;
     memset(&mbs, 0, sizeof mbs);
     while (argsize) {
-        size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
+        size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs);
         if (converted == 0) {
             /* Reached end of string; null char stored. */
             break;
         }
 
-        if (converted == (size_t)-2) {
+        if (converted == INCOMPLETE_CHARACTER) {
             /* Incomplete character. This should never happen,
                since we provide everything that we have -
                unless there is a bug in the C library, or I
                misunderstood how mbrtowc works. */
             goto decode_error;
         }
 
-        if (converted == (size_t)-1) {
+        if (converted == DECODE_ERROR) {
             if (!surrogateescape) {
                 goto decode_error;
             }
 
-            /* Conversion error. Escape as UTF-8b, and start over
-               in the initial shift state. */
+            /* Decoding error. Escape as UTF-8b, and start over in the initial
+               shift state. */
             *out++ = 0xdc00 + *in++;
             argsize--;
             memset(&mbs, 0, sizeof mbs);
             continue;
         }
 
-        if (Py_UNICODE_IS_SURROGATE(*out)) {
-            if (!surrogateescape) {
-                goto decode_error;
-            }
+        // _Py_mbrtowc() reject lone surrogate characters
+        assert(!Py_UNICODE_IS_SURROGATE(*out));
 
-            /* Surrogate character.  Escape the original
-               byte sequence with surrogateescape. */
-            argsize -= converted;
-            while (converted--) {
-                *out++ = 0xdc00 + *in++;
-            }
-            continue;
-        }
         /* successfully converted some bytes */
         in += converted;
         argsize -= converted;
@@ -652,7 +692,7 @@ encode_current_locale(const wchar_t *text, char **str,
                 else {
                     converted = wcstombs(NULL, buf, 0);
                 }
-                if (converted == (size_t)-1) {
+                if (converted == DECODE_ERROR) {
                     goto encode_error;
                 }
                 if (bytes != NULL) {
@@ -1440,7 +1480,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
     char cmode[10];
     size_t r;
     r = wcstombs(cmode, mode, 10);
-    if (r == (size_t)-1 || r >= 10) {
+    if (r == DECODE_ERROR || r >= 10) {
         errno = EINVAL;
         return NULL;
     }