Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9976834

Browse files
authored
bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-24843)
Python no longer fails at startup with a fatal error if a command line argument contains an invalid Unicode character. The Py_DecodeLocale() function now escapes byte sequences which would be decoded as Unicode characters outside the [U+0000; U+10ffff] range. Use MAX_UNICODE constant in unicodeobject.c.
1 parent 6086ae7 commit 9976834

File tree

4 files changed

+148
-69
lines changed

4 files changed

+148
-69
lines changed

Lib/test/test_cmd_line.py

Lines changed: 60 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -200,38 +200,72 @@ def test_undecodable_code(self):
200200
if not stdout.startswith(pattern):
201201
raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
202202

203+
@unittest.skipIf(sys.platform == 'win32',
204+
'Windows has a native unicode API')
205+
def test_invalid_utf8_arg(self):
206+
# bpo-35883: Py_DecodeLocale() must escape b'\xfd\xbf\xbf\xbb\xba\xba'
207+
# byte sequence with surrogateescape rather than decoding it as the
208+
# U+7fffbeba character which is outside the [U+0000; U+10ffff] range of
209+
# Python Unicode characters.
210+
#
211+
# Test with default config, in the C locale, in the Python UTF-8 Mode.
212+
code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))'
213+
base_cmd = [sys.executable, '-c', code]
214+
215+
def run_default(arg):
216+
cmd = [sys.executable, '-c', code, arg]
217+
return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
218+
219+
def run_c_locale(arg):
220+
cmd = [sys.executable, '-c', code, arg]
221+
env = dict(os.environ)
222+
env['LC_ALL'] = 'C'
223+
return subprocess.run(cmd, stdout=subprocess.PIPE,
224+
text=True, env=env)
225+
226+
def run_utf8_mode(arg):
227+
cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
228+
return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
229+
230+
valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
231+
# invalid UTF-8 byte sequences with a valid UTF-8 sequence
232+
# in the middle.
233+
invalid_utf8 = (
234+
b'\xff' # invalid byte
235+
b'\xc3\xff' # invalid byte sequence
236+
b'\xc3\xa9' # valid utf-8: U+00E9 character
237+
b'\xed\xa0\x80' # lone surrogate character (invalid)
238+
b'\xfd\xbf\xbf\xbb\xba\xba' # character outside [U+0000; U+10ffff]
239+
)
240+
test_args = [valid_utf8, invalid_utf8]
241+
242+
for run_cmd in (run_default, run_c_locale, run_utf8_mode):
243+
with self.subTest(run_cmd=run_cmd):
244+
for arg in test_args:
245+
proc = run_cmd(arg)
246+
self.assertEqual(proc.stdout.rstrip(), ascii(arg))
247+
203248
@unittest.skipUnless((sys.platform == 'darwin' or
204249
support.is_android), 'test specific to Mac OS X and Android')
205250
def test_osx_android_utf8(self):
206-
def check_output(text):
207-
decoded = text.decode('utf-8', 'surrogateescape')
208-
expected = ascii(decoded).encode('ascii') + b'\n'
251+
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
252+
code = "import sys; print(ascii(sys.argv[1]))"
209253

210-
env = os.environ.copy()
211-
# C locale gives ASCII locale encoding, but Python uses UTF-8
212-
# to parse the command line arguments on Mac OS X and Android.
213-
env['LC_ALL'] = 'C'
254+
decoded = text.decode('utf-8', 'surrogateescape')
255+
expected = ascii(decoded).encode('ascii') + b'\n'
214256

215-
p = subprocess.Popen(
216-
(sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
217-
stdout=subprocess.PIPE,
218-
env=env)
219-
stdout, stderr = p.communicate()
220-
self.assertEqual(stdout, expected)
221-
self.assertEqual(p.returncode, 0)
257+
env = os.environ.copy()
258+
# C locale gives ASCII locale encoding, but Python uses UTF-8
259+
# to parse the command line arguments on Mac OS X and Android.
260+
env['LC_ALL'] = 'C'
222261

223-
# test valid utf-8
224-
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
225-
check_output(text)
226-
227-
# test invalid utf-8
228-
text = (
229-
b'\xff' # invalid byte
230-
b'\xc3\xa9' # valid utf-8 character
231-
b'\xc3\xff' # invalid byte sequence
232-
b'\xed\xa0\x80' # lone surrogate character (invalid)
233-
)
234-
check_output(text)
262+
p = subprocess.Popen(
263+
(sys.executable, "-c", code, text),
264+
stdout=subprocess.PIPE,
265+
env=env)
266+
stdout, stderr = p.communicate()
267+
self.assertEqual(stdout, expected)
268+
self.assertEqual(p.returncode, 0)
235269

236270
def test_non_interactive_output_buffering(self):
237271
code = textwrap.dedent("""
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Python no longer fails at startup with a fatal error if a command line
2+
argument contains an invalid Unicode character. The
3+
:c:func:`Py_DecodeLocale` function now escapes byte sequences which would be
4+
decoded as Unicode characters outside the [U+0000; U+10ffff] range.

Objects/unicodeobject.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently
9494
extern "C" {
9595
#endif
9696

97-
/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
97+
// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
98+
// The value must be the same in fileutils.c.
9899
#define MAX_UNICODE 0x10ffff
99100

100101
#ifdef Py_DEBUG
@@ -1784,8 +1785,8 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
17841785
*maxchar = ch;
17851786
if (*maxchar > MAX_UNICODE) {
17861787
PyErr_Format(PyExc_ValueError,
1787-
"character U+%x is not in range [U+0000; U+10ffff]",
1788-
ch);
1788+
"character U+%x is not in range [U+0000; U+%x]",
1789+
ch, MAX_UNICODE);
17891790
return -1;
17901791
}
17911792
}
@@ -14089,7 +14090,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
1408914090
{
1409014091
case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
1409114092
case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14092-
case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14093+
case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
1409314094
default:
1409414095
Py_UNREACHABLE();
1409514096
}

Python/fileutils.c

Lines changed: 79 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@ extern int winerror_to_errno(int);
3434
int _Py_open_cloexec_works = -1;
3535
#endif
3636

37+
// The value must be the same in unicodeobject.c.
38+
#define MAX_UNICODE 0x10ffff
39+
40+
// mbstowcs() and mbrtowc() errors
41+
static const size_t DECODE_ERROR = ((size_t)-1);
42+
static const size_t INCOMPLETE_CHARACTER = (size_t)-2;
43+
3744

3845
static int
3946
get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
@@ -82,6 +89,57 @@ _Py_device_encoding(int fd)
8289
#endif
8390
}
8491

92+
93+
static size_t
94+
is_valid_wide_char(wchar_t ch)
95+
{
96+
if (Py_UNICODE_IS_SURROGATE(ch)) {
97+
// Reject lone surrogate characters
98+
return 0;
99+
}
100+
if (ch > MAX_UNICODE) {
101+
// bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
102+
// The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
103+
// it creates characters outside the [U+0000; U+10ffff] range:
104+
// https://sourceware.org/bugzilla/show_bug.cgi?id=2373
105+
return 0;
106+
}
107+
return 1;
108+
}
109+
110+
111+
static size_t
112+
_Py_mbstowcs(wchar_t *dest, const char *src, size_t n)
113+
{
114+
size_t count = mbstowcs(dest, src, n);
115+
if (dest != NULL && count != DECODE_ERROR) {
116+
for (size_t i=0; i < count; i++) {
117+
wchar_t ch = dest[i];
118+
if (!is_valid_wide_char(ch)) {
119+
return DECODE_ERROR;
120+
}
121+
}
122+
}
123+
return count;
124+
}
125+
126+
127+
#ifdef HAVE_MBRTOWC
128+
static size_t
129+
_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
130+
{
131+
assert(pwc != NULL);
132+
size_t count = mbrtowc(pwc, str, len, pmbs);
133+
if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) {
134+
if (!is_valid_wide_char(*pwc)) {
135+
return DECODE_ERROR;
136+
}
137+
}
138+
return count;
139+
}
140+
#endif
141+
142+
85143
#if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS)
86144

87145
#define USE_FORCE_ASCII
@@ -148,8 +206,8 @@ check_force_ascii(void)
148206
size_t res;
149207

150208
ch = (unsigned char)0xA7;
151-
res = mbstowcs(&wch, (char*)&ch, 1);
152-
if (res != (size_t)-1 && wch == L'\xA7') {
209+
res = _Py_mbstowcs(&wch, (char*)&ch, 1);
210+
if (res != DECODE_ERROR && wch == L'\xA7') {
153211
/* On HP-UX withe C locale or the POSIX locale,
154212
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
155213
Latin1 encoding in practice. Force ASCII in this case.
@@ -196,8 +254,8 @@ check_force_ascii(void)
196254

197255
unsigned uch = (unsigned char)i;
198256
ch[0] = (char)uch;
199-
res = mbstowcs(wch, ch, 1);
200-
if (res != (size_t)-1) {
257+
res = _Py_mbstowcs(wch, ch, 1);
258+
if (res != DECODE_ERROR) {
201259
/* decoding a non-ASCII character from the locale encoding succeed:
202260
the locale encoding is not ASCII, force ASCII */
203261
return 1;
@@ -387,9 +445,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
387445
*/
388446
argsize = strlen(arg);
389447
#else
390-
argsize = mbstowcs(NULL, arg, 0);
448+
argsize = _Py_mbstowcs(NULL, arg, 0);
391449
#endif
392-
if (argsize != (size_t)-1) {
450+
if (argsize != DECODE_ERROR) {
393451
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
394452
return -1;
395453
}
@@ -398,21 +456,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
398456
return -1;
399457
}
400458

401-
count = mbstowcs(res, arg, argsize + 1);
402-
if (count != (size_t)-1) {
403-
wchar_t *tmp;
404-
/* Only use the result if it contains no
405-
surrogate characters. */
406-
for (tmp = res; *tmp != 0 &&
407-
!Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
408-
;
409-
if (*tmp == 0) {
410-
if (wlen != NULL) {
411-
*wlen = count;
412-
}
413-
*wstr = res;
414-
return 0;
459+
count = _Py_mbstowcs(res, arg, argsize + 1);
460+
if (count != DECODE_ERROR) {
461+
*wstr = res;
462+
if (wlen != NULL) {
463+
*wlen = count;
415464
}
465+
return 0;
416466
}
417467
PyMem_RawFree(res);
418468
}
@@ -436,46 +486,36 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
436486
out = res;
437487
memset(&mbs, 0, sizeof mbs);
438488
while (argsize) {
439-
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
489+
size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs);
440490
if (converted == 0) {
441491
/* Reached end of string; null char stored. */
442492
break;
443493
}
444494

445-
if (converted == (size_t)-2) {
495+
if (converted == INCOMPLETE_CHARACTER) {
446496
/* Incomplete character. This should never happen,
447497
since we provide everything that we have -
448498
unless there is a bug in the C library, or I
449499
misunderstood how mbrtowc works. */
450500
goto decode_error;
451501
}
452502

453-
if (converted == (size_t)-1) {
503+
if (converted == DECODE_ERROR) {
454504
if (!surrogateescape) {
455505
goto decode_error;
456506
}
457507

458-
/* Conversion error. Escape as UTF-8b, and start over
459-
in the initial shift state. */
508+
/* Decoding error. Escape as UTF-8b, and start over in the initial
509+
shift state. */
460510
*out++ = 0xdc00 + *in++;
461511
argsize--;
462512
memset(&mbs, 0, sizeof mbs);
463513
continue;
464514
}
465515

466-
if (Py_UNICODE_IS_SURROGATE(*out)) {
467-
if (!surrogateescape) {
468-
goto decode_error;
469-
}
516+
// _Py_mbrtowc() reject lone surrogate characters
517+
assert(!Py_UNICODE_IS_SURROGATE(*out));
470518

471-
/* Surrogate character. Escape the original
472-
byte sequence with surrogateescape. */
473-
argsize -= converted;
474-
while (converted--) {
475-
*out++ = 0xdc00 + *in++;
476-
}
477-
continue;
478-
}
479519
/* successfully converted some bytes */
480520
in += converted;
481521
argsize -= converted;
@@ -652,7 +692,7 @@ encode_current_locale(const wchar_t *text, char **str,
652692
else {
653693
converted = wcstombs(NULL, buf, 0);
654694
}
655-
if (converted == (size_t)-1) {
695+
if (converted == DECODE_ERROR) {
656696
goto encode_error;
657697
}
658698
if (bytes != NULL) {
@@ -1440,7 +1480,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
14401480
char cmode[10];
14411481
size_t r;
14421482
r = wcstombs(cmode, mode, 10);
1443-
if (r == (size_t)-1 || r >= 10) {
1483+
if (r == DECODE_ERROR || r >= 10) {
14441484
errno = EINVAL;
14451485
return NULL;
14461486
}

0 commit comments

Comments
 (0)