Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3b6e61e

Browse files
bpo-35883: Py_DecodeLocale() escapes invalid Unicode characters (GH-24843) (GH-24906)
Python no longer fails at startup with a fatal error if a command line argument contains an invalid Unicode character. The Py_DecodeLocale() function now escapes byte sequences which would be decoded as Unicode characters outside the [U+0000; U+10ffff] range. Use MAX_UNICODE constant in unicodeobject.c. (cherry picked from commit 9976834) Co-authored-by: Victor Stinner <[email protected]> Co-authored-by: Victor Stinner <[email protected]>
1 parent e9092b2 commit 3b6e61e

File tree

4 files changed

+148
-69
lines changed

4 files changed

+148
-69
lines changed

Lib/test/test_cmd_line.py

Lines changed: 60 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -190,38 +190,72 @@ def test_undecodable_code(self):
190190
if not stdout.startswith(pattern):
191191
raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
192192

193+
@unittest.skipIf(sys.platform == 'win32',
194+
'Windows has a native unicode API')
195+
def test_invalid_utf8_arg(self):
196+
# bpo-35883: Py_DecodeLocale() must escape b'\xfd\xbf\xbf\xbb\xba\xba'
197+
# byte sequence with surrogateescape rather than decoding it as the
198+
# U+7fffbeba character which is outside the [U+0000; U+10ffff] range of
199+
# Python Unicode characters.
200+
#
201+
# Test with default config, in the C locale, in the Python UTF-8 Mode.
202+
code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))'
203+
base_cmd = [sys.executable, '-c', code]
204+
205+
def run_default(arg):
206+
cmd = [sys.executable, '-c', code, arg]
207+
return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
208+
209+
def run_c_locale(arg):
210+
cmd = [sys.executable, '-c', code, arg]
211+
env = dict(os.environ)
212+
env['LC_ALL'] = 'C'
213+
return subprocess.run(cmd, stdout=subprocess.PIPE,
214+
text=True, env=env)
215+
216+
def run_utf8_mode(arg):
217+
cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
218+
return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
219+
220+
valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
221+
# invalid UTF-8 byte sequences with a valid UTF-8 sequence
222+
# in the middle.
223+
invalid_utf8 = (
224+
b'\xff' # invalid byte
225+
b'\xc3\xff' # invalid byte sequence
226+
b'\xc3\xa9' # valid utf-8: U+00E9 character
227+
b'\xed\xa0\x80' # lone surrogate character (invalid)
228+
b'\xfd\xbf\xbf\xbb\xba\xba' # character outside [U+0000; U+10ffff]
229+
)
230+
test_args = [valid_utf8, invalid_utf8]
231+
232+
for run_cmd in (run_default, run_c_locale, run_utf8_mode):
233+
with self.subTest(run_cmd=run_cmd):
234+
for arg in test_args:
235+
proc = run_cmd(arg)
236+
self.assertEqual(proc.stdout.rstrip(), ascii(arg))
237+
193238
@unittest.skipUnless((sys.platform == 'darwin' or
194239
support.is_android), 'test specific to Mac OS X and Android')
195240
def test_osx_android_utf8(self):
196-
def check_output(text):
197-
decoded = text.decode('utf-8', 'surrogateescape')
198-
expected = ascii(decoded).encode('ascii') + b'\n'
241+
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
242+
code = "import sys; print(ascii(sys.argv[1]))"
199243

200-
env = os.environ.copy()
201-
# C locale gives ASCII locale encoding, but Python uses UTF-8
202-
# to parse the command line arguments on Mac OS X and Android.
203-
env['LC_ALL'] = 'C'
244+
decoded = text.decode('utf-8', 'surrogateescape')
245+
expected = ascii(decoded).encode('ascii') + b'\n'
204246

205-
p = subprocess.Popen(
206-
(sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
207-
stdout=subprocess.PIPE,
208-
env=env)
209-
stdout, stderr = p.communicate()
210-
self.assertEqual(stdout, expected)
211-
self.assertEqual(p.returncode, 0)
247+
env = os.environ.copy()
248+
# C locale gives ASCII locale encoding, but Python uses UTF-8
249+
# to parse the command line arguments on Mac OS X and Android.
250+
env['LC_ALL'] = 'C'
212251

213-
# test valid utf-8
214-
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
215-
check_output(text)
216-
217-
# test invalid utf-8
218-
text = (
219-
b'\xff' # invalid byte
220-
b'\xc3\xa9' # valid utf-8 character
221-
b'\xc3\xff' # invalid byte sequence
222-
b'\xed\xa0\x80' # lone surrogate character (invalid)
223-
)
224-
check_output(text)
252+
p = subprocess.Popen(
253+
(sys.executable, "-c", code, text),
254+
stdout=subprocess.PIPE,
255+
env=env)
256+
stdout, stderr = p.communicate()
257+
self.assertEqual(stdout, expected)
258+
self.assertEqual(p.returncode, 0)
225259

226260
def test_unbuffered_output(self):
227261
# Test expected operation of the '-u' switch
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Python no longer fails at startup with a fatal error if a command line
2+
argument contains an invalid Unicode character. The
3+
:c:func:`Py_DecodeLocale` function now escapes byte sequences which would be
4+
decoded as Unicode characters outside the [U+0000; U+10ffff] range.

Objects/unicodeobject.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ NOTE: In the interpreter's initialization phase, some globals are currently
9090
extern "C" {
9191
#endif
9292

93-
/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
93+
// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
94+
// The value must be the same in fileutils.c.
9495
#define MAX_UNICODE 0x10ffff
9596

9697
#ifdef Py_DEBUG
@@ -1707,8 +1708,8 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
17071708
*maxchar = ch;
17081709
if (*maxchar > MAX_UNICODE) {
17091710
PyErr_Format(PyExc_ValueError,
1710-
"character U+%x is not in range [U+0000; U+10ffff]",
1711-
ch);
1711+
"character U+%x is not in range [U+0000; U+%x]",
1712+
ch, MAX_UNICODE);
17121713
return -1;
17131714
}
17141715
}
@@ -13610,7 +13611,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
1361013611
{
1361113612
case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
1361213613
case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13613-
case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13614+
case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
1361413615
default:
1361513616
Py_UNREACHABLE();
1361613617
}

Python/fileutils.c

Lines changed: 79 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,13 @@ extern int winerror_to_errno(int);
3333
int _Py_open_cloexec_works = -1;
3434
#endif
3535

36+
// The value must be the same in unicodeobject.c.
37+
#define MAX_UNICODE 0x10ffff
38+
39+
// mbstowcs() and mbrtowc() errors
40+
static const size_t DECODE_ERROR = ((size_t)-1);
41+
static const size_t INCOMPLETE_CHARACTER = (size_t)-2;
42+
3643

3744
static int
3845
get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
@@ -85,6 +92,57 @@ _Py_device_encoding(int fd)
8592
Py_RETURN_NONE;
8693
}
8794

95+
96+
static size_t
97+
is_valid_wide_char(wchar_t ch)
98+
{
99+
if (Py_UNICODE_IS_SURROGATE(ch)) {
100+
// Reject lone surrogate characters
101+
return 0;
102+
}
103+
if (ch > MAX_UNICODE) {
104+
// bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
105+
// The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
106+
// it creates characters outside the [U+0000; U+10ffff] range:
107+
// https://sourceware.org/bugzilla/show_bug.cgi?id=2373
108+
return 0;
109+
}
110+
return 1;
111+
}
112+
113+
114+
static size_t
115+
_Py_mbstowcs(wchar_t *dest, const char *src, size_t n)
116+
{
117+
size_t count = mbstowcs(dest, src, n);
118+
if (dest != NULL && count != DECODE_ERROR) {
119+
for (size_t i=0; i < count; i++) {
120+
wchar_t ch = dest[i];
121+
if (!is_valid_wide_char(ch)) {
122+
return DECODE_ERROR;
123+
}
124+
}
125+
}
126+
return count;
127+
}
128+
129+
130+
#ifdef HAVE_MBRTOWC
131+
static size_t
132+
_Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
133+
{
134+
assert(pwc != NULL);
135+
size_t count = mbrtowc(pwc, str, len, pmbs);
136+
if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER) {
137+
if (!is_valid_wide_char(*pwc)) {
138+
return DECODE_ERROR;
139+
}
140+
}
141+
return count;
142+
}
143+
#endif
144+
145+
88146
#if !defined(_Py_FORCE_UTF8_FS_ENCODING) && !defined(MS_WINDOWS)
89147

90148
#define USE_FORCE_ASCII
@@ -151,8 +209,8 @@ check_force_ascii(void)
151209
size_t res;
152210

153211
ch = (unsigned char)0xA7;
154-
res = mbstowcs(&wch, (char*)&ch, 1);
155-
if (res != (size_t)-1 && wch == L'\xA7') {
212+
res = _Py_mbstowcs(&wch, (char*)&ch, 1);
213+
if (res != DECODE_ERROR && wch == L'\xA7') {
156214
/* On HP-UX withe C locale or the POSIX locale,
157215
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
158216
Latin1 encoding in practice. Force ASCII in this case.
@@ -199,8 +257,8 @@ check_force_ascii(void)
199257

200258
unsigned uch = (unsigned char)i;
201259
ch[0] = (char)uch;
202-
res = mbstowcs(wch, ch, 1);
203-
if (res != (size_t)-1) {
260+
res = _Py_mbstowcs(wch, ch, 1);
261+
if (res != DECODE_ERROR) {
204262
/* decoding a non-ASCII character from the locale encoding succeed:
205263
the locale encoding is not ASCII, force ASCII */
206264
return 1;
@@ -390,9 +448,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
390448
*/
391449
argsize = strlen(arg);
392450
#else
393-
argsize = mbstowcs(NULL, arg, 0);
451+
argsize = _Py_mbstowcs(NULL, arg, 0);
394452
#endif
395-
if (argsize != (size_t)-1) {
453+
if (argsize != DECODE_ERROR) {
396454
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
397455
return -1;
398456
}
@@ -401,21 +459,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
401459
return -1;
402460
}
403461

404-
count = mbstowcs(res, arg, argsize + 1);
405-
if (count != (size_t)-1) {
406-
wchar_t *tmp;
407-
/* Only use the result if it contains no
408-
surrogate characters. */
409-
for (tmp = res; *tmp != 0 &&
410-
!Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
411-
;
412-
if (*tmp == 0) {
413-
if (wlen != NULL) {
414-
*wlen = count;
415-
}
416-
*wstr = res;
417-
return 0;
462+
count = _Py_mbstowcs(res, arg, argsize + 1);
463+
if (count != DECODE_ERROR) {
464+
*wstr = res;
465+
if (wlen != NULL) {
466+
*wlen = count;
418467
}
468+
return 0;
419469
}
420470
PyMem_RawFree(res);
421471
}
@@ -439,46 +489,36 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
439489
out = res;
440490
memset(&mbs, 0, sizeof mbs);
441491
while (argsize) {
442-
size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
492+
size_t converted = _Py_mbrtowc(out, (char*)in, argsize, &mbs);
443493
if (converted == 0) {
444494
/* Reached end of string; null char stored. */
445495
break;
446496
}
447497

448-
if (converted == (size_t)-2) {
498+
if (converted == INCOMPLETE_CHARACTER) {
449499
/* Incomplete character. This should never happen,
450500
since we provide everything that we have -
451501
unless there is a bug in the C library, or I
452502
misunderstood how mbrtowc works. */
453503
goto decode_error;
454504
}
455505

456-
if (converted == (size_t)-1) {
506+
if (converted == DECODE_ERROR) {
457507
if (!surrogateescape) {
458508
goto decode_error;
459509
}
460510

461-
/* Conversion error. Escape as UTF-8b, and start over
462-
in the initial shift state. */
511+
/* Decoding error. Escape as UTF-8b, and start over in the initial
512+
shift state. */
463513
*out++ = 0xdc00 + *in++;
464514
argsize--;
465515
memset(&mbs, 0, sizeof mbs);
466516
continue;
467517
}
468518

469-
if (Py_UNICODE_IS_SURROGATE(*out)) {
470-
if (!surrogateescape) {
471-
goto decode_error;
472-
}
519+
// _Py_mbrtowc() reject lone surrogate characters
520+
assert(!Py_UNICODE_IS_SURROGATE(*out));
473521

474-
/* Surrogate character. Escape the original
475-
byte sequence with surrogateescape. */
476-
argsize -= converted;
477-
while (converted--) {
478-
*out++ = 0xdc00 + *in++;
479-
}
480-
continue;
481-
}
482522
/* successfully converted some bytes */
483523
in += converted;
484524
argsize -= converted;
@@ -655,7 +695,7 @@ encode_current_locale(const wchar_t *text, char **str,
655695
else {
656696
converted = wcstombs(NULL, buf, 0);
657697
}
658-
if (converted == (size_t)-1) {
698+
if (converted == DECODE_ERROR) {
659699
goto encode_error;
660700
}
661701
if (bytes != NULL) {
@@ -1371,7 +1411,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
13711411
char cmode[10];
13721412
size_t r;
13731413
r = wcstombs(cmode, mode, 10);
1374-
if (r == (size_t)-1 || r >= 10) {
1414+
if (r == DECODE_ERROR || r >= 10) {
13751415
errno = EINVAL;
13761416
return NULL;
13771417
}

0 commit comments

Comments
 (0)