Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f933e1a

Browse files
author
Victor Stinner
committed
Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead of
the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment variable is not set, the locale encoding is ISO-8859-1, whereas most programs (including Python) expect UTF-8. Python already uses UTF-8 for the filesystem encoding and to encode command line arguments on this OS.
1 parent 073f759 commit f933e1a

4 files changed

Lines changed: 160 additions & 0 deletions

File tree

Lib/test/test_cmd_line.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,38 @@ def test_undecodable_code(self):
148148
if not stdout.startswith(pattern):
149149
raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
150150

151+
@unittest.skipUnless(sys.platform == 'darwin', 'test specific to Mac OS X')
152+
def test_osx_utf8(self):
153+
def check_output(text):
154+
decoded = text.decode('utf8', 'surrogateescape')
155+
expected = ascii(decoded).encode('ascii') + b'\n'
156+
157+
env = os.environ.copy()
158+
# C locale gives ASCII locale encoding, but Python uses UTF-8
159+
# to parse the command line arguments on Mac OS X
160+
env['LC_ALL'] = 'C'
161+
162+
p = subprocess.Popen(
163+
(sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
164+
stdout=subprocess.PIPE,
165+
env=env)
166+
stdout, stderr = p.communicate()
167+
self.assertEqual(stdout, expected)
168+
self.assertEqual(p.returncode, 0)
169+
170+
# test valid utf-8
171+
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
172+
check_output(text)
173+
174+
# test invalid utf-8
175+
text = (
176+
b'\xff' # invalid byte
177+
b'\xc3\xa9' # valid utf-8 character
178+
b'\xc3\xff' # invalid byte sequence
179+
b'\xed\xa0\x80' # lone surrogate character (invalid)
180+
)
181+
check_output(text)
182+
151183
def test_unbuffered_output(self):
152184
# Test expected operation of the '-u' switch
153185
for stream in ('stdout', 'stderr'):

Misc/NEWS

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ What's New in Python 3.2 Beta 1?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead
14+
of the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment
15+
variable is not set, the locale encoding is ISO-8859-1, whereas most programs
16+
(including Python) expect UTF-8. Python already uses UTF-8 for the filesystem
17+
encoding and to encode command line arguments on this OS.
18+
1319
- Issue #9713, #10114: Parser functions (eg. PyParser_ASTFromFile) expects
1420
filenames encoded to the filesystem encoding with surrogateescape error
1521
handler (to support undecodable bytes), instead of UTF-8 in strict mode.

Modules/python.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ wmain(int argc, wchar_t **argv)
1515
}
1616
#else
1717

18+
#ifdef __APPLE__
19+
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
20+
#endif
21+
1822
int
1923
main(int argc, char **argv)
2024
{
@@ -41,7 +45,11 @@ main(int argc, char **argv)
4145
oldloc = strdup(setlocale(LC_ALL, NULL));
4246
setlocale(LC_ALL, "");
4347
for (i = 0; i < argc; i++) {
48+
#ifdef __APPLE__
49+
argv_copy[i] = _Py_DecodeUTF8_surrogateescape(argv[i], strlen(argv[i]));
50+
#else
4451
argv_copy[i] = _Py_char2wchar(argv[i], NULL);
52+
#endif
4553
if (!argv_copy[i])
4654
return 1;
4755
argv_copy2[i] = argv_copy[i];

Objects/unicodeobject.c

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2716,6 +2716,120 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
27162716

27172717
#undef ASCII_CHAR_MASK
27182718

2719+
#ifdef __APPLE__
2720+
2721+
/* Simplified UTF-8 decoder using surrogateescape error handler,
2722+
used to decode the command line arguments on Mac OS X. */
2723+
2724+
wchar_t*
2725+
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2726+
{
2727+
int n;
2728+
const char *e;
2729+
wchar_t *unicode, *p;
2730+
2731+
/* Note: size will always be longer than the resulting Unicode
2732+
character count */
2733+
if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2734+
PyErr_NoMemory();
2735+
return NULL;
2736+
}
2737+
unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2738+
if (!unicode)
2739+
return NULL;
2740+
2741+
/* Unpack UTF-8 encoded data */
2742+
p = unicode;
2743+
e = s + size;
2744+
while (s < e) {
2745+
Py_UCS4 ch = (unsigned char)*s;
2746+
2747+
if (ch < 0x80) {
2748+
*p++ = (wchar_t)ch;
2749+
s++;
2750+
continue;
2751+
}
2752+
2753+
n = utf8_code_length[ch];
2754+
if (s + n > e) {
2755+
goto surrogateescape;
2756+
}
2757+
2758+
switch (n) {
2759+
case 0:
2760+
case 1:
2761+
goto surrogateescape;
2762+
2763+
case 2:
2764+
if ((s[1] & 0xc0) != 0x80)
2765+
goto surrogateescape;
2766+
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2767+
assert ((ch > 0x007F) && (ch <= 0x07FF));
2768+
*p++ = (wchar_t)ch;
2769+
break;
2770+
2771+
case 3:
2772+
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2773+
will result in surrogates in range d800-dfff. Surrogates are
2774+
not valid UTF-8 so they are rejected.
2775+
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2776+
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2777+
if ((s[1] & 0xc0) != 0x80 ||
2778+
(s[2] & 0xc0) != 0x80 ||
2779+
((unsigned char)s[0] == 0xE0 &&
2780+
(unsigned char)s[1] < 0xA0) ||
2781+
((unsigned char)s[0] == 0xED &&
2782+
(unsigned char)s[1] > 0x9F)) {
2783+
2784+
goto surrogateescape;
2785+
}
2786+
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2787+
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2788+
*p++ = (Py_UNICODE)ch;
2789+
break;
2790+
2791+
case 4:
2792+
if ((s[1] & 0xc0) != 0x80 ||
2793+
(s[2] & 0xc0) != 0x80 ||
2794+
(s[3] & 0xc0) != 0x80 ||
2795+
((unsigned char)s[0] == 0xF0 &&
2796+
(unsigned char)s[1] < 0x90) ||
2797+
((unsigned char)s[0] == 0xF4 &&
2798+
(unsigned char)s[1] > 0x8F)) {
2799+
goto surrogateescape;
2800+
}
2801+
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2802+
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2803+
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2804+
2805+
#if SIZEOF_WCHAR_T == 4
2806+
*p++ = (wchar_t)ch;
2807+
#else
2808+
/* compute and append the two surrogates: */
2809+
2810+
/* translate from 10000..10FFFF to 0..FFFF */
2811+
ch -= 0x10000;
2812+
2813+
/* high surrogate = top 10 bits added to D800 */
2814+
*p++ = (wchar_t)(0xD800 + (ch >> 10));
2815+
2816+
/* low surrogate = bottom 10 bits added to DC00 */
2817+
*p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2818+
#endif
2819+
break;
2820+
}
2821+
s += n;
2822+
continue;
2823+
2824+
surrogateescape:
2825+
*p++ = 0xDC00 + ch;
2826+
s++;
2827+
}
2828+
*p = L'\0';
2829+
return unicode;
2830+
}
2831+
2832+
#endif /* __APPLE__ */
27192833

27202834
/* Allocation strategy: if the string is short, convert into a stack buffer
27212835
and allocate exactly as much space needed at the end. Else allocate the

0 commit comments

Comments
 (0)