Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7ae320d

Browse files
committed
(Merge 3.2) Issue #16455: On FreeBSD and Solaris, if the locale is C, the
ASCII/surrogateescape codec is now used, instead of the locale encoding, to decode the command line arguments. This change fixes inconsistencies with os.fsencode() and os.fsdecode() because these operating systems announces an ASCII locale encoding, whereas the ISO-8859-1 encoding is used in practice.
2 parents 791e464 + 20b654a commit 7ae320d

3 files changed

Lines changed: 227 additions & 27 deletions

File tree

Misc/NEWS

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ What's New in Python 3.3.1?
1212
Core and Builtins
1313
-----------------
1414

15+
- Issue #16455: On FreeBSD and Solaris, if the locale is C, the
16+
ASCII/surrogateescape codec is now used, instead of the locale encoding, to
17+
decode the command line arguments. This change fixes inconsistencies with
18+
os.fsencode() and os.fsdecode() because these operating systems announces an
19+
ASCII locale encoding, whereas the ISO-8859-1 encoding is used in practice.
20+
1521
- Issue #16761: Calling int() with base argument only now raises TypeError.
1622

1723
- Issue #16759: Support the full DWORD (unsigned long) range in Reg2Py

Objects/unicodeobject.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3045,8 +3045,8 @@ PyUnicode_FromEncodedObject(register PyObject *obj,
30453045
/* Convert encoding to lower case and replace '_' with '-' in order to
30463046
catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
30473047
1 on success. */
3048-
static int
3049-
normalize_encoding(const char *encoding,
3048+
int
3049+
_Py_normalize_encoding(const char *encoding,
30503050
char *lower,
30513051
size_t lower_len)
30523052
{
@@ -3090,7 +3090,7 @@ PyUnicode_Decode(const char *s,
30903090
char lower[11]; /* Enough for any encoding shortcut */
30913091

30923092
/* Shortcuts for common default encodings */
3093-
if (normalize_encoding(encoding, lower, sizeof(lower))) {
3093+
if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
30943094
if ((strcmp(lower, "utf-8") == 0) ||
30953095
(strcmp(lower, "utf8") == 0))
30963096
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
@@ -3455,7 +3455,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
34553455
}
34563456

34573457
/* Shortcuts for common default encodings */
3458-
if (normalize_encoding(encoding, lower, sizeof(lower))) {
3458+
if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
34593459
if ((strcmp(lower, "utf-8") == 0) ||
34603460
(strcmp(lower, "utf8") == 0))
34613461
{

Python/fileutils.c

Lines changed: 217 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#endif
66

77
#ifdef HAVE_LANGINFO_H
8+
#include <locale.h>
89
#include <langinfo.h>
910
#endif
1011

@@ -42,7 +43,182 @@ _Py_device_encoding(int fd)
4243
Py_RETURN_NONE;
4344
}
4445

45-
#ifdef HAVE_STAT
46+
#if !defined(__APPLE__) && !defined(MS_WINDOWS)
47+
extern int _Py_normalize_encoding(const char *, char *, size_t);
48+
49+
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
50+
On these operating systems, nl_langinfo(CODESET) announces an alias of the
51+
ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
52+
ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
53+
locale.getpreferredencoding() codec. For example, if command line arguments
54+
are decoded by mbstowcs() and encoded back by os.fsencode(), we get a
55+
UnicodeEncodeError instead of retrieving the original byte string.
56+
57+
The workaround is enabled if setlocale(LC_CTYPE, NULL) returns "C",
58+
nl_langinfo(CODESET) announces "ascii" (or an alias to ASCII), and at least
59+
one byte in range 0x80-0xff can be decoded from the locale encoding. The
60+
workaround is also enabled on error, for example if getting the locale
61+
failed.
62+
63+
Values of locale_is_ascii:
64+
65+
1: the workaround is used: _Py_wchar2char() uses
66+
encode_ascii_surrogateescape() and _Py_char2wchar() uses
67+
decode_ascii_surrogateescape()
68+
0: the workaround is not used: _Py_wchar2char() uses wcstombs() and
69+
_Py_char2wchar() uses mbstowcs()
70+
-1: unknown, need to call check_force_ascii() to get the value
71+
*/
72+
static int force_ascii = -1;
73+
74+
static int
75+
check_force_ascii(void)
76+
{
77+
char *loc;
78+
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
79+
char *codeset, **alias;
80+
char encoding[100];
81+
int is_ascii;
82+
unsigned int i;
83+
char* ascii_aliases[] = {
84+
"ascii",
85+
"646",
86+
"ansi-x3.4-1968",
87+
"ansi-x3-4-1968",
88+
"ansi-x3.4-1986",
89+
"cp367",
90+
"csascii",
91+
"ibm367",
92+
"iso646-us",
93+
"iso-646.irv-1991",
94+
"iso-ir-6",
95+
"us",
96+
"us-ascii",
97+
NULL
98+
};
99+
#endif
100+
101+
loc = setlocale(LC_CTYPE, NULL);
102+
if (loc == NULL)
103+
goto error;
104+
if (strcmp(loc, "C") != 0) {
105+
/* the LC_CTYPE locale is different than C */
106+
return 0;
107+
}
108+
109+
#if defined(HAVE_LANGINFO_H) && defined(CODESET)
110+
codeset = nl_langinfo(CODESET);
111+
if (!codeset || codeset[0] == '\0') {
112+
/* CODESET is not set or empty */
113+
goto error;
114+
}
115+
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
116+
goto error;
117+
118+
is_ascii = 0;
119+
for (alias=ascii_aliases; *alias != NULL; alias++) {
120+
if (strcmp(encoding, *alias) == 0) {
121+
is_ascii = 1;
122+
break;
123+
}
124+
}
125+
if (!is_ascii) {
126+
/* nl_langinfo(CODESET) is not "ascii" or an alias of ASCII */
127+
return 0;
128+
}
129+
130+
for (i=0x80; i<0xff; i++) {
131+
unsigned char ch;
132+
wchar_t wch;
133+
size_t res;
134+
135+
ch = (unsigned char)i;
136+
res = mbstowcs(&wch, (char*)&ch, 1);
137+
if (res != (size_t)-1) {
138+
/* decoding a non-ASCII character from the locale encoding succeed:
139+
the locale encoding is not ASCII, force ASCII */
140+
return 1;
141+
}
142+
}
143+
/* None of the bytes in the range 0x80-0xff can be decoded from the locale
144+
encoding: the locale encoding is really ASCII */
145+
return 0;
146+
#else
147+
/* nl_langinfo(CODESET) is not available: always force ASCII */
148+
return 1;
149+
#endif
150+
151+
error:
152+
/* if an error occured, force the ASCII encoding */
153+
return 1;
154+
}
155+
156+
static char*
157+
encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos)
158+
{
159+
char *result = NULL, *out;
160+
size_t len, i;
161+
wchar_t ch;
162+
163+
if (error_pos != NULL)
164+
*error_pos = (size_t)-1;
165+
166+
len = wcslen(text);
167+
168+
result = PyMem_Malloc(len + 1); /* +1 for NUL byte */
169+
if (result == NULL)
170+
return NULL;
171+
172+
out = result;
173+
for (i=0; i<len; i++) {
174+
ch = text[i];
175+
176+
if (ch <= 0x7f) {
177+
/* ASCII character */
178+
*out++ = (char)ch;
179+
}
180+
else if (0xdc80 <= ch && ch <= 0xdcff) {
181+
/* UTF-8b surrogate */
182+
*out++ = (char)(ch - 0xdc00);
183+
}
184+
else {
185+
if (error_pos != NULL)
186+
*error_pos = i;
187+
PyMem_Free(result);
188+
return NULL;
189+
}
190+
}
191+
*out = '\0';
192+
return result;
193+
}
194+
#endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */
195+
196+
#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC))
197+
static wchar_t*
198+
decode_ascii_surrogateescape(const char *arg, size_t *size)
199+
{
200+
wchar_t *res;
201+
unsigned char *in;
202+
wchar_t *out;
203+
204+
res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
205+
if (!res)
206+
return NULL;
207+
208+
in = (unsigned char*)arg;
209+
out = res;
210+
while(*in)
211+
if(*in < 128)
212+
*out++ = *in++;
213+
else
214+
*out++ = 0xdc00 + *in++;
215+
*out = 0;
216+
if (size != NULL)
217+
*size = out - res;
218+
return res;
219+
}
220+
#endif
221+
46222

47223
/* Decode a byte string from the locale encoding with the
48224
surrogateescape error handler (undecodable bytes are decoded as characters
@@ -76,20 +252,35 @@ _Py_char2wchar(const char* arg, size_t *size)
76252
return wstr;
77253
#else
78254
wchar_t *res;
255+
size_t argsize;
256+
size_t count;
257+
unsigned char *in;
258+
wchar_t *out;
259+
#ifdef HAVE_MBRTOWC
260+
mbstate_t mbs;
261+
#endif
262+
263+
#ifndef MS_WINDOWS
264+
if (force_ascii == -1)
265+
force_ascii = check_force_ascii();
266+
267+
if (force_ascii) {
268+
/* force ASCII encoding to workaround mbstowcs() issue */
269+
res = decode_ascii_surrogateescape(arg, size);
270+
if (res == NULL)
271+
goto oom;
272+
return res;
273+
}
274+
#endif
275+
79276
#ifdef HAVE_BROKEN_MBSTOWCS
80277
/* Some platforms have a broken implementation of
81278
* mbstowcs which does not count the characters that
82279
* would result from conversion. Use an upper bound.
83280
*/
84-
size_t argsize = strlen(arg);
281+
argsize = strlen(arg);
85282
#else
86-
size_t argsize = mbstowcs(NULL, arg, 0);
87-
#endif
88-
size_t count;
89-
unsigned char *in;
90-
wchar_t *out;
91-
#ifdef HAVE_MBRTOWC
92-
mbstate_t mbs;
283+
argsize = mbstowcs(NULL, arg, 0);
93284
#endif
94285
if (argsize != (size_t)-1) {
95286
res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t));
@@ -160,24 +351,16 @@ _Py_char2wchar(const char* arg, size_t *size)
160351
argsize -= converted;
161352
out++;
162353
}
354+
if (size != NULL)
355+
*size = out - res;
163356
#else /* HAVE_MBRTOWC */
164357
/* Cannot use C locale for escaping; manually escape as if charset
165358
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
166359
correctly in the locale's charset, which must be an ASCII superset. */
167-
res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
168-
if (!res)
360+
res = decode_ascii_surrogateescape(arg, size);
361+
if (res == NULL)
169362
goto oom;
170-
in = (unsigned char*)arg;
171-
out = res;
172-
while(*in)
173-
if(*in < 128)
174-
*out++ = *in++;
175-
else
176-
*out++ = 0xdc00 + *in++;
177-
*out = 0;
178363
#endif /* HAVE_MBRTOWC */
179-
if (size != NULL)
180-
*size = out - res;
181364
return res;
182365
oom:
183366
if (size != NULL)
@@ -236,6 +419,14 @@ _Py_wchar2char(const wchar_t *text, size_t *error_pos)
236419
size_t i, size, converted;
237420
wchar_t c, buf[2];
238421

422+
#ifndef MS_WINDOWS
423+
if (force_ascii == -1)
424+
force_ascii = check_force_ascii();
425+
426+
if (force_ascii)
427+
return encode_ascii_surrogateescape(text, error_pos);
428+
#endif
429+
239430
/* The function works in two steps:
240431
1. compute the length of the output buffer in bytes (size)
241432
2. outputs the bytes */
@@ -276,7 +467,7 @@ _Py_wchar2char(const wchar_t *text, size_t *error_pos)
276467
}
277468
}
278469
if (result != NULL) {
279-
*bytes = 0;
470+
*bytes = '\0';
280471
break;
281472
}
282473

@@ -320,6 +511,8 @@ _Py_wstat(const wchar_t* path, struct stat *buf)
320511
}
321512
#endif
322513

514+
#ifdef HAVE_STAT
515+
323516
/* Call _wstat() on Windows, or encode the path to the filesystem encoding and
324517
call stat() otherwise. Only fill st_mode attribute on Windows.
325518
@@ -352,6 +545,8 @@ _Py_stat(PyObject *path, struct stat *statbuf)
352545
#endif
353546
}
354547

548+
#endif
549+
355550
/* Open a file. Use _wfopen() on Windows, encode the path to the locale
356551
encoding and use fopen() otherwise. */
357552

@@ -533,4 +728,3 @@ _Py_wgetcwd(wchar_t *buf, size_t size)
533728
#endif
534729
}
535730

536-
#endif

0 commit comments

Comments
 (0)