|
5 | 5 | #endif |
6 | 6 |
|
7 | 7 | #ifdef HAVE_LANGINFO_H |
| 8 | +#include <locale.h> |
8 | 9 | #include <langinfo.h> |
9 | 10 | #endif |
10 | 11 |
|
@@ -42,7 +43,182 @@ _Py_device_encoding(int fd) |
42 | 43 | Py_RETURN_NONE; |
43 | 44 | } |
44 | 45 |
|
45 | | -#ifdef HAVE_STAT |
| 46 | +#if !defined(__APPLE__) && !defined(MS_WINDOWS) |
| 47 | +extern int _Py_normalize_encoding(const char *, char *, size_t); |
| 48 | + |
| 49 | +/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale. |
| 50 | + On these operating systems, nl_langinfo(CODESET) announces an alias of the |
| 51 | + ASCII encoding, whereas mbstowcs() and wcstombs() functions use the |
| 52 | + ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use |
| 53 | + locale.getpreferredencoding() codec. For example, if command line arguments |
| 54 | + are decoded by mbstowcs() and encoded back by os.fsencode(), we get a |
| 55 | + UnicodeEncodeError instead of retrieving the original byte string. |
| 56 | +
|
| 57 | + The workaround is enabled if setlocale(LC_CTYPE, NULL) returns "C", |
| 58 | + nl_langinfo(CODESET) announces "ascii" (or an alias to ASCII), and at least |
| 59 | + one byte in range 0x80-0xff can be decoded from the locale encoding. The |
| 60 | + workaround is also enabled on error, for example if getting the locale |
| 61 | + failed. |
| 62 | +
|
| 63 | + Values of locale_is_ascii: |
| 64 | +
|
| 65 | + 1: the workaround is used: _Py_wchar2char() uses |
| 66 | + encode_ascii_surrogateescape() and _Py_char2wchar() uses |
| 67 | + decode_ascii_surrogateescape() |
| 68 | + 0: the workaround is not used: _Py_wchar2char() uses wcstombs() and |
| 69 | + _Py_char2wchar() uses mbstowcs() |
| 70 | + -1: unknown, need to call check_force_ascii() to get the value |
| 71 | +*/ |
| 72 | +static int force_ascii = -1; |
| 73 | + |
| 74 | +static int |
| 75 | +check_force_ascii(void) |
| 76 | +{ |
| 77 | + char *loc; |
| 78 | +#if defined(HAVE_LANGINFO_H) && defined(CODESET) |
| 79 | + char *codeset, **alias; |
| 80 | + char encoding[100]; |
| 81 | + int is_ascii; |
| 82 | + unsigned int i; |
| 83 | + char* ascii_aliases[] = { |
| 84 | + "ascii", |
| 85 | + "646", |
| 86 | + "ansi-x3.4-1968", |
| 87 | + "ansi-x3-4-1968", |
| 88 | + "ansi-x3.4-1986", |
| 89 | + "cp367", |
| 90 | + "csascii", |
| 91 | + "ibm367", |
| 92 | + "iso646-us", |
| 93 | + "iso-646.irv-1991", |
| 94 | + "iso-ir-6", |
| 95 | + "us", |
| 96 | + "us-ascii", |
| 97 | + NULL |
| 98 | + }; |
| 99 | +#endif |
| 100 | + |
| 101 | + loc = setlocale(LC_CTYPE, NULL); |
| 102 | + if (loc == NULL) |
| 103 | + goto error; |
| 104 | + if (strcmp(loc, "C") != 0) { |
| 105 | + /* the LC_CTYPE locale is different than C */ |
| 106 | + return 0; |
| 107 | + } |
| 108 | + |
| 109 | +#if defined(HAVE_LANGINFO_H) && defined(CODESET) |
| 110 | + codeset = nl_langinfo(CODESET); |
| 111 | + if (!codeset || codeset[0] == '\0') { |
| 112 | + /* CODESET is not set or empty */ |
| 113 | + goto error; |
| 114 | + } |
| 115 | + if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) |
| 116 | + goto error; |
| 117 | + |
| 118 | + is_ascii = 0; |
| 119 | + for (alias=ascii_aliases; *alias != NULL; alias++) { |
| 120 | + if (strcmp(encoding, *alias) == 0) { |
| 121 | + is_ascii = 1; |
| 122 | + break; |
| 123 | + } |
| 124 | + } |
| 125 | + if (!is_ascii) { |
| 126 | + /* nl_langinfo(CODESET) is not "ascii" or an alias of ASCII */ |
| 127 | + return 0; |
| 128 | + } |
| 129 | + |
| 130 | + for (i=0x80; i<0xff; i++) { |
| 131 | + unsigned char ch; |
| 132 | + wchar_t wch; |
| 133 | + size_t res; |
| 134 | + |
| 135 | + ch = (unsigned char)i; |
| 136 | + res = mbstowcs(&wch, (char*)&ch, 1); |
| 137 | + if (res != (size_t)-1) { |
| 138 | + /* decoding a non-ASCII character from the locale encoding succeed: |
| 139 | + the locale encoding is not ASCII, force ASCII */ |
| 140 | + return 1; |
| 141 | + } |
| 142 | + } |
| 143 | + /* None of the bytes in the range 0x80-0xff can be decoded from the locale |
| 144 | + encoding: the locale encoding is really ASCII */ |
| 145 | + return 0; |
| 146 | +#else |
| 147 | + /* nl_langinfo(CODESET) is not available: always force ASCII */ |
| 148 | + return 1; |
| 149 | +#endif |
| 150 | + |
| 151 | +error: |
| 152 | + /* if an error occured, force the ASCII encoding */ |
| 153 | + return 1; |
| 154 | +} |
| 155 | + |
| 156 | +static char* |
| 157 | +encode_ascii_surrogateescape(const wchar_t *text, size_t *error_pos) |
| 158 | +{ |
| 159 | + char *result = NULL, *out; |
| 160 | + size_t len, i; |
| 161 | + wchar_t ch; |
| 162 | + |
| 163 | + if (error_pos != NULL) |
| 164 | + *error_pos = (size_t)-1; |
| 165 | + |
| 166 | + len = wcslen(text); |
| 167 | + |
| 168 | + result = PyMem_Malloc(len + 1); /* +1 for NUL byte */ |
| 169 | + if (result == NULL) |
| 170 | + return NULL; |
| 171 | + |
| 172 | + out = result; |
| 173 | + for (i=0; i<len; i++) { |
| 174 | + ch = text[i]; |
| 175 | + |
| 176 | + if (ch <= 0x7f) { |
| 177 | + /* ASCII character */ |
| 178 | + *out++ = (char)ch; |
| 179 | + } |
| 180 | + else if (0xdc80 <= ch && ch <= 0xdcff) { |
| 181 | + /* UTF-8b surrogate */ |
| 182 | + *out++ = (char)(ch - 0xdc00); |
| 183 | + } |
| 184 | + else { |
| 185 | + if (error_pos != NULL) |
| 186 | + *error_pos = i; |
| 187 | + PyMem_Free(result); |
| 188 | + return NULL; |
| 189 | + } |
| 190 | + } |
| 191 | + *out = '\0'; |
| 192 | + return result; |
| 193 | +} |
| 194 | +#endif /* !defined(__APPLE__) && !defined(MS_WINDOWS) */ |
| 195 | + |
| 196 | +#if !defined(__APPLE__) && (!defined(MS_WINDOWS) || !defined(HAVE_MBRTOWC)) |
| 197 | +static wchar_t* |
| 198 | +decode_ascii_surrogateescape(const char *arg, size_t *size) |
| 199 | +{ |
| 200 | + wchar_t *res; |
| 201 | + unsigned char *in; |
| 202 | + wchar_t *out; |
| 203 | + |
| 204 | + res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t)); |
| 205 | + if (!res) |
| 206 | + return NULL; |
| 207 | + |
| 208 | + in = (unsigned char*)arg; |
| 209 | + out = res; |
| 210 | + while(*in) |
| 211 | + if(*in < 128) |
| 212 | + *out++ = *in++; |
| 213 | + else |
| 214 | + *out++ = 0xdc00 + *in++; |
| 215 | + *out = 0; |
| 216 | + if (size != NULL) |
| 217 | + *size = out - res; |
| 218 | + return res; |
| 219 | +} |
| 220 | +#endif |
| 221 | + |
46 | 222 |
|
47 | 223 | /* Decode a byte string from the locale encoding with the |
48 | 224 | surrogateescape error handler (undecodable bytes are decoded as characters |
@@ -76,20 +252,35 @@ _Py_char2wchar(const char* arg, size_t *size) |
76 | 252 | return wstr; |
77 | 253 | #else |
78 | 254 | wchar_t *res; |
| 255 | + size_t argsize; |
| 256 | + size_t count; |
| 257 | + unsigned char *in; |
| 258 | + wchar_t *out; |
| 259 | +#ifdef HAVE_MBRTOWC |
| 260 | + mbstate_t mbs; |
| 261 | +#endif |
| 262 | + |
| 263 | +#ifndef MS_WINDOWS |
| 264 | + if (force_ascii == -1) |
| 265 | + force_ascii = check_force_ascii(); |
| 266 | + |
| 267 | + if (force_ascii) { |
| 268 | + /* force ASCII encoding to workaround mbstowcs() issue */ |
| 269 | + res = decode_ascii_surrogateescape(arg, size); |
| 270 | + if (res == NULL) |
| 271 | + goto oom; |
| 272 | + return res; |
| 273 | + } |
| 274 | +#endif |
| 275 | + |
79 | 276 | #ifdef HAVE_BROKEN_MBSTOWCS |
80 | 277 | /* Some platforms have a broken implementation of |
81 | 278 | * mbstowcs which does not count the characters that |
82 | 279 | * would result from conversion. Use an upper bound. |
83 | 280 | */ |
84 | | - size_t argsize = strlen(arg); |
| 281 | + argsize = strlen(arg); |
85 | 282 | #else |
86 | | - size_t argsize = mbstowcs(NULL, arg, 0); |
87 | | -#endif |
88 | | - size_t count; |
89 | | - unsigned char *in; |
90 | | - wchar_t *out; |
91 | | -#ifdef HAVE_MBRTOWC |
92 | | - mbstate_t mbs; |
| 283 | + argsize = mbstowcs(NULL, arg, 0); |
93 | 284 | #endif |
94 | 285 | if (argsize != (size_t)-1) { |
95 | 286 | res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t)); |
@@ -160,24 +351,16 @@ _Py_char2wchar(const char* arg, size_t *size) |
160 | 351 | argsize -= converted; |
161 | 352 | out++; |
162 | 353 | } |
| 354 | + if (size != NULL) |
| 355 | + *size = out - res; |
163 | 356 | #else /* HAVE_MBRTOWC */ |
164 | 357 | /* Cannot use C locale for escaping; manually escape as if charset |
165 | 358 | is ASCII (i.e. escape all bytes > 128. This will still roundtrip |
166 | 359 | correctly in the locale's charset, which must be an ASCII superset. */ |
167 | | - res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t)); |
168 | | - if (!res) |
| 360 | + res = decode_ascii_surrogateescape(arg, size); |
| 361 | + if (res == NULL) |
169 | 362 | goto oom; |
170 | | - in = (unsigned char*)arg; |
171 | | - out = res; |
172 | | - while(*in) |
173 | | - if(*in < 128) |
174 | | - *out++ = *in++; |
175 | | - else |
176 | | - *out++ = 0xdc00 + *in++; |
177 | | - *out = 0; |
178 | 363 | #endif /* HAVE_MBRTOWC */ |
179 | | - if (size != NULL) |
180 | | - *size = out - res; |
181 | 364 | return res; |
182 | 365 | oom: |
183 | 366 | if (size != NULL) |
@@ -236,6 +419,14 @@ _Py_wchar2char(const wchar_t *text, size_t *error_pos) |
236 | 419 | size_t i, size, converted; |
237 | 420 | wchar_t c, buf[2]; |
238 | 421 |
|
| 422 | +#ifndef MS_WINDOWS |
| 423 | + if (force_ascii == -1) |
| 424 | + force_ascii = check_force_ascii(); |
| 425 | + |
| 426 | + if (force_ascii) |
| 427 | + return encode_ascii_surrogateescape(text, error_pos); |
| 428 | +#endif |
| 429 | + |
239 | 430 | /* The function works in two steps: |
240 | 431 | 1. compute the length of the output buffer in bytes (size) |
241 | 432 | 2. outputs the bytes */ |
@@ -276,7 +467,7 @@ _Py_wchar2char(const wchar_t *text, size_t *error_pos) |
276 | 467 | } |
277 | 468 | } |
278 | 469 | if (result != NULL) { |
279 | | - *bytes = 0; |
| 470 | + *bytes = '\0'; |
280 | 471 | break; |
281 | 472 | } |
282 | 473 |
|
@@ -320,6 +511,8 @@ _Py_wstat(const wchar_t* path, struct stat *buf) |
320 | 511 | } |
321 | 512 | #endif |
322 | 513 |
|
| 514 | +#ifdef HAVE_STAT |
| 515 | + |
323 | 516 | /* Call _wstat() on Windows, or encode the path to the filesystem encoding and |
324 | 517 | call stat() otherwise. Only fill st_mode attribute on Windows. |
325 | 518 |
|
@@ -352,6 +545,8 @@ _Py_stat(PyObject *path, struct stat *statbuf) |
352 | 545 | #endif |
353 | 546 | } |
354 | 547 |
|
| 548 | +#endif |
| 549 | + |
355 | 550 | /* Open a file. Use _wfopen() on Windows, encode the path to the locale |
356 | 551 | encoding and use fopen() otherwise. */ |
357 | 552 |
|
@@ -533,4 +728,3 @@ _Py_wgetcwd(wchar_t *buf, size_t size) |
533 | 728 | #endif |
534 | 729 | } |
535 | 730 |
|
536 | | -#endif |
|
0 commit comments