@@ -33,6 +33,13 @@ extern int winerror_to_errno(int);
33
33
int _Py_open_cloexec_works = -1 ;
34
34
#endif
35
35
36
+ // The value must be the same in unicodeobject.c.
37
+ #define MAX_UNICODE 0x10ffff
38
+
39
+ // mbstowcs() and mbrtowc() errors
40
+ static const size_t DECODE_ERROR = ((size_t )-1 );
41
+ static const size_t INCOMPLETE_CHARACTER = (size_t )-2 ;
42
+
36
43
37
44
static int
38
45
get_surrogateescape (_Py_error_handler errors , int * surrogateescape )
@@ -85,6 +92,57 @@ _Py_device_encoding(int fd)
85
92
Py_RETURN_NONE ;
86
93
}
87
94
95
+
96
+ static size_t
97
+ is_valid_wide_char (wchar_t ch )
98
+ {
99
+ if (Py_UNICODE_IS_SURROGATE (ch )) {
100
+ // Reject lone surrogate characters
101
+ return 0 ;
102
+ }
103
+ if (ch > MAX_UNICODE ) {
104
+ // bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
105
+ // The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
106
+ // it creates characters outside the [U+0000; U+10ffff] range:
107
+ // https://sourceware.org/bugzilla/show_bug.cgi?id=2373
108
+ return 0 ;
109
+ }
110
+ return 1 ;
111
+ }
112
+
113
+
114
+ static size_t
115
+ _Py_mbstowcs (wchar_t * dest , const char * src , size_t n )
116
+ {
117
+ size_t count = mbstowcs (dest , src , n );
118
+ if (dest != NULL && count != DECODE_ERROR ) {
119
+ for (size_t i = 0 ; i < count ; i ++ ) {
120
+ wchar_t ch = dest [i ];
121
+ if (!is_valid_wide_char (ch )) {
122
+ return DECODE_ERROR ;
123
+ }
124
+ }
125
+ }
126
+ return count ;
127
+ }
128
+
129
+
130
+ #ifdef HAVE_MBRTOWC
131
+ static size_t
132
+ _Py_mbrtowc (wchar_t * pwc , const char * str , size_t len , mbstate_t * pmbs )
133
+ {
134
+ assert (pwc != NULL );
135
+ size_t count = mbrtowc (pwc , str , len , pmbs );
136
+ if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER ) {
137
+ if (!is_valid_wide_char (* pwc )) {
138
+ return DECODE_ERROR ;
139
+ }
140
+ }
141
+ return count ;
142
+ }
143
+ #endif
144
+
145
+
88
146
#if !defined(_Py_FORCE_UTF8_FS_ENCODING ) && !defined(MS_WINDOWS )
89
147
90
148
#define USE_FORCE_ASCII
@@ -151,8 +209,8 @@ check_force_ascii(void)
151
209
size_t res ;
152
210
153
211
ch = (unsigned char )0xA7 ;
154
- res = mbstowcs (& wch , (char * )& ch , 1 );
155
- if (res != ( size_t ) -1 && wch == L'\xA7' ) {
212
+ res = _Py_mbstowcs (& wch , (char * )& ch , 1 );
213
+ if (res != DECODE_ERROR && wch == L'\xA7' ) {
156
214
/* On HP-UX withe C locale or the POSIX locale,
157
215
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
158
216
Latin1 encoding in practice. Force ASCII in this case.
@@ -199,8 +257,8 @@ check_force_ascii(void)
199
257
200
258
unsigned uch = (unsigned char )i ;
201
259
ch [0 ] = (char )uch ;
202
- res = mbstowcs (wch , ch , 1 );
203
- if (res != ( size_t ) -1 ) {
260
+ res = _Py_mbstowcs (wch , ch , 1 );
261
+ if (res != DECODE_ERROR ) {
204
262
/* decoding a non-ASCII character from the locale encoding succeed:
205
263
the locale encoding is not ASCII, force ASCII */
206
264
return 1 ;
@@ -390,9 +448,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
390
448
*/
391
449
argsize = strlen (arg );
392
450
#else
393
- argsize = mbstowcs (NULL , arg , 0 );
451
+ argsize = _Py_mbstowcs (NULL , arg , 0 );
394
452
#endif
395
- if (argsize != ( size_t ) -1 ) {
453
+ if (argsize != DECODE_ERROR ) {
396
454
if (argsize > PY_SSIZE_T_MAX / sizeof (wchar_t ) - 1 ) {
397
455
return -1 ;
398
456
}
@@ -401,21 +459,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
401
459
return -1 ;
402
460
}
403
461
404
- count = mbstowcs (res , arg , argsize + 1 );
405
- if (count != (size_t )-1 ) {
406
- wchar_t * tmp ;
407
- /* Only use the result if it contains no
408
- surrogate characters. */
409
- for (tmp = res ; * tmp != 0 &&
410
- !Py_UNICODE_IS_SURROGATE (* tmp ); tmp ++ )
411
- ;
412
- if (* tmp == 0 ) {
413
- if (wlen != NULL ) {
414
- * wlen = count ;
415
- }
416
- * wstr = res ;
417
- return 0 ;
462
+ count = _Py_mbstowcs (res , arg , argsize + 1 );
463
+ if (count != DECODE_ERROR ) {
464
+ * wstr = res ;
465
+ if (wlen != NULL ) {
466
+ * wlen = count ;
418
467
}
468
+ return 0 ;
419
469
}
420
470
PyMem_RawFree (res );
421
471
}
@@ -439,46 +489,36 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
439
489
out = res ;
440
490
memset (& mbs , 0 , sizeof mbs );
441
491
while (argsize ) {
442
- size_t converted = mbrtowc (out , (char * )in , argsize , & mbs );
492
+ size_t converted = _Py_mbrtowc (out , (char * )in , argsize , & mbs );
443
493
if (converted == 0 ) {
444
494
/* Reached end of string; null char stored. */
445
495
break ;
446
496
}
447
497
448
- if (converted == ( size_t ) -2 ) {
498
+ if (converted == INCOMPLETE_CHARACTER ) {
449
499
/* Incomplete character. This should never happen,
450
500
since we provide everything that we have -
451
501
unless there is a bug in the C library, or I
452
502
misunderstood how mbrtowc works. */
453
503
goto decode_error ;
454
504
}
455
505
456
- if (converted == ( size_t ) -1 ) {
506
+ if (converted == DECODE_ERROR ) {
457
507
if (!surrogateescape ) {
458
508
goto decode_error ;
459
509
}
460
510
461
- /* Conversion error. Escape as UTF-8b, and start over
462
- in the initial shift state. */
511
+ /* Decoding error. Escape as UTF-8b, and start over in the initial
512
+ shift state. */
463
513
* out ++ = 0xdc00 + * in ++ ;
464
514
argsize -- ;
465
515
memset (& mbs , 0 , sizeof mbs );
466
516
continue ;
467
517
}
468
518
469
- if (Py_UNICODE_IS_SURROGATE (* out )) {
470
- if (!surrogateescape ) {
471
- goto decode_error ;
472
- }
519
+ // _Py_mbrtowc() reject lone surrogate characters
520
+ assert (!Py_UNICODE_IS_SURROGATE (* out ));
473
521
474
- /* Surrogate character. Escape the original
475
- byte sequence with surrogateescape. */
476
- argsize -= converted ;
477
- while (converted -- ) {
478
- * out ++ = 0xdc00 + * in ++ ;
479
- }
480
- continue ;
481
- }
482
522
/* successfully converted some bytes */
483
523
in += converted ;
484
524
argsize -= converted ;
@@ -655,7 +695,7 @@ encode_current_locale(const wchar_t *text, char **str,
655
695
else {
656
696
converted = wcstombs (NULL , buf , 0 );
657
697
}
658
- if (converted == ( size_t ) -1 ) {
698
+ if (converted == DECODE_ERROR ) {
659
699
goto encode_error ;
660
700
}
661
701
if (bytes != NULL ) {
@@ -1371,7 +1411,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
1371
1411
char cmode [10 ];
1372
1412
size_t r ;
1373
1413
r = wcstombs (cmode , mode , 10 );
1374
- if (r == ( size_t ) -1 || r >= 10 ) {
1414
+ if (r == DECODE_ERROR || r >= 10 ) {
1375
1415
errno = EINVAL ;
1376
1416
return NULL ;
1377
1417
}
0 commit comments