@@ -34,6 +34,13 @@ extern int winerror_to_errno(int);
34
34
int _Py_open_cloexec_works = -1 ;
35
35
#endif
36
36
37
+ // The value must be the same in unicodeobject.c.
38
+ #define MAX_UNICODE 0x10ffff
39
+
40
+ // mbstowcs() and mbrtowc() errors
41
+ static const size_t DECODE_ERROR = ((size_t )-1 );
42
+ static const size_t INCOMPLETE_CHARACTER = (size_t )-2 ;
43
+
37
44
38
45
static int
39
46
get_surrogateescape (_Py_error_handler errors , int * surrogateescape )
@@ -82,6 +89,57 @@ _Py_device_encoding(int fd)
82
89
#endif
83
90
}
84
91
92
+
93
+ static size_t
94
+ is_valid_wide_char (wchar_t ch )
95
+ {
96
+ if (Py_UNICODE_IS_SURROGATE (ch )) {
97
+ // Reject lone surrogate characters
98
+ return 0 ;
99
+ }
100
+ if (ch > MAX_UNICODE ) {
101
+ // bpo-35883: Reject characters outside [U+0000; U+10ffff] range.
102
+ // The glibc mbstowcs() UTF-8 decoder does not respect the RFC 3629,
103
+ // it creates characters outside the [U+0000; U+10ffff] range:
104
+ // https://sourceware.org/bugzilla/show_bug.cgi?id=2373
105
+ return 0 ;
106
+ }
107
+ return 1 ;
108
+ }
109
+
110
+
111
+ static size_t
112
+ _Py_mbstowcs (wchar_t * dest , const char * src , size_t n )
113
+ {
114
+ size_t count = mbstowcs (dest , src , n );
115
+ if (dest != NULL && count != DECODE_ERROR ) {
116
+ for (size_t i = 0 ; i < count ; i ++ ) {
117
+ wchar_t ch = dest [i ];
118
+ if (!is_valid_wide_char (ch )) {
119
+ return DECODE_ERROR ;
120
+ }
121
+ }
122
+ }
123
+ return count ;
124
+ }
125
+
126
+
127
+ #ifdef HAVE_MBRTOWC
128
+ static size_t
129
+ _Py_mbrtowc (wchar_t * pwc , const char * str , size_t len , mbstate_t * pmbs )
130
+ {
131
+ assert (pwc != NULL );
132
+ size_t count = mbrtowc (pwc , str , len , pmbs );
133
+ if (count != 0 && count != DECODE_ERROR && count != INCOMPLETE_CHARACTER ) {
134
+ if (!is_valid_wide_char (* pwc )) {
135
+ return DECODE_ERROR ;
136
+ }
137
+ }
138
+ return count ;
139
+ }
140
+ #endif
141
+
142
+
85
143
#if !defined(_Py_FORCE_UTF8_FS_ENCODING ) && !defined(MS_WINDOWS )
86
144
87
145
#define USE_FORCE_ASCII
@@ -148,8 +206,8 @@ check_force_ascii(void)
148
206
size_t res ;
149
207
150
208
ch = (unsigned char )0xA7 ;
151
- res = mbstowcs (& wch , (char * )& ch , 1 );
152
- if (res != ( size_t ) -1 && wch == L'\xA7' ) {
209
+ res = _Py_mbstowcs (& wch , (char * )& ch , 1 );
210
+ if (res != DECODE_ERROR && wch == L'\xA7' ) {
153
211
/* On HP-UX withe C locale or the POSIX locale,
154
212
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
155
213
Latin1 encoding in practice. Force ASCII in this case.
@@ -196,8 +254,8 @@ check_force_ascii(void)
196
254
197
255
unsigned uch = (unsigned char )i ;
198
256
ch [0 ] = (char )uch ;
199
- res = mbstowcs (wch , ch , 1 );
200
- if (res != ( size_t ) -1 ) {
257
+ res = _Py_mbstowcs (wch , ch , 1 );
258
+ if (res != DECODE_ERROR ) {
201
259
/* decoding a non-ASCII character from the locale encoding succeed:
202
260
the locale encoding is not ASCII, force ASCII */
203
261
return 1 ;
@@ -387,9 +445,9 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
387
445
*/
388
446
argsize = strlen (arg );
389
447
#else
390
- argsize = mbstowcs (NULL , arg , 0 );
448
+ argsize = _Py_mbstowcs (NULL , arg , 0 );
391
449
#endif
392
- if (argsize != ( size_t ) -1 ) {
450
+ if (argsize != DECODE_ERROR ) {
393
451
if (argsize > PY_SSIZE_T_MAX / sizeof (wchar_t ) - 1 ) {
394
452
return -1 ;
395
453
}
@@ -398,21 +456,13 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
398
456
return -1 ;
399
457
}
400
458
401
- count = mbstowcs (res , arg , argsize + 1 );
402
- if (count != (size_t )-1 ) {
403
- wchar_t * tmp ;
404
- /* Only use the result if it contains no
405
- surrogate characters. */
406
- for (tmp = res ; * tmp != 0 &&
407
- !Py_UNICODE_IS_SURROGATE (* tmp ); tmp ++ )
408
- ;
409
- if (* tmp == 0 ) {
410
- if (wlen != NULL ) {
411
- * wlen = count ;
412
- }
413
- * wstr = res ;
414
- return 0 ;
459
+ count = _Py_mbstowcs (res , arg , argsize + 1 );
460
+ if (count != DECODE_ERROR ) {
461
+ * wstr = res ;
462
+ if (wlen != NULL ) {
463
+ * wlen = count ;
415
464
}
465
+ return 0 ;
416
466
}
417
467
PyMem_RawFree (res );
418
468
}
@@ -436,46 +486,36 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
436
486
out = res ;
437
487
memset (& mbs , 0 , sizeof mbs );
438
488
while (argsize ) {
439
- size_t converted = mbrtowc (out , (char * )in , argsize , & mbs );
489
+ size_t converted = _Py_mbrtowc (out , (char * )in , argsize , & mbs );
440
490
if (converted == 0 ) {
441
491
/* Reached end of string; null char stored. */
442
492
break ;
443
493
}
444
494
445
- if (converted == ( size_t ) -2 ) {
495
+ if (converted == INCOMPLETE_CHARACTER ) {
446
496
/* Incomplete character. This should never happen,
447
497
since we provide everything that we have -
448
498
unless there is a bug in the C library, or I
449
499
misunderstood how mbrtowc works. */
450
500
goto decode_error ;
451
501
}
452
502
453
- if (converted == ( size_t ) -1 ) {
503
+ if (converted == DECODE_ERROR ) {
454
504
if (!surrogateescape ) {
455
505
goto decode_error ;
456
506
}
457
507
458
- /* Conversion error. Escape as UTF-8b, and start over
459
- in the initial shift state. */
508
+ /* Decoding error. Escape as UTF-8b, and start over in the initial
509
+ shift state. */
460
510
* out ++ = 0xdc00 + * in ++ ;
461
511
argsize -- ;
462
512
memset (& mbs , 0 , sizeof mbs );
463
513
continue ;
464
514
}
465
515
466
- if (Py_UNICODE_IS_SURROGATE (* out )) {
467
- if (!surrogateescape ) {
468
- goto decode_error ;
469
- }
516
+ // _Py_mbrtowc() reject lone surrogate characters
517
+ assert (!Py_UNICODE_IS_SURROGATE (* out ));
470
518
471
- /* Surrogate character. Escape the original
472
- byte sequence with surrogateescape. */
473
- argsize -= converted ;
474
- while (converted -- ) {
475
- * out ++ = 0xdc00 + * in ++ ;
476
- }
477
- continue ;
478
- }
479
519
/* successfully converted some bytes */
480
520
in += converted ;
481
521
argsize -= converted ;
@@ -652,7 +692,7 @@ encode_current_locale(const wchar_t *text, char **str,
652
692
else {
653
693
converted = wcstombs (NULL , buf , 0 );
654
694
}
655
- if (converted == ( size_t ) -1 ) {
695
+ if (converted == DECODE_ERROR ) {
656
696
goto encode_error ;
657
697
}
658
698
if (bytes != NULL ) {
@@ -1440,7 +1480,7 @@ _Py_wfopen(const wchar_t *path, const wchar_t *mode)
1440
1480
char cmode [10 ];
1441
1481
size_t r ;
1442
1482
r = wcstombs (cmode , mode , 10 );
1443
- if (r == ( size_t ) -1 || r >= 10 ) {
1483
+ if (r == DECODE_ERROR || r >= 10 ) {
1444
1484
errno = EINVAL ;
1445
1485
return NULL ;
1446
1486
}
0 commit comments