@@ -134,6 +134,34 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
134
134
return m ;
135
135
}
136
136
137
+ /* This function returns the original len if it encounters an encoding error */
138
+ DWORD _find_last_utf8_sequence_boundary (char * buf , DWORD len ) {
139
+ DWORD backup_len = 1 ;
140
+ if (len == 0 ) {
141
+ return len ;
142
+ }
143
+ if ((buf [len - backup_len ] & 0x80 ) == 0 ) {
144
+ /* Ok, last byte is 1-byte character, no need to back up */
145
+ return len ;
146
+ }
147
+ while ((buf [len - backup_len ] & 0xc0 ) == 0x80 ) {
148
+ /* Back up all the UTF-8 continuation bytes */
149
+ if (backup_len >= len || backup_len >= 4 ) {
150
+ /* Error, no UTF-8 sequence head found */
151
+ return len ;
152
+ }
153
+ backup_length ++ ;
154
+ }
155
+ /* Consume one more byte. If the encoding is correct, this
156
+ byte is the head of the last (potentially incomplete) UTF-8
157
+ sequence, which too needs to be in the next chunk. */
158
+ backup_len ++ ;
159
+ if (backup_len >= len ) {
160
+ /* Error, no more bytes left in the buffer */
161
+ return len ;
162
+ }
163
+ return len - backup_len ;
164
+ }
137
165
138
166
/*[clinic input]
139
167
module _io
@@ -975,7 +1003,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
975
1003
{
976
1004
BOOL res = TRUE;
977
1005
wchar_t * wbuf ;
978
- DWORD len , wlen , orig_len , n = 0 ;
1006
+ DWORD len , wlen , n = 0 ;
979
1007
HANDLE handle ;
980
1008
981
1009
if (self -> fd == -1 )
@@ -1007,21 +1035,8 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
1007
1035
have to reduce and recalculate. */
1008
1036
while (wlen > 32766 / sizeof (wchar_t )) {
1009
1037
len /= 2 ;
1010
- orig_len = len ;
1011
- /* Reduce the length until we hit the final byte of a UTF-8 sequence
1012
- * (top bit is unset). Fix for github issue 82052.
1013
- */
1014
- while (len > 0 && (((char * )b -> buf )[len - 1 ] & 0x80 ) != 0 )
1015
- -- len ;
1016
- /* If we hit a length of 0, something has gone wrong. This shouldn't
1017
- * be possible, as valid UTF-8 can have at most 3 non-final bytes
1018
- * before a final one, and our buffer is way longer than that.
1019
- * But to be on the safe side, if we hit this issue we just restore
1020
- * the original length and let the console API sort it out.
1021
- */
1022
- if (len == 0 ) {
1023
- len = orig_len ;
1024
- }
1038
+ /* Fix for github issues gh-110913 and gh-82052. */
1039
+ len = _find_last_utf8_sequence_boundary (b -> buf , len );
1025
1040
wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , NULL , 0 );
1026
1041
}
1027
1042
Py_END_ALLOW_THREADS
0 commit comments