@@ -975,6 +975,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
975
975
{
976
976
BOOL res = TRUE;
977
977
wchar_t * wbuf ;
978
+ char * buf ;
978
979
DWORD len , wlen , orig_len , n = 0 ;
979
980
HANDLE handle ;
980
981
@@ -1008,11 +1009,23 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
1008
1009
while (wlen > 32766 / sizeof (wchar_t )) {
1009
1010
len /= 2 ;
1010
1011
orig_len = len ;
1011
- /* Reduce the length until we hit the final byte of a UTF-8 sequence
1012
- * (top bit is unset). Fix for github issue 82052.
1013
- */
1014
- while (len > 0 && (((char * )b -> buf )[len - 1 ] & 0x80 ) != 0 )
1015
- -- len ;
1012
+ /* Reduce the length until we find an UTF-8 sequence boundary.
1013
+ Fix for github issue gh-110913 and gh-82052.
1014
+ If the last byte was not a 1-byte character, enter the workaround
1015
+ */
1016
+ buf = (char * )b -> buf ;
1017
+ if (len > 0 && (buf [len - 1 ] & 0x80 ) != 0 ) {
1018
+ while (len > 0 && (buf [len - 1 ] & 0xc0 ) == 0x80 ) {
1019
+ /* Trace back all the UTF-8 continuation bytes */
1020
+ -- len ;
1021
+ }
1022
+ if (len > 0 ) {
1023
+ /* Consume one more byte. If the encoding is correct, this
1024
+ byte is the head of the last (potentially incomplete) UTF-8
1025
+ sequence, which too needs to be in the next chunk. */
1026
+ -- len ;
1027
+ }
1028
+ }
1016
1029
/* If we hit a length of 0, something has gone wrong. This shouldn't
1017
1030
* be possible, as valid UTF-8 can have at most 3 non-final bytes
1018
1031
* before a final one, and our buffer is way longer than that.
0 commit comments