Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b8a5110

Browse files
committed
Fix UTF-8 sequence boundary search
1 parent 0f9d0fb commit b8a5110

File tree

1 file changed

+18
-5
lines changed

1 file changed

+18
-5
lines changed

Modules/_io/winconsoleio.c

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -975,6 +975,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
975975
{
976976
BOOL res = TRUE;
977977
wchar_t *wbuf;
978+
char *buf;
978979
DWORD len, wlen, orig_len, n = 0;
979980
HANDLE handle;
980981

@@ -1008,11 +1009,23 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
10081009
while (wlen > 32766 / sizeof(wchar_t)) {
10091010
len /= 2;
10101011
orig_len = len;
1011-
/* Reduce the length until we hit the final byte of a UTF-8 sequence
1012-
* (top bit is unset). Fix for github issue 82052.
1013-
*/
1014-
while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
1015-
--len;
1012+
/* Reduce the length until we find an UTF-8 sequence boundary.
1013+
Fix for github issue gh-110913 and gh-82052.
1014+
If the last byte was not a 1-byte character, enter the workaround
1015+
*/
1016+
buf = (char *)b->buf;
1017+
if (len > 0 && (buf[len-1] & 0x80) != 0) {
1018+
while (len > 0 && (buf[len-1] & 0xc0) == 0x80) {
1019+
/* Trace back all the UTF-8 continuation bytes */
1020+
--len;
1021+
}
1022+
if (len > 0) {
1023+
/* Consume one more byte. If the encoding is correct, this
1024+
byte is the head of the last (potentially incomplete) UTF-8
1025+
sequence, which too needs to be in the next chunk. */
1026+
--len;
1027+
}
1028+
}
10161029
/* If we hit a length of 0, something has gone wrong. This shouldn't
10171030
* be possible, as valid UTF-8 can have at most 3 non-final bytes
10181031
* before a final one, and our buffer is way longer than that.

0 commit comments

Comments
 (0)