Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 709fb94

Browse files
committed
Fix UTF-8 sequence boundary search
1 parent 0f9d0fb commit 709fb94

File tree

1 file changed

+31
-16
lines changed

1 file changed

+31
-16
lines changed

Modules/_io/winconsoleio.c

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,34 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
134134
return m;
135135
}
136136

137+
/* This function returns the original len if it encounters an encoding error */
138+
DWORD _find_last_utf8_sequence_boundary(char *buf, DWORD len) {
139+
DWORD backup_len = 1;
140+
if (len == 0) {
141+
return len;
142+
}
143+
if ((buf[len - backup_len] & 0x80) == 0) {
144+
/* Ok, last byte is 1-byte character, no need to back up */
145+
return len;
146+
}
147+
while ((buf[len - backup_len] & 0xc0) == 0x80) {
148+
/* Back up all the UTF-8 continuation bytes */
149+
if (backup_len >= len || backup_len >= 4) {
150+
/* Error, no UTF-8 sequence head found */
151+
return len;
152+
}
153+
backup_length++;
154+
}
155+
/* Consume one more byte. If the encoding is correct, this
156+
byte is the head of the last (potentially incomplete) UTF-8
157+
sequence, which too needs to be in the next chunk. */
158+
backup_len++;
159+
if (backup_len >= len) {
160+
/* Error, no more bytes left in the buffer */
161+
return len;
162+
}
163+
return len - backup_len;
164+
}
137165

138166
/*[clinic input]
139167
module _io
@@ -975,7 +1003,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
9751003
{
9761004
BOOL res = TRUE;
9771005
wchar_t *wbuf;
978-
DWORD len, wlen, orig_len, n = 0;
1006+
DWORD len, wlen, n = 0;
9791007
HANDLE handle;
9801008

9811009
if (self->fd == -1)
@@ -1007,21 +1035,8 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
10071035
have to reduce and recalculate. */
10081036
while (wlen > 32766 / sizeof(wchar_t)) {
10091037
len /= 2;
1010-
orig_len = len;
1011-
/* Reduce the length until we hit the final byte of a UTF-8 sequence
1012-
* (top bit is unset). Fix for github issue 82052.
1013-
*/
1014-
while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
1015-
--len;
1016-
/* If we hit a length of 0, something has gone wrong. This shouldn't
1017-
* be possible, as valid UTF-8 can have at most 3 non-final bytes
1018-
* before a final one, and our buffer is way longer than that.
1019-
* But to be on the safe side, if we hit this issue we just restore
1020-
* the original length and let the console API sort it out.
1021-
*/
1022-
if (len == 0) {
1023-
len = orig_len;
1024-
}
1038+
/* Fix for github issues gh-110913 and gh-82052. */
1039+
len = _find_last_utf8_sequence_boundary(b->buf, len);
10251040
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
10261041
}
10271042
Py_END_ALLOW_THREADS

0 commit comments

Comments
 (0)