Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 69bcaf7

Browse files
gh-110913: Fix WindowsConsoleIO chunking of UTF-8 text (GH-111007)
(cherry picked from commit 11312ea) Co-authored-by: Tamás Hegedűs <[email protected]>
1 parent 6df935c commit 69bcaf7

File tree

2 files changed

+21
-16
lines changed

2 files changed

+21
-16
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
WindowsConsoleIO now correctly chunks large buffers without splitting up UTF-8 sequences.

Modules/_io/winconsoleio.c

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,23 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
132132
return m;
133133
}
134134

135+
static DWORD
136+
_find_last_utf8_boundary(const char *buf, DWORD len)
137+
{
138+
/* This function never returns 0, returns the original len instead */
139+
DWORD count = 1;
140+
if (len == 0 || (buf[len - 1] & 0x80) == 0) {
141+
return len;
142+
}
143+
for (;; count++) {
144+
if (count > 3 || count >= len) {
145+
return len;
146+
}
147+
if ((buf[len - count] & 0xc0) != 0x80) {
148+
return len - count;
149+
}
150+
}
151+
}
135152

136153
/*[clinic input]
137154
module _io
@@ -954,7 +971,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b)
954971
{
955972
BOOL res = TRUE;
956973
wchar_t *wbuf;
957-
DWORD len, wlen, orig_len, n = 0;
974+
DWORD len, wlen, n = 0;
958975
HANDLE handle;
959976

960977
if (self->fd == -1)
@@ -984,21 +1001,8 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b)
9841001
have to reduce and recalculate. */
9851002
while (wlen > 32766 / sizeof(wchar_t)) {
9861003
len /= 2;
987-
orig_len = len;
988-
/* Reduce the length until we hit the final byte of a UTF-8 sequence
989-
* (top bit is unset). Fix for github issue 82052.
990-
*/
991-
while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
992-
--len;
993-
/* If we hit a length of 0, something has gone wrong. This shouldn't
994-
* be possible, as valid UTF-8 can have at most 3 non-final bytes
995-
* before a final one, and our buffer is way longer than that.
996-
* But to be on the safe side, if we hit this issue we just restore
997-
* the original length and let the console API sort it out.
998-
*/
999-
if (len == 0) {
1000-
len = orig_len;
1001-
}
1004+
/* Fix for github issues gh-110913 and gh-82052. */
1005+
len = _find_last_utf8_boundary(b->buf, len);
10021006
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
10031007
}
10041008
Py_END_ALLOW_THREADS

0 commit comments

Comments
 (0)