Fix UTF-8 sequence boundary search

sorgloomer · sorgloomer · commit b8a5110bfada · 2023-10-17T23:02:02.000+02:00
diff --git a/Modules/_io/winconsoleio.c b/Modules/_io/winconsoleio.c
@@ -975,6 +975,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
 {
     BOOL res = TRUE;
     wchar_t *wbuf;
+    char *buf;
     DWORD len, wlen, orig_len, n = 0;
     HANDLE handle;
 
@@ -1008,11 +1009,23 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
     while (wlen > 32766 / sizeof(wchar_t)) {
         len /= 2;
         orig_len = len;
-        /* Reduce the length until we hit the final byte of a UTF-8 sequence
-         * (top bit is unset). Fix for github issue 82052.
-         */
-        while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
-            --len;
+        /* Reduce the length until we find an UTF-8 sequence boundary.
+           Fix for github issue gh-110913 and gh-82052.
+           If the last byte was not a 1-byte character, enter the workaround
+           */
+        buf = (char *)b->buf;
+        if (len > 0 && (buf[len-1] & 0x80) != 0) {
+            while (len > 0 && (buf[len-1] & 0xc0) == 0x80) {
+                /* Trace back all the UTF-8 continuation bytes */
+                --len;
+            }
+            if (len > 0) {
+                /* Consume one more byte. If the encoding is correct, this
+                   byte is the head of the last (potentially incomplete) UTF-8
+                   sequence, which too needs to be in the next chunk. */
+                --len;
+            }
+        }
         /* If we hit a length of 0, something has gone wrong. This shouldn't
          * be possible, as valid UTF-8 can have at most 3 non-final bytes
          * before a final one, and our buffer is way longer than that.