Fix UTF-8 sequence boundary search

sorgloomer · sorgloomer · commit 709fb9420050 · 2023-10-18T01:44:53.000+02:00
diff --git a/Modules/_io/winconsoleio.c b/Modules/_io/winconsoleio.c
@@ -134,6 +134,34 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
     return m;
 }
 
+/* This function returns the original len if it encounters an encoding error */
+DWORD _find_last_utf8_sequence_boundary(char *buf, DWORD len) {
+    DWORD backup_len = 1;
+    if (len == 0) {
+        return len;
+    }
+    if ((buf[len - backup_len] & 0x80) == 0) {
+        /* Ok, last byte is 1-byte character, no need to back up */
+        return len;
+    }
+    while ((buf[len - backup_len] & 0xc0) == 0x80) {
+        /* Back up all the UTF-8 continuation bytes */
+        if (backup_len >= len || backup_len >= 4) {
+            /* Error, no UTF-8 sequence head found */
+            return len;
+        }
+        backup_length++;
+    }
+    /* Consume one more byte. If the encoding is correct, this
+       byte is the head of the last (potentially incomplete) UTF-8
+       sequence, which too needs to be in the next chunk. */
+    backup_len++;
+    if (backup_len >= len) {
+        /* Error, no more bytes left in the buffer */
+        return len;
+    }
+    return len - backup_len;
+}
 
 /*[clinic input]
 module _io
@@ -975,7 +1003,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
 {
     BOOL res = TRUE;
     wchar_t *wbuf;
-    DWORD len, wlen, orig_len, n = 0;
+    DWORD len, wlen, n = 0;
     HANDLE handle;
 
     if (self->fd == -1)
@@ -1007,21 +1035,8 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
        have to reduce and recalculate. */
     while (wlen > 32766 / sizeof(wchar_t)) {
         len /= 2;
-        orig_len = len;
-        /* Reduce the length until we hit the final byte of a UTF-8 sequence
-         * (top bit is unset). Fix for github issue 82052.
-         */
-        while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
-            --len;
-        /* If we hit a length of 0, something has gone wrong. This shouldn't
-         * be possible, as valid UTF-8 can have at most 3 non-final bytes
-         * before a final one, and our buffer is way longer than that.
-         * But to be on the safe side, if we hit this issue we just restore
-         * the original length and let the console API sort it out.
-         */
-        if (len == 0) {
-            len = orig_len;
-        }
+        /* Fix for github issues gh-110913 and gh-82052. */
+        len = _find_last_utf8_sequence_boundary(b->buf, len);
         wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
     }
     Py_END_ALLOW_THREADS