From 3ba68b06e8e043725e726304d9700b0daa647d25 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 20 Feb 2021 12:10:44 +0900 Subject: [PATCH 1/8] bpo-43260: io: Prevent large data remains in textio. When very large data remains in TextIOWrapper, flush() may fail forever. So prevent large (i.e. 1MiB) data remains in TextIOWrapper internal buffer. --- Modules/_io/textio.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index f08d14e18b4021..fcde011a9ee568 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1585,6 +1585,8 @@ _textiowrapper_writeflush(textio *self) ret = PyObject_CallMethodOneArg(self->buffer, _PyIO_str_write, b); } while (ret == NULL && _PyIO_trap_eintr()); Py_DECREF(b); + // NOTE: We cleared buffer but we don't know how many bytes are actually written + // when an error occurred. if (ret == NULL) return -1; Py_DECREF(ret); @@ -1673,6 +1675,25 @@ _io_TextIOWrapper_write_impl(textio *self, PyObject *text) bytes_len = PyBytes_GET_SIZE(b); } + // bpo-43260: If `b` is large, leaving it in pending_bytes may cause + // MemoryError and we can not recover from it forever. + // So do not leave the large data in pending_bytes buffer. + // 1MiB is just a heuristics. + if (bytes_len > 1024*1024) { + if (_textiowrapper_writeflush(self)) { + Py_DECREF(b); + return NULL; + } + self->pending_bytes = b; + self->pending_bytes_count = bytes_len; + if (_textiowrapper_writeflush(self)) { + Py_CLEAR(self->pending_bytes); + self->pending_bytes_count = 0; + return NULL; + } + return PyLong_FromSsize_t(textlen); + } + if (self->pending_bytes == NULL) { self->pending_bytes_count = 0; self->pending_bytes = b; From 5ffbe8f04d4e19de22b48be946dbff5fc1bf0829 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 20 Feb 2021 12:15:44 +0900 Subject: [PATCH 2/8] Add news entry --- .../next/Library/2021-02-20-12-15-29.bpo-43260.6znAas.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2021-02-20-12-15-29.bpo-43260.6znAas.rst diff --git a/Misc/NEWS.d/next/Library/2021-02-20-12-15-29.bpo-43260.6znAas.rst b/Misc/NEWS.d/next/Library/2021-02-20-12-15-29.bpo-43260.6znAas.rst new file mode 100644 index 00000000000000..f3c21d1c63f72b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-02-20-12-15-29.bpo-43260.6znAas.rst @@ -0,0 +1,2 @@ +Fix TextIOWrapper can not flush internal buffer forever after very large +text is written. From ccda9bccf9db6ae2ecb2e2c8150e01cdf6aa1b3e Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 20 Feb 2021 13:57:04 +0900 Subject: [PATCH 3/8] Use chunk_size instead of heuristic 1MiB --- Modules/_io/textio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index fcde011a9ee568..243b71948b94b7 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1678,8 +1678,7 @@ _io_TextIOWrapper_write_impl(textio *self, PyObject *text) // bpo-43260: If `b` is large, leaving it in pending_bytes may cause // MemoryError and we can not recover from it forever. // So do not leave the large data in pending_bytes buffer. - // 1MiB is just a heuristics. - if (bytes_len > 1024*1024) { + if (bytes_len > self->chunk_size) { if (_textiowrapper_writeflush(self)) { Py_DECREF(b); return NULL; From a17a9f41345c782b64315e46db741c39e9d94c31 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 20 Feb 2021 15:37:20 +0900 Subject: [PATCH 4/8] Use Eryk's idea --- Modules/_io/textio.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 243b71948b94b7..914489f68f4c64 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1644,7 +1644,10 @@ _io_TextIOWrapper_write_impl(textio *self, PyObject *text) /* XXX What if we were just reading? */ if (self->encodefunc != NULL) { - if (PyUnicode_IS_ASCII(text) && is_asciicompat_encoding(self->encodefunc)) { + if (PyUnicode_IS_ASCII(text) && + // See bpo-43260 + PyUnicode_GET_LENGTH(text) < self->chunk_size && + is_asciicompat_encoding(self->encodefunc)) { b = text; Py_INCREF(b); } @@ -1653,8 +1656,9 @@ _io_TextIOWrapper_write_impl(textio *self, PyObject *text) } self->encoding_start_of_stream = 0; } - else + else { b = PyObject_CallMethodOneArg(self->encoder, _PyIO_str_encode, text); + } Py_DECREF(text); if (b == NULL) @@ -1675,26 +1679,18 @@ _io_TextIOWrapper_write_impl(textio *self, PyObject *text) bytes_len = PyBytes_GET_SIZE(b); } - // bpo-43260: If `b` is large, leaving it in pending_bytes may cause - // MemoryError and we can not recover from it forever. - // So do not leave the large data in pending_bytes buffer. - if (bytes_len > self->chunk_size) { - if (_textiowrapper_writeflush(self)) { - Py_DECREF(b); - return NULL; - } + if (self->pending_bytes == NULL) { + self->pending_bytes_count = 0; self->pending_bytes = b; - self->pending_bytes_count = bytes_len; - if (_textiowrapper_writeflush(self)) { - Py_CLEAR(self->pending_bytes); - self->pending_bytes_count = 0; + } + else if (self->pending_bytes_count + bytes_len > self->chunk_size) { + // bpo-43260: If `b` is very large, leaving it in pending_bytes may + // cause MemoryError and we can not recover from it forever. + // So do not leave the large data in pending_bytes buffer. + if (_textiowrapper_writeflush(self) < 0) { + Py_DECREF(b); return NULL; } - return PyLong_FromSsize_t(textlen); - } - - if (self->pending_bytes == NULL) { - self->pending_bytes_count = 0; self->pending_bytes = b; } else if (!PyList_CheckExact(self->pending_bytes)) { @@ -1716,7 +1712,7 @@ _io_TextIOWrapper_write_impl(textio *self, PyObject *text) } self->pending_bytes_count += bytes_len; - if (self->pending_bytes_count > self->chunk_size || needflush || + if (self->pending_bytes_count >= self->chunk_size || needflush || text_needflush) { if (_textiowrapper_writeflush(self) < 0) return NULL; From 54ff77e2ac7df31e5bb2edbc3d37b39e007953a2 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 20 Feb 2021 15:39:56 +0900 Subject: [PATCH 5/8] fix comment --- Modules/_io/textio.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 914489f68f4c64..b334bc5c92079d 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1684,9 +1684,7 @@ _io_TextIOWrapper_write_impl(textio *self, PyObject *text) self->pending_bytes = b; } else if (self->pending_bytes_count + bytes_len > self->chunk_size) { - // bpo-43260: If `b` is very large, leaving it in pending_bytes may - // cause MemoryError and we can not recover from it forever. - // So do not leave the large data in pending_bytes buffer. + // Prevent to concatinate more than chunk_size data. if (_textiowrapper_writeflush(self) < 0) { Py_DECREF(b); return NULL; From 960b928440e586ac3e1db8c1a68dcd5bead2bc1e Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 20 Feb 2021 16:59:45 +0900 Subject: [PATCH 6/8] <= --- Modules/_io/textio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index b334bc5c92079d..0f1287583176f5 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1646,7 +1646,7 @@ _io_TextIOWrapper_write_impl(textio *self, PyObject *text) if (self->encodefunc != NULL) { if (PyUnicode_IS_ASCII(text) && // See bpo-43260 - PyUnicode_GET_LENGTH(text) < self->chunk_size && + PyUnicode_GET_LENGTH(text) <= self->chunk_size && is_asciicompat_encoding(self->encodefunc)) { b = text; Py_INCREF(b); From 9572c1e002fd23ca9a5d03cfb2307043d5771800 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 20 Feb 2021 17:29:23 +0900 Subject: [PATCH 7/8] concatenate --- Modules/_io/textio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 0f1287583176f5..03001ecb0a5b3b 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -1684,7 +1684,7 @@ _io_TextIOWrapper_write_impl(textio *self, PyObject *text) self->pending_bytes = b; } else if (self->pending_bytes_count + bytes_len > self->chunk_size) { - // Prevent to concatinate more than chunk_size data. + // Prevent to concatenate more than chunk_size data. if (_textiowrapper_writeflush(self) < 0) { Py_DECREF(b); return NULL; From 05ab6e0d0f461bfdbe4039422e6b84431b0d87ed Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sun, 21 Feb 2021 00:18:42 +0900 Subject: [PATCH 8/8] Add test --- Lib/test/test_io.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index cc54d0ea0062f2..3768b625516f41 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -3767,6 +3767,33 @@ def test_del__CHUNK_SIZE_SystemError(self): with self.assertRaises(AttributeError): del t._CHUNK_SIZE + def test_internal_buffer_size(self): + # bpo-43260: TextIOWrapper's internal buffer should not store + # data larger than chunk size. + chunk_size = 8192 # default chunk size, updated later + + class MockIO(self.MockRawIO): + def write(self, data): + if len(data) > chunk_size: + raise RuntimeError + return super().write(data) + + buf = MockIO() + t = self.TextIOWrapper(buf, encoding="ascii") + chunk_size = t._CHUNK_SIZE + t.write("abc") + t.write("def") + # default chunk size is 8192 bytes so t don't write data to buf. + self.assertEqual([], buf._write_stack) + + with self.assertRaises(RuntimeError): + t.write("x"*(chunk_size+1)) + + self.assertEqual([b"abcdef"], buf._write_stack) + t.write("ghi") + t.write("x"*chunk_size) + self.assertEqual([b"abcdef", b"ghi", b"x"*chunk_size], buf._write_stack) + class PyTextIOWrapperTest(TextIOWrapperTest): io = pyio