Issue #15758: Fix FileIO.readall() so it no longer has O(n**2) complexity.

shibturn · shibturn · commit af7260e81a13 · 2013-05-17T23:34:42.000+01:00
diff --git a/Misc/NEWS b/Misc/NEWS
@@ -91,6 +91,8 @@ Core and Builtins
 Library
 -------
 
+- Issue #15758: Fix FileIO.readall() so it no longer has O(n**2) complexity.
+
 - Issue #14596: The struct.Struct() objects now use more compact implementation.
 
 - Issue #17981: Closed socket on error in SysLogHandler.
diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c
@@ -556,33 +556,27 @@ fileio_readinto(fileio *self, PyObject *args)
     return PyLong_FromSsize_t(n);
 }
 
+#ifndef HAVE_FSTAT
+
+static PyObject *
+fileio_readall(fileio *self)
+{
+    _Py_IDENTIFIER(readall);
+    return _PyObject_CallMethodId((PyObject*)&PyRawIOBase_Type,
+                                  &PyId_readall, "O", self);
+}
+
+#else
+
 static size_t
-new_buffersize(fileio *self, size_t currentsize
-#ifdef HAVE_FSTAT
-               , Py_off_t pos, Py_off_t end
-#endif
-               )
+new_buffersize(fileio *self, size_t currentsize)
 {
     size_t addend;
-#ifdef HAVE_FSTAT
-    if (end != (Py_off_t)-1) {
-        /* Files claiming a size smaller than SMALLCHUNK may
-           actually be streaming pseudo-files. In this case, we
-           apply the more aggressive algorithm below.
-        */
-        if (end >= SMALLCHUNK && end >= pos && pos >= 0) {
-            /* Add 1 so if the file were to grow we'd notice. */
-            Py_off_t bufsize = currentsize + end - pos + 1;
-            if (bufsize < PY_SSIZE_T_MAX)
-                return (size_t)bufsize;
-            else
-                return PY_SSIZE_T_MAX;
-        }
-    }
-#endif
+
     /* Expand the buffer by an amount proportional to the current size,
        giving us amortized linear-time behavior.  For bigger sizes, use a
        less-than-double growth factor to avoid excessive allocation. */
+    assert(currentsize <= PY_SSIZE_T_MAX);
     if (currentsize > 65536)
         addend = currentsize >> 3;
     else
@@ -596,25 +590,18 @@ new_buffersize(fileio *self, size_t currentsize
 static PyObject *
 fileio_readall(fileio *self)
 {
-#ifdef HAVE_FSTAT
     struct stat st;
     Py_off_t pos, end;
-#endif
     PyObject *result;
-    Py_ssize_t total = 0;
+    Py_ssize_t bytes_read = 0;
     Py_ssize_t n;
-    size_t newsize;
+    size_t bufsize;
 
     if (self->fd < 0)
         return err_closed();
     if (!_PyVerify_fd(self->fd))
         return PyErr_SetFromErrno(PyExc_IOError);
 
-    result = PyBytes_FromStringAndSize(NULL, SMALLCHUNK);
-    if (result == NULL)
-        return NULL;
-
-#ifdef HAVE_FSTAT
 #if defined(MS_WIN64) || defined(MS_WINDOWS)
     pos = _lseeki64(self->fd, 0L, SEEK_CUR);
 #else
@@ -624,44 +611,46 @@ fileio_readall(fileio *self)
         end = st.st_size;
     else
         end = (Py_off_t)-1;
-#endif
+
+    if (end > 0 && end >= pos && pos >= 0 && end - pos < PY_SSIZE_T_MAX) {
+        /* This is probably a real file, so we try to allocate a
+           buffer one byte larger than the rest of the file.  If the
+           calculation is right then we should get EOF without having
+           to enlarge the buffer. */
+        bufsize = (size_t)(end - pos + 1);
+    } else {
+        bufsize = SMALLCHUNK;
+    }
+
+    result = PyBytes_FromStringAndSize(NULL, bufsize);
+    if (result == NULL)
+        return NULL;
+
     while (1) {
-#ifdef HAVE_FSTAT
-        newsize = new_buffersize(self, total, pos, end);
-#else
-        newsize = new_buffersize(self, total);
-#endif
-        if (newsize > PY_SSIZE_T_MAX || newsize <= 0) {
-            PyErr_SetString(PyExc_OverflowError,
-                "unbounded read returned more bytes "
-                "than a Python string can hold ");
-            Py_DECREF(result);
-            return NULL;
-        }
+        if (bytes_read >= (Py_ssize_t)bufsize) {
+            bufsize = new_buffersize(self, bytes_read);
+            if (bufsize > PY_SSIZE_T_MAX || bufsize <= 0) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "unbounded read returned more bytes "
+                                "than a Python string can hold ");
+                Py_DECREF(result);
+                return NULL;
+            }
 
-        if (PyBytes_GET_SIZE(result) < (Py_ssize_t)newsize) {
-            if (_PyBytes_Resize(&result, newsize) < 0) {
-                if (total == 0) {
-                    Py_DECREF(result);
+            if (PyBytes_GET_SIZE(result) < (Py_ssize_t)bufsize) {
+                if (_PyBytes_Resize(&result, bufsize) < 0)
                     return NULL;
-                }
-                PyErr_Clear();
-                break;
             }
         }
         Py_BEGIN_ALLOW_THREADS
         errno = 0;
-        n = newsize - total;
+        n = bufsize - bytes_read;
 #if defined(MS_WIN64) || defined(MS_WINDOWS)
         if (n > INT_MAX)
             n = INT_MAX;
-        n = read(self->fd,
-                 PyBytes_AS_STRING(result) + total,
-                 (int)n);
+        n = read(self->fd, PyBytes_AS_STRING(result) + bytes_read, (int)n);
 #else
-        n = read(self->fd,
-                 PyBytes_AS_STRING(result) + total,
-                 n);
+        n = read(self->fd, PyBytes_AS_STRING(result) + bytes_read, n);
 #endif
         Py_END_ALLOW_THREADS
         if (n == 0)
@@ -674,7 +663,7 @@ fileio_readall(fileio *self)
                 }
                 continue;
             }
-            if (total > 0)
+            if (bytes_read > 0)
                 break;
             if (errno == EAGAIN) {
                 Py_DECREF(result);
@@ -684,22 +673,19 @@ fileio_readall(fileio *self)
             PyErr_SetFromErrno(PyExc_IOError);
             return NULL;
         }
-        total += n;
-#ifdef HAVE_FSTAT
+        bytes_read += n;
         pos += n;
-#endif
     }
 
-    if (PyBytes_GET_SIZE(result) > total) {
-        if (_PyBytes_Resize(&result, total) < 0) {
-            /* This should never happen, but just in case */
-            Py_DECREF(result);
+    if (PyBytes_GET_SIZE(result) > bytes_read) {
+        if (_PyBytes_Resize(&result, bytes_read) < 0)
             return NULL;
-        }
     }
     return result;
 }
 
+#endif /* HAVE_FSTAT */
+
 static PyObject *
 fileio_read(fileio *self, PyObject *args)
 {