Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7a6e959

Browse files
committed
SF patch 580331 by Oren Tirosh: make file objects their own iterator.
For a file f, iter(f) now returns f (unless f is closed), and f.next() is similar to f.readline() when EOF is not reached; however, f.next() uses a readahead buffer that messes up the file position, so mixing f.next() and f.readline() (or other methods) doesn't work right. Calling f.seek() drops the readahead buffer, but other operations don't. The real purpose of this change is to reduce the confusion between objects and their iterators. By making a file its own iterator, it's made clearer that using the iterator modifies the file object's state (in particular the current position). A nice side effect is that this speeds up "for line in f:" by not having to use the xreadlines module. The f.xreadlines() method is still supported for backwards compatibility, though it is the same as iter(f) now. (I made some cosmetic changes to Oren's code, and added a test for "file closed" to file_iternext() and file_iter().)
1 parent 3a451b1 commit 7a6e959

2 files changed

Lines changed: 136 additions & 34 deletions

File tree

Include/fileobject.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@ typedef struct {
1313
PyObject *f_name;
1414
PyObject *f_mode;
1515
int (*f_close)(FILE *);
16-
int f_softspace; /* Flag used by 'print' command */
17-
int f_binary; /* Flag which indicates whether the file is open
18-
open in binary (1) or test (0) mode */
16+
int f_softspace; /* Flag used by 'print' command */
17+
int f_binary; /* Flag which indicates whether the file is
18+
open in binary (1) or text (0) mode */
19+
char* f_buf; /* Allocated readahead buffer */
20+
char* f_bufend; /* Points after last occupied position */
21+
char* f_bufptr; /* Current buffer position */
1922
#ifdef WITH_UNIVERSAL_NEWLINES
2023
int f_univ_newline; /* Handle any newline convention */
2124
int f_newlinetypes; /* Types of newlines seen */

Objects/fileobject.c

Lines changed: 130 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
/* File object implementation */
32

43
#include "Python.h"
@@ -116,6 +115,7 @@ fill_file_fields(PyFileObject *f, FILE *fp, char *name, char *mode,
116115
f->f_close = close;
117116
f->f_softspace = 0;
118117
f->f_binary = strchr(mode,'b') != NULL;
118+
f->f_buf = NULL;
119119
#ifdef WITH_UNIVERSAL_NEWLINES
120120
f->f_univ_newline = (strchr(mode, 'U') != NULL);
121121
f->f_newlinetypes = NEWLINE_UNKNOWN;
@@ -271,6 +271,8 @@ err_closed(void)
271271
return NULL;
272272
}
273273

274+
void drop_readahead(PyFileObject *);
275+
274276
/* Methods */
275277

276278
static void
@@ -283,6 +285,7 @@ file_dealloc(PyFileObject *f)
283285
}
284286
Py_XDECREF(f->f_name);
285287
Py_XDECREF(f->f_mode);
288+
drop_readahead(f);
286289
f->ob_type->tp_free((PyObject *)f);
287290
}
288291

@@ -405,6 +408,7 @@ file_seek(PyFileObject *f, PyObject *args)
405408

406409
if (f->f_fp == NULL)
407410
return err_closed();
411+
drop_readahead(f);
408412
whence = 0;
409413
if (!PyArg_ParseTuple(args, "O|i:seek", &offobj, &whence))
410414
return NULL;
@@ -1177,28 +1181,6 @@ file_readline(PyFileObject *f, PyObject *args)
11771181
return get_line(f, n);
11781182
}
11791183

1180-
static PyObject *
1181-
file_xreadlines(PyFileObject *f)
1182-
{
1183-
static PyObject* xreadlines_function = NULL;
1184-
1185-
if (f->f_fp == NULL)
1186-
return err_closed();
1187-
if (!xreadlines_function) {
1188-
PyObject *xreadlines_module =
1189-
PyImport_ImportModule("xreadlines");
1190-
if(!xreadlines_module)
1191-
return NULL;
1192-
1193-
xreadlines_function = PyObject_GetAttrString(xreadlines_module,
1194-
"xreadlines");
1195-
Py_DECREF(xreadlines_module);
1196-
if(!xreadlines_function)
1197-
return NULL;
1198-
}
1199-
return PyObject_CallFunction(xreadlines_function, "(O)", f);
1200-
}
1201-
12021184
static PyObject *
12031185
file_readlines(PyFileObject *f, PyObject *args)
12041186
{
@@ -1462,6 +1444,15 @@ file_writelines(PyFileObject *f, PyObject *seq)
14621444
#undef CHUNKSIZE
14631445
}
14641446

1447+
static PyObject *
1448+
file_getiter(PyFileObject *f)
1449+
{
1450+
if (f->f_fp == NULL)
1451+
return err_closed();
1452+
Py_INCREF(f);
1453+
return (PyObject *)f;
1454+
}
1455+
14651456
PyDoc_STRVAR(readline_doc,
14661457
"readline([size]) -> next line from the file, as a string.\n"
14671458
"\n"
@@ -1517,10 +1508,10 @@ PyDoc_STRVAR(readlines_doc,
15171508
"total number of bytes in the lines returned.");
15181509

15191510
PyDoc_STRVAR(xreadlines_doc,
1520-
"xreadlines() -> next line from the file, as a string.\n"
1511+
"xreadlines() -> returns self.\n"
15211512
"\n"
1522-
"Equivalent to xreadlines.xreadlines(file). This is like readline(), but\n"
1523-
"often quicker, due to reading ahead internally.");
1513+
"For backward compatibility. File objects now include the performance\n"
1514+
"optimizations previously implemented in the xreadlines module.");
15241515

15251516
PyDoc_STRVAR(writelines_doc,
15261517
"writelines(sequence_of_strings) -> None. Write the strings to the file.\n"
@@ -1554,7 +1545,7 @@ static PyMethodDef file_methods[] = {
15541545
{"tell", (PyCFunction)file_tell, METH_NOARGS, tell_doc},
15551546
{"readinto", (PyCFunction)file_readinto, METH_VARARGS, readinto_doc},
15561547
{"readlines", (PyCFunction)file_readlines, METH_VARARGS, readlines_doc},
1557-
{"xreadlines", (PyCFunction)file_xreadlines, METH_NOARGS, xreadlines_doc},
1548+
{"xreadlines", (PyCFunction)file_getiter, METH_NOARGS, xreadlines_doc},
15581549
{"writelines", (PyCFunction)file_writelines, METH_O, writelines_doc},
15591550
{"flush", (PyCFunction)file_flush, METH_NOARGS, flush_doc},
15601551
{"close", (PyCFunction)file_close, METH_NOARGS, close_doc},
@@ -1617,12 +1608,120 @@ static PyGetSetDef file_getsetlist[] = {
16171608
{0},
16181609
};
16191610

1611+
void
1612+
drop_readahead(PyFileObject *f)
1613+
{
1614+
if (f->f_buf != NULL) {
1615+
PyMem_Free(f->f_buf);
1616+
f->f_buf = NULL;
1617+
}
1618+
}
1619+
1620+
/* Make sure that file has a readahead buffer with at least one byte
1621+
(unless at EOF) and no more than bufsize. Returns negative value on
1622+
error */
1623+
int readahead(PyFileObject *f, int bufsize) {
1624+
int chunksize;
1625+
1626+
if (f->f_buf != NULL) {
1627+
if( (f->f_bufend - f->f_bufptr) >= 1)
1628+
return 0;
1629+
else
1630+
drop_readahead(f);
1631+
}
1632+
if ((f->f_buf = PyMem_Malloc(bufsize)) == NULL) {
1633+
return -1;
1634+
}
1635+
Py_BEGIN_ALLOW_THREADS
1636+
errno = 0;
1637+
chunksize = Py_UniversalNewlineFread(
1638+
f->f_buf, bufsize, f->f_fp, (PyObject *)f);
1639+
Py_END_ALLOW_THREADS
1640+
if (chunksize == 0) {
1641+
if (ferror(f->f_fp)) {
1642+
PyErr_SetFromErrno(PyExc_IOError);
1643+
clearerr(f->f_fp);
1644+
drop_readahead(f);
1645+
return -1;
1646+
}
1647+
}
1648+
f->f_bufptr = f->f_buf;
1649+
f->f_bufend = f->f_buf + chunksize;
1650+
return 0;
1651+
}
1652+
1653+
/* Used by file_iternext. The returned string will start with 'skip'
1654+
uninitialized bytes followed by the remainder of the line. Don't be
1655+
horrified by the recursive call: maximum recursion depth is limited by
1656+
logarithmic buffer growth to about 50 even when reading a 1gb line. */
1657+
1658+
PyStringObject *
1659+
readahead_get_line_skip(PyFileObject *f, int skip, int bufsize) {
1660+
PyStringObject* s;
1661+
char *bufptr;
1662+
char *buf;
1663+
int len;
1664+
1665+
if (f->f_buf == NULL)
1666+
if (readahead(f, bufsize) < 0)
1667+
return NULL;
1668+
1669+
len = f->f_bufend - f->f_bufptr;
1670+
if (len == 0)
1671+
return (PyStringObject *)
1672+
PyString_FromStringAndSize(NULL, skip);
1673+
bufptr = memchr(f->f_bufptr, '\n', len);
1674+
if (bufptr != NULL) {
1675+
bufptr++; /* Count the '\n' */
1676+
len = bufptr - f->f_bufptr;
1677+
s = (PyStringObject *)
1678+
PyString_FromStringAndSize(NULL, skip+len);
1679+
if (s == NULL)
1680+
return NULL;
1681+
memcpy(PyString_AS_STRING(s)+skip, f->f_bufptr, len);
1682+
f->f_bufptr = bufptr;
1683+
if (bufptr == f->f_bufend)
1684+
drop_readahead(f);
1685+
} else {
1686+
bufptr = f->f_bufptr;
1687+
buf = f->f_buf;
1688+
f->f_buf = NULL; /* Force new readahead buffer */
1689+
s = readahead_get_line_skip(
1690+
f, skip+len, bufsize + (bufsize>>2) );
1691+
if (s == NULL) {
1692+
PyMem_Free(buf);
1693+
return NULL;
1694+
}
1695+
memcpy(PyString_AS_STRING(s)+skip, bufptr, len);
1696+
PyMem_Free(buf);
1697+
}
1698+
return s;
1699+
}
1700+
1701+
/* A larger buffer size may actually decrease performance. */
1702+
#define READAHEAD_BUFSIZE 8192
1703+
16201704
static PyObject *
1621-
file_getiter(PyObject *f)
1705+
file_iternext(PyFileObject *f)
16221706
{
1623-
return PyObject_CallMethod(f, "xreadlines", "");
1707+
PyStringObject* l;
1708+
1709+
int i;
1710+
1711+
if (f->f_fp == NULL)
1712+
return err_closed();
1713+
1714+
i = f->f_softspace;
1715+
1716+
l = readahead_get_line_skip(f, 0, READAHEAD_BUFSIZE);
1717+
if (l == NULL || PyString_GET_SIZE(l) == 0) {
1718+
Py_XDECREF(l);
1719+
return NULL;
1720+
}
1721+
return (PyObject *)l;
16241722
}
16251723

1724+
16261725
static PyObject *
16271726
file_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
16281727
{
@@ -1742,8 +1841,8 @@ PyTypeObject PyFile_Type = {
17421841
0, /* tp_clear */
17431842
0, /* tp_richcompare */
17441843
0, /* tp_weaklistoffset */
1745-
file_getiter, /* tp_iter */
1746-
0, /* tp_iternext */
1844+
(getiterfunc)file_getiter, /* tp_iter */
1845+
(iternextfunc)file_iternext, /* tp_iternext */
17471846
file_methods, /* tp_methods */
17481847
file_memberlist, /* tp_members */
17491848
file_getsetlist, /* tp_getset */

0 commit comments

Comments
 (0)