diff --git a/Doc/library/csv.rst b/Doc/library/csv.rst index 17534fcc4615dc..66f36cd269df26 100644 --- a/Doc/library/csv.rst +++ b/Doc/library/csv.rst @@ -53,7 +53,7 @@ The :mod:`csv` module defines the following functions: .. index:: single: universal newlines; csv.reader function -.. function:: reader(csvfile, dialect='excel', **fmtparams) +.. function:: reader(csvfile, dialect='excel', field_size_limit=None, **fmtparams) Return a reader object which will iterate over lines in the given *csvfile*. *csvfile* can be any object which supports the :term:`iterator` protocol and returns a @@ -417,6 +417,10 @@ Reader objects have the following public attributes: The number of lines read from the source iterator. This is not the same as the number of records returned, as records can span multiple lines. +.. attribute:: csvreader.field_size_limit + + The maximum field size allowed by this reader. If :const:`None`, + csv.field_size_limit() is used. DictReader objects have the following public attribute: diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 7a333139b5ea2c..eb405edade1568 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -319,6 +319,36 @@ def test_read_bigfield(self): finally: csv.field_size_limit(limit) + def test_override_size_limit(self): + line = ',,,' + reader = csv.reader([line]) + self.assertEqual(reader.field_size_limit, None) + reader = csv.reader([line], field_size_limit=None) + self.assertEqual(reader.field_size_limit, None) + + reader = csv.reader([line], field_size_limit=5) + self.assertEqual(reader.field_size_limit, 5) + reader.field_size_limit = None + self.assertEqual(reader.field_size_limit, None) + reader.field_size_limit = 6 + self.assertEqual(reader.field_size_limit, 6) + del reader.field_size_limit + self.assertEqual(reader.field_size_limit, None) + + with self.assertRaises(ValueError): + csv.reader([line], field_size_limit=-1) + + with self.assertRaises(TypeError): + csv.reader([line], field_size_limit="string") + + line = 'long_field,3,4,5' + reader = csv.reader([line, line], field_size_limit=4) + with self.assertRaises(csv.Error): + list(reader) + reader.field_size_limit = 50 + self.assertEqual(list(reader), [['long_field', '3', '4', '5']]) + + def test_read_linenum(self): r = csv.reader(['line,1', 'line,2', 'line,3']) self.assertEqual(r.line_num, 0) diff --git a/Misc/NEWS.d/next/Library/2019-02-27-16-53-43.bpo-36121.AbiDLw.rst b/Misc/NEWS.d/next/Library/2019-02-27-16-53-43.bpo-36121.AbiDLw.rst new file mode 100644 index 00000000000000..4028e82a26d913 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-02-27-16-53-43.bpo-36121.AbiDLw.rst @@ -0,0 +1,2 @@ +``csv.field_size_limit`` can now be overriden for each csv.Reader +instance. Contributed by Rémi Lapeyre. diff --git a/Modules/_csv.c b/Modules/_csv.c index d86f63ef597610..707b7dc140c0a3 100644 --- a/Modules/_csv.c +++ b/Modules/_csv.c @@ -102,6 +102,7 @@ typedef struct { Py_ssize_t field_len; /* length of current field */ int numeric_field; /* treat field as numeric */ unsigned long line_num; /* Source-file line number */ + long field_limit; /* field limit for current reader */ } ReaderObj; static PyTypeObject Reader_Type; @@ -572,13 +573,18 @@ parse_grow_buff(ReaderObj *self) static int parse_add_char(ReaderObj *self, Py_UCS4 c) { - if (self->field_len >= _csvstate_global->field_limit) { + long limit = self->field_limit; + if (limit == -1) { + limit = _csvstate_global->field_limit; + } + if (self->field_len >= limit) { PyErr_Format(_csvstate_global->error_obj, "field larger than field limit (%ld)", - _csvstate_global->field_limit); + limit); return -1; } - if (self->field_len == self->field_size && !parse_grow_buff(self)) + if (self->field_len == self->field_size && !parse_grow_buff(self)) { return -1; + } self->field[self->field_len++] = c; return 0; } @@ -894,6 +900,45 @@ static struct PyMemberDef Reader_memberlist[] = { { NULL } }; +static PyObject * +Reader_get_field_size_limit(PyObject *self, void *Py_UNUSED(ignored)) +{ + ReaderObj *reader = (ReaderObj *)self; + if (reader->field_limit == -1) { // -1 is used as a flag for unset value + Py_RETURN_NONE; + } + else { + return PyLong_FromLong(reader->field_limit); + } +} + +static int +Reader_set_field_size_limit(PyObject *self, PyObject *arg, void *Py_UNUSED(ignored)) +{ + ReaderObj *reader = (ReaderObj *)self; + if (arg == NULL || arg == Py_None) { + reader->field_limit = -1; + return 0; + } + else { + long limit = PyLong_AsLong(arg); + if (limit == -1 && PyErr_Occurred()) { + return -1; + } + if (limit <= 0) { + PyErr_Format(PyExc_ValueError, "field_size_limit must greater than 0"); + return -1; + } + reader->field_limit = limit; + return 0; + } +} + +static PyGetSetDef Reader_getset[] = { + { "field_size_limit", Reader_get_field_size_limit, + Reader_set_field_size_limit, PyDoc_STR("field size limit") }, + { NULL }, +}; static PyTypeObject Reader_Type = { PyVarObject_HEAD_INIT(NULL, 0) @@ -927,18 +972,19 @@ static PyTypeObject Reader_Type = { (getiterfunc)Reader_iternext, /*tp_iternext*/ Reader_methods, /*tp_methods*/ Reader_memberlist, /*tp_members*/ - 0, /*tp_getset*/ - + Reader_getset, /*tp_getset*/ }; static PyObject * csv_reader(PyObject *module, PyObject *args, PyObject *keyword_args) { PyObject * iterator, * dialect = NULL; + PyObject * _field_size_limit = NULL; ReaderObj * self = PyObject_GC_New(ReaderObj, &Reader_Type); - if (!self) + if (!self) { return NULL; + } self->dialect = NULL; self->fields = NULL; @@ -947,30 +993,66 @@ csv_reader(PyObject *module, PyObject *args, PyObject *keyword_args) self->field_size = 0; self->line_num = 0; + _field_size_limit = PyUnicode_FromString("field_size_limit"); + if (_field_size_limit == NULL) { + goto fail; + } + PyObject *field_size_limit = NULL; + if (keyword_args != NULL) { + field_size_limit = PyDict_GetItemWithError(keyword_args, _field_size_limit); + if (PyErr_Occurred()) { + goto fail; + } + } + if (field_size_limit == NULL) { + self->field_limit = -1; + } else if (field_size_limit == Py_None) { + self->field_limit = -1; + if (PyDict_DelItem(keyword_args, _field_size_limit) < 0) { + goto fail; + } + } + else { + long limit = PyLong_AsLong(field_size_limit); + if (PyErr_Occurred()) { + goto fail; + } + if (limit <= 0) { + PyErr_Format(PyExc_ValueError, "field_size_limit must greater than 0"); + goto fail; + } + if (PyDict_DelItem(keyword_args, _field_size_limit) < 0) { + goto fail; + } + self->field_limit = limit; + } + Py_CLEAR(_field_size_limit); + if (parse_reset(self) < 0) { - Py_DECREF(self); - return NULL; + goto fail; } if (!PyArg_UnpackTuple(args, "", 1, 2, &iterator, &dialect)) { - Py_DECREF(self); - return NULL; + goto fail; } self->input_iter = PyObject_GetIter(iterator); if (self->input_iter == NULL) { PyErr_SetString(PyExc_TypeError, "argument 1 must be an iterator"); - Py_DECREF(self); - return NULL; + goto fail; } self->dialect = (DialectObj *)_call_dialect(dialect, keyword_args); if (self->dialect == NULL) { - Py_DECREF(self); - return NULL; + goto fail; } PyObject_GC_Track(self); return (PyObject *)self; + +fail: + Py_XDECREF(_field_size_limit); + Py_DECREF(self); + return NULL; } /* @@ -1535,8 +1617,8 @@ PyDoc_STRVAR(csv_module_doc, " written as two quotes\n"); PyDoc_STRVAR(csv_reader_doc, -" csv_reader = reader(iterable [, dialect='excel']\n" -" [optional keyword args])\n" +" csv_reader = reader(iterable , dialect='excel',\n" +" field_size_limit=None, **fmtparams)\n" " for row in csv_reader:\n" " process(row)\n" "\n"