Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit e4d05c4

Browse files
committed
Set an upper limit on the size of the field buffer, raise an exception
when this limit is reached. Limit defaults to 128k, and is changed by module set_field_limit() method. Previously, an unmatched quote character could result in the entire file being read into the field buffer, potentially exhausting virtual memory.
1 parent 29bf4e4 commit e4d05c4

3 files changed

Lines changed: 87 additions & 32 deletions

File tree

Lib/csv.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import re
77
from _csv import Error, __version__, writer, reader, register_dialect, \
88
unregister_dialect, get_dialect, list_dialects, \
9+
set_field_limit, \
910
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
1011
__doc__
1112
from _csv import Dialect as _Dialect

Lib/test/test_csv.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,10 +229,17 @@ def test_read_quoting(self):
229229
quoting=csv.QUOTE_NONE, escapechar='\\')
230230

231231
def test_read_bigfield(self):
232-
# This exercises the buffer realloc functionality
233-
bigstring = 'X' * 50000
232+
# This exercises the buffer realloc functionality and field size
233+
# limits.
234+
size = 50000
235+
bigstring = 'X' * size
234236
bigline = '%s,%s' % (bigstring, bigstring)
235237
self._read_test([bigline], [[bigstring, bigstring]])
238+
csv.set_field_limit(size)
239+
self._read_test([bigline], [[bigstring, bigstring]])
240+
self.assertEqual(csv.set_field_limit(), size)
241+
csv.set_field_limit(size-1)
242+
self.assertRaises(csv.Error, self._read_test, [bigline], [])
236243

237244
class TestDialectRegistry(unittest.TestCase):
238245
def test_registry_badargs(self):

Modules/_csv.c

Lines changed: 77 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ module instead.
4444

4545
static PyObject *error_obj; /* CSV exception */
4646
static PyObject *dialects; /* Dialect registry */
47+
static long field_limit = 128 * 1024; /* max parsed field size */
4748

4849
typedef enum {
4950
START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
@@ -527,15 +528,21 @@ parse_grow_buff(ReaderObj *self)
527528
return 1;
528529
}
529530

530-
static void
531+
static int
531532
parse_add_char(ReaderObj *self, char c)
532533
{
534+
if (self->field_len >= field_limit) {
535+
PyErr_Format(error_obj, "field larger than field limit (%ld)",
536+
field_limit);
537+
return -1;
538+
}
533539
if (self->field_len == self->field_size && !parse_grow_buff(self))
534-
return;
540+
return -1;
535541
self->field[self->field_len++] = c;
542+
return 0;
536543
}
537544

538-
static void
545+
static int
539546
parse_process_char(ReaderObj *self, char c)
540547
{
541548
DialectObj *dialect = self->dialect;
@@ -574,13 +581,15 @@ parse_process_char(ReaderObj *self, char c)
574581
}
575582
else {
576583
/* begin new unquoted field */
577-
parse_add_char(self, c);
584+
if (parse_add_char(self, c) < 0)
585+
return -1;
578586
self->state = IN_FIELD;
579587
}
580588
break;
581589

582590
case ESCAPED_CHAR:
583-
parse_add_char(self, c);
591+
if (parse_add_char(self, c) < 0)
592+
return -1;
584593
self->state = IN_FIELD;
585594
break;
586595

@@ -602,15 +611,17 @@ parse_process_char(ReaderObj *self, char c)
602611
}
603612
else {
604613
/* normal character - save in field */
605-
parse_add_char(self, c);
614+
if (parse_add_char(self, c) < 0)
615+
return -1;
606616
}
607617
break;
608618

609619
case IN_QUOTED_FIELD:
610620
/* in quoted field */
611621
if (c == '\n') {
612622
/* end of line - save '\n' in field */
613-
parse_add_char(self, '\n');
623+
if (parse_add_char(self, '\n') < 0)
624+
return -1;
614625
}
615626
else if (c == dialect->escapechar) {
616627
/* Possible escape character */
@@ -629,12 +640,14 @@ parse_process_char(ReaderObj *self, char c)
629640
}
630641
else {
631642
/* normal character - save in field */
632-
parse_add_char(self, c);
643+
if (parse_add_char(self, c) < 0)
644+
return -1;
633645
}
634646
break;
635647

636648
case ESCAPE_IN_QUOTED_FIELD:
637-
parse_add_char(self, c);
649+
if (parse_add_char(self, c) < 0)
650+
return -1;
638651
self->state = IN_QUOTED_FIELD;
639652
break;
640653

@@ -643,7 +656,8 @@ parse_process_char(ReaderObj *self, char c)
643656
if (dialect->quoting != QUOTE_NONE &&
644657
c == dialect->quotechar) {
645658
/* save "" as " */
646-
parse_add_char(self, c);
659+
if (parse_add_char(self, c) < 0)
660+
return -1;
647661
self->state = IN_QUOTED_FIELD;
648662
}
649663
else if (c == dialect->delimiter) {
@@ -657,7 +671,8 @@ parse_process_char(ReaderObj *self, char c)
657671
self->state = START_RECORD;
658672
}
659673
else if (!dialect->strict) {
660-
parse_add_char(self, c);
674+
if (parse_add_char(self, c) < 0)
675+
return -1;
661676
self->state = IN_FIELD;
662677
}
663678
else {
@@ -666,10 +681,12 @@ parse_process_char(ReaderObj *self, char c)
666681
PyErr_Format(error_obj, "%c expected after %c",
667682
dialect->delimiter,
668683
dialect->quotechar);
684+
return -1;
669685
}
670686
break;
671687

672688
}
689+
return 0;
673690
}
674691

675692
/*
@@ -754,13 +771,15 @@ Reader_iternext(ReaderObj *self)
754771
return PyErr_Format(error_obj,
755772
"newline inside string");
756773
}
757-
parse_process_char(self, c);
758-
if (PyErr_Occurred()) {
759-
Py_DECREF(lineobj);
760-
return NULL;
761-
}
762-
}
763-
parse_process_char(self, '\n');
774+
if (parse_process_char(self, c) < 0) {
775+
Py_DECREF(lineobj);
776+
return NULL;
777+
}
778+
}
779+
if (parse_process_char(self, '\n') < 0) {
780+
Py_DECREF(lineobj);
781+
return NULL;
782+
}
764783
Py_DECREF(lineobj);
765784
} while (self->state != START_RECORD);
766785

@@ -1387,6 +1406,25 @@ csv_get_dialect(PyObject *module, PyObject *name_obj)
13871406
return get_dialect_from_registry(name_obj);
13881407
}
13891408

1409+
static PyObject *
1410+
csv_set_field_limit(PyObject *module, PyObject *args)
1411+
{
1412+
PyObject *new_limit = NULL;
1413+
long old_limit = field_limit;
1414+
1415+
if (!PyArg_UnpackTuple(args, "set_field_limit", 0, 1, &new_limit))
1416+
return NULL;
1417+
if (new_limit != NULL) {
1418+
if (!PyInt_Check(new_limit)) {
1419+
PyErr_Format(PyExc_TypeError,
1420+
"limit must be an integer");
1421+
return NULL;
1422+
}
1423+
field_limit = PyInt_AsLong(new_limit);
1424+
}
1425+
return PyInt_FromLong(old_limit);
1426+
}
1427+
13901428
/*
13911429
* MODULE
13921430
*/
@@ -1494,20 +1532,29 @@ PyDoc_STRVAR(csv_unregister_dialect_doc,
14941532
"Delete the name/dialect mapping associated with a string name.\n"
14951533
" csv.unregister_dialect(name)");
14961534

1535+
PyDoc_STRVAR(csv_set_field_limit_doc,
1536+
"Sets an upper limit on parsed fields.\n"
1537+
" csv.set_field_limit([limit])\n"
1538+
"\n"
1539+
"Returns old limit. If limit is not given, no new limit is set and\n"
1540+
"the old limit is returned");
1541+
14971542
static struct PyMethodDef csv_methods[] = {
1498-
{ "reader", (PyCFunction)csv_reader,
1499-
METH_VARARGS | METH_KEYWORDS, csv_reader_doc},
1500-
{ "writer", (PyCFunction)csv_writer,
1501-
METH_VARARGS | METH_KEYWORDS, csv_writer_doc},
1502-
{ "list_dialects", (PyCFunction)csv_list_dialects,
1503-
METH_NOARGS, csv_list_dialects_doc},
1504-
{ "register_dialect", (PyCFunction)csv_register_dialect,
1543+
{ "reader", (PyCFunction)csv_reader,
1544+
METH_VARARGS | METH_KEYWORDS, csv_reader_doc},
1545+
{ "writer", (PyCFunction)csv_writer,
1546+
METH_VARARGS | METH_KEYWORDS, csv_writer_doc},
1547+
{ "list_dialects", (PyCFunction)csv_list_dialects,
1548+
METH_NOARGS, csv_list_dialects_doc},
1549+
{ "register_dialect", (PyCFunction)csv_register_dialect,
15051550
METH_VARARGS | METH_KEYWORDS, csv_register_dialect_doc},
1506-
{ "unregister_dialect", (PyCFunction)csv_unregister_dialect,
1507-
METH_O, csv_unregister_dialect_doc},
1508-
{ "get_dialect", (PyCFunction)csv_get_dialect,
1509-
METH_O, csv_get_dialect_doc},
1510-
{ NULL, NULL }
1551+
{ "unregister_dialect", (PyCFunction)csv_unregister_dialect,
1552+
METH_O, csv_unregister_dialect_doc},
1553+
{ "get_dialect", (PyCFunction)csv_get_dialect,
1554+
METH_O, csv_get_dialect_doc},
1555+
{ "set_field_limit", (PyCFunction)csv_set_field_limit,
1556+
METH_VARARGS, csv_set_field_limit_doc},
1557+
{ NULL, NULL }
15111558
};
15121559

15131560
PyMODINIT_FUNC

0 commit comments

Comments
 (0)