Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit de20b0b

Browse files
committed
Issue #13149: Speed up append-only StringIO objects.
This is very similar to the "lazy strings" idea.
1 parent 9f4b1e9 commit de20b0b

3 files changed

Lines changed: 108 additions & 5 deletions

File tree

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,8 @@ Core and Builtins
365365
Library
366366
-------
367367

368+
- Issue #13149: Speed up append-only StringIO objects.
369+
368370
- Issue #13373: multiprocessing.Queue.get() could sometimes block indefinitely
369371
when called with a timeout. Patch by Arnaud Ysmal.
370372

Modules/_io/stringio.c

Lines changed: 105 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,25 @@
77
than the enclosed string, for proper functioning of _PyIO_find_line_ending.
88
*/
99

10+
#define STATE_REALIZED 1
11+
#define STATE_ACCUMULATING 2
12+
1013
typedef struct {
1114
PyObject_HEAD
1215
Py_UCS4 *buf;
1316
Py_ssize_t pos;
1417
Py_ssize_t string_size;
1518
size_t buf_size;
1619

20+
/* The stringio object can be in two states: accumulating or realized.
21+
In accumulating state, the internal buffer contains nothing and
22+
the contents are given by the embedded _PyAccu structure.
23+
In realized state, the internal buffer is meaningful and the
24+
_PyAccu is destroyed.
25+
*/
26+
int state;
27+
_PyAccu accu;
28+
1729
char ok; /* initialized? */
1830
char closed;
1931
char readuniversal;
@@ -40,6 +52,11 @@ typedef struct {
4052
return NULL; \
4153
}
4254

55+
#define ENSURE_REALIZED(self) \
56+
if (realize(self) < 0) { \
57+
return NULL; \
58+
}
59+
4360
PyDoc_STRVAR(stringio_doc,
4461
"Text I/O implementation using an in-memory buffer.\n"
4562
"\n"
@@ -102,6 +119,54 @@ resize_buffer(stringio *self, size_t size)
102119
return -1;
103120
}
104121

122+
static PyObject *
123+
make_intermediate(stringio *self)
124+
{
125+
PyObject *intermediate = _PyAccu_Finish(&self->accu);
126+
self->state = STATE_REALIZED;
127+
if (intermediate == NULL)
128+
return NULL;
129+
if (_PyAccu_Init(&self->accu) ||
130+
_PyAccu_Accumulate(&self->accu, intermediate)) {
131+
Py_DECREF(intermediate);
132+
return NULL;
133+
}
134+
self->state = STATE_ACCUMULATING;
135+
return intermediate;
136+
}
137+
138+
static int
139+
realize(stringio *self)
140+
{
141+
Py_ssize_t len;
142+
PyObject *intermediate;
143+
144+
if (self->state == STATE_REALIZED)
145+
return 0;
146+
assert(self->state == STATE_ACCUMULATING);
147+
self->state = STATE_REALIZED;
148+
149+
intermediate = _PyAccu_Finish(&self->accu);
150+
if (intermediate == NULL)
151+
return -1;
152+
153+
/* Append the intermediate string to the internal buffer.
154+
The length should be equal to the current cursor position.
155+
*/
156+
len = PyUnicode_GET_LENGTH(intermediate);
157+
if (resize_buffer(self, len) < 0) {
158+
Py_DECREF(intermediate);
159+
return -1;
160+
}
161+
if (!PyUnicode_AsUCS4(intermediate, self->buf, len, 0)) {
162+
Py_DECREF(intermediate);
163+
return -1;
164+
}
165+
166+
Py_DECREF(intermediate);
167+
return 0;
168+
}
169+
105170
/* Internal routine for writing a whole PyUnicode object to the buffer of a
106171
StringIO object. Returns 0 on success, or -1 on error. */
107172
static Py_ssize_t
@@ -136,7 +201,6 @@ write_str(stringio *self, PyObject *obj)
136201
return -1;
137202
}
138203
len = PyUnicode_GET_LENGTH(decoded);
139-
140204
assert(len >= 0);
141205

142206
/* This overflow check is not strictly necessary. However, it avoids us to
@@ -147,6 +211,17 @@ write_str(stringio *self, PyObject *obj)
147211
"new position too large");
148212
goto fail;
149213
}
214+
215+
if (self->state == STATE_ACCUMULATING) {
216+
if (self->string_size == self->pos) {
217+
if (_PyAccu_Accumulate(&self->accu, decoded))
218+
goto fail;
219+
goto success;
220+
}
221+
if (realize(self))
222+
goto fail;
223+
}
224+
150225
if (self->pos + len > self->string_size) {
151226
if (resize_buffer(self, self->pos + len) < 0)
152227
goto fail;
@@ -174,6 +249,7 @@ write_str(stringio *self, PyObject *obj)
174249
0))
175250
goto fail;
176251

252+
success:
177253
/* Set the new length of the internal string if it has changed. */
178254
self->pos += len;
179255
if (self->string_size < self->pos)
@@ -195,6 +271,8 @@ stringio_getvalue(stringio *self)
195271
{
196272
CHECK_INITIALIZED(self);
197273
CHECK_CLOSED(self);
274+
if (self->state == STATE_ACCUMULATING)
275+
return make_intermediate(self);
198276
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, self->buf,
199277
self->string_size);
200278
}
@@ -251,6 +329,14 @@ stringio_read(stringio *self, PyObject *args)
251329
size = 0;
252330
}
253331

332+
/* Optimization for seek(0); read() */
333+
if (self->state == STATE_ACCUMULATING && self->pos == 0 && size == n) {
334+
PyObject *result = make_intermediate(self);
335+
self->pos = self->string_size;
336+
return result;
337+
}
338+
339+
ENSURE_REALIZED(self);
254340
output = self->buf + self->pos;
255341
self->pos += size;
256342
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, size);
@@ -301,6 +387,7 @@ stringio_readline(stringio *self, PyObject *args)
301387
if (!PyArg_ParseTuple(args, "|O:readline", &arg))
302388
return NULL;
303389
CHECK_CLOSED(self);
390+
ENSURE_REALIZED(self);
304391

305392
if (PyNumber_Check(arg)) {
306393
limit = PyNumber_AsSsize_t(arg, PyExc_OverflowError);
@@ -322,6 +409,7 @@ stringio_iternext(stringio *self)
322409

323410
CHECK_INITIALIZED(self);
324411
CHECK_CLOSED(self);
412+
ENSURE_REALIZED(self);
325413

326414
if (Py_TYPE(self) == &PyStringIO_Type) {
327415
/* Skip method call overhead for speed */
@@ -392,6 +480,7 @@ stringio_truncate(stringio *self, PyObject *args)
392480
}
393481

394482
if (size < self->string_size) {
483+
ENSURE_REALIZED(self);
395484
if (resize_buffer(self, size) < 0)
396485
return NULL;
397486
self->string_size = size;
@@ -492,6 +581,7 @@ stringio_close(stringio *self)
492581
/* Free up some memory */
493582
if (resize_buffer(self, 0) < 0)
494583
return NULL;
584+
_PyAccu_Destroy(&self->accu);
495585
Py_CLEAR(self->readnl);
496586
Py_CLEAR(self->writenl);
497587
Py_CLEAR(self->decoder);
@@ -521,6 +611,7 @@ stringio_dealloc(stringio *self)
521611
PyMem_Free(self->buf);
522612
self->buf = NULL;
523613
}
614+
_PyAccu_Destroy(&self->accu);
524615
Py_CLEAR(self->readnl);
525616
Py_CLEAR(self->writenl);
526617
Py_CLEAR(self->decoder);
@@ -559,6 +650,7 @@ stringio_init(stringio *self, PyObject *args, PyObject *kwds)
559650
PyObject *value = NULL;
560651
PyObject *newline_obj = NULL;
561652
char *newline = "\n";
653+
Py_ssize_t value_len;
562654

563655
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OO:__init__", kwlist,
564656
&value, &newline_obj))
@@ -600,6 +692,7 @@ stringio_init(stringio *self, PyObject *args, PyObject *kwds)
600692

601693
self->ok = 0;
602694

695+
_PyAccu_Destroy(&self->accu);
603696
Py_CLEAR(self->readnl);
604697
Py_CLEAR(self->writenl);
605698
Py_CLEAR(self->decoder);
@@ -636,19 +729,27 @@ stringio_init(stringio *self, PyObject *args, PyObject *kwds)
636729
/* Now everything is set up, resize buffer to size of initial value,
637730
and copy it */
638731
self->string_size = 0;
639-
if (value && value != Py_None) {
640-
Py_ssize_t len = PyUnicode_GetSize(value);
732+
if (value && value != Py_None)
733+
value_len = PyUnicode_GetSize(value);
734+
else
735+
value_len = 0;
736+
if (value_len > 0) {
641737
/* This is a heuristic, for newline translation might change
642738
the string length. */
643-
if (resize_buffer(self, len) < 0)
739+
if (resize_buffer(self, 0) < 0)
644740
return -1;
741+
self->state = STATE_REALIZED;
645742
self->pos = 0;
646743
if (write_str(self, value) < 0)
647744
return -1;
648745
}
649746
else {
747+
/* Empty stringio object, we can start by accumulating */
650748
if (resize_buffer(self, 0) < 0)
651749
return -1;
750+
if (_PyAccu_Init(&self->accu))
751+
return -1;
752+
self->state = STATE_ACCUMULATING;
652753
}
653754
self->pos = 0;
654755

Objects/unicodeobject.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2055,7 +2055,7 @@ Py_UCS4*
20552055
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
20562056
int copy_null)
20572057
{
2058-
if (target == NULL || targetsize < 1) {
2058+
if (target == NULL || targetsize < 0) {
20592059
PyErr_BadInternalCall();
20602060
return NULL;
20612061
}

0 commit comments

Comments
 (0)