Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 211b81d

Browse files
committed
Issue #11114: Fix catastrophic performance of tell() on text files (up
to 1000x faster in some cases). It is still one to two order of magnitudes slower than binary tell().
1 parent d2751fb commit 211b81d

3 files changed

Lines changed: 150 additions & 47 deletions

File tree

Lib/_pyio.py

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1488,6 +1488,7 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None,
14881488
self._decoded_chars_used = 0 # offset into _decoded_chars for read()
14891489
self._snapshot = None # info for reconstructing decoder state
14901490
self._seekable = self._telling = self.buffer.seekable()
1491+
self._b2cratio = 0.0
14911492

14921493
if self._seekable and self.writable():
14931494
position = self.buffer.tell()
@@ -1655,7 +1656,12 @@ def _read_chunk(self):
16551656
# Read a chunk, decode it, and put the result in self._decoded_chars.
16561657
input_chunk = self.buffer.read1(self._CHUNK_SIZE)
16571658
eof = not input_chunk
1658-
self._set_decoded_chars(self._decoder.decode(input_chunk, eof))
1659+
decoded_chars = self._decoder.decode(input_chunk, eof)
1660+
self._set_decoded_chars(decoded_chars)
1661+
if decoded_chars:
1662+
self._b2cratio = len(input_chunk) / len(self._decoded_chars)
1663+
else:
1664+
self._b2cratio = 0.0
16591665

16601666
if self._telling:
16611667
# At the snapshot point, len(dec_buffer) bytes before the read,
@@ -1709,20 +1715,56 @@ def tell(self):
17091715
# forward until it gives us enough decoded characters.
17101716
saved_state = decoder.getstate()
17111717
try:
1718+
# Fast search for an acceptable start point, close to our
1719+
# current pos.
1720+
# Rationale: calling decoder.decode() has a large overhead
1721+
# regardless of chunk size; we want the number of such calls to
1722+
# be O(1) in most situations (common decoders, non-crazy input).
1723+
# Actually, it will be exactly 1 for fixed-size codecs (all
1724+
# 8-bit codecs, also UTF-16 and UTF-32).
1725+
skip_bytes = int(self._b2cratio * chars_to_skip)
1726+
skip_back = 1
1727+
assert skip_bytes <= len(next_input)
1728+
while skip_bytes > 0:
1729+
decoder.setstate((b'', dec_flags))
1730+
# Decode up to temptative start point
1731+
n = len(decoder.decode(next_input[:skip_bytes]))
1732+
if n <= chars_to_skip:
1733+
b, d = decoder.getstate()
1734+
if not b:
1735+
# Before pos and no bytes buffered in decoder => OK
1736+
dec_flags = d
1737+
chars_to_skip -= n
1738+
break
1739+
# Skip back by buffered amount and reset heuristic
1740+
skip_bytes -= len(b)
1741+
skip_back = 1
1742+
else:
1743+
# We're too far ahead, skip back a bit
1744+
skip_bytes -= skip_back
1745+
skip_back = skip_back * 2
1746+
else:
1747+
skip_bytes = 0
1748+
decoder.setstate((b'', dec_flags))
1749+
17121750
# Note our initial start point.
1713-
decoder.setstate((b'', dec_flags))
1714-
start_pos = position
1715-
start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0
1716-
need_eof = 0
1751+
start_pos = position + skip_bytes
1752+
start_flags = dec_flags
1753+
if chars_to_skip == 0:
1754+
# We haven't moved from the start point.
1755+
return self._pack_cookie(start_pos, start_flags)
17171756

17181757
# Feed the decoder one byte at a time. As we go, note the
17191758
# nearest "safe start point" before the current location
17201759
# (a point where the decoder has nothing buffered, so seek()
17211760
# can safely start from there and advance to this location).
1722-
next_byte = bytearray(1)
1723-
for next_byte[0] in next_input:
1761+
bytes_fed = 0
1762+
need_eof = 0
1763+
# Chars decoded since `start_pos`
1764+
chars_decoded = 0
1765+
for i in range(skip_bytes, len(next_input)):
17241766
bytes_fed += 1
1725-
chars_decoded += len(decoder.decode(next_byte))
1767+
chars_decoded += len(decoder.decode(next_input[i:i+1]))
17261768
dec_buffer, dec_flags = decoder.getstate()
17271769
if not dec_buffer and chars_decoded <= chars_to_skip:
17281770
# Decoder buffer is empty, so this is a safe start point.

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ Core and Builtins
3535
Library
3636
-------
3737

38+
- Issue #11114: Fix catastrophic performance of tell() on text files (up
39+
to 1000x faster in some cases). It is still one to two order of magnitudes
40+
slower than binary tell().
41+
3842
- Issue 10882: Add os.sendfile function.
3943

4044
- Issue #10868: Allow usage of the register method of an ABC as a class

Modules/_io/textio.c

Lines changed: 96 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -678,12 +678,16 @@ typedef struct
678678
PyObject *pending_bytes; /* list of bytes objects waiting to be
679679
written, or NULL */
680680
Py_ssize_t pending_bytes_count;
681-
PyObject *snapshot;
681+
682682
/* snapshot is either None, or a tuple (dec_flags, next_input) where
683683
* dec_flags is the second (integer) item of the decoder state and
684684
* next_input is the chunk of input bytes that comes next after the
685685
* snapshot point. We use this to reconstruct decoder states in tell().
686686
*/
687+
PyObject *snapshot;
688+
/* Bytes-to-characters ratio for the current chunk. Serves as input for
689+
the heuristic in tell(). */
690+
double b2cratio;
687691

688692
/* Cache raw object if it's a FileIO object */
689693
PyObject *raw;
@@ -850,6 +854,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
850854
self->decoded_chars_used = 0;
851855
self->pending_bytes_count = 0;
852856
self->encodefunc = NULL;
857+
self->b2cratio = 0.0;
853858

854859
if (encoding == NULL) {
855860
/* Try os.device_encoding(fileno) */
@@ -1390,6 +1395,7 @@ textiowrapper_read_chunk(textio *self)
13901395
PyObject *dec_flags = NULL;
13911396
PyObject *input_chunk = NULL;
13921397
PyObject *decoded_chars, *chunk_size;
1398+
Py_ssize_t nbytes, nchars;
13931399
int eof;
13941400

13951401
/* The return value is True unless EOF was reached. The decoded string is
@@ -1435,7 +1441,8 @@ textiowrapper_read_chunk(textio *self)
14351441
goto fail;
14361442
assert(PyBytes_Check(input_chunk));
14371443

1438-
eof = (PyBytes_Size(input_chunk) == 0);
1444+
nbytes = PyBytes_Size(input_chunk);
1445+
eof = (nbytes == 0);
14391446

14401447
if (Py_TYPE(self->decoder) == &PyIncrementalNewlineDecoder_Type) {
14411448
decoded_chars = _PyIncrementalNewlineDecoder_decode(
@@ -1450,7 +1457,12 @@ textiowrapper_read_chunk(textio *self)
14501457
if (decoded_chars == NULL)
14511458
goto fail;
14521459
textiowrapper_set_decoded_chars(self, decoded_chars);
1453-
if (PyUnicode_GET_SIZE(decoded_chars) > 0)
1460+
nchars = PyUnicode_GET_SIZE(decoded_chars);
1461+
if (nchars > 0)
1462+
self->b2cratio = (double) nbytes / nchars;
1463+
else
1464+
self->b2cratio = 0.0;
1465+
if (nchars > 0)
14541466
eof = 0;
14551467

14561468
if (self->telling) {
@@ -2139,8 +2151,12 @@ textiowrapper_tell(textio *self, PyObject *args)
21392151
cookie_type cookie = {0,0,0,0,0};
21402152
PyObject *next_input;
21412153
Py_ssize_t chars_to_skip, chars_decoded;
2154+
Py_ssize_t skip_bytes, skip_back;
21422155
PyObject *saved_state = NULL;
21432156
char *input, *input_end;
2157+
char *dec_buffer;
2158+
Py_ssize_t dec_buffer_len;
2159+
int dec_flags;
21442160

21452161
CHECK_INITIALIZED(self);
21462162
CHECK_CLOSED(self);
@@ -2176,6 +2192,7 @@ textiowrapper_tell(textio *self, PyObject *args)
21762192
#else
21772193
cookie.start_pos = PyLong_AsLong(posobj);
21782194
#endif
2195+
Py_DECREF(posobj);
21792196
if (PyErr_Occurred())
21802197
goto fail;
21812198

@@ -2190,57 +2207,99 @@ textiowrapper_tell(textio *self, PyObject *args)
21902207
/* How many decoded characters have been used up since the snapshot? */
21912208
if (self->decoded_chars_used == 0) {
21922209
/* We haven't moved from the snapshot point. */
2193-
Py_DECREF(posobj);
21942210
return textiowrapper_build_cookie(&cookie);
21952211
}
21962212

21972213
chars_to_skip = self->decoded_chars_used;
21982214

2199-
/* Starting from the snapshot position, we will walk the decoder
2200-
* forward until it gives us enough decoded characters.
2201-
*/
2215+
/* Decoder state will be restored at the end */
22022216
saved_state = PyObject_CallMethodObjArgs(self->decoder,
22032217
_PyIO_str_getstate, NULL);
22042218
if (saved_state == NULL)
22052219
goto fail;
22062220

2207-
/* Note our initial start point. */
2208-
if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
2209-
goto fail;
2221+
#define DECODER_GETSTATE() do { \
2222+
PyObject *_state = PyObject_CallMethodObjArgs(self->decoder, \
2223+
_PyIO_str_getstate, NULL); \
2224+
if (_state == NULL) \
2225+
goto fail; \
2226+
if (!PyArg_Parse(_state, "(y#i)", &dec_buffer, &dec_buffer_len, &dec_flags)) { \
2227+
Py_DECREF(_state); \
2228+
goto fail; \
2229+
} \
2230+
Py_DECREF(_state); \
2231+
} while (0)
2232+
2233+
/* TODO: replace assert with exception */
2234+
#define DECODER_DECODE(start, len, res) do { \
2235+
PyObject *_decoded = PyObject_CallMethod( \
2236+
self->decoder, "decode", "y#", start, len); \
2237+
if (_decoded == NULL) \
2238+
goto fail; \
2239+
assert (PyUnicode_Check(_decoded)); \
2240+
res = PyUnicode_GET_SIZE(_decoded); \
2241+
Py_DECREF(_decoded); \
2242+
} while (0)
2243+
2244+
/* Fast search for an acceptable start point, close to our
2245+
current pos */
2246+
skip_bytes = (Py_ssize_t) (self->b2cratio * chars_to_skip);
2247+
skip_back = 1;
2248+
assert(skip_back <= PyBytes_GET_SIZE(next_input));
2249+
input = PyBytes_AS_STRING(next_input);
2250+
while (skip_bytes > 0) {
2251+
/* Decode up to temptative start point */
2252+
if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
2253+
goto fail;
2254+
DECODER_DECODE(input, skip_bytes, chars_decoded);
2255+
if (chars_decoded <= chars_to_skip) {
2256+
DECODER_GETSTATE();
2257+
if (dec_buffer_len == 0) {
2258+
/* Before pos and no bytes buffered in decoder => OK */
2259+
cookie.dec_flags = dec_flags;
2260+
chars_to_skip -= chars_decoded;
2261+
break;
2262+
}
2263+
/* Skip back by buffered amount and reset heuristic */
2264+
skip_bytes -= dec_buffer_len;
2265+
skip_back = 1;
2266+
}
2267+
else {
2268+
/* We're too far ahead, skip back a bit */
2269+
skip_bytes -= skip_back;
2270+
skip_back *= 2;
2271+
}
2272+
}
2273+
if (skip_bytes <= 0) {
2274+
skip_bytes = 0;
2275+
if (_textiowrapper_decoder_setstate(self, &cookie) < 0)
2276+
goto fail;
2277+
}
22102278

2211-
/* Feed the decoder one byte at a time. As we go, note the
2212-
* nearest "safe start point" before the current location
2213-
* (a point where the decoder has nothing buffered, so seek()
2279+
/* Note our initial start point. */
2280+
cookie.start_pos += skip_bytes;
2281+
cookie.chars_to_skip = chars_to_skip;
2282+
if (chars_to_skip == 0)
2283+
goto finally;
2284+
2285+
/* We should be close to the desired position. Now feed the decoder one
2286+
* byte at a time until we reach the `chars_to_skip` target.
2287+
* As we go, note the nearest "safe start point" before the current
2288+
* location (a point where the decoder has nothing buffered, so seek()
22142289
* can safely start from there and advance to this location).
22152290
*/
22162291
chars_decoded = 0;
22172292
input = PyBytes_AS_STRING(next_input);
22182293
input_end = input + PyBytes_GET_SIZE(next_input);
2294+
input += skip_bytes;
22192295
while (input < input_end) {
2220-
PyObject *state;
2221-
char *dec_buffer;
2222-
Py_ssize_t dec_buffer_len;
2223-
int dec_flags;
2224-
2225-
PyObject *decoded = PyObject_CallMethod(
2226-
self->decoder, "decode", "y#", input, 1);
2227-
if (decoded == NULL)
2228-
goto fail;
2229-
assert (PyUnicode_Check(decoded));
2230-
chars_decoded += PyUnicode_GET_SIZE(decoded);
2231-
Py_DECREF(decoded);
2296+
Py_ssize_t n;
22322297

2298+
DECODER_DECODE(input, 1, n);
2299+
/* We got n chars for 1 byte */
2300+
chars_decoded += n;
22332301
cookie.bytes_to_feed += 1;
2234-
2235-
state = PyObject_CallMethodObjArgs(self->decoder,
2236-
_PyIO_str_getstate, NULL);
2237-
if (state == NULL)
2238-
goto fail;
2239-
if (!PyArg_Parse(state, "(y#i)", &dec_buffer, &dec_buffer_len, &dec_flags)) {
2240-
Py_DECREF(state);
2241-
goto fail;
2242-
}
2243-
Py_DECREF(state);
2302+
DECODER_GETSTATE();
22442303

22452304
if (dec_buffer_len == 0 && chars_decoded <= chars_to_skip) {
22462305
/* Decoder buffer is empty, so this is a safe start point. */
@@ -2272,8 +2331,7 @@ textiowrapper_tell(textio *self, PyObject *args)
22722331
}
22732332
}
22742333

2275-
/* finally */
2276-
Py_XDECREF(posobj);
2334+
finally:
22772335
res = PyObject_CallMethod(self->decoder, "setstate", "(O)", saved_state);
22782336
Py_DECREF(saved_state);
22792337
if (res == NULL)
@@ -2284,8 +2342,7 @@ textiowrapper_tell(textio *self, PyObject *args)
22842342
cookie.chars_to_skip = Py_SAFE_DOWNCAST(chars_to_skip, Py_ssize_t, int);
22852343
return textiowrapper_build_cookie(&cookie);
22862344

2287-
fail:
2288-
Py_XDECREF(posobj);
2345+
fail:
22892346
if (saved_state) {
22902347
PyObject *type, *value, *traceback;
22912348
PyErr_Fetch(&type, &value, &traceback);

0 commit comments

Comments
 (0)