@@ -678,12 +678,16 @@ typedef struct
678678 PyObject * pending_bytes ; /* list of bytes objects waiting to be
679679 written, or NULL */
680680 Py_ssize_t pending_bytes_count ;
681- PyObject * snapshot ;
681+
682682 /* snapshot is either None, or a tuple (dec_flags, next_input) where
683683 * dec_flags is the second (integer) item of the decoder state and
684684 * next_input is the chunk of input bytes that comes next after the
685685 * snapshot point. We use this to reconstruct decoder states in tell().
686686 */
687+ PyObject * snapshot ;
688+ /* Bytes-to-characters ratio for the current chunk. Serves as input for
689+ the heuristic in tell(). */
690+ double b2cratio ;
687691
688692 /* Cache raw object if it's a FileIO object */
689693 PyObject * raw ;
@@ -850,6 +854,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
850854 self -> decoded_chars_used = 0 ;
851855 self -> pending_bytes_count = 0 ;
852856 self -> encodefunc = NULL ;
857+ self -> b2cratio = 0.0 ;
853858
854859 if (encoding == NULL ) {
855860 /* Try os.device_encoding(fileno) */
@@ -1390,6 +1395,7 @@ textiowrapper_read_chunk(textio *self)
13901395 PyObject * dec_flags = NULL ;
13911396 PyObject * input_chunk = NULL ;
13921397 PyObject * decoded_chars , * chunk_size ;
1398+ Py_ssize_t nbytes , nchars ;
13931399 int eof ;
13941400
13951401 /* The return value is True unless EOF was reached. The decoded string is
@@ -1435,7 +1441,8 @@ textiowrapper_read_chunk(textio *self)
14351441 goto fail ;
14361442 assert (PyBytes_Check (input_chunk ));
14371443
1438- eof = (PyBytes_Size (input_chunk ) == 0 );
1444+ nbytes = PyBytes_Size (input_chunk );
1445+ eof = (nbytes == 0 );
14391446
14401447 if (Py_TYPE (self -> decoder ) == & PyIncrementalNewlineDecoder_Type ) {
14411448 decoded_chars = _PyIncrementalNewlineDecoder_decode (
@@ -1450,7 +1457,12 @@ textiowrapper_read_chunk(textio *self)
14501457 if (decoded_chars == NULL )
14511458 goto fail ;
14521459 textiowrapper_set_decoded_chars (self , decoded_chars );
1453- if (PyUnicode_GET_SIZE (decoded_chars ) > 0 )
1460+ nchars = PyUnicode_GET_SIZE (decoded_chars );
1461+ if (nchars > 0 )
1462+ self -> b2cratio = (double ) nbytes / nchars ;
1463+ else
1464+ self -> b2cratio = 0.0 ;
1465+ if (nchars > 0 )
14541466 eof = 0 ;
14551467
14561468 if (self -> telling ) {
@@ -2139,8 +2151,12 @@ textiowrapper_tell(textio *self, PyObject *args)
21392151 cookie_type cookie = {0 ,0 ,0 ,0 ,0 };
21402152 PyObject * next_input ;
21412153 Py_ssize_t chars_to_skip , chars_decoded ;
2154+ Py_ssize_t skip_bytes , skip_back ;
21422155 PyObject * saved_state = NULL ;
21432156 char * input , * input_end ;
2157+ char * dec_buffer ;
2158+ Py_ssize_t dec_buffer_len ;
2159+ int dec_flags ;
21442160
21452161 CHECK_INITIALIZED (self );
21462162 CHECK_CLOSED (self );
@@ -2176,6 +2192,7 @@ textiowrapper_tell(textio *self, PyObject *args)
21762192#else
21772193 cookie .start_pos = PyLong_AsLong (posobj );
21782194#endif
2195+ Py_DECREF (posobj );
21792196 if (PyErr_Occurred ())
21802197 goto fail ;
21812198
@@ -2190,57 +2207,99 @@ textiowrapper_tell(textio *self, PyObject *args)
21902207 /* How many decoded characters have been used up since the snapshot? */
21912208 if (self -> decoded_chars_used == 0 ) {
21922209 /* We haven't moved from the snapshot point. */
2193- Py_DECREF (posobj );
21942210 return textiowrapper_build_cookie (& cookie );
21952211 }
21962212
21972213 chars_to_skip = self -> decoded_chars_used ;
21982214
2199- /* Starting from the snapshot position, we will walk the decoder
2200- * forward until it gives us enough decoded characters.
2201- */
2215+ /* Decoder state will be restored at the end */
22022216 saved_state = PyObject_CallMethodObjArgs (self -> decoder ,
22032217 _PyIO_str_getstate , NULL );
22042218 if (saved_state == NULL )
22052219 goto fail ;
22062220
2207- /* Note our initial start point. */
2208- if (_textiowrapper_decoder_setstate (self , & cookie ) < 0 )
2209- goto fail ;
2221+ #define DECODER_GETSTATE () do { \
2222+ PyObject *_state = PyObject_CallMethodObjArgs(self->decoder, \
2223+ _PyIO_str_getstate, NULL); \
2224+ if (_state == NULL) \
2225+ goto fail; \
2226+ if (!PyArg_Parse(_state, "(y#i)", &dec_buffer, &dec_buffer_len, &dec_flags)) { \
2227+ Py_DECREF(_state); \
2228+ goto fail; \
2229+ } \
2230+ Py_DECREF(_state); \
2231+ } while (0)
2232+
2233+ /* TODO: replace assert with exception */
2234+ #define DECODER_DECODE (start , len , res ) do { \
2235+ PyObject *_decoded = PyObject_CallMethod( \
2236+ self->decoder, "decode", "y#", start, len); \
2237+ if (_decoded == NULL) \
2238+ goto fail; \
2239+ assert (PyUnicode_Check(_decoded)); \
2240+ res = PyUnicode_GET_SIZE(_decoded); \
2241+ Py_DECREF(_decoded); \
2242+ } while (0)
2243+
2244+ /* Fast search for an acceptable start point, close to our
2245+ current pos */
2246+ skip_bytes = (Py_ssize_t ) (self -> b2cratio * chars_to_skip );
2247+ skip_back = 1 ;
2248+ assert (skip_back <= PyBytes_GET_SIZE (next_input ));
2249+ input = PyBytes_AS_STRING (next_input );
2250+ while (skip_bytes > 0 ) {
2251+ /* Decode up to temptative start point */
2252+ if (_textiowrapper_decoder_setstate (self , & cookie ) < 0 )
2253+ goto fail ;
2254+ DECODER_DECODE (input , skip_bytes , chars_decoded );
2255+ if (chars_decoded <= chars_to_skip ) {
2256+ DECODER_GETSTATE ();
2257+ if (dec_buffer_len == 0 ) {
2258+ /* Before pos and no bytes buffered in decoder => OK */
2259+ cookie .dec_flags = dec_flags ;
2260+ chars_to_skip -= chars_decoded ;
2261+ break ;
2262+ }
2263+ /* Skip back by buffered amount and reset heuristic */
2264+ skip_bytes -= dec_buffer_len ;
2265+ skip_back = 1 ;
2266+ }
2267+ else {
2268+ /* We're too far ahead, skip back a bit */
2269+ skip_bytes -= skip_back ;
2270+ skip_back *= 2 ;
2271+ }
2272+ }
2273+ if (skip_bytes <= 0 ) {
2274+ skip_bytes = 0 ;
2275+ if (_textiowrapper_decoder_setstate (self , & cookie ) < 0 )
2276+ goto fail ;
2277+ }
22102278
2211- /* Feed the decoder one byte at a time. As we go, note the
2212- * nearest "safe start point" before the current location
2213- * (a point where the decoder has nothing buffered, so seek()
2279+ /* Note our initial start point. */
2280+ cookie .start_pos += skip_bytes ;
2281+ cookie .chars_to_skip = chars_to_skip ;
2282+ if (chars_to_skip == 0 )
2283+ goto finally ;
2284+
2285+ /* We should be close to the desired position. Now feed the decoder one
2286+ * byte at a time until we reach the `chars_to_skip` target.
2287+ * As we go, note the nearest "safe start point" before the current
2288+ * location (a point where the decoder has nothing buffered, so seek()
22142289 * can safely start from there and advance to this location).
22152290 */
22162291 chars_decoded = 0 ;
22172292 input = PyBytes_AS_STRING (next_input );
22182293 input_end = input + PyBytes_GET_SIZE (next_input );
2294+ input += skip_bytes ;
22192295 while (input < input_end ) {
2220- PyObject * state ;
2221- char * dec_buffer ;
2222- Py_ssize_t dec_buffer_len ;
2223- int dec_flags ;
2224-
2225- PyObject * decoded = PyObject_CallMethod (
2226- self -> decoder , "decode" , "y#" , input , 1 );
2227- if (decoded == NULL )
2228- goto fail ;
2229- assert (PyUnicode_Check (decoded ));
2230- chars_decoded += PyUnicode_GET_SIZE (decoded );
2231- Py_DECREF (decoded );
2296+ Py_ssize_t n ;
22322297
2298+ DECODER_DECODE (input , 1 , n );
2299+ /* We got n chars for 1 byte */
2300+ chars_decoded += n ;
22332301 cookie .bytes_to_feed += 1 ;
2234-
2235- state = PyObject_CallMethodObjArgs (self -> decoder ,
2236- _PyIO_str_getstate , NULL );
2237- if (state == NULL )
2238- goto fail ;
2239- if (!PyArg_Parse (state , "(y#i)" , & dec_buffer , & dec_buffer_len , & dec_flags )) {
2240- Py_DECREF (state );
2241- goto fail ;
2242- }
2243- Py_DECREF (state );
2302+ DECODER_GETSTATE ();
22442303
22452304 if (dec_buffer_len == 0 && chars_decoded <= chars_to_skip ) {
22462305 /* Decoder buffer is empty, so this is a safe start point. */
@@ -2272,8 +2331,7 @@ textiowrapper_tell(textio *self, PyObject *args)
22722331 }
22732332 }
22742333
2275- /* finally */
2276- Py_XDECREF (posobj );
2334+ finally :
22772335 res = PyObject_CallMethod (self -> decoder , "setstate" , "(O)" , saved_state );
22782336 Py_DECREF (saved_state );
22792337 if (res == NULL )
@@ -2284,8 +2342,7 @@ textiowrapper_tell(textio *self, PyObject *args)
22842342 cookie .chars_to_skip = Py_SAFE_DOWNCAST (chars_to_skip , Py_ssize_t , int );
22852343 return textiowrapper_build_cookie (& cookie );
22862344
2287- fail :
2288- Py_XDECREF (posobj );
2345+ fail :
22892346 if (saved_state ) {
22902347 PyObject * type , * value , * traceback ;
22912348 PyErr_Fetch (& type , & value , & traceback );
0 commit comments