@@ -1180,14 +1180,14 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None,
11801180 self ._encoder = None
11811181 self ._decoder = None
11821182 self ._decoded_text = "" # buffer for text produced by decoder
1183- self ._decoded_text_offset = 0 # offset to text returned by read()
11841183 self ._snapshot = None # info for reconstructing decoder state
11851184 self ._seekable = self ._telling = self .buffer .seekable ()
11861185
11871186 # A word about _snapshot. This attribute is either None, or a tuple
1188- # (decoder_state, next_input) where decoder_state is the second
1189- # (integer) item of the decoder state, and next_input is the chunk
1190- # of bytes that comes after the snapshot point in the input.
1187+ # (decoder_state, input_chunk, decoded_chars) where decoder_state is
1188+ # the second (integer) item of the decoder state, input_chunk is the
1189+ # chunk of bytes that was read, and decoded_chars is the number of
1190+ # characters rendered by the decoder after feeding it those bytes.
11911191 # We use this to reconstruct intermediate decoder states in tell().
11921192
11931193 # Naming convention:
@@ -1271,10 +1271,10 @@ def _read_chunk(self):
12711271 """
12721272 Read and decode the next chunk of data from the BufferedReader.
12731273
1274- The return value is True unless EOF was reached. The decoded string
1275- is placed in self._decoded_text (replacing its previous value).
1276- (The entire input chunk is sent to the decoder, though some of it
1277- may remain buffered in the decoder, yet to be converted.)
1274+ Return a tuple of two elements: all the bytes that were read, and
1275+ the decoded string produced by the decoder. (The entire input
1276+ chunk is sent to the decoder, but some of it may remain buffered
1277+ in the decoder, yet to be converted.)
12781278 """
12791279
12801280 if self ._decoder is None :
@@ -1283,9 +1283,8 @@ def _read_chunk(self):
12831283 # No one should call tell(), so don't bother taking a snapshot.
12841284 input_chunk = self .buffer .read1 (self ._CHUNK_SIZE )
12851285 eof = not input_chunk
1286- self ._decoded_text = self ._decoder .decode (input_chunk , eof )
1287- self ._decoded_text_offset = 0
1288- return not eof
1286+ decoded = self ._decoder .decode (input_chunk , eof )
1287+ return (input_chunk , decoded )
12891288
12901289 # The cookie returned by tell() cannot include the contents of
12911290 # the decoder's buffer, so we need to snapshot a point in the
@@ -1299,15 +1298,16 @@ def _read_chunk(self):
12991298
13001299 input_chunk = self .buffer .read1 (self ._CHUNK_SIZE )
13011300 eof = not input_chunk
1302- self ._decoded_text = self ._decoder .decode (input_chunk , eof )
1303- self ._decoded_text_offset = 0
1301+ decoded = self ._decoder .decode (input_chunk , eof )
13041302
1305- # At the snapshot point, len(dec_buffer) bytes ago, the next input
1306- # to be passed to the decoder is dec_buffer + input_chunk.
1307- self ._snapshot = (dec_flags , dec_buffer + input_chunk )
1308- return not eof
1303+ # At the snapshot point len(dec_buffer) bytes ago, the next input
1304+ # to be passed to the decoder is dec_buffer + input_chunk. Save
1305+ # len(decoded) so that later, tell() can figure out how much
1306+ # decoded data has been used up by TextIOWrapper.read().
1307+ self ._snapshot = (dec_flags , dec_buffer + input_chunk , len (decoded ))
1308+ return (input_chunk , decoded )
13091309
1310- def _pack_cookie (self , position , dec_flags = 0 ,
1310+ def _encode_tell_cookie (self , position , dec_flags = 0 ,
13111311 feed_bytes = 0 , need_eof = 0 , skip_chars = 0 ):
13121312 # The meaning of a tell() cookie is: seek to position, set the
13131313 # decoder flags to dec_flags, read feed_bytes bytes, feed them
@@ -1317,7 +1317,7 @@ def _pack_cookie(self, position, dec_flags=0,
13171317 return (position | (dec_flags << 64 ) | (feed_bytes << 128 ) |
13181318 (skip_chars << 192 ) | bool (need_eof )<< 256 )
13191319
1320- def _unpack_cookie (self , bigint ):
1320+ def _decode_tell_cookie (self , bigint ):
13211321 rest , position = divmod (bigint , 1 << 64 )
13221322 rest , dec_flags = divmod (rest , 1 << 64 )
13231323 rest , feed_bytes = divmod (rest , 1 << 64 )
@@ -1339,14 +1339,14 @@ def tell(self):
13391339 return position
13401340
13411341 # Skip backward to the snapshot point (see _read_chunk).
1342- dec_flags , next_input = self ._snapshot
1342+ dec_flags , next_input , decoded_chars = self ._snapshot
13431343 position -= len (next_input )
13441344
1345- # How many decoded characters have been returned since the snapshot?
1346- skip_chars = self ._decoded_text_offset
1345+ # How many decoded characters have been consumed since the snapshot?
1346+ skip_chars = decoded_chars - len ( self ._decoded_text )
13471347 if skip_chars == 0 :
13481348 # We haven't moved from the snapshot point.
1349- return self ._pack_cookie (position , dec_flags )
1349+ return self ._encode_tell_cookie (position , dec_flags )
13501350
13511351 # Walk the decoder forward, one byte at a time, to find the minimum
13521352 # input necessary to give us the decoded characters we need to skip.
@@ -1373,8 +1373,8 @@ def tell(self):
13731373 if decoded_chars >= skip_chars :
13741374 break
13751375 else :
1376- # We didn't get enough decoded data; signal EOF to get more.
1377- decoded = decoder .decode (b"" , final = True )
1376+ # We didn't get enough decoded data; send EOF to get more.
1377+ decoded = decoder .decode (b"" , True )
13781378 decoded_chars += len (decoded )
13791379 need_eof = 1
13801380 if decoded_chars < skip_chars :
@@ -1385,7 +1385,7 @@ def tell(self):
13851385 position += safe_fed_bytes
13861386 fed_bytes -= safe_fed_bytes
13871387 skip_chars -= safe_decoded_chars
1388- return self ._pack_cookie (
1388+ return self ._encode_tell_cookie (
13891389 position , dec_flags , fed_bytes , need_eof , skip_chars )
13901390 finally :
13911391 decoder .setstate (saved_state )
@@ -1405,7 +1405,8 @@ def seek(self, cookie, whence=0):
14051405 raise IOError ("can't do nonzero end-relative seeks" )
14061406 self .flush ()
14071407 position = self .buffer .seek (0 , 2 )
1408- self ._clear_decoded_text ()
1408+ self ._decoded_text = ""
1409+ self ._snapshot = None
14091410 if self ._decoder :
14101411 self ._decoder .reset ()
14111412 return position
@@ -1418,70 +1419,48 @@ def seek(self, cookie, whence=0):
14181419
14191420 # Seek back to the snapshot point.
14201421 position , dec_flags , feed_bytes , need_eof , skip_chars = \
1421- self ._unpack_cookie (cookie )
1422+ self ._decode_tell_cookie (cookie )
14221423 self .buffer .seek (position )
1423- self ._clear_decoded_text ()
1424+ self ._decoded_text = ""
1425+ self ._snapshot = None
14241426
14251427 if self ._decoder or dec_flags or feed_bytes or need_eof :
14261428 # Restore the decoder flags to their values from the snapshot.
14271429 self ._decoder = self ._decoder or self ._get_decoder ()
14281430 self ._decoder .setstate ((b"" , dec_flags ))
1429- self ._snapshot = (dec_flags , b'' )
14301431
14311432 if feed_bytes or need_eof :
14321433 # Feed feed_bytes bytes to the decoder.
14331434 input_chunk = self .buffer .read (feed_bytes )
1434- self . _decoded_text = self ._decoder .decode (input_chunk , need_eof )
1435- if len (self . _decoded_text ) < skip_chars :
1435+ decoded = self ._decoder .decode (input_chunk , need_eof )
1436+ if len (decoded ) < skip_chars :
14361437 raise IOError ("can't restore logical file position" )
14371438
14381439 # Skip skip_chars of the decoded characters.
1439- self ._decoded_text_offset = skip_chars
1440+ self ._decoded_text = decoded [ skip_chars :]
14401441
14411442 # Restore the snapshot.
1442- self ._snapshot = (dec_flags , input_chunk )
1443+ self ._snapshot = (dec_flags , input_chunk , len ( decoded ) )
14431444 return cookie
14441445
1445- def _clear_decoded_text (self ):
1446- """Reset the _decoded_text buffer."""
1447- self ._decoded_text = ''
1448- self ._decoded_text_offset = 0
1449- self ._snapshot = None
1450-
1451- def _emit_decoded_text (self , n = None ):
1452- """Advance into the _decoded_text buffer."""
1453- offset = self ._decoded_text_offset
1454- if n is None :
1455- text = self ._decoded_text [offset :]
1456- else :
1457- text = self ._decoded_text [offset :offset + n ]
1458- self ._decoded_text_offset += len (text )
1459- return text
1460-
1461- def _unemit_decoded_text (self , n ):
1462- """Rewind the _decoded_text buffer."""
1463- if self ._decoded_text_offset < n :
1464- raise AssertionError ("unemit out of bounds" )
1465- self ._decoded_text_offset -= n
1466-
14671446 def read (self , n = None ):
14681447 if n is None :
14691448 n = - 1
14701449 decoder = self ._decoder or self ._get_decoder ()
1450+ result = self ._decoded_text
14711451 if n < 0 :
1472- # Read everything.
1473- result = (self ._emit_decoded_text () +
1474- decoder .decode (self .buffer .read (), final = True ))
1475- self ._clear_decoded_text ()
1452+ result += decoder .decode (self .buffer .read (), True )
1453+ self ._decoded_text = ""
1454+ self ._snapshot = None
14761455 return result
14771456 else :
1478- # Keep reading chunks until we have n characters to return.
1479- eof = False
1480- result = self . _emit_decoded_text ( n )
1481- while len ( result ) < n and not eof :
1482- eof = not self . _read_chunk ()
1483- result += self ._emit_decoded_text ( n - len ( result ))
1484- return result
1457+ while len ( result ) < n :
1458+ input_chunk , decoded = self . _read_chunk ()
1459+ result += decoded
1460+ if not input_chunk :
1461+ break
1462+ self ._decoded_text = result [ n :]
1463+ return result [: n ]
14851464
14861465 def __next__ (self ):
14871466 self ._telling = False
@@ -1495,20 +1474,21 @@ def __next__(self):
14951474 def readline (self , limit = None ):
14961475 if limit is None :
14971476 limit = - 1
1477+ if limit >= 0 :
1478+ # XXX Hack to support limit argument, for backwards compatibility
1479+ line = self .readline ()
1480+ if len (line ) <= limit :
1481+ return line
1482+ line , self ._decoded_text = \
1483+ line [:limit ], line [limit :] + self ._decoded_text
1484+ return line
14981485
1499- # Grab all the decoded text (we will rewind any extra bits later).
1500- line = self ._emit_decoded_text ()
1501-
1486+ line = self ._decoded_text
15021487 start = 0
15031488 decoder = self ._decoder or self ._get_decoder ()
15041489
15051490 pos = endpos = None
15061491 while True :
1507- if limit >= 0 and len (line ) >= limit :
1508- # Length limit has been reached.
1509- endpos = limit
1510- break
1511-
15121492 if self ._readtranslate :
15131493 # Newlines are already translated, only search for \n
15141494 pos = line .find ('\n ' , start )
@@ -1558,18 +1538,20 @@ def readline(self, limit=None):
15581538
15591539 # No line ending seen yet - get more data
15601540 more_line = ''
1561- while self ._read_chunk ():
1562- if self ._decoded_text :
1541+ while True :
1542+ readahead , pending = self ._read_chunk ()
1543+ more_line = pending
1544+ if more_line or not readahead :
15631545 break
1564- if self . _decoded_text :
1565- line += self . _emit_decoded_text ()
1546+ if more_line :
1547+ line += more_line
15661548 else :
15671549 # end of file
1568- self ._clear_decoded_text ()
1550+ self ._decoded_text = ''
1551+ self ._snapshot = None
15691552 return line
15701553
1571- # Rewind _decoded_text to just after the line ending we found.
1572- self ._unemit_decoded_text (len (line ) - endpos )
1554+ self ._decoded_text = line [endpos :]
15731555 return line [:endpos ]
15741556
15751557 @property
0 commit comments