@@ -1179,20 +1179,19 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None,
11791179 self ._writenl = newline or os .linesep
11801180 self ._encoder = None
11811181 self ._decoder = None
1182- self ._decoded_text = "" # buffer for text produced by decoder
1183- self ._decoded_text_offset = 0 # offset to text returned by read()
1182+ self ._decoded_chars = '' # buffer for text returned from decoder
1183+ self ._decoded_chars_used = 0 # offset into _decoded_chars for read()
11841184 self ._snapshot = None # info for reconstructing decoder state
11851185 self ._seekable = self ._telling = self .buffer .seekable ()
11861186
1187- # A word about _snapshot. This attribute is either None, or a tuple
1188- # (decoder_state, next_input) where decoder_state is the second
1189- # (integer) item of the decoder state, and next_input is the chunk
1190- # of bytes that comes after the snapshot point in the input.
1191- # We use this to reconstruct intermediate decoder states in tell().
1187+ # self._snapshot is either None, or a tuple (dec_flags, next_input)
1188+ # where dec_flags is the second (integer) item of the decoder state
1189+ # and next_input is the chunk of input bytes that comes next after the
1190+ # snapshot point. We use this to reconstruct decoder states in tell().
11921191
11931192 # Naming convention:
1194- # - integer variables ending in "_bytes" count input bytes
1195- # - integer variables ending in "_chars" count decoded characters
1193+ # - "bytes_..." for integer variables that count input bytes
1194+ # - "chars_..." for integer variables that count decoded characters
11961195
11971196 def __repr__ (self ):
11981197 return '<TIOW %x>' % id (self )
@@ -1267,62 +1266,79 @@ def _get_decoder(self):
12671266 self ._decoder = decoder
12681267 return decoder
12691268
1269+ # The following three methods implement an ADT for _decoded_chars.
1270+ # Text returned from the decoder is buffered here until the client
1271+ # requests it by calling our read() or readline() method.
1272+ def _set_decoded_chars (self , chars ):
1273+ """Set the _decoded_chars buffer."""
1274+ self ._decoded_chars = chars
1275+ self ._decoded_chars_used = 0
1276+
1277+ def _get_decoded_chars (self , n = None ):
1278+ """Advance into the _decoded_chars buffer."""
1279+ offset = self ._decoded_chars_used
1280+ if n is None :
1281+ chars = self ._decoded_chars [offset :]
1282+ else :
1283+ chars = self ._decoded_chars [offset :offset + n ]
1284+ self ._decoded_chars_used += len (chars )
1285+ return chars
1286+
1287+ def _rewind_decoded_chars (self , n ):
1288+ """Rewind the _decoded_chars buffer."""
1289+ if self ._decoded_chars_used < n :
1290+ raise AssertionError ("rewind decoded_chars out of bounds" )
1291+ self ._decoded_chars_used -= n
1292+
12701293 def _read_chunk (self ):
12711294 """
12721295 Read and decode the next chunk of data from the BufferedReader.
12731296
12741297 The return value is True unless EOF was reached. The decoded string
1275- is placed in self._decoded_text (replacing its previous value).
1276- ( The entire input chunk is sent to the decoder, though some of it
1277- may remain buffered in the decoder, yet to be converted.)
1298+ is placed in self._decoded_chars (replacing its previous value).
1299+ The entire input chunk is sent to the decoder, though some of it
1300+ may remain buffered in the decoder, yet to be converted.
12781301 """
12791302
12801303 if self ._decoder is None :
12811304 raise ValueError ("no decoder" )
1282- if not self ._telling :
1283- # No one should call tell(), so don't bother taking a snapshot.
1284- input_chunk = self .buffer .read1 (self ._CHUNK_SIZE )
1285- eof = not input_chunk
1286- self ._decoded_text = self ._decoder .decode (input_chunk , eof )
1287- self ._decoded_text_offset = 0
1288- return not eof
1289-
1290- # The cookie returned by tell() cannot include the contents of
1291- # the decoder's buffer, so we need to snapshot a point in the
1292- # input where the decoder has nothing in its input buffer.
1293-
1294- dec_buffer , dec_flags = self ._decoder .getstate ()
1295- # The state tuple returned by getstate() contains the decoder's
1296- # input buffer and an integer representing any other state. Thus,
1297- # there is a valid snapshot point len(decoder_buffer) bytes ago in
1298- # the input, with the state tuple (b'', decoder_state).
12991305
1306+ if self ._telling :
1307+ # To prepare for tell(), we need to snapshot a point in the
1308+ # file where the decoder's input buffer is empty.
1309+
1310+ dec_buffer , dec_flags = self ._decoder .getstate ()
1311+ # Given this, we know there was a valid snapshot point
1312+ # len(dec_buffer) bytes ago with decoder state (b'', dec_flags).
1313+
1314+ # Read a chunk, decode it, and put the result in self._decoded_chars.
13001315 input_chunk = self .buffer .read1 (self ._CHUNK_SIZE )
13011316 eof = not input_chunk
1302- self ._decoded_text = self ._decoder .decode (input_chunk , eof )
1303- self ._decoded_text_offset = 0
1317+ self ._set_decoded_chars (self ._decoder .decode (input_chunk , eof ))
1318+
1319+ if self ._telling :
1320+ # At the snapshot point, len(dec_buffer) bytes before the read,
1321+ # the next input to be decoded is dec_buffer + input_chunk.
1322+ self ._snapshot = (dec_flags , dec_buffer + input_chunk )
13041323
1305- # At the snapshot point, len(dec_buffer) bytes ago, the next input
1306- # to be passed to the decoder is dec_buffer + input_chunk.
1307- self ._snapshot = (dec_flags , dec_buffer + input_chunk )
13081324 return not eof
13091325
13101326 def _pack_cookie (self , position , dec_flags = 0 ,
1311- feed_bytes = 0 , need_eof = 0 , skip_chars = 0 ):
1327+ bytes_to_feed = 0 , need_eof = 0 , chars_to_skip = 0 ):
13121328 # The meaning of a tell() cookie is: seek to position, set the
1313- # decoder flags to dec_flags, read feed_bytes bytes, feed them
1329+ # decoder flags to dec_flags, read bytes_to_feed bytes, feed them
13141330 # into the decoder with need_eof as the EOF flag, then skip
1315- # skip_chars characters of the decoded result. For most simple
1316- # decoders, this should often just be the position .
1317- return (position | (dec_flags << 64 ) | (feed_bytes << 128 ) |
1318- ( skip_chars << 192 ) | bool (need_eof )<< 256 )
1331+ # chars_to_skip characters of the decoded result. For most simple
1332+ # decoders, tell() will often just give a byte offset in the file .
1333+ return (position | (dec_flags << 64 ) | (bytes_to_feed << 128 ) |
1334+ ( chars_to_skip << 192 ) | bool (need_eof )<< 256 )
13191335
13201336 def _unpack_cookie (self , bigint ):
13211337 rest , position = divmod (bigint , 1 << 64 )
13221338 rest , dec_flags = divmod (rest , 1 << 64 )
1323- rest , feed_bytes = divmod (rest , 1 << 64 )
1324- need_eof , skip_chars = divmod (rest , 1 << 64 )
1325- return position , dec_flags , feed_bytes , need_eof , skip_chars
1339+ rest , bytes_to_feed = divmod (rest , 1 << 64 )
1340+ need_eof , chars_to_skip = divmod (rest , 1 << 64 )
1341+ return position , dec_flags , bytes_to_feed , need_eof , chars_to_skip
13261342
13271343 def tell (self ):
13281344 if not self ._seekable :
@@ -1333,7 +1349,7 @@ def tell(self):
13331349 position = self .buffer .tell ()
13341350 decoder = self ._decoder
13351351 if decoder is None or self ._snapshot is None :
1336- if self ._decoded_text :
1352+ if self ._decoded_chars :
13371353 # This should never happen.
13381354 raise AssertionError ("pending decoded text" )
13391355 return position
@@ -1342,51 +1358,48 @@ def tell(self):
13421358 dec_flags , next_input = self ._snapshot
13431359 position -= len (next_input )
13441360
1345- # How many decoded characters have been returned since the snapshot?
1346- skip_chars = self ._decoded_text_offset
1347- if skip_chars == 0 :
1361+ # How many decoded characters have been used up since the snapshot?
1362+ chars_to_skip = self ._decoded_chars_used
1363+ if chars_to_skip == 0 :
13481364 # We haven't moved from the snapshot point.
13491365 return self ._pack_cookie (position , dec_flags )
13501366
1351- # Walk the decoder forward, one byte at a time, to find the minimum
1352- # input necessary to give us the decoded characters we need to skip.
1353- # As we go, look for the "safe point" nearest to the current location
1354- # (i.e. a point where the decoder has nothing buffered, so we can
1355- # safely start from there when trying to return to this location).
1367+ # Starting from the snapshot position, we will walk the decoder
1368+ # forward until it gives us enough decoded characters.
13561369 saved_state = decoder .getstate ()
13571370 try :
1358- decoder .setstate ((b"" , dec_flags ))
1359- fed_bytes = 0
1360- decoded_chars = 0
1371+ # Note our initial start point.
1372+ decoder .setstate ((b'' , dec_flags ))
1373+ start_pos = position
1374+ start_flags , bytes_fed , chars_decoded = dec_flags , 0 , 0
13611375 need_eof = 0
1362- last_safe_point = (dec_flags , 0 , 0 )
13631376
1377+ # Feed the decoder one byte at a time. As we go, note the
1378+ # nearest "safe start point" before the current location
1379+ # (a point where the decoder has nothing buffered, so seek()
1380+ # can safely start from there and advance to this location).
13641381 next_byte = bytearray (1 )
13651382 for next_byte [0 ] in next_input :
1366- decoded = decoder .decode (next_byte )
1367- fed_bytes += 1
1368- decoded_chars += len (decoded )
1383+ bytes_fed += 1
1384+ chars_decoded += len (decoder .decode (next_byte ))
13691385 dec_buffer , dec_flags = decoder .getstate ()
1370- if not dec_buffer and decoded_chars <= skip_chars :
1371- # Decoder buffer is empty, so it's safe to start from here.
1372- last_safe_point = (dec_flags , fed_bytes , decoded_chars )
1373- if decoded_chars >= skip_chars :
1386+ if not dec_buffer and chars_decoded <= chars_to_skip :
1387+ # Decoder buffer is empty, so this is a safe start point.
1388+ start_pos += bytes_fed
1389+ chars_to_skip -= chars_decoded
1390+ start_flags , bytes_fed , chars_decoded = dec_flags , 0 , 0
1391+ if chars_decoded >= chars_to_skip :
13741392 break
13751393 else :
13761394 # We didn't get enough decoded data; signal EOF to get more.
1377- decoded = decoder .decode (b"" , final = True )
1378- decoded_chars += len (decoded )
1395+ chars_decoded += len (decoder .decode (b'' , final = True ))
13791396 need_eof = 1
1380- if decoded_chars < skip_chars :
1397+ if chars_decoded < chars_to_skip :
13811398 raise IOError ("can't reconstruct logical file position" )
13821399
1383- # Advance the starting position to the last safe point.
1384- dec_flags , safe_fed_bytes , safe_decoded_chars = last_safe_point
1385- position += safe_fed_bytes
1386- fed_bytes -= safe_fed_bytes
1387- skip_chars -= safe_decoded_chars
1400+ # The returned cookie corresponds to the last safe start point.
13881401 return self ._pack_cookie (
1389- position , dec_flags , fed_bytes , need_eof , skip_chars )
1402+ start_pos , start_flags , bytes_fed , need_eof , chars_to_skip )
13901403 finally :
13911404 decoder .setstate (saved_state )
13921405
@@ -1405,7 +1418,8 @@ def seek(self, cookie, whence=0):
14051418 raise IOError ("can't do nonzero end-relative seeks" )
14061419 self .flush ()
14071420 position = self .buffer .seek (0 , 2 )
1408- self ._clear_decoded_text ()
1421+ self ._set_decoded_chars ('' )
1422+ self ._snapshot = None
14091423 if self ._decoder :
14101424 self ._decoder .reset ()
14111425 return position
@@ -1416,71 +1430,54 @@ def seek(self, cookie, whence=0):
14161430 raise ValueError ("negative seek position %r" % (cookie ,))
14171431 self .flush ()
14181432
1419- # Seek back to the snapshot point.
1420- position , dec_flags , feed_bytes , need_eof , skip_chars = \
1433+ # The strategy of seek() is to go back to the safe start point
1434+ # and replay the effect of read(chars_to_skip) from there.
1435+ start_pos , dec_flags , bytes_to_feed , need_eof , chars_to_skip = \
14211436 self ._unpack_cookie (cookie )
1422- self .buffer .seek (position )
1423- self ._clear_decoded_text ()
14241437
1425- if self ._decoder or dec_flags or feed_bytes or need_eof :
1426- # Restore the decoder flags to their values from the snapshot.
1438+ # Seek back to the safe start point.
1439+ self .buffer .seek (start_pos )
1440+ self ._set_decoded_chars ('' )
1441+ self ._snapshot = None
1442+
1443+ # Restore the decoder to its state from the safe start point.
1444+ if self ._decoder or dec_flags or chars_to_skip :
14271445 self ._decoder = self ._decoder or self ._get_decoder ()
1428- self ._decoder .setstate ((b"" , dec_flags ))
1446+ self ._decoder .setstate ((b'' , dec_flags ))
14291447 self ._snapshot = (dec_flags , b'' )
14301448
1431- if feed_bytes or need_eof :
1432- # Feed feed_bytes bytes to the decoder.
1433- input_chunk = self .buffer .read (feed_bytes )
1434- self ._decoded_text = self ._decoder .decode (input_chunk , need_eof )
1435- if len (self ._decoded_text ) < skip_chars :
1436- raise IOError ("can't restore logical file position" )
1437-
1438- # Skip skip_chars of the decoded characters.
1439- self ._decoded_text_offset = skip_chars
1440-
1441- # Restore the snapshot.
1449+ if chars_to_skip :
1450+ # Just like _read_chunk, feed the decoder and save a snapshot.
1451+ input_chunk = self .buffer .read (bytes_to_feed )
1452+ self ._set_decoded_chars (
1453+ self ._decoder .decode (input_chunk , need_eof ))
14421454 self ._snapshot = (dec_flags , input_chunk )
1443- return cookie
1444-
1445- def _clear_decoded_text (self ):
1446- """Reset the _decoded_text buffer."""
1447- self ._decoded_text = ''
1448- self ._decoded_text_offset = 0
1449- self ._snapshot = None
14501455
1451- def _emit_decoded_text (self , n = None ):
1452- """Advance into the _decoded_text buffer."""
1453- offset = self ._decoded_text_offset
1454- if n is None :
1455- text = self ._decoded_text [offset :]
1456- else :
1457- text = self ._decoded_text [offset :offset + n ]
1458- self ._decoded_text_offset += len (text )
1459- return text
1456+ # Skip chars_to_skip of the decoded characters.
1457+ if len (self ._decoded_chars ) < chars_to_skip :
1458+ raise IOError ("can't restore logical file position" )
1459+ self ._decoded_chars_used = chars_to_skip
14601460
1461- def _unemit_decoded_text (self , n ):
1462- """Rewind the _decoded_text buffer."""
1463- if self ._decoded_text_offset < n :
1464- raise AssertionError ("unemit out of bounds" )
1465- self ._decoded_text_offset -= n
1461+ return cookie
14661462
14671463 def read (self , n = None ):
14681464 if n is None :
14691465 n = - 1
14701466 decoder = self ._decoder or self ._get_decoder ()
14711467 if n < 0 :
14721468 # Read everything.
1473- result = (self ._emit_decoded_text () +
1469+ result = (self ._get_decoded_chars () +
14741470 decoder .decode (self .buffer .read (), final = True ))
1475- self ._clear_decoded_text ()
1471+ self ._set_decoded_chars ('' )
1472+ self ._snapshot = None
14761473 return result
14771474 else :
14781475 # Keep reading chunks until we have n characters to return.
14791476 eof = False
1480- result = self ._emit_decoded_text (n )
1477+ result = self ._get_decoded_chars (n )
14811478 while len (result ) < n and not eof :
14821479 eof = not self ._read_chunk ()
1483- result += self ._emit_decoded_text (n - len (result ))
1480+ result += self ._get_decoded_chars (n - len (result ))
14841481 return result
14851482
14861483 def __next__ (self ):
@@ -1497,7 +1494,7 @@ def readline(self, limit=None):
14971494 limit = - 1
14981495
14991496 # Grab all the decoded text (we will rewind any extra bits later).
1500- line = self ._emit_decoded_text ()
1497+ line = self ._get_decoded_chars ()
15011498
15021499 start = 0
15031500 decoder = self ._decoder or self ._get_decoder ()
@@ -1558,20 +1555,21 @@ def readline(self, limit=None):
15581555 # No line ending seen yet - get more data
15591556 more_line = ''
15601557 while self ._read_chunk ():
1561- if self ._decoded_text :
1558+ if self ._decoded_chars :
15621559 break
1563- if self ._decoded_text :
1564- line += self ._emit_decoded_text ()
1560+ if self ._decoded_chars :
1561+ line += self ._get_decoded_chars ()
15651562 else :
15661563 # end of file
1567- self ._clear_decoded_text ()
1564+ self ._set_decoded_chars ('' )
1565+ self ._snapshot = None
15681566 return line
15691567
15701568 if limit >= 0 and endpos > limit :
15711569 endpos = limit # don't exceed limit
15721570
1573- # Rewind _decoded_text to just after the line ending we found.
1574- self ._unemit_decoded_text (len (line ) - endpos )
1571+ # Rewind _decoded_chars to just after the line ending we found.
1572+ self ._rewind_decoded_chars (len (line ) - endpos )
15751573 return line [:endpos ]
15761574
15771575 @property
0 commit comments