Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 593cd6b

Browse files
committed
Clean up the TextIOWrapper code; pick better names; improve documentation.
1 parent dbe28e5 commit 593cd6b

1 file changed

Lines changed: 121 additions & 123 deletions

File tree

Lib/io.py

Lines changed: 121 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,20 +1179,19 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None,
11791179
self._writenl = newline or os.linesep
11801180
self._encoder = None
11811181
self._decoder = None
1182-
self._decoded_text = "" # buffer for text produced by decoder
1183-
self._decoded_text_offset = 0 # offset to text returned by read()
1182+
self._decoded_chars = '' # buffer for text returned from decoder
1183+
self._decoded_chars_used = 0 # offset into _decoded_chars for read()
11841184
self._snapshot = None # info for reconstructing decoder state
11851185
self._seekable = self._telling = self.buffer.seekable()
11861186

1187-
# A word about _snapshot. This attribute is either None, or a tuple
1188-
# (decoder_state, next_input) where decoder_state is the second
1189-
# (integer) item of the decoder state, and next_input is the chunk
1190-
# of bytes that comes after the snapshot point in the input.
1191-
# We use this to reconstruct intermediate decoder states in tell().
1187+
# self._snapshot is either None, or a tuple (dec_flags, next_input)
1188+
# where dec_flags is the second (integer) item of the decoder state
1189+
# and next_input is the chunk of input bytes that comes next after the
1190+
# snapshot point. We use this to reconstruct decoder states in tell().
11921191

11931192
# Naming convention:
1194-
# - integer variables ending in "_bytes" count input bytes
1195-
# - integer variables ending in "_chars" count decoded characters
1193+
# - "bytes_..." for integer variables that count input bytes
1194+
# - "chars_..." for integer variables that count decoded characters
11961195

11971196
def __repr__(self):
11981197
return '<TIOW %x>' % id(self)
@@ -1267,62 +1266,79 @@ def _get_decoder(self):
12671266
self._decoder = decoder
12681267
return decoder
12691268

1269+
# The following three methods implement an ADT for _decoded_chars.
1270+
# Text returned from the decoder is buffered here until the client
1271+
# requests it by calling our read() or readline() method.
1272+
def _set_decoded_chars(self, chars):
1273+
"""Set the _decoded_chars buffer."""
1274+
self._decoded_chars = chars
1275+
self._decoded_chars_used = 0
1276+
1277+
def _get_decoded_chars(self, n=None):
1278+
"""Advance into the _decoded_chars buffer."""
1279+
offset = self._decoded_chars_used
1280+
if n is None:
1281+
chars = self._decoded_chars[offset:]
1282+
else:
1283+
chars = self._decoded_chars[offset:offset + n]
1284+
self._decoded_chars_used += len(chars)
1285+
return chars
1286+
1287+
def _rewind_decoded_chars(self, n):
1288+
"""Rewind the _decoded_chars buffer."""
1289+
if self._decoded_chars_used < n:
1290+
raise AssertionError("rewind decoded_chars out of bounds")
1291+
self._decoded_chars_used -= n
1292+
12701293
def _read_chunk(self):
12711294
"""
12721295
Read and decode the next chunk of data from the BufferedReader.
12731296
12741297
The return value is True unless EOF was reached. The decoded string
1275-
is placed in self._decoded_text (replacing its previous value).
1276-
(The entire input chunk is sent to the decoder, though some of it
1277-
may remain buffered in the decoder, yet to be converted.)
1298+
is placed in self._decoded_chars (replacing its previous value).
1299+
The entire input chunk is sent to the decoder, though some of it
1300+
may remain buffered in the decoder, yet to be converted.
12781301
"""
12791302

12801303
if self._decoder is None:
12811304
raise ValueError("no decoder")
1282-
if not self._telling:
1283-
# No one should call tell(), so don't bother taking a snapshot.
1284-
input_chunk = self.buffer.read1(self._CHUNK_SIZE)
1285-
eof = not input_chunk
1286-
self._decoded_text = self._decoder.decode(input_chunk, eof)
1287-
self._decoded_text_offset = 0
1288-
return not eof
1289-
1290-
# The cookie returned by tell() cannot include the contents of
1291-
# the decoder's buffer, so we need to snapshot a point in the
1292-
# input where the decoder has nothing in its input buffer.
1293-
1294-
dec_buffer, dec_flags = self._decoder.getstate()
1295-
# The state tuple returned by getstate() contains the decoder's
1296-
# input buffer and an integer representing any other state. Thus,
1297-
# there is a valid snapshot point len(decoder_buffer) bytes ago in
1298-
# the input, with the state tuple (b'', decoder_state).
12991305

1306+
if self._telling:
1307+
# To prepare for tell(), we need to snapshot a point in the
1308+
# file where the decoder's input buffer is empty.
1309+
1310+
dec_buffer, dec_flags = self._decoder.getstate()
1311+
# Given this, we know there was a valid snapshot point
1312+
# len(dec_buffer) bytes ago with decoder state (b'', dec_flags).
1313+
1314+
# Read a chunk, decode it, and put the result in self._decoded_chars.
13001315
input_chunk = self.buffer.read1(self._CHUNK_SIZE)
13011316
eof = not input_chunk
1302-
self._decoded_text = self._decoder.decode(input_chunk, eof)
1303-
self._decoded_text_offset = 0
1317+
self._set_decoded_chars(self._decoder.decode(input_chunk, eof))
1318+
1319+
if self._telling:
1320+
# At the snapshot point, len(dec_buffer) bytes before the read,
1321+
# the next input to be decoded is dec_buffer + input_chunk.
1322+
self._snapshot = (dec_flags, dec_buffer + input_chunk)
13041323

1305-
# At the snapshot point, len(dec_buffer) bytes ago, the next input
1306-
# to be passed to the decoder is dec_buffer + input_chunk.
1307-
self._snapshot = (dec_flags, dec_buffer + input_chunk)
13081324
return not eof
13091325

13101326
def _pack_cookie(self, position, dec_flags=0,
1311-
feed_bytes=0, need_eof=0, skip_chars=0):
1327+
bytes_to_feed=0, need_eof=0, chars_to_skip=0):
13121328
# The meaning of a tell() cookie is: seek to position, set the
1313-
# decoder flags to dec_flags, read feed_bytes bytes, feed them
1329+
# decoder flags to dec_flags, read bytes_to_feed bytes, feed them
13141330
# into the decoder with need_eof as the EOF flag, then skip
1315-
# skip_chars characters of the decoded result. For most simple
1316-
# decoders, this should often just be the position.
1317-
return (position | (dec_flags<<64) | (feed_bytes<<128) |
1318-
(skip_chars<<192) | bool(need_eof)<<256)
1331+
# chars_to_skip characters of the decoded result. For most simple
1332+
# decoders, tell() will often just give a byte offset in the file.
1333+
return (position | (dec_flags<<64) | (bytes_to_feed<<128) |
1334+
(chars_to_skip<<192) | bool(need_eof)<<256)
13191335

13201336
def _unpack_cookie(self, bigint):
13211337
rest, position = divmod(bigint, 1<<64)
13221338
rest, dec_flags = divmod(rest, 1<<64)
1323-
rest, feed_bytes = divmod(rest, 1<<64)
1324-
need_eof, skip_chars = divmod(rest, 1<<64)
1325-
return position, dec_flags, feed_bytes, need_eof, skip_chars
1339+
rest, bytes_to_feed = divmod(rest, 1<<64)
1340+
need_eof, chars_to_skip = divmod(rest, 1<<64)
1341+
return position, dec_flags, bytes_to_feed, need_eof, chars_to_skip
13261342

13271343
def tell(self):
13281344
if not self._seekable:
@@ -1333,7 +1349,7 @@ def tell(self):
13331349
position = self.buffer.tell()
13341350
decoder = self._decoder
13351351
if decoder is None or self._snapshot is None:
1336-
if self._decoded_text:
1352+
if self._decoded_chars:
13371353
# This should never happen.
13381354
raise AssertionError("pending decoded text")
13391355
return position
@@ -1342,51 +1358,48 @@ def tell(self):
13421358
dec_flags, next_input = self._snapshot
13431359
position -= len(next_input)
13441360

1345-
# How many decoded characters have been returned since the snapshot?
1346-
skip_chars = self._decoded_text_offset
1347-
if skip_chars == 0:
1361+
# How many decoded characters have been used up since the snapshot?
1362+
chars_to_skip = self._decoded_chars_used
1363+
if chars_to_skip == 0:
13481364
# We haven't moved from the snapshot point.
13491365
return self._pack_cookie(position, dec_flags)
13501366

1351-
# Walk the decoder forward, one byte at a time, to find the minimum
1352-
# input necessary to give us the decoded characters we need to skip.
1353-
# As we go, look for the "safe point" nearest to the current location
1354-
# (i.e. a point where the decoder has nothing buffered, so we can
1355-
# safely start from there when trying to return to this location).
1367+
# Starting from the snapshot position, we will walk the decoder
1368+
# forward until it gives us enough decoded characters.
13561369
saved_state = decoder.getstate()
13571370
try:
1358-
decoder.setstate((b"", dec_flags))
1359-
fed_bytes = 0
1360-
decoded_chars = 0
1371+
# Note our initial start point.
1372+
decoder.setstate((b'', dec_flags))
1373+
start_pos = position
1374+
start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0
13611375
need_eof = 0
1362-
last_safe_point = (dec_flags, 0, 0)
13631376

1377+
# Feed the decoder one byte at a time. As we go, note the
1378+
# nearest "safe start point" before the current location
1379+
# (a point where the decoder has nothing buffered, so seek()
1380+
# can safely start from there and advance to this location).
13641381
next_byte = bytearray(1)
13651382
for next_byte[0] in next_input:
1366-
decoded = decoder.decode(next_byte)
1367-
fed_bytes += 1
1368-
decoded_chars += len(decoded)
1383+
bytes_fed += 1
1384+
chars_decoded += len(decoder.decode(next_byte))
13691385
dec_buffer, dec_flags = decoder.getstate()
1370-
if not dec_buffer and decoded_chars <= skip_chars:
1371-
# Decoder buffer is empty, so it's safe to start from here.
1372-
last_safe_point = (dec_flags, fed_bytes, decoded_chars)
1373-
if decoded_chars >= skip_chars:
1386+
if not dec_buffer and chars_decoded <= chars_to_skip:
1387+
# Decoder buffer is empty, so this is a safe start point.
1388+
start_pos += bytes_fed
1389+
chars_to_skip -= chars_decoded
1390+
start_flags, bytes_fed, chars_decoded = dec_flags, 0, 0
1391+
if chars_decoded >= chars_to_skip:
13741392
break
13751393
else:
13761394
# We didn't get enough decoded data; signal EOF to get more.
1377-
decoded = decoder.decode(b"", final=True)
1378-
decoded_chars += len(decoded)
1395+
chars_decoded += len(decoder.decode(b'', final=True))
13791396
need_eof = 1
1380-
if decoded_chars < skip_chars:
1397+
if chars_decoded < chars_to_skip:
13811398
raise IOError("can't reconstruct logical file position")
13821399

1383-
# Advance the starting position to the last safe point.
1384-
dec_flags, safe_fed_bytes, safe_decoded_chars = last_safe_point
1385-
position += safe_fed_bytes
1386-
fed_bytes -= safe_fed_bytes
1387-
skip_chars -= safe_decoded_chars
1400+
# The returned cookie corresponds to the last safe start point.
13881401
return self._pack_cookie(
1389-
position, dec_flags, fed_bytes, need_eof, skip_chars)
1402+
start_pos, start_flags, bytes_fed, need_eof, chars_to_skip)
13901403
finally:
13911404
decoder.setstate(saved_state)
13921405

@@ -1405,7 +1418,8 @@ def seek(self, cookie, whence=0):
14051418
raise IOError("can't do nonzero end-relative seeks")
14061419
self.flush()
14071420
position = self.buffer.seek(0, 2)
1408-
self._clear_decoded_text()
1421+
self._set_decoded_chars('')
1422+
self._snapshot = None
14091423
if self._decoder:
14101424
self._decoder.reset()
14111425
return position
@@ -1416,71 +1430,54 @@ def seek(self, cookie, whence=0):
14161430
raise ValueError("negative seek position %r" % (cookie,))
14171431
self.flush()
14181432

1419-
# Seek back to the snapshot point.
1420-
position, dec_flags, feed_bytes, need_eof, skip_chars = \
1433+
# The strategy of seek() is to go back to the safe start point
1434+
# and replay the effect of read(chars_to_skip) from there.
1435+
start_pos, dec_flags, bytes_to_feed, need_eof, chars_to_skip = \
14211436
self._unpack_cookie(cookie)
1422-
self.buffer.seek(position)
1423-
self._clear_decoded_text()
14241437

1425-
if self._decoder or dec_flags or feed_bytes or need_eof:
1426-
# Restore the decoder flags to their values from the snapshot.
1438+
# Seek back to the safe start point.
1439+
self.buffer.seek(start_pos)
1440+
self._set_decoded_chars('')
1441+
self._snapshot = None
1442+
1443+
# Restore the decoder to its state from the safe start point.
1444+
if self._decoder or dec_flags or chars_to_skip:
14271445
self._decoder = self._decoder or self._get_decoder()
1428-
self._decoder.setstate((b"", dec_flags))
1446+
self._decoder.setstate((b'', dec_flags))
14291447
self._snapshot = (dec_flags, b'')
14301448

1431-
if feed_bytes or need_eof:
1432-
# Feed feed_bytes bytes to the decoder.
1433-
input_chunk = self.buffer.read(feed_bytes)
1434-
self._decoded_text = self._decoder.decode(input_chunk, need_eof)
1435-
if len(self._decoded_text) < skip_chars:
1436-
raise IOError("can't restore logical file position")
1437-
1438-
# Skip skip_chars of the decoded characters.
1439-
self._decoded_text_offset = skip_chars
1440-
1441-
# Restore the snapshot.
1449+
if chars_to_skip:
1450+
# Just like _read_chunk, feed the decoder and save a snapshot.
1451+
input_chunk = self.buffer.read(bytes_to_feed)
1452+
self._set_decoded_chars(
1453+
self._decoder.decode(input_chunk, need_eof))
14421454
self._snapshot = (dec_flags, input_chunk)
1443-
return cookie
1444-
1445-
def _clear_decoded_text(self):
1446-
"""Reset the _decoded_text buffer."""
1447-
self._decoded_text = ''
1448-
self._decoded_text_offset = 0
1449-
self._snapshot = None
14501455

1451-
def _emit_decoded_text(self, n=None):
1452-
"""Advance into the _decoded_text buffer."""
1453-
offset = self._decoded_text_offset
1454-
if n is None:
1455-
text = self._decoded_text[offset:]
1456-
else:
1457-
text = self._decoded_text[offset:offset + n]
1458-
self._decoded_text_offset += len(text)
1459-
return text
1456+
# Skip chars_to_skip of the decoded characters.
1457+
if len(self._decoded_chars) < chars_to_skip:
1458+
raise IOError("can't restore logical file position")
1459+
self._decoded_chars_used = chars_to_skip
14601460

1461-
def _unemit_decoded_text(self, n):
1462-
"""Rewind the _decoded_text buffer."""
1463-
if self._decoded_text_offset < n:
1464-
raise AssertionError("unemit out of bounds")
1465-
self._decoded_text_offset -= n
1461+
return cookie
14661462

14671463
def read(self, n=None):
14681464
if n is None:
14691465
n = -1
14701466
decoder = self._decoder or self._get_decoder()
14711467
if n < 0:
14721468
# Read everything.
1473-
result = (self._emit_decoded_text() +
1469+
result = (self._get_decoded_chars() +
14741470
decoder.decode(self.buffer.read(), final=True))
1475-
self._clear_decoded_text()
1471+
self._set_decoded_chars('')
1472+
self._snapshot = None
14761473
return result
14771474
else:
14781475
# Keep reading chunks until we have n characters to return.
14791476
eof = False
1480-
result = self._emit_decoded_text(n)
1477+
result = self._get_decoded_chars(n)
14811478
while len(result) < n and not eof:
14821479
eof = not self._read_chunk()
1483-
result += self._emit_decoded_text(n - len(result))
1480+
result += self._get_decoded_chars(n - len(result))
14841481
return result
14851482

14861483
def __next__(self):
@@ -1497,7 +1494,7 @@ def readline(self, limit=None):
14971494
limit = -1
14981495

14991496
# Grab all the decoded text (we will rewind any extra bits later).
1500-
line = self._emit_decoded_text()
1497+
line = self._get_decoded_chars()
15011498

15021499
start = 0
15031500
decoder = self._decoder or self._get_decoder()
@@ -1558,20 +1555,21 @@ def readline(self, limit=None):
15581555
# No line ending seen yet - get more data
15591556
more_line = ''
15601557
while self._read_chunk():
1561-
if self._decoded_text:
1558+
if self._decoded_chars:
15621559
break
1563-
if self._decoded_text:
1564-
line += self._emit_decoded_text()
1560+
if self._decoded_chars:
1561+
line += self._get_decoded_chars()
15651562
else:
15661563
# end of file
1567-
self._clear_decoded_text()
1564+
self._set_decoded_chars('')
1565+
self._snapshot = None
15681566
return line
15691567

15701568
if limit >= 0 and endpos > limit:
15711569
endpos = limit # don't exceed limit
15721570

1573-
# Rewind _decoded_text to just after the line ending we found.
1574-
self._unemit_decoded_text(len(line) - endpos)
1571+
# Rewind _decoded_chars to just after the line ending we found.
1572+
self._rewind_decoded_chars(len(line) - endpos)
15751573
return line[:endpos]
15761574

15771575
@property

0 commit comments

Comments
 (0)