@@ -79,7 +79,8 @@ def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
7979 mode = "rb"
8080 mode_code = _MODE_READ
8181 self ._decompressor = BZ2Decompressor ()
82- self ._buffer = None
82+ self ._buffer = b""
83+ self ._buffer_offset = 0
8384 elif mode in ("w" , "wb" ):
8485 mode = "wb"
8586 mode_code = _MODE_WRITE
@@ -124,7 +125,8 @@ def close(self):
124125 self ._fp = None
125126 self ._closefp = False
126127 self ._mode = _MODE_CLOSED
127- self ._buffer = None
128+ self ._buffer = b""
129+ self ._buffer_offset = 0
128130
129131 @property
130132 def closed (self ):
@@ -174,16 +176,13 @@ def _check_can_seek(self):
174176
175177 # Fill the readahead buffer if it is empty. Returns False on EOF.
176178 def _fill_buffer (self ):
179+ if self ._mode == _MODE_READ_EOF :
180+ return False
177181 # Depending on the input data, our call to the decompressor may not
178182 # return any data. In this case, try again after reading another block.
179- while True :
180- if self ._buffer :
181- return True
182-
183- if self ._decompressor .unused_data :
184- rawblock = self ._decompressor .unused_data
185- else :
186- rawblock = self ._fp .read (_BUFFER_SIZE )
183+ while self ._buffer_offset == len (self ._buffer ):
184+ rawblock = (self ._decompressor .unused_data or
185+ self ._fp .read (_BUFFER_SIZE ))
187186
188187 if not rawblock :
189188 if self ._decompressor .eof :
@@ -199,30 +198,48 @@ def _fill_buffer(self):
199198 self ._decompressor = BZ2Decompressor ()
200199
201200 self ._buffer = self ._decompressor .decompress (rawblock )
201+ self ._buffer_offset = 0
202+ return True
202203
203204 # Read data until EOF.
204205 # If return_data is false, consume the data without returning it.
205206 def _read_all (self , return_data = True ):
207+ # The loop assumes that _buffer_offset is 0. Ensure that this is true.
208+ self ._buffer = self ._buffer [self ._buffer_offset :]
209+ self ._buffer_offset = 0
210+
206211 blocks = []
207212 while self ._fill_buffer ():
208213 if return_data :
209214 blocks .append (self ._buffer )
210215 self ._pos += len (self ._buffer )
211- self ._buffer = None
216+ self ._buffer = b""
212217 if return_data :
213218 return b"" .join (blocks )
214219
215220 # Read a block of up to n bytes.
216221 # If return_data is false, consume the data without returning it.
217222 def _read_block (self , n , return_data = True ):
223+ # If we have enough data buffered, return immediately.
224+ end = self ._buffer_offset + n
225+ if end <= len (self ._buffer ):
226+ data = self ._buffer [self ._buffer_offset : end ]
227+ self ._buffer_offset = end
228+ self ._pos += len (data )
229+ return data
230+
231+ # The loop assumes that _buffer_offset is 0. Ensure that this is true.
232+ self ._buffer = self ._buffer [self ._buffer_offset :]
233+ self ._buffer_offset = 0
234+
218235 blocks = []
219236 while n > 0 and self ._fill_buffer ():
220237 if n < len (self ._buffer ):
221238 data = self ._buffer [:n ]
222- self ._buffer = self . _buffer [ n :]
239+ self ._buffer_offset = n
223240 else :
224241 data = self ._buffer
225- self ._buffer = None
242+ self ._buffer = b""
226243 if return_data :
227244 blocks .append (data )
228245 self ._pos += len (data )
@@ -238,9 +255,9 @@ def peek(self, n=0):
238255 """
239256 with self ._lock :
240257 self ._check_can_read ()
241- if self . _mode == _MODE_READ_EOF or not self ._fill_buffer ():
258+ if not self ._fill_buffer ():
242259 return b""
243- return self ._buffer
260+ return self ._buffer [ self . _buffer_offset :]
244261
245262 def read (self , size = - 1 ):
246263 """Read up to size uncompressed bytes from the file.
@@ -250,7 +267,7 @@ def read(self, size=-1):
250267 """
251268 with self ._lock :
252269 self ._check_can_read ()
253- if self . _mode == _MODE_READ_EOF or size == 0 :
270+ if size == 0 :
254271 return b""
255272 elif size < 0 :
256273 return self ._read_all ()
@@ -268,15 +285,19 @@ def read1(self, size=-1):
268285 # In this case we make multiple reads, to avoid returning b"".
269286 with self ._lock :
270287 self ._check_can_read ()
271- if (size == 0 or self ._mode == _MODE_READ_EOF or
272- not self ._fill_buffer ()):
288+ if (size == 0 or
289+ # Only call _fill_buffer() if the buffer is actually empty.
290+ # This gives a significant speedup if *size* is small.
291+ (self ._buffer_offset == len (self ._buffer ) and not self ._fill_buffer ())):
273292 return b""
274- if 0 < size < len (self ._buffer ):
275- data = self ._buffer [:size ]
276- self ._buffer = self ._buffer [size :]
293+ if size > 0 :
294+ data = self ._buffer [self ._buffer_offset :
295+ self ._buffer_offset + size ]
296+ self ._buffer_offset += len (data )
277297 else :
278- data = self ._buffer
279- self ._buffer = None
298+ data = self ._buffer [self ._buffer_offset :]
299+ self ._buffer = b""
300+ self ._buffer_offset = 0
280301 self ._pos += len (data )
281302 return data
282303
@@ -299,6 +320,14 @@ def readline(self, size=-1):
299320 raise TypeError ("Integer argument expected" )
300321 size = size .__index__ ()
301322 with self ._lock :
323+ # Shortcut for the common case - the whole line is in the buffer.
324+ if size < 0 :
325+ end = self ._buffer .find (b"\n " , self ._buffer_offset ) + 1
326+ if end > 0 :
327+ line = self ._buffer [self ._buffer_offset : end ]
328+ self ._buffer_offset = end
329+ self ._pos += len (line )
330+ return line
302331 return io .BufferedIOBase .readline (self , size )
303332
304333 def readlines (self , size = - 1 ):
@@ -345,7 +374,8 @@ def _rewind(self):
345374 self ._mode = _MODE_READ
346375 self ._pos = 0
347376 self ._decompressor = BZ2Decompressor ()
348- self ._buffer = None
377+ self ._buffer = b""
378+ self ._buffer_offset = 0
349379
350380 def seek (self , offset , whence = 0 ):
351381 """Change the file position.
@@ -385,8 +415,7 @@ def seek(self, offset, whence=0):
385415 offset -= self ._pos
386416
387417 # Read and discard data until we reach the desired position.
388- if self ._mode != _MODE_READ_EOF :
389- self ._read_block (offset , return_data = False )
418+ self ._read_block (offset , return_data = False )
390419
391420 return self ._pos
392421
0 commit comments