Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 6c57318

Browse files
committed
Issue #16034: Fix performance regressions in the new BZ2File implementation.
Thanks to Victor Hooi for the bug report, and Serhiy Storchaka for the initial patch.
1 parent f23e2b6 commit 6c57318

2 files changed

Lines changed: 58 additions & 26 deletions

File tree

Lib/bz2.py

Lines changed: 55 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ def __init__(self, filename, mode="r", buffering=None, compresslevel=9):
7979
mode = "rb"
8080
mode_code = _MODE_READ
8181
self._decompressor = BZ2Decompressor()
82-
self._buffer = None
82+
self._buffer = b""
83+
self._buffer_offset = 0
8384
elif mode in ("w", "wb"):
8485
mode = "wb"
8586
mode_code = _MODE_WRITE
@@ -124,7 +125,8 @@ def close(self):
124125
self._fp = None
125126
self._closefp = False
126127
self._mode = _MODE_CLOSED
127-
self._buffer = None
128+
self._buffer = b""
129+
self._buffer_offset = 0
128130

129131
@property
130132
def closed(self):
@@ -174,16 +176,13 @@ def _check_can_seek(self):
174176

175177
# Fill the readahead buffer if it is empty. Returns False on EOF.
176178
def _fill_buffer(self):
179+
if self._mode == _MODE_READ_EOF:
180+
return False
177181
# Depending on the input data, our call to the decompressor may not
178182
# return any data. In this case, try again after reading another block.
179-
while True:
180-
if self._buffer:
181-
return True
182-
183-
if self._decompressor.unused_data:
184-
rawblock = self._decompressor.unused_data
185-
else:
186-
rawblock = self._fp.read(_BUFFER_SIZE)
183+
while self._buffer_offset == len(self._buffer):
184+
rawblock = (self._decompressor.unused_data or
185+
self._fp.read(_BUFFER_SIZE))
187186

188187
if not rawblock:
189188
if self._decompressor.eof:
@@ -199,30 +198,48 @@ def _fill_buffer(self):
199198
self._decompressor = BZ2Decompressor()
200199

201200
self._buffer = self._decompressor.decompress(rawblock)
201+
self._buffer_offset = 0
202+
return True
202203

203204
# Read data until EOF.
204205
# If return_data is false, consume the data without returning it.
205206
def _read_all(self, return_data=True):
207+
# The loop assumes that _buffer_offset is 0. Ensure that this is true.
208+
self._buffer = self._buffer[self._buffer_offset:]
209+
self._buffer_offset = 0
210+
206211
blocks = []
207212
while self._fill_buffer():
208213
if return_data:
209214
blocks.append(self._buffer)
210215
self._pos += len(self._buffer)
211-
self._buffer = None
216+
self._buffer = b""
212217
if return_data:
213218
return b"".join(blocks)
214219

215220
# Read a block of up to n bytes.
216221
# If return_data is false, consume the data without returning it.
217222
def _read_block(self, n, return_data=True):
223+
# If we have enough data buffered, return immediately.
224+
end = self._buffer_offset + n
225+
if end <= len(self._buffer):
226+
data = self._buffer[self._buffer_offset : end]
227+
self._buffer_offset = end
228+
self._pos += len(data)
229+
return data
230+
231+
# The loop assumes that _buffer_offset is 0. Ensure that this is true.
232+
self._buffer = self._buffer[self._buffer_offset:]
233+
self._buffer_offset = 0
234+
218235
blocks = []
219236
while n > 0 and self._fill_buffer():
220237
if n < len(self._buffer):
221238
data = self._buffer[:n]
222-
self._buffer = self._buffer[n:]
239+
self._buffer_offset = n
223240
else:
224241
data = self._buffer
225-
self._buffer = None
242+
self._buffer = b""
226243
if return_data:
227244
blocks.append(data)
228245
self._pos += len(data)
@@ -238,9 +255,9 @@ def peek(self, n=0):
238255
"""
239256
with self._lock:
240257
self._check_can_read()
241-
if self._mode == _MODE_READ_EOF or not self._fill_buffer():
258+
if not self._fill_buffer():
242259
return b""
243-
return self._buffer
260+
return self._buffer[self._buffer_offset:]
244261

245262
def read(self, size=-1):
246263
"""Read up to size uncompressed bytes from the file.
@@ -250,7 +267,7 @@ def read(self, size=-1):
250267
"""
251268
with self._lock:
252269
self._check_can_read()
253-
if self._mode == _MODE_READ_EOF or size == 0:
270+
if size == 0:
254271
return b""
255272
elif size < 0:
256273
return self._read_all()
@@ -268,15 +285,19 @@ def read1(self, size=-1):
268285
# In this case we make multiple reads, to avoid returning b"".
269286
with self._lock:
270287
self._check_can_read()
271-
if (size == 0 or self._mode == _MODE_READ_EOF or
272-
not self._fill_buffer()):
288+
if (size == 0 or
289+
# Only call _fill_buffer() if the buffer is actually empty.
290+
# This gives a significant speedup if *size* is small.
291+
(self._buffer_offset == len(self._buffer) and not self._fill_buffer())):
273292
return b""
274-
if 0 < size < len(self._buffer):
275-
data = self._buffer[:size]
276-
self._buffer = self._buffer[size:]
293+
if size > 0:
294+
data = self._buffer[self._buffer_offset :
295+
self._buffer_offset + size]
296+
self._buffer_offset += len(data)
277297
else:
278-
data = self._buffer
279-
self._buffer = None
298+
data = self._buffer[self._buffer_offset:]
299+
self._buffer = b""
300+
self._buffer_offset = 0
280301
self._pos += len(data)
281302
return data
282303

@@ -299,6 +320,14 @@ def readline(self, size=-1):
299320
raise TypeError("Integer argument expected")
300321
size = size.__index__()
301322
with self._lock:
323+
# Shortcut for the common case - the whole line is in the buffer.
324+
if size < 0:
325+
end = self._buffer.find(b"\n", self._buffer_offset) + 1
326+
if end > 0:
327+
line = self._buffer[self._buffer_offset : end]
328+
self._buffer_offset = end
329+
self._pos += len(line)
330+
return line
302331
return io.BufferedIOBase.readline(self, size)
303332

304333
def readlines(self, size=-1):
@@ -345,7 +374,8 @@ def _rewind(self):
345374
self._mode = _MODE_READ
346375
self._pos = 0
347376
self._decompressor = BZ2Decompressor()
348-
self._buffer = None
377+
self._buffer = b""
378+
self._buffer_offset = 0
349379

350380
def seek(self, offset, whence=0):
351381
"""Change the file position.
@@ -385,8 +415,7 @@ def seek(self, offset, whence=0):
385415
offset -= self._pos
386416

387417
# Read and discard data until we reach the desired position.
388-
if self._mode != _MODE_READ_EOF:
389-
self._read_block(offset, return_data=False)
418+
self._read_block(offset, return_data=False)
390419

391420
return self._pos
392421

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ Core and Builtins
1313
Library
1414
-------
1515

16+
- Issue #16034: Fix performance regressions in the new BZ2File implementation.
17+
Initial patch by Serhiy Storchaka.
18+
1619
- Issue #15756: subprocess.poll() now properly handles errno.ECHILD to
1720
return a returncode of 0 when the child has already exited or cannot
1821
be waited on.

0 commit comments

Comments
 (0)