Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit eae7dad

Browse files
authored
gh-95534: Improve gzip reading speed by 10% (#97664)
Change summary: + There is now a `gzip.READ_BUFFER_SIZE` constant that is 128KB. Other programs that read in 128KB chunks: pigz and cat. So this seems best practice among good programs. Also it is faster than 8 kb chunks. + a zlib._ZlibDecompressor was added. This is the _bz2.BZ2Decompressor ported to zlib. Since the zlib.Decompress object is better for in-memory decompression, the _ZlibDecompressor is hidden. It only makes sense in file decompression, and that is already implemented now in the gzip library. No need to bother the users with this. + The ZlibDecompressor uses the older Cpython arrange_output_buffer functions, as those are faster and more appropriate for the use case. + GzipFile.read has been optimized. There is no longer a `unconsumed_tail` member to write back to padded file. This is instead handled by the ZlibDecompressor itself, which has an internal buffer. `_add_read_data` has been inlined, as it was just two calls. EDIT: While I am adding improvements anyway, I figured I could add another one-liner optimization now to the python -m gzip application. That read chunks in io.DEFAULT_BUFFER_SIZE previously, but has been updated now to use READ_BUFFER_SIZE chunks.
1 parent bb38b39 commit eae7dad

File tree

5 files changed

+850
-80
lines changed

5 files changed

+850
-80
lines changed

Lib/gzip.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
_COMPRESS_LEVEL_TRADEOFF = 6
2222
_COMPRESS_LEVEL_BEST = 9
2323

24+
READ_BUFFER_SIZE = 128 * 1024
25+
2426

2527
def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
2628
encoding=None, errors=None, newline=None):
@@ -446,7 +448,7 @@ def _read_gzip_header(fp):
446448

447449
class _GzipReader(_compression.DecompressReader):
448450
def __init__(self, fp):
449-
super().__init__(_PaddedFile(fp), zlib.decompressobj,
451+
super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
450452
wbits=-zlib.MAX_WBITS)
451453
# Set flag indicating start of a new member
452454
self._new_member = True
@@ -494,12 +496,13 @@ def read(self, size=-1):
494496
self._new_member = False
495497

496498
# Read a chunk of data from the file
497-
buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
499+
if self._decompressor.needs_input:
500+
buf = self._fp.read(READ_BUFFER_SIZE)
501+
uncompress = self._decompressor.decompress(buf, size)
502+
else:
503+
uncompress = self._decompressor.decompress(b"", size)
498504

499-
uncompress = self._decompressor.decompress(buf, size)
500-
if self._decompressor.unconsumed_tail != b"":
501-
self._fp.prepend(self._decompressor.unconsumed_tail)
502-
elif self._decompressor.unused_data != b"":
505+
if self._decompressor.unused_data != b"":
503506
# Prepend the already read bytes to the fileobj so they can
504507
# be seen by _read_eof() and _read_gzip_header()
505508
self._fp.prepend(self._decompressor.unused_data)
@@ -510,14 +513,11 @@ def read(self, size=-1):
510513
raise EOFError("Compressed file ended before the "
511514
"end-of-stream marker was reached")
512515

513-
self._add_read_data( uncompress )
516+
self._crc = zlib.crc32(uncompress, self._crc)
517+
self._stream_size += len(uncompress)
514518
self._pos += len(uncompress)
515519
return uncompress
516520

517-
def _add_read_data(self, data):
518-
self._crc = zlib.crc32(data, self._crc)
519-
self._stream_size = self._stream_size + len(data)
520-
521521
def _read_eof(self):
522522
# We've read to the end of the file
523523
# We check that the computed CRC and size of the
@@ -647,7 +647,7 @@ def main():
647647
f = builtins.open(arg, "rb")
648648
g = open(arg + ".gz", "wb")
649649
while True:
650-
chunk = f.read(io.DEFAULT_BUFFER_SIZE)
650+
chunk = f.read(READ_BUFFER_SIZE)
651651
if not chunk:
652652
break
653653
g.write(chunk)

Lib/test/test_zlib.py

+167
Original file line numberDiff line numberDiff line change
@@ -944,6 +944,173 @@ def choose_lines(source, number, seed=None, generator=random):
944944
"""
945945

946946

947+
class ZlibDecompressorTest():
948+
# Test adopted from test_bz2.py
949+
TEXT = HAMLET_SCENE
950+
DATA = zlib.compress(HAMLET_SCENE)
951+
BAD_DATA = b"Not a valid deflate block"
952+
def test_Constructor(self):
953+
self.assertRaises(TypeError, zlib._ZlibDecompressor, 42)
954+
955+
def testDecompress(self):
956+
zlibd = zlib._ZlibDecompressor()
957+
self.assertRaises(TypeError, zlibd.decompress)
958+
text = zlibd.decompress(self.DATA)
959+
self.assertEqual(text, self.TEXT)
960+
961+
def testDecompressChunks10(self):
962+
zlibd = zlib._ZlibDecompressor()
963+
text = b''
964+
n = 0
965+
while True:
966+
str = self.DATA[n*10:(n+1)*10]
967+
if not str:
968+
break
969+
text += zlibd.decompress(str)
970+
n += 1
971+
self.assertEqual(text, self.TEXT)
972+
973+
def testDecompressUnusedData(self):
974+
zlibd = zlib._ZlibDecompressor()
975+
unused_data = b"this is unused data"
976+
text = zlibd.decompress(self.DATA+unused_data)
977+
self.assertEqual(text, self.TEXT)
978+
self.assertEqual(zlibd.unused_data, unused_data)
979+
980+
def testEOFError(self):
981+
zlibd = zlib._ZlibDecompressor()
982+
text = zlibd.decompress(self.DATA)
983+
self.assertRaises(EOFError, zlibd.decompress, b"anything")
984+
self.assertRaises(EOFError, zlibd.decompress, b"")
985+
986+
@support.skip_if_pgo_task
987+
@bigmemtest(size=_4G + 100, memuse=3.3)
988+
def testDecompress4G(self, size):
989+
# "Test zlib._ZlibDecompressor.decompress() with >4GiB input"
990+
blocksize = 10 * 1024 * 1024
991+
block = random.randbytes(blocksize)
992+
try:
993+
data = block * (size // blocksize + 1)
994+
compressed = zlib.compress(data)
995+
zlibd = zlib._ZlibDecompressor()
996+
decompressed = zlibd.decompress(compressed)
997+
self.assertTrue(decompressed == data)
998+
finally:
999+
data = None
1000+
compressed = None
1001+
decompressed = None
1002+
1003+
def testPickle(self):
1004+
for proto in range(pickle.HIGHEST_PROTOCOL + 1):
1005+
with self.assertRaises(TypeError):
1006+
pickle.dumps(zlib._ZlibDecompressor(), proto)
1007+
1008+
def testDecompressorChunksMaxsize(self):
1009+
zlibd = zlib._ZlibDecompressor()
1010+
max_length = 100
1011+
out = []
1012+
1013+
# Feed some input
1014+
len_ = len(self.BIG_DATA) - 64
1015+
out.append(zlibd.decompress(self.BIG_DATA[:len_],
1016+
max_length=max_length))
1017+
self.assertFalse(zlibd.needs_input)
1018+
self.assertEqual(len(out[-1]), max_length)
1019+
1020+
# Retrieve more data without providing more input
1021+
out.append(zlibd.decompress(b'', max_length=max_length))
1022+
self.assertFalse(zlibd.needs_input)
1023+
self.assertEqual(len(out[-1]), max_length)
1024+
1025+
# Retrieve more data while providing more input
1026+
out.append(zlibd.decompress(self.BIG_DATA[len_:],
1027+
max_length=max_length))
1028+
self.assertLessEqual(len(out[-1]), max_length)
1029+
1030+
# Retrieve remaining uncompressed data
1031+
while not zlibd.eof:
1032+
out.append(zlibd.decompress(b'', max_length=max_length))
1033+
self.assertLessEqual(len(out[-1]), max_length)
1034+
1035+
out = b"".join(out)
1036+
self.assertEqual(out, self.BIG_TEXT)
1037+
self.assertEqual(zlibd.unused_data, b"")
1038+
1039+
def test_decompressor_inputbuf_1(self):
1040+
# Test reusing input buffer after moving existing
1041+
# contents to beginning
1042+
zlibd = zlib._ZlibDecompressor()
1043+
out = []
1044+
1045+
# Create input buffer and fill it
1046+
self.assertEqual(zlibd.decompress(self.DATA[:100],
1047+
max_length=0), b'')
1048+
1049+
# Retrieve some results, freeing capacity at beginning
1050+
# of input buffer
1051+
out.append(zlibd.decompress(b'', 2))
1052+
1053+
# Add more data that fits into input buffer after
1054+
# moving existing data to beginning
1055+
out.append(zlibd.decompress(self.DATA[100:105], 15))
1056+
1057+
# Decompress rest of data
1058+
out.append(zlibd.decompress(self.DATA[105:]))
1059+
self.assertEqual(b''.join(out), self.TEXT)
1060+
1061+
def test_decompressor_inputbuf_2(self):
1062+
# Test reusing input buffer by appending data at the
1063+
# end right away
1064+
zlibd = zlib._ZlibDecompressor()
1065+
out = []
1066+
1067+
# Create input buffer and empty it
1068+
self.assertEqual(zlibd.decompress(self.DATA[:200],
1069+
max_length=0), b'')
1070+
out.append(zlibd.decompress(b''))
1071+
1072+
# Fill buffer with new data
1073+
out.append(zlibd.decompress(self.DATA[200:280], 2))
1074+
1075+
# Append some more data, not enough to require resize
1076+
out.append(zlibd.decompress(self.DATA[280:300], 2))
1077+
1078+
# Decompress rest of data
1079+
out.append(zlibd.decompress(self.DATA[300:]))
1080+
self.assertEqual(b''.join(out), self.TEXT)
1081+
1082+
def test_decompressor_inputbuf_3(self):
1083+
# Test reusing input buffer after extending it
1084+
1085+
zlibd = zlib._ZlibDecompressor()
1086+
out = []
1087+
1088+
# Create almost full input buffer
1089+
out.append(zlibd.decompress(self.DATA[:200], 5))
1090+
1091+
# Add even more data to it, requiring resize
1092+
out.append(zlibd.decompress(self.DATA[200:300], 5))
1093+
1094+
# Decompress rest of data
1095+
out.append(zlibd.decompress(self.DATA[300:]))
1096+
self.assertEqual(b''.join(out), self.TEXT)
1097+
1098+
def test_failure(self):
1099+
zlibd = zlib._ZlibDecompressor()
1100+
self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30)
1101+
# Previously, a second call could crash due to internal inconsistency
1102+
self.assertRaises(Exception, zlibd.decompress, self.BAD_DATA * 30)
1103+
1104+
@support.refcount_test
1105+
def test_refleaks_in___init__(self):
1106+
gettotalrefcount = support.get_attribute(sys, 'gettotalrefcount')
1107+
zlibd = zlib._ZlibDecompressor()
1108+
refs_before = gettotalrefcount()
1109+
for i in range(100):
1110+
zlibd.__init__()
1111+
self.assertAlmostEqual(gettotalrefcount() - refs_before, 0, delta=10)
1112+
1113+
9471114
class CustomInt:
9481115
def __index__(self):
9491116
return 100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
:meth:`gzip.GzipFile.read` reads 10% faster.

Modules/clinic/zlibmodule.c.h

+99-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)