Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 55b4338

Browse files
committed
Issue #1625: BZ2File and bz2.decompress() now support multi-stream files.
Initial patch by Nir Aides.
1 parent c556e10 commit 55b4338

3 files changed

Lines changed: 161 additions & 15 deletions

File tree

Lib/bz2.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ def __init__(self, filename=None, mode="r", buffering=None,
7676
mode = "wb"
7777
mode_code = _MODE_WRITE
7878
self._compressor = BZ2Compressor()
79+
elif mode in ("a", "ab"):
80+
mode = "ab"
81+
mode_code = _MODE_WRITE
82+
self._compressor = BZ2Compressor()
7983
else:
8084
raise ValueError("Invalid mode: {!r}".format(mode))
8185

@@ -161,14 +165,25 @@ def _check_can_seek(self):
161165
def _fill_buffer(self):
162166
if self._buffer:
163167
return True
164-
if self._decompressor.eof:
165-
self._mode = _MODE_READ_EOF
166-
self._size = self._pos
167-
return False
168-
rawblock = self._fp.read(_BUFFER_SIZE)
168+
169+
if self._decompressor.unused_data:
170+
rawblock = self._decompressor.unused_data
171+
else:
172+
rawblock = self._fp.read(_BUFFER_SIZE)
173+
169174
if not rawblock:
170-
raise EOFError("Compressed file ended before the "
171-
"end-of-stream marker was reached")
175+
if self._decompressor.eof:
176+
self._mode = _MODE_READ_EOF
177+
self._size = self._pos
178+
return False
179+
else:
180+
raise EOFError("Compressed file ended before the "
181+
"end-of-stream marker was reached")
182+
183+
# Continue to next stream.
184+
if self._decompressor.eof:
185+
self._decompressor = BZ2Decompressor()
186+
172187
self._buffer = self._decompressor.decompress(rawblock)
173188
return True
174189

@@ -384,9 +399,15 @@ def decompress(data):
384399
"""
385400
if len(data) == 0:
386401
return b""
387-
decomp = BZ2Decompressor()
388-
result = decomp.decompress(data)
389-
if not decomp.eof:
390-
raise ValueError("Compressed data ended before the "
391-
"end-of-stream marker was reached")
392-
return result
402+
403+
result = b""
404+
while True:
405+
decomp = BZ2Decompressor()
406+
result += decomp.decompress(data)
407+
if not decomp.eof:
408+
raise ValueError("Compressed data ended before the "
409+
"end-of-stream marker was reached")
410+
if not decomp.unused_data:
411+
return result
412+
# There is unused data left over. Proceed to next stream.
413+
data = decomp.unused_data

Lib/test/test_bz2.py

Lines changed: 124 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ def getData(self, crlf=False):
8484
else:
8585
return self.DATA
8686

87-
def createTempFile(self, crlf=False):
87+
def createTempFile(self, crlf=False, streams=1):
8888
with open(self.filename, "wb") as f:
89-
f.write(self.getData(crlf))
89+
f.write(self.getData(crlf) * streams)
9090

9191
def testRead(self):
9292
# "Test BZ2File.read()"
@@ -95,6 +95,26 @@ def testRead(self):
9595
self.assertRaises(TypeError, bz2f.read, None)
9696
self.assertEqual(bz2f.read(), self.TEXT)
9797

98+
def testReadMultiStream(self):
99+
# "Test BZ2File.read() with a multi stream archive"
100+
self.createTempFile(streams=5)
101+
with BZ2File(self.filename) as bz2f:
102+
self.assertRaises(TypeError, bz2f.read, None)
103+
self.assertEqual(bz2f.read(), self.TEXT * 5)
104+
105+
def testReadMonkeyMultiStream(self):
106+
# "Test BZ2File.read() with a multi stream archive in which stream"
107+
# "end is alined with internal buffer size"
108+
buffer_size = bz2._BUFFER_SIZE
109+
bz2._BUFFER_SIZE = len(self.DATA)
110+
try:
111+
self.createTempFile(streams=5)
112+
with BZ2File(self.filename) as bz2f:
113+
self.assertRaises(TypeError, bz2f.read, None)
114+
self.assertEqual(bz2f.read(), self.TEXT * 5)
115+
finally:
116+
bz2._BUFFER_SIZE = buffer_size
117+
98118
def testRead0(self):
99119
# "Test BBZ2File.read(0)"
100120
self.createTempFile()
@@ -114,6 +134,18 @@ def testReadChunk10(self):
114134
text += str
115135
self.assertEqual(text, self.TEXT)
116136

137+
def testReadChunk10MultiStream(self):
138+
# "Test BZ2File.read() in chunks of 10 bytes with a multi stream archive"
139+
self.createTempFile(streams=5)
140+
with BZ2File(self.filename) as bz2f:
141+
text = b''
142+
while 1:
143+
str = bz2f.read(10)
144+
if not str:
145+
break
146+
text += str
147+
self.assertEqual(text, self.TEXT * 5)
148+
117149
def testRead100(self):
118150
# "Test BZ2File.read(100)"
119151
self.createTempFile()
@@ -151,6 +183,15 @@ def testReadLine(self):
151183
for line in sio.readlines():
152184
self.assertEqual(bz2f.readline(), line)
153185

186+
def testReadLineMultiStream(self):
187+
# "Test BZ2File.readline() with a multi stream archive"
188+
self.createTempFile(streams=5)
189+
with BZ2File(self.filename) as bz2f:
190+
self.assertRaises(TypeError, bz2f.readline, None)
191+
sio = BytesIO(self.TEXT * 5)
192+
for line in sio.readlines():
193+
self.assertEqual(bz2f.readline(), line)
194+
154195
def testReadLines(self):
155196
# "Test BZ2File.readlines()"
156197
self.createTempFile()
@@ -159,13 +200,28 @@ def testReadLines(self):
159200
sio = BytesIO(self.TEXT)
160201
self.assertEqual(bz2f.readlines(), sio.readlines())
161202

203+
def testReadLinesMultiStream(self):
204+
# "Test BZ2File.readlines() with a multi stream archive"
205+
self.createTempFile(streams=5)
206+
with BZ2File(self.filename) as bz2f:
207+
self.assertRaises(TypeError, bz2f.readlines, None)
208+
sio = BytesIO(self.TEXT * 5)
209+
self.assertEqual(bz2f.readlines(), sio.readlines())
210+
162211
def testIterator(self):
163212
# "Test iter(BZ2File)"
164213
self.createTempFile()
165214
with BZ2File(self.filename) as bz2f:
166215
sio = BytesIO(self.TEXT)
167216
self.assertEqual(list(iter(bz2f)), sio.readlines())
168217

218+
def testIteratorMultiStream(self):
219+
# "Test iter(BZ2File) with a multi stream archive"
220+
self.createTempFile(streams=5)
221+
with BZ2File(self.filename) as bz2f:
222+
sio = BytesIO(self.TEXT * 5)
223+
self.assertEqual(list(iter(bz2f)), sio.readlines())
224+
169225
def testClosedIteratorDeadlock(self):
170226
# "Test that iteration on a closed bz2file releases the lock."
171227
# http://bugs.python.org/issue3309
@@ -217,6 +273,17 @@ def testWriteMethodsOnReadOnlyFile(self):
217273
self.assertRaises(IOError, bz2f.write, b"a")
218274
self.assertRaises(IOError, bz2f.writelines, [b"a"])
219275

276+
def testAppend(self):
277+
# "Test BZ2File.write()"
278+
with BZ2File(self.filename, "w") as bz2f:
279+
self.assertRaises(TypeError, bz2f.write)
280+
bz2f.write(self.TEXT)
281+
with BZ2File(self.filename, "a") as bz2f:
282+
self.assertRaises(TypeError, bz2f.write)
283+
bz2f.write(self.TEXT)
284+
with open(self.filename, 'rb') as f:
285+
self.assertEqual(self.decompress(f.read()), self.TEXT * 2)
286+
220287
def testSeekForward(self):
221288
# "Test BZ2File.seek(150, 0)"
222289
self.createTempFile()
@@ -225,6 +292,14 @@ def testSeekForward(self):
225292
bz2f.seek(150)
226293
self.assertEqual(bz2f.read(), self.TEXT[150:])
227294

295+
def testSeekForwardMultiStream(self):
296+
# "Test BZ2File.seek(150, 0) across stream boundaries"
297+
self.createTempFile(streams=2)
298+
with BZ2File(self.filename) as bz2f:
299+
self.assertRaises(TypeError, bz2f.seek)
300+
bz2f.seek(len(self.TEXT) + 150)
301+
self.assertEqual(bz2f.read(), self.TEXT[150:])
302+
228303
def testSeekBackwards(self):
229304
# "Test BZ2File.seek(-150, 1)"
230305
self.createTempFile()
@@ -233,13 +308,30 @@ def testSeekBackwards(self):
233308
bz2f.seek(-150, 1)
234309
self.assertEqual(bz2f.read(), self.TEXT[500-150:])
235310

311+
def testSeekBackwardsMultiStream(self):
312+
# "Test BZ2File.seek(-150, 1) across stream boundaries"
313+
self.createTempFile(streams=2)
314+
with BZ2File(self.filename) as bz2f:
315+
readto = len(self.TEXT) + 100
316+
while readto > 0:
317+
readto -= len(bz2f.read(readto))
318+
bz2f.seek(-150, 1)
319+
self.assertEqual(bz2f.read(), self.TEXT[100-150:] + self.TEXT)
320+
236321
def testSeekBackwardsFromEnd(self):
237322
# "Test BZ2File.seek(-150, 2)"
238323
self.createTempFile()
239324
with BZ2File(self.filename) as bz2f:
240325
bz2f.seek(-150, 2)
241326
self.assertEqual(bz2f.read(), self.TEXT[len(self.TEXT)-150:])
242327

328+
def testSeekBackwardsFromEndMultiStream(self):
329+
# "Test BZ2File.seek(-1000, 2) across stream boundaries"
330+
self.createTempFile(streams=2)
331+
with BZ2File(self.filename) as bz2f:
332+
bz2f.seek(-1000, 2)
333+
self.assertEqual(bz2f.read(), (self.TEXT * 2)[-1000:])
334+
243335
def testSeekPostEnd(self):
244336
# "Test BZ2File.seek(150000)"
245337
self.createTempFile()
@@ -248,6 +340,14 @@ def testSeekPostEnd(self):
248340
self.assertEqual(bz2f.tell(), len(self.TEXT))
249341
self.assertEqual(bz2f.read(), b"")
250342

343+
def testSeekPostEndMultiStream(self):
344+
# "Test BZ2File.seek(150000)"
345+
self.createTempFile(streams=5)
346+
with BZ2File(self.filename) as bz2f:
347+
bz2f.seek(150000)
348+
self.assertEqual(bz2f.tell(), len(self.TEXT) * 5)
349+
self.assertEqual(bz2f.read(), b"")
350+
251351
def testSeekPostEndTwice(self):
252352
# "Test BZ2File.seek(150000) twice"
253353
self.createTempFile()
@@ -257,6 +357,15 @@ def testSeekPostEndTwice(self):
257357
self.assertEqual(bz2f.tell(), len(self.TEXT))
258358
self.assertEqual(bz2f.read(), b"")
259359

360+
def testSeekPostEndTwiceMultiStream(self):
361+
# "Test BZ2File.seek(150000) twice with a multi stream archive"
362+
self.createTempFile(streams=5)
363+
with BZ2File(self.filename) as bz2f:
364+
bz2f.seek(150000)
365+
bz2f.seek(150000)
366+
self.assertEqual(bz2f.tell(), len(self.TEXT) * 5)
367+
self.assertEqual(bz2f.read(), b"")
368+
260369
def testSeekPreStart(self):
261370
# "Test BZ2File.seek(-150, 0)"
262371
self.createTempFile()
@@ -265,6 +374,14 @@ def testSeekPreStart(self):
265374
self.assertEqual(bz2f.tell(), 0)
266375
self.assertEqual(bz2f.read(), self.TEXT)
267376

377+
def testSeekPreStartMultiStream(self):
378+
# "Test BZ2File.seek(-150, 0) with a multi stream archive"
379+
self.createTempFile(streams=2)
380+
with BZ2File(self.filename) as bz2f:
381+
bz2f.seek(-150)
382+
self.assertEqual(bz2f.tell(), 0)
383+
self.assertEqual(bz2f.read(), self.TEXT * 2)
384+
268385
def testFileno(self):
269386
# "Test BZ2File.fileno()"
270387
self.createTempFile()
@@ -510,6 +627,11 @@ def testDecompressIncomplete(self):
510627
# "Test decompress() function with incomplete data"
511628
self.assertRaises(ValueError, bz2.decompress, self.DATA[:-10])
512629

630+
def testDecompressMultiStream(self):
631+
# "Test decompress() function for data with multiple streams"
632+
text = bz2.decompress(self.DATA * 5)
633+
self.assertEqual(text, self.TEXT * 5)
634+
513635
def test_main():
514636
support.run_unittest(
515637
BZ2FileTest,

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ Core and Builtins
161161
Library
162162
-------
163163

164+
- Issue #1625: BZ2File and bz2.decompress() now support multi-stream files.
165+
Initial patch by Nir Aides.
166+
164167
- Issue #12175: BufferedReader.read(-1) now calls raw.readall() if available.
165168

166169
- Issue #12175: FileIO.readall() now only reads the file position and size

0 commit comments

Comments
 (0)