Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7b96984

Browse files
committed
Issue #1675951: Allow GzipFile to work with unseekable file objects.
Patch by Florian Festi.
1 parent dda7fdf commit 7b96984

5 files changed

Lines changed: 101 additions & 18 deletions

File tree

Doc/library/gzip.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ The module defines the following items:
7474
.. versionchanged:: 3.2
7575
Support for zero-padded files was added.
7676

77+
.. versionchanged:: 3.2
78+
Support for unseekable files was added.
79+
7780

7881
.. function:: open(filename, mode='rb', compresslevel=9)
7982

Lib/gzip.py

Lines changed: 73 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,62 @@ def open(filename, mode="rb", compresslevel=9):
4545
"""
4646
return GzipFile(filename, mode, compresslevel)
4747

48+
class _PaddedFile:
49+
"""Minimal read-only file object that prepends a string to the contents
50+
of an actual file. Shouldn't be used outside of gzip.py, as it lacks
51+
essential functionality."""
52+
53+
def __init__(self, f, prepend=b''):
54+
self._buffer = prepend
55+
self._length = len(prepend)
56+
self.file = f
57+
self._read = 0
58+
59+
def read(self, size):
60+
if self._read is None:
61+
return self.file.read(size)
62+
if self._read + size <= self._length:
63+
read = self._read
64+
self._read += size
65+
return self._buffer[read:self._read]
66+
else:
67+
read = self._read
68+
self._read = None
69+
return self._buffer[read:] + \
70+
self.file.read(size-self._length+read)
71+
72+
def prepend(self, prepend=b'', readprevious=False):
73+
if self._read is None:
74+
self._buffer = prepend
75+
elif readprevious and len(prepend) <= self._read:
76+
self._read -= len(prepend)
77+
return
78+
else:
79+
self._buffer = self._buffer[read:] + prepend
80+
self._length = len(self._buffer)
81+
self._read = 0
82+
83+
def unused(self):
84+
if self._read is None:
85+
return b''
86+
return self._buffer[self._read:]
87+
88+
def seek(self, offset, whence=0):
89+
# This is only ever called with offset=whence=0
90+
if whence == 1 and self._read is not None:
91+
if 0 <= offset + self._read <= self._length:
92+
self._read += offset
93+
return
94+
else:
95+
offset += self._length - self._read
96+
self._read = None
97+
self._buffer = None
98+
return self.file.seek(offset, whence)
99+
100+
def __getattr__(self, name):
101+
return getattr(name, self.file)
102+
103+
48104
class GzipFile(io.BufferedIOBase):
49105
"""The GzipFile class simulates most of the methods of a file object with
50106
the exception of the readinto() and truncate() methods.
@@ -119,6 +175,7 @@ def __init__(self, filename=None, mode=None,
119175
self.name = filename
120176
# Starts small, scales exponentially
121177
self.min_readsize = 100
178+
fileobj = _PaddedFile(fileobj)
122179

123180
elif mode[0:1] == 'w' or mode[0:1] == 'a':
124181
self.mode = WRITE
@@ -188,6 +245,9 @@ def _init_read(self):
188245

189246
def _read_gzip_header(self):
190247
magic = self.fileobj.read(2)
248+
if magic == b'':
249+
raise EOFError("Reached EOF")
250+
191251
if magic != b'\037\213':
192252
raise IOError('Not a gzipped file')
193253
method = ord( self.fileobj.read(1) )
@@ -219,6 +279,11 @@ def _read_gzip_header(self):
219279
if flag & FHCRC:
220280
self.fileobj.read(2) # Read & discard the 16-bit header CRC
221281

282+
unused = self.fileobj.unused()
283+
if unused:
284+
uncompress = self.decompress.decompress(unused)
285+
self._add_read_data(uncompress)
286+
222287
def write(self,data):
223288
if self.mode != WRITE:
224289
import errno
@@ -282,16 +347,6 @@ def _read(self, size=1024):
282347
if self._new_member:
283348
# If the _new_member flag is set, we have to
284349
# jump to the next member, if there is one.
285-
#
286-
# First, check if we're at the end of the file;
287-
# if so, it's time to stop; no more members to read.
288-
pos = self.fileobj.tell() # Save current position
289-
self.fileobj.seek(0, 2) # Seek to end of file
290-
if pos == self.fileobj.tell():
291-
raise EOFError("Reached EOF")
292-
else:
293-
self.fileobj.seek( pos ) # Return to original position
294-
295350
self._init_read()
296351
self._read_gzip_header()
297352
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
@@ -305,6 +360,9 @@ def _read(self, size=1024):
305360

306361
if buf == b"":
307362
uncompress = self.decompress.flush()
363+
# Prepend the already read bytes to the fileobj to they can be
364+
# seen by _read_eof()
365+
self.fileobj.prepend(self.decompress.unused_data, True)
308366
self._read_eof()
309367
self._add_read_data( uncompress )
310368
raise EOFError('Reached EOF')
@@ -316,10 +374,9 @@ def _read(self, size=1024):
316374
# Ending case: we've come to the end of a member in the file,
317375
# so seek back to the start of the unused data, finish up
318376
# this member, and read a new gzip header.
319-
# (The number of bytes to seek back is the length of the unused
320-
# data, minus 8 because _read_eof() will rewind a further 8 bytes)
321-
self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
322-
377+
# Prepend the already read bytes to the fileobj to they can be
378+
# seen by _read_eof() and _read_gzip_header()
379+
self.fileobj.prepend(self.decompress.unused_data, True)
323380
# Check the CRC and file size, and set the flag so we read
324381
# a new member on the next call
325382
self._read_eof()
@@ -334,12 +391,10 @@ def _add_read_data(self, data):
334391
self.size = self.size + len(data)
335392

336393
def _read_eof(self):
337-
# We've read to the end of the file, so we have to rewind in order
338-
# to reread the 8 bytes containing the CRC and the file size.
394+
# We've read to the end of the file
339395
# We check the that the computed CRC and size of the
340396
# uncompressed data matches the stored values. Note that the size
341397
# stored is the true file size mod 2**32.
342-
self.fileobj.seek(-8, 1)
343398
crc32 = read32(self.fileobj)
344399
isize = read32(self.fileobj) # may exceed 2GB
345400
if crc32 != self.crc:
@@ -355,7 +410,7 @@ def _read_eof(self):
355410
while c == b"\x00":
356411
c = self.fileobj.read(1)
357412
if c:
358-
self.fileobj.seek(-1, 1)
413+
self.fileobj.prepend(c, True)
359414

360415
@property
361416
def closed(self):

Lib/test/test_gzip.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,17 @@
2222
"""
2323

2424

25+
class UnseekableIO(io.BytesIO):
26+
def seekable(self):
27+
return False
28+
29+
def tell(self):
30+
raise io.UnsupportedOperation
31+
32+
def seek(self, *args):
33+
raise io.UnsupportedOperation
34+
35+
2536
class TestGzip(unittest.TestCase):
2637
filename = support.TESTFN
2738

@@ -265,6 +276,16 @@ def test_zero_padded_file(self):
265276
d = f.read()
266277
self.assertEqual(d, data1 * 50, "Incorrect data in file")
267278

279+
def test_non_seekable_file(self):
280+
uncompressed = data1 * 50
281+
buf = UnseekableIO()
282+
with gzip.GzipFile(fileobj=buf, mode="wb") as f:
283+
f.write(uncompressed)
284+
compressed = buf.getvalue()
285+
buf = UnseekableIO(compressed)
286+
with gzip.GzipFile(fileobj=buf, mode="rb") as f:
287+
self.assertEqual(f.read(), uncompressed)
288+
268289
# Testing compress/decompress shortcut functions
269290

270291
def test_compress(self):

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,7 @@ Bill Fancher
260260
Mark Favas
261261
Niels Ferguson
262262
Sebastian Fernandez
263+
Florian Festi
263264
Vincent Fiack
264265
Tomer Filiba
265266
Jeffrey Finkelstein

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ Core and Builtins
6262
Library
6363
-------
6464

65+
- Issue #1675951: Allow GzipFile to work with unseekable file objects.
66+
Patch by Florian Festi.
67+
6568
- Logging: Added QueueListener class to facilitate logging usage for
6669
performance-critical threads.
6770

0 commit comments

Comments
 (0)