RustPython · youknowone · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/Lib/gzip.py b/Lib/gzip.py
@@ -15,12 +15,16 @@
 
 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
 
-READ, WRITE = 1, 2
+READ = 'rb'
+WRITE = 'wb'
 
 _COMPRESS_LEVEL_FAST = 1
 _COMPRESS_LEVEL_TRADEOFF = 6
 _COMPRESS_LEVEL_BEST = 9
 
+READ_BUFFER_SIZE = 128 * 1024
+_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE
+
 
 def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
          encoding=None, errors=None, newline=None):
@@ -118,6 +122,21 @@ class BadGzipFile(OSError):
     """Exception raised in some cases for invalid gzip files."""
 
 
+class _WriteBufferStream(io.RawIOBase):
+    """Minimal object to pass WriteBuffer flushes into GzipFile"""
+    def __init__(self, gzip_file):
+        self.gzip_file = gzip_file
+
+    def write(self, data):
+        return self.gzip_file._write_raw(data)
+
+    def seekable(self):
+        return False
+
+    def writable(self):
+        return True
+
+
 class GzipFile(_compression.BaseStream):
     """The GzipFile class simulates most of the methods of a file object with
     the exception of the truncate() method.
@@ -160,9 +179,10 @@ def __init__(self, filename=None, mode=None,
         and 9 is slowest and produces the most compression. 0 is no compression
         at all. The default is 9.
 
-        The mtime argument is an optional numeric timestamp to be written
-        to the last modification time field in the stream when compressing.
-        If omitted or None, the current time is used.
+        The optional mtime argument is the timestamp requested by gzip. The time
+        is in Unix format, i.e., seconds since 00:00:00 UTC, January 1, 1970.
+        If mtime is omitted or None, the current time is used. Use mtime = 0
+        to generate a compressed stream that does not depend on creation time.
 
         """
 
@@ -182,6 +202,7 @@ def __init__(self, filename=None, mode=None,
         if mode is None:
             mode = getattr(fileobj, 'mode', 'rb')
 
+
         if mode.startswith('r'):
             self.mode = READ
             raw = _GzipReader(fileobj)
@@ -204,6 +225,9 @@ def __init__(self, filename=None, mode=None,
                                              zlib.DEF_MEM_LEVEL,
                                              0)
             self._write_mtime = mtime
+            self._buffer_size = _WRITE_BUFFER_SIZE
+            self._buffer = io.BufferedWriter(_WriteBufferStream(self),
+                                             buffer_size=self._buffer_size)
         else:
             raise ValueError("Invalid mode: {!r}".format(mode))
 
@@ -212,14 +236,6 @@ def __init__(self, filename=None, mode=None,
         if self.mode == WRITE:
             self._write_gzip_header(compresslevel)
 
-    @property
-    def filename(self):
-        import warnings
-        warnings.warn("use the name attribute", DeprecationWarning, 2)
-        if self.mode == WRITE and self.name[-3:] != ".gz":
-            return self.name + ".gz"
-        return self.name
-
     @property
     def mtime(self):
         """Last modification time read from stream, or None"""
@@ -237,6 +253,11 @@ def _init_write(self, filename):
         self.bufsize = 0
         self.offset = 0  # Current file offset for seek(), tell(), etc
 
+    def tell(self):
+        self._check_not_closed()
+        self._buffer.flush()
+        return super().tell()
+
     def _write_gzip_header(self, compresslevel):
         self.fileobj.write(b'\037\213')             # magic header
         self.fileobj.write(b'\010')                 # compression method
@@ -278,6 +299,10 @@ def write(self,data):
         if self.fileobj is None:
             raise ValueError("write() on closed GzipFile object")
 
+        return self._buffer.write(data)
+
+    def _write_raw(self, data):
+        # Called by our self._buffer underlying WriteBufferStream.
         if isinstance(data, (bytes, bytearray)):
             length = len(data)
         else:
@@ -326,18 +351,19 @@ def closed(self):
 
     def close(self):
         fileobj = self.fileobj
-        if fileobj is None:
+        if fileobj is None or self._buffer.closed:
             return
-        self.fileobj = None
         try:
             if self.mode == WRITE:
+                self._buffer.flush()
                 fileobj.write(self.compress.flush())
                 write32u(fileobj, self.crc)
                 # self.size may exceed 2 GiB, or even 4 GiB
                 write32u(fileobj, self.size & 0xffffffff)
             elif self.mode == READ:
                 self._buffer.close()
         finally:
+            self.fileobj = None
             myfileobj = self.myfileobj
             if myfileobj:
                 self.myfileobj = None
@@ -346,6 +372,7 @@ def close(self):
     def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
         self._check_not_closed()
         if self.mode == WRITE:
+            self._buffer.flush()
             # Ensure the compressor's buffer is flushed
             self.fileobj.write(self.compress.flush(zlib_mode))
             self.fileobj.flush()
@@ -376,6 +403,9 @@ def seekable(self):
 
     def seek(self, offset, whence=io.SEEK_SET):
         if self.mode == WRITE:
+            self._check_not_closed()
+            # Flush buffer to ensure validity of self.offset
+            self._buffer.flush()
             if whence != io.SEEK_SET:
                 if whence == io.SEEK_CUR:
                     offset = self.offset + offset
@@ -384,10 +414,10 @@ def seek(self, offset, whence=io.SEEK_SET):
             if offset < self.offset:
                 raise OSError('Negative seek in write mode')
             count = offset - self.offset
-            chunk = b'\0' * 1024
-            for i in range(count // 1024):
+            chunk = b'\0' * self._buffer_size
+            for i in range(count // self._buffer_size):
                 self.write(chunk)
-            self.write(b'\0' * (count % 1024))
+            self.write(b'\0' * (count % self._buffer_size))
         elif self.mode == READ:
             self._check_not_closed()
             return self._buffer.seek(offset, whence)
@@ -454,7 +484,7 @@ def _read_gzip_header(fp):
 
 class _GzipReader(_compression.DecompressReader):
     def __init__(self, fp):
-        super().__init__(_PaddedFile(fp), zlib.decompressobj,
+        super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
                          wbits=-zlib.MAX_WBITS)
         # Set flag indicating start of a new member
         self._new_member = True
@@ -502,12 +532,13 @@ def read(self, size=-1):
                 self._new_member = False
 
             # Read a chunk of data from the file
-            buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
+            if self._decompressor.needs_input:
+                buf = self._fp.read(READ_BUFFER_SIZE)
+                uncompress = self._decompressor.decompress(buf, size)
+            else:
+                uncompress = self._decompressor.decompress(b"", size)
 
-            uncompress = self._decompressor.decompress(buf, size)
-            if self._decompressor.unconsumed_tail != b"":
-                self._fp.prepend(self._decompressor.unconsumed_tail)
-            elif self._decompressor.unused_data != b"":
+            if self._decompressor.unused_data != b"":
                 # Prepend the already read bytes to the fileobj so they can
                 # be seen by _read_eof() and _read_gzip_header()
                 self._fp.prepend(self._decompressor.unused_data)
@@ -518,14 +549,11 @@ def read(self, size=-1):
                 raise EOFError("Compressed file ended before the "
                                "end-of-stream marker was reached")
 
-        self._add_read_data( uncompress )
+        self._crc = zlib.crc32(uncompress, self._crc)
+        self._stream_size += len(uncompress)
         self._pos += len(uncompress)
         return uncompress
 
-    def _add_read_data(self, data):
-        self._crc = zlib.crc32(data, self._crc)
-        self._stream_size = self._stream_size + len(data)
-
     def _read_eof(self):
         # We've read to the end of the file
         # We check that the computed CRC and size of the
@@ -552,43 +580,21 @@ def _rewind(self):
         self._new_member = True
 
 
-def _create_simple_gzip_header(compresslevel: int,
-                               mtime = None) -> bytes:
-    """
-    Write a simple gzip header with no extra fields.
-    :param compresslevel: Compresslevel used to determine the xfl bytes.
-    :param mtime: The mtime (must support conversion to a 32-bit integer).
-    :return: A bytes object representing the gzip header.
-    """
-    if mtime is None:
-        mtime = time.time()
-    if compresslevel == _COMPRESS_LEVEL_BEST:
-        xfl = 2
-    elif compresslevel == _COMPRESS_LEVEL_FAST:
-        xfl = 4
-    else:
-        xfl = 0
-    # Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
-    # fields added to header), mtime, xfl and os (255 for unknown OS).
-    return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
-
-
-def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
+def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=0):
     """Compress data in one shot and return the compressed string.
 
     compresslevel sets the compression level in range of 0-9.
-    mtime can be used to set the modification time. The modification time is
-    set to the current time by default.
+    mtime can be used to set the modification time.
+    The modification time is set to 0 by default, for reproducibility.
     """
-    if mtime == 0:
-        # Use zlib as it creates the header with 0 mtime by default.
-        # This is faster and with less overhead.
-        return zlib.compress(data, level=compresslevel, wbits=31)
-    header = _create_simple_gzip_header(compresslevel, mtime)
-    trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
-    # Wbits=-15 creates a raw deflate block.
-    return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
-            trailer)
+    # Wbits=31 automatically includes a gzip header and trailer.
+    gzip_data = zlib.compress(data, level=compresslevel, wbits=31)
+    if mtime is None:
+        mtime = time.time()
+    # Reuse gzip header created by zlib, replace mtime and OS byte for
+    # consistency.
+    header = struct.pack("<4sLBB", gzip_data, int(mtime), gzip_data[8], 255)
+    return header + gzip_data[10:]
 
 
 def decompress(data):
@@ -655,7 +661,7 @@ def main():
                 f = builtins.open(arg, "rb")
                 g = open(arg + ".gz", "wb")
         while True:
-            chunk = f.read(io.DEFAULT_BUFFER_SIZE)
+            chunk = f.read(READ_BUFFER_SIZE)
             if not chunk:
                 break
             g.write(chunk)