Support zip64 in zipimport

thatch · thatch · commit a836182b48ad · 2022-06-22T14:45:41.000-07:00
* Reads zip64 files as produced by zipfile
* Include tests (somewhat slow, however, because of the need to create "large" zips)
* About the same amount of strictness reading invalid zip files as zipfile has
diff --git a/Doc/library/zipimport.rst b/Doc/library/zipimport.rst
@@ -30,6 +30,9 @@ Any files may be present in the ZIP archive, but importers are only invoked for
 corresponding :file:`.pyc` file, meaning that if a ZIP archive
 doesn't contain :file:`.pyc` files, importing may be rather slow.
 
+.. versionchanged:: 3.12
+   ZIP64 is supported
+
 .. versionchanged:: 3.8
    Previously, ZIP archives with an archive comment were not supported.
 
diff --git a/Lib/test/test_zipimport.py b/Lib/test/test_zipimport.py
@@ -776,6 +776,16 @@ def testLargestPossibleComment(self):
         files = {TESTMOD + ".py": (NOW, test_src)}
         self.doTest(".py", files, TESTMOD, comment=b"c" * ((1 << 16) - 1))
 
+    def testZip64(self):
+        # This is the simplest way to make zipfile generate the zip64 EOCD block
+        files = {f"f{n}.py": (NOW, test_src) for n in range(65537)}
+        self.doTest(".py", files, "f6")
+
+    def testZip64CruftAndComment(self):
+        # This is the simplest way to make zipfile generate the zip64 EOCD block
+        files = {f"f{n}.py": (NOW, test_src) for n in range(65537)}
+        self.doTest(".py", files, "f65536", comment=b"c" * ((1 << 16) - 1))
+
 
 @support.requires_zlib()
 class CompressedZipImportTestCase(UncompressedZipImportTestCase):
diff --git a/Lib/zipimport.py b/Lib/zipimport.py
@@ -40,8 +40,14 @@ class ZipImportError(ImportError):
 _module_type = type(sys)
 
 END_CENTRAL_DIR_SIZE = 22
-STRING_END_ARCHIVE = b'PK\x05\x06'
+END_CENTRAL_DIR_SIZE_64 = 56
+END_CENTRAL_DIR_LOCATOR_SIZE_64 = 20
+STRING_END_ARCHIVE = b'PK\x05\x06'  # standard EOCD signature
+STRING_END_LOCATOR_64 = b'PK\x06\x07'  # Zip64 EOCD Locator signature
+STRING_END_ZIP_64 = b'PK\x06\x06'  # Zip64 EOCD signature
 MAX_COMMENT_LEN = (1 << 16) - 1
+MAX_UINT32 = 0xffffffff
+ZIP64_EXTRA_TAG = 0x1
 
 class zipimporter(_bootstrap_external._LoaderBasics):
     """zipimporter(archivepath) -> zipimporter object
@@ -406,49 +412,69 @@ def _read_directory(archive):
         raise ZipImportError(f"can't open Zip file: {archive!r}", path=archive)
 
     with fp:
+        # Check if there's a comment.
         try:
-            fp.seek(-END_CENTRAL_DIR_SIZE, 2)
-            header_position = fp.tell()
-            buffer = fp.read(END_CENTRAL_DIR_SIZE)
+            fp.seek(0, 2)
+            file_size = fp.tell()
         except OSError:
-            raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
-        if len(buffer) != END_CENTRAL_DIR_SIZE:
-            raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
-        if buffer[:4] != STRING_END_ARCHIVE:
-            # Bad: End of Central Dir signature
-            # Check if there's a comment.
-            try:
-                fp.seek(0, 2)
-                file_size = fp.tell()
-            except OSError:
-                raise ZipImportError(f"can't read Zip file: {archive!r}",
-                                     path=archive)
-            max_comment_start = max(file_size - MAX_COMMENT_LEN -
-                                    END_CENTRAL_DIR_SIZE, 0)
-            try:
-                fp.seek(max_comment_start)
-                data = fp.read()
-            except OSError:
-                raise ZipImportError(f"can't read Zip file: {archive!r}",
-                                     path=archive)
-            pos = data.rfind(STRING_END_ARCHIVE)
-            if pos < 0:
-                raise ZipImportError(f'not a Zip file: {archive!r}',
+            raise ZipImportError(f"can't read Zip file: {archive!r}",
+                                 path=archive)
+        max_comment_start = max(file_size - MAX_COMMENT_LEN -
+                                END_CENTRAL_DIR_SIZE - END_CENTRAL_DIR_SIZE_64 -
+                                END_CENTRAL_DIR_LOCATOR_SIZE_64, 0)
+        try:
+            fp.seek(max_comment_start)
+            data = fp.read()
+        except OSError:
+            raise ZipImportError(f"can't read Zip file: {archive!r}",
+                                 path=archive)
+        pos = data.rfind(STRING_END_ARCHIVE)
+        pos64 = data.rfind(STRING_END_ZIP_64)
+
+        if (pos64 >= 0 and pos64+END_CENTRAL_DIR_SIZE_64+END_CENTRAL_DIR_LOCATOR_SIZE_64==pos):
+            # Zip64 at "correct" offset from standard EOCD
+            buffer = data[pos64:pos64 + END_CENTRAL_DIR_SIZE_64]
+            if len(buffer) != END_CENTRAL_DIR_SIZE_64:
+                raise ZipImportError(f"corrupt Zip64 file: {archive!r}",
                                      path=archive)
+            header_position = file_size - len(data) + pos64
+
+            central_directory_size = int.from_bytes(buffer[40:48], 'little')
+            central_directory_position = int.from_bytes(buffer[48:56], 'little')
+            num_entries = int.from_bytes(buffer[24:32], 'little')
+        elif pos >= 0:
             buffer = data[pos:pos+END_CENTRAL_DIR_SIZE]
             if len(buffer) != END_CENTRAL_DIR_SIZE:
                 raise ZipImportError(f"corrupt Zip file: {archive!r}",
                                      path=archive)
+
             header_position = file_size - len(data) + pos
 
-        header_size = _unpack_uint32(buffer[12:16])
-        header_offset = _unpack_uint32(buffer[16:20])
-        if header_position < header_size:
+            # Buffer now contains a valid EOCD, and header_position gives the
+            # starting position of it.
+            central_directory_size = _unpack_uint32(buffer[12:16])
+            central_directory_position = _unpack_uint32(buffer[16:20])
+            num_entries = _unpack_uint16(buffer[8:10])
+
+            # N.b. if someday you want to prefer the standard (non-zip64) EOCD,
+            # you need to adjust position by 76 for arc to be 0.
+        else:
+            raise ZipImportError(f'not a Zip file: {archive!r}',
+                                 path=archive)
+
+        # Buffer now contains a valid EOCD, and header_position gives the
+        # starting position of it.
+        # XXX: These are cursory checks but are not as exact or strict as they
+        # could be.  Checking the arc-adjusted value is probably good too.
+        if header_position < central_directory_size:
             raise ZipImportError(f'bad central directory size: {archive!r}', path=archive)
-        if header_position < header_offset:
+        if header_position < central_directory_position:
             raise ZipImportError(f'bad central directory offset: {archive!r}', path=archive)
-        header_position -= header_size
-        arc_offset = header_position - header_offset
+        header_position -= central_directory_size
+        # On just-a-zipfile these values are the same and arc_offset is zero; if
+        # the file has some bytes prepended, `arc_offset` is the number of such
+        # bytes.  This is used for pex as well as self-extracting .exe.
+        arc_offset = header_position - central_directory_position
         if arc_offset < 0:
             raise ZipImportError(f'bad central directory size or offset: {archive!r}', path=archive)
 
@@ -465,6 +491,11 @@ def _read_directory(archive):
                 raise EOFError('EOF read where not expected')
             # Start of file header
             if buffer[:4] != b'PK\x01\x02':
+                if count != num_entries:
+                    raise ZipImportError(
+                        f"mismatched num_entries: {count} should be {num_entries} in {archive!r}",
+                        path=archive,
+                    )
                 break                                # Bad: Central Dir File Header
             if len(buffer) != 46:
                 raise EOFError('EOF read where not expected')
@@ -480,9 +511,6 @@ def _read_directory(archive):
             comment_size = _unpack_uint16(buffer[32:34])
             file_offset = _unpack_uint32(buffer[42:46])
             header_size = name_size + extra_size + comment_size
-            if file_offset > header_offset:
-                raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
-            file_offset += arc_offset
 
             try:
                 name = fp.read(name_size)
@@ -494,7 +522,10 @@ def _read_directory(archive):
             # slower than reading the data because fseek flushes stdio's
             # internal buffers.    See issue #8745.
             try:
-                if len(fp.read(header_size - name_size)) != header_size - name_size:
+                extra_data_len = header_size - name_size
+                extra_data = fp.read(extra_data_len)
+
+                if len(extra_data) != extra_data_len:
                     raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
             except OSError:
                 raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
@@ -511,6 +542,65 @@ def _read_directory(archive):
 
             name = name.replace('/', path_sep)
             path = _bootstrap_external._path_join(archive, name)
+
+            # Ordering matches unpacking below.
+            if (
+                file_size == MAX_UINT32 or
+                data_size == MAX_UINT32 or
+                file_offset == MAX_UINT32
+            ):
+                # need to decode extra_data looking for a zip64 extra (which might not
+                # be present)
+                while extra_data:
+                    if len(extra_data) < 4:
+                        raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+                    tag = _unpack_uint16(extra_data[:2])
+                    size = _unpack_uint16(extra_data[2:4])
+                    if len(extra_data) < 4 + size:
+                        raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+                    if tag == ZIP64_EXTRA_TAG:
+                        if (len(extra_data) - 4) % 8 != 0:
+                            raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+                        values = [
+                            int.from_bytes(extra_data[i:i+8], 'little')
+                            for i in range(4, len(extra_data), 8)
+                        ]
+
+                        # N.b. Here be dragons: the ordering of these is different than
+                        # the header fields, and it's really easy to get it wrong since
+                        # naturally-occuring zips that use all 3 are >4GB and not
+                        # something that would be checked-in.
+                        # The tests include a binary-edited zip that uses zip64
+                        # (unnecessarily) for all three.
+                        if file_size == MAX_UINT32:
+                            file_size = values.pop(0)
+                        if data_size == MAX_UINT32:
+                            data_size = values.pop(0)
+                        if file_offset == MAX_UINT32:
+                            file_offset = values.pop(0)
+
+                        if values:
+                            raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+
+                        break
+
+                    # For a typical zip, this bytes-slicing only happens 2-3 times, on
+                    # small data like timestamps and filesizes.
+                    extra_data = extra_data[4+size:]
+                else:
+                    _bootstrap._verbose_message(
+                        "zipimport: suspected zip64 but no zip64 extra for {!r}",
+                        path,
+                    )
+            # XXX These two statements seem swapped because `header_offset` is a
+            # position within the actual file, but `file_offset` (when compared) is
+            # as encoded in the entry, not adjusted for this file.
+            # N.b. this must be after we've potentially read the zip64 extra which can
+            # change `file_offset`.
+            if file_offset > central_directory_position:
+                raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
+            file_offset += arc_offset
+
             t = (path, compress, data_size, file_size, file_offset, time, date, crc)
             files[name] = t
             count += 1
diff --git a/Misc/NEWS.d/next/Library/2022-06-22-14-45-32.gh-issue-89739.CqZcRL.rst b/Misc/NEWS.d/next/Library/2022-06-22-14-45-32.gh-issue-89739.CqZcRL.rst
@@ -0,0 +1 @@
+The ``zipimport`` module can now read ZIP64 files.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+The ``zipimport`` module can now read ZIP64 files.