Thanks to visit codestin.com
Credit goes to github.com

Skip to content

gh-89739: gh-77140: Support zip64 in zipimport #94146

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Doc/library/zipimport.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ Any files may be present in the ZIP archive, but importers are only invoked for
corresponding :file:`.pyc` file, meaning that if a ZIP archive
doesn't contain :file:`.pyc` files, importing may be rather slow.

.. versionchanged:: 3.13
ZIP64 is supported

.. versionchanged:: 3.8
Previously, ZIP archives with an archive comment were not supported.

Expand Down
6 changes: 6 additions & 0 deletions Doc/whatsnew/3.13.rst
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,12 @@ xml.etree.ElementTree
:func:`~xml.etree.ElementTree.iterparse` for explicit cleaning up.
(Contributed by Serhiy Storchaka in :gh:`69893`.)

zipimport
---------

* Gains support for ZIP64 format files. Everybody loves huge code right?
(Contributed by Tim Hatch in :gh:`94146`.)


Optimizations
=============
Expand Down
5 changes: 5 additions & 0 deletions Lib/importlib/_bootstrap_external.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ def _pack_uint32(x):
return (int(x) & 0xFFFFFFFF).to_bytes(4, 'little')


def _unpack_uint64(data):
"""Convert 8 bytes in little-endian to an integer."""
assert len(data) == 8
return int.from_bytes(data, 'little')

def _unpack_uint32(data):
"""Convert 4 bytes in little-endian to an integer."""
assert len(data) == 4
Expand Down
12 changes: 12 additions & 0 deletions Lib/test/test_zipimport.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ def makeZip(self, files, zipName=TEMP_ZIP, **kw):
f.write(stuff)
f.write(data)

def getZip64Files(self):
# This is the simplest way to make zipfile generate the zip64 EOCD block
return {f"f{n}.py": (NOW, test_src) for n in range(65537)}

def doTest(self, expected_ext, files, *modules, **kw):
self.makeZip(files, **kw)

Expand Down Expand Up @@ -798,6 +802,14 @@ def testLargestPossibleComment(self):
files = {TESTMOD + ".py": (NOW, test_src)}
self.doTest(".py", files, TESTMOD, comment=b"c" * ((1 << 16) - 1))

def testZip64(self):
files = self.getZip64Files()
self.doTest(".py", files, "f6")

def testZip64CruftAndComment(self):
files = self.getZip64Files()
self.doTest(".py", files, "f65536", comment=b"c" * ((1 << 16) - 1))


@support.requires_zlib()
class CompressedZipImportTestCase(UncompressedZipImportTestCase):
Expand Down
166 changes: 127 additions & 39 deletions Lib/zipimport.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#from importlib import _bootstrap_external
#from importlib import _bootstrap # for _verbose_message
import _frozen_importlib_external as _bootstrap_external
from _frozen_importlib_external import _unpack_uint16, _unpack_uint32
from _frozen_importlib_external import _unpack_uint16, _unpack_uint32, _unpack_uint64
import _frozen_importlib as _bootstrap # for _verbose_message
import _imp # for check_hash_based_pycs
import _io # for open
Expand All @@ -40,8 +40,14 @@ class ZipImportError(ImportError):
_module_type = type(sys)

END_CENTRAL_DIR_SIZE = 22
STRING_END_ARCHIVE = b'PK\x05\x06'
END_CENTRAL_DIR_SIZE_64 = 56
END_CENTRAL_DIR_LOCATOR_SIZE_64 = 20
STRING_END_ARCHIVE = b'PK\x05\x06' # standard EOCD signature
STRING_END_LOCATOR_64 = b'PK\x06\x07' # Zip64 EOCD Locator signature
STRING_END_ZIP_64 = b'PK\x06\x06' # Zip64 EOCD signature
MAX_COMMENT_LEN = (1 << 16) - 1
MAX_UINT32 = 0xffffffff
ZIP64_EXTRA_TAG = 0x1

class zipimporter(_bootstrap_external._LoaderBasics):
"""zipimporter(archivepath) -> zipimporter object
Expand Down Expand Up @@ -356,49 +362,72 @@ def _read_directory(archive):
# to not cause problems when some runs 'python3 /dev/fd/9 9<some_script'
start_offset = fp.tell()
try:
# Check if there's a comment.
try:
fp.seek(-END_CENTRAL_DIR_SIZE, 2)
header_position = fp.tell()
buffer = fp.read(END_CENTRAL_DIR_SIZE)
fp.seek(0, 2)
file_size = fp.tell()
except OSError:
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
if len(buffer) != END_CENTRAL_DIR_SIZE:
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
if buffer[:4] != STRING_END_ARCHIVE:
# Bad: End of Central Dir signature
# Check if there's a comment.
try:
fp.seek(0, 2)
file_size = fp.tell()
except OSError:
raise ZipImportError(f"can't read Zip file: {archive!r}",
path=archive)
max_comment_start = max(file_size - MAX_COMMENT_LEN -
END_CENTRAL_DIR_SIZE, 0)
try:
fp.seek(max_comment_start)
data = fp.read()
except OSError:
raise ZipImportError(f"can't read Zip file: {archive!r}",
path=archive)
pos = data.rfind(STRING_END_ARCHIVE)
if pos < 0:
raise ZipImportError(f'not a Zip file: {archive!r}',
path=archive)
raise ZipImportError(f"can't read Zip file: {archive!r}",
path=archive)
max_comment_plus_dirs_size = (
MAX_COMMENT_LEN + END_CENTRAL_DIR_SIZE +
END_CENTRAL_DIR_SIZE_64 + END_CENTRAL_DIR_LOCATOR_SIZE_64)
max_comment_start = max(file_size - max_comment_plus_dirs_size, 0)
try:
fp.seek(max_comment_start)
data = fp.read(max_comment_plus_dirs_size)
except OSError:
raise ZipImportError(f"can't read Zip file: {archive!r}",
path=archive)
pos = data.rfind(STRING_END_ARCHIVE)
pos64 = data.rfind(STRING_END_ZIP_64)

if (pos64 >= 0 and pos64+END_CENTRAL_DIR_SIZE_64+END_CENTRAL_DIR_LOCATOR_SIZE_64==pos):
# Zip64 at "correct" offset from standard EOCD
buffer = data[pos64:pos64 + END_CENTRAL_DIR_SIZE_64]
if len(buffer) != END_CENTRAL_DIR_SIZE_64:
raise ZipImportError(
f"corrupt Zip64 file: Expected {END_CENTRAL_DIR_SIZE_64} byte "
f"zip64 central directory, but read {len(buffer)} bytes.",
path=archive)
header_position = file_size - len(data) + pos64

central_directory_size = _unpack_uint64(buffer[40:48])
central_directory_position = _unpack_uint64(buffer[48:56])
num_entries = _unpack_uint64(buffer[24:32])
elif pos >= 0:
buffer = data[pos:pos+END_CENTRAL_DIR_SIZE]
if len(buffer) != END_CENTRAL_DIR_SIZE:
raise ZipImportError(f"corrupt Zip file: {archive!r}",
path=archive)

header_position = file_size - len(data) + pos

header_size = _unpack_uint32(buffer[12:16])
header_offset = _unpack_uint32(buffer[16:20])
if header_position < header_size:
# Buffer now contains a valid EOCD, and header_position gives the
# starting position of it.
central_directory_size = _unpack_uint32(buffer[12:16])
central_directory_position = _unpack_uint32(buffer[16:20])
num_entries = _unpack_uint16(buffer[8:10])

# N.b. if someday you want to prefer the standard (non-zip64) EOCD,
# you need to adjust position by 76 for arc to be 0.
else:
raise ZipImportError(f'not a Zip file: {archive!r}',
path=archive)

# Buffer now contains a valid EOCD, and header_position gives the
# starting position of it.
# XXX: These are cursory checks but are not as exact or strict as they
# could be. Checking the arc-adjusted value is probably good too.
if header_position < central_directory_size:
raise ZipImportError(f'bad central directory size: {archive!r}', path=archive)
if header_position < header_offset:
if header_position < central_directory_position:
raise ZipImportError(f'bad central directory offset: {archive!r}', path=archive)
header_position -= header_size
arc_offset = header_position - header_offset
header_position -= central_directory_size
# On just-a-zipfile these values are the same and arc_offset is zero; if
# the file has some bytes prepended, `arc_offset` is the number of such
# bytes. This is used for pex as well as self-extracting .exe.
arc_offset = header_position - central_directory_position
if arc_offset < 0:
raise ZipImportError(f'bad central directory size or offset: {archive!r}', path=archive)

Expand All @@ -415,6 +444,11 @@ def _read_directory(archive):
raise EOFError('EOF read where not expected')
# Start of file header
if buffer[:4] != b'PK\x01\x02':
if count != num_entries:
raise ZipImportError(
f"mismatched num_entries: {count} should be {num_entries} in {archive!r}",
path=archive,
)
break # Bad: Central Dir File Header
if len(buffer) != 46:
raise EOFError('EOF read where not expected')
Expand All @@ -430,9 +464,6 @@ def _read_directory(archive):
comment_size = _unpack_uint16(buffer[32:34])
file_offset = _unpack_uint32(buffer[42:46])
header_size = name_size + extra_size + comment_size
if file_offset > header_offset:
raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
file_offset += arc_offset

try:
name = fp.read(name_size)
Expand All @@ -444,7 +475,10 @@ def _read_directory(archive):
# slower than reading the data because fseek flushes stdio's
# internal buffers. See issue #8745.
try:
if len(fp.read(header_size - name_size)) != header_size - name_size:
extra_data_len = header_size - name_size
extra_data = memoryview(fp.read(extra_data_len))

if len(extra_data) != extra_data_len:
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
except OSError:
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
Expand All @@ -461,6 +495,60 @@ def _read_directory(archive):

name = name.replace('/', path_sep)
path = _bootstrap_external._path_join(archive, name)

# Ordering matches unpacking below.
if (
file_size == MAX_UINT32 or
data_size == MAX_UINT32 or
file_offset == MAX_UINT32
):
# need to decode extra_data looking for a zip64 extra (which might not
# be present)
while extra_data:
if len(extra_data) < 4:
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
tag = _unpack_uint16(extra_data[:2])
size = _unpack_uint16(extra_data[2:4])
if len(extra_data) < 4 + size:
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
if tag == ZIP64_EXTRA_TAG:
if (len(extra_data) - 4) % 8 != 0:
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
num_extra_values = (len(extra_data) - 4) // 8
if num_extra_values > 3:
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
values = struct.unpack_from(f"<{min(num_extra_values, 3)}Q",
Copy link
Contributor

@jsirois jsirois Apr 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where does this symbol come from? I see no struct module import in this file. My Pex too-big-zip tests also find the following under 3.13.0a6:

Failed checking if argv[0] is an import path entry
Traceback (most recent call last):
  File "<frozen zipimport>", line 98, in __init__
  File "<frozen zipimport>", line 520, in _read_directory
NameError: name 'struct' is not defined. Did you forget to import 'struct'?
  File "/tmp/pytest-of-jsirois/pytest-10/test_check0/too-big.pyz", line 1
    PK-
SyntaxError: source code cannot contain null bytes

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A quick experiment seems to indicate just adding the import gets things working. I'll try my hand at sending up a patch.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch! too bad the tests didn't catch it. let me know if you need help with getting that PR going.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'm good, thank you. Filed #118107 to reference in the forthcoming PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix here hopefully: #118108

extra_data, offset=4)

# N.b. Here be dragons: the ordering of these is different than
# the header fields, and it's really easy to get it wrong since
# naturally-occuring zips that use all 3 are >4GB
if file_size == MAX_UINT32:
file_size = values.pop(0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These were problematic as well: values is a tuple; not a list. The new test caught this issue.

if data_size == MAX_UINT32:
data_size = values.pop(0)
if file_offset == MAX_UINT32:
file_offset = values.pop(0)

break

# For a typical zip, this bytes-slicing only happens 2-3 times, on
# small data like timestamps and filesizes.
extra_data = extra_data[4+size:]
else:
_bootstrap._verbose_message(
"zipimport: suspected zip64 but no zip64 extra for {!r}",
path,
)
# XXX These two statements seem swapped because `central_directory_position`
# is a position within the actual file, but `file_offset` (when compared) is
# as encoded in the entry, not adjusted for this file.
# N.b. this must be after we've potentially read the zip64 extra which can
# change `file_offset`.
if file_offset > central_directory_position:
raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
file_offset += arc_offset

t = (path, compress, data_size, file_size, file_offset, time, date, crc)
files[name] = t
count += 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The :mod:`zipimport` module can now read ZIP64 files.