diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 6a4fa67332e179..4f566168938056 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -518,6 +518,23 @@ ZipFile Objects .. versionadded:: 3.11 +.. method:: ZipFile.remove(zinfo_or_arcname) + + Removes a member from the archive. *zinfo_or_arcname* is either the full + path of the member, or a :class:`ZipInfo` instance. + + The archive must be opened with mode ``'a'``. + + Calling :meth:`remove` on a closed ZipFile will raise a :exc:`ValueError`. + + .. note:: + + Removing a member in an archive may involve a move of many internal data + records, which can be I/O intensive for a large ZIP file. + + .. versionadded:: next + + The following data attributes are also available: .. attribute:: ZipFile.filename diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index ada96813709aea..0b457bda6b3347 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -1360,6 +1360,229 @@ class LzmaWriterTests(AbstractWriterTests, unittest.TestCase): class ZstdWriterTests(AbstractWriterTests, unittest.TestCase): compression = zipfile.ZIP_ZSTANDARD +class AbstractRemoveTests: + + def _test_removing_members(self, test_files, indexes, force_zip64=False): + """Test underlying _remove_members() for removing members at given + indexes.""" + # calculate the expected results + expected_files = [] + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: + for i, (file, data) in enumerate(test_files): + if i not in indexes: + with zh.open(file, 'w', force_zip64=force_zip64) as fh: + fh.write(data) + expected_files.append(file) + expected_size = os.path.getsize(TESTFN) + + # prepare the test zip + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: + for file, data in test_files: + with zh.open(file, 'w', force_zip64=force_zip64) as fh: + fh.write(data) + + # do the removal and check the result + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + members = {zh.infolist()[i] for i in indexes} + zh._remove_members(members) + + # make sure internal caches have reflected the change + # and are consistent + self.assertEqual(zh.namelist(), expected_files) + for file, _ in test_files: + if file in zh.namelist(): + self.assertEqual(zh.getinfo(file).filename, file) + else: + with self.assertRaises(KeyError): + zh.getinfo(file) + + self.assertIsNone(zh.testzip()) + self.assertEqual(os.path.getsize(TESTFN), expected_size) + + def _test_removing_combinations(self, test_files, n=None): + """Test underlying _remove_members() for removing random combinations + of members.""" + ln = len(test_files) + for n in (range(1, ln + 1) if n is None else (n,)): + for indexes in itertools.combinations(range(ln), n): + with self.subTest(remove=indexes): + self._test_removing_members(test_files, indexes) + + def test_basic(self): + """Test underlying _remove_members() for removing random combinations + of members.""" + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + self._test_removing_combinations(test_files) + + def test_duplicated_arcname(self): + """Test underlying _remove_members() for removing any one of random + duplicated members.""" + dupl_file = 'file.txt' + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + ln = len(test_files) + for n in range(2, ln + 1): + for dups in itertools.combinations(range(ln), n): + files = [] + for i, (file, data) in enumerate(test_files): + file_ = dupl_file if i in dups else file + files.append((file_, data)) + + for index in dups: + indexes = [index] + with self.subTest(dups=dups, remove=indexes): + import warnings + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self._test_removing_members(files, indexes) + + def test_non_physical(self): + """Test underlying _remove_members() for non-physical removing.""" + test_files = [ + ('file0.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('file1.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ('file2.txt', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ] + + ln = len(test_files) + for n in range(1, ln + 1): + for indexes in itertools.combinations(range(ln), n): + with self.subTest(remove=indexes): + # prepare the test zip + expected = {} + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: + for i, (file, data) in enumerate(test_files): + zh.writestr(file, data) + if i not in indexes: + expected[file] = zh.getinfo(file).header_offset + + # do the removal and check the result + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + members = {zh.infolist()[i] for i in indexes} + zh._remove_members(members, remove_physical=False) + self.assertEqual(zh.namelist(), list(expected)) + for file, offset in expected.items(): + self.assertEqual(zh.getinfo(file).header_offset, offset) + self.assertIsNone(zh.testzip()) + + def test_verify(self): + """Test if params are passed to underlying _remove_members() correctly, + or never passed if conditions not met.""" + file0 = 'file0.txt' + file = 'datafile.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + + # closed: error and do nothing + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + zh.close() + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with self.assertRaises(ValueError): + zh.remove(file) + mock_fn.assert_not_called() + + # writing: error and do nothing + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: + zh.writestr(file, data) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with zh.open(file0, 'w') as fh: + with self.assertRaises(ValueError): + zh.remove(file) + mock_fn.assert_not_called() + + # mode 'r': error and do nothing + with zipfile.ZipFile(TESTFN, 'r', self.compression) as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with self.assertRaises(ValueError): + zh.remove(file) + mock_fn.assert_not_called() + + # mode 'w': error and do nothing + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: + zh.writestr(file, data) + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with self.assertRaises(ValueError): + zh.remove(file) + mock_fn.assert_not_called() + + # mode 'x': error and do nothing + os.remove(TESTFN) + with zipfile.ZipFile(TESTFN, 'x', self.compression) as zh: + zh.writestr(file, data) + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with self.assertRaises(ValueError): + zh.remove(file) + mock_fn.assert_not_called() + + # mode 'a': the most general use case + with zipfile.ZipFile(TESTFN, 'w', self.compression) as zh: + zh.writestr(file, data) + # -- remove with arcname + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + zh.remove(file) + mock_fn.assert_called_once_with({zh.getinfo(file)}) + # -- remove with zinfo + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + zinfo = zh.getinfo(file) + zh.remove(zinfo) + mock_fn.assert_called_once_with({zinfo}) + # -- remove with nonexist arcname + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + with self.assertRaises(KeyError): + zh.remove('nonexist.file') + mock_fn.assert_not_called() + # -- remove with nonexist zinfo (even if same name) + with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh: + with mock.patch('zipfile.ZipFile._remove_members') as mock_fn: + zinfo = zipfile.ZipInfo(file) + with self.assertRaises(KeyError): + zh.remove(zinfo) + mock_fn.assert_not_called() + + def test_zip64(self): + """Test if members use zip64.""" + test_files = [ + ('pre.txt', b'Lorem ipsum dolor sit amet, consectetur adipiscing elit'), + ('datafile', b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'), + ('post.txt', b'Duis aute irure dolor in reprehenderit in voluptate velit esse'), + ] + + self._test_removing_members(test_files, [1], force_zip64=True) + +class StoredRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_STORED + +@requires_zlib() +class DeflateRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_DEFLATED + +@requires_bz2() +class Bzip2RemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_BZIP2 + +@requires_lzma() +class LzmaRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_LZMA + +@requires_zstd() +class ZstdRemoveTests(AbstractRemoveTests, unittest.TestCase): + compression = zipfile.ZIP_ZSTANDARD + + class PyZipFileTests(unittest.TestCase): def assertCompiledIn(self, name, namelist): if name + 'o' not in namelist: diff --git a/Lib/test/test_zipfile64.py b/Lib/test/test_zipfile64.py index 2e1affe0252858..84d1862f08bf31 100644 --- a/Lib/test/test_zipfile64.py +++ b/Lib/test/test_zipfile64.py @@ -87,6 +87,69 @@ def tearDown(self): os_helper.unlink(TESTFN2) +class TestRemove(unittest.TestCase): + def setUp(self): + # Create test data. + line_gen = ("Test of zipfile line %d." % i for i in range(1000000)) + self.data = '\n'.join(line_gen).encode('ascii') + + def _write_large_file(self, fh): + # It will contain enough copies of self.data to reach about 8 GiB. + filecount = 8*1024**3 // len(self.data) + + next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL + for num in range(filecount): + fh.write(self.data) + # Print still working message since this test can be really slow + if next_time <= time.monotonic(): + next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL + print(( + ' writing %d of %d, be patient...' % + (num, filecount)), file=sys.__stdout__) + sys.__stdout__.flush() + + def test_remove_large_file(self): + # Try the temp file. If we do TESTFN2, then it hogs + # gigabytes of disk space for the duration of the test. + with TemporaryFile() as f: + self._test_remove_large_file(f) + self.assertFalse(f.closed) + + def _test_remove_large_file(self, f): + file = 'datafile.txt' + file1 = 'dummy.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + with zipfile.ZipFile(f, 'w') as zh: + with zh.open(file1, 'w', force_zip64=True) as fh: + self._write_large_file(fh) + zh.writestr(file, data) + + with zipfile.ZipFile(f, 'a') as zh: + zh.remove(file1) + self.assertIsNone(zh.testzip()) + + def test_remove_before_large_file(self): + # Try the temp file. If we do TESTFN2, then it hogs + # gigabytes of disk space for the duration of the test. + with TemporaryFile() as f: + self._test_remove_before_large_file(f) + self.assertFalse(f.closed) + + def _test_remove_before_large_file(self, f): + file = 'datafile.txt' + file1 = 'dummy.txt' + data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem' + with zipfile.ZipFile(f, 'w') as zh: + zh.writestr(file, data) + with zh.open(file1, 'w', force_zip64=True) as fh: + self._write_large_file(fh) + expected_size = zh.getinfo(file1).file_size + + with zipfile.ZipFile(f, 'a') as zh: + zh.remove(file) + self.assertIsNone(zh.testzip()) + + class OtherTests(unittest.TestCase): def testMoreThan64kFiles(self): # This test checks that more than 64k files can be added to an archive, diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 18caeb3e04a2b5..02e80b390eec7f 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -1866,6 +1866,36 @@ def extractall(self, path=None, members=None, pwd=None): for zipinfo in members: self._extract_member(zipinfo, path, pwd) + def remove(self, zinfo_or_arcname): + """Remove a member from the archive. + + The archive must be open with mode 'a', since mode 'w'/'x' may be used + on an unseekable file buffer, which disallows truncation.""" + + if self.mode != 'a': + raise ValueError("remove() requires mode 'a'") + if not self.fp: + raise ValueError( + "Attempt to write to ZIP archive that was already closed") + if self._writing: + raise ValueError( + "Can't write to ZIP archive while an open writing handle exists." + ) + + # Make sure we have an existing info object + if isinstance(zinfo_or_arcname, ZipInfo): + zinfo = zinfo_or_arcname + # make sure zinfo exists + if zinfo not in self.filelist: + raise KeyError( + 'There is no item %r in the archive' % zinfo_or_arcname) + else: + # get the info object + zinfo = self.getinfo(zinfo_or_arcname) + + with self._lock: + return self._remove_members({zinfo}) + @classmethod def _sanitize_windows_name(cls, arcname, pathsep): """Replace bad characters and remove trailing dots from parts.""" @@ -1930,6 +1960,69 @@ def _extract_member(self, member, targetpath, pwd): return targetpath + def _remove_members(self, members, *, remove_physical=True, chunk_size=2**20): + """Remove members in a zip file. + + All members (as zinfo) should exist in the zip; otherwise the zip file + will erroneously end in an inconsistent state. + """ + fp = self.fp + entry_offset = 0 + member_seen = False + + # get a sorted filelist by header offset, in case the dir order + # doesn't match the actual entry order + filelist = sorted(self.filelist, key=lambda x: x.header_offset) + for i, info in enumerate(filelist): + is_member = info in members + + if not (member_seen or is_member): + continue + + # get the total size of the entry + try: + offset = filelist[i + 1].header_offset + except IndexError: + offset = self.start_dir + entry_size = offset - info.header_offset + + if is_member: + member_seen = True + entry_offset += entry_size + + # update caches + self.filelist.remove(info) + try: + del self.NameToInfo[info.filename] + except KeyError: + pass + continue + + # update the header and move entry data to the new position + if remove_physical: + old_header_offset = info.header_offset + info.header_offset -= entry_offset + if info._end_offset is not None: + info._end_offset -= entry_offset + read_size = 0 + while read_size < entry_size: + fp.seek(old_header_offset + read_size) + data = fp.read(min(entry_size - read_size, chunk_size)) + fp.seek(info.header_offset + read_size) + fp.write(data) + fp.flush() + read_size += len(data) + + # Avoid missing entry if entries have a duplicated name. + # Reverse the order as NameToInfo normally stores the last added one. + for info in reversed(self.filelist): + self.NameToInfo.setdefault(info.filename, info) + + # update state + if remove_physical: + self.start_dir -= entry_offset + self._didModify = True + def _writecheck(self, zinfo): """Check for errors before writing a file to the archive.""" if zinfo.filename in self.NameToInfo: diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst new file mode 100644 index 00000000000000..6a696828991836 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-22-12-52-35.gh-issue-51067.tJxGGF.rst @@ -0,0 +1 @@ +Add ``ZipFile.remove()``