From affec4f90bce84eaa0f68103a9cb7fc8cd93c9f8 Mon Sep 17 00:00:00 2001 From: Robert O'Shea Date: Tue, 21 Feb 2023 20:13:59 +0000 Subject: [PATCH 1/7] gh-102120: Added iter_no_cache to TarFile --- Lib/tarfile.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index d686435d90ad1b..efec472b5bdb0f 100755 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -2477,6 +2477,68 @@ def __iter__(self): index += 1 yield tarinfo + def iter_no_cache(self): + """Provide an iterator object that does not cache files + for systems low on memory. + """ + self._check("ra") + if self.firstmember is not None: + m = self.firstmember + self.firstmember = None + yield m + + # Advance the file pointer. + if self.offset != self.fileobj.tell(): + if self.offset == 0: + return None + self.fileobj.seek(self.offset - 1) + if not self.fileobj.read(1): + raise ReadError("unexpected end of data") + + # Read the next block. + while True: + tarinfo = None + # Advance the file pointer. + if self.offset != self.fileobj.tell(): + self.fileobj.seek(self.offset - 1) + if not self.fileobj.read(1): + break + try: + tarinfo = self.tarinfo.fromtarfile(self) + except EOFHeaderError as e: + if self.ignore_zeros: + self._dbg(2, "0x%X: %s" % (self.offset, e)) + self.offset += BLOCKSIZE + continue + except InvalidHeaderError as e: + if self.ignore_zeros: + self._dbg(2, "0x%X: %s" % (self.offset, e)) + self.offset += BLOCKSIZE + continue + elif self.offset == 0: + raise ReadError(str(e)) from None + except EmptyHeaderError: + if self.offset == 0: + raise ReadError("empty file") from None + except TruncatedHeaderError as e: + if self.offset == 0: + raise ReadError(str(e)) from None + except SubsequentHeaderError as e: + raise ReadError(str(e)) from None + except Exception as e: + try: + import zlib + if isinstance(e, zlib.error): + raise ReadError(f'zlib error: {e}') from None + else: + raise e + except ImportError: + raise e + if tarinfo is not None: + yield tarinfo + else: + break + def _dbg(self, level, msg): """Write debugging output to sys.stderr. """ From 2741ad5e4109ea6617caecc2bd1963d2c39f876c Mon Sep 17 00:00:00 2001 From: Robert O'Shea Date: Fri, 24 Feb 2023 21:22:00 +0000 Subject: [PATCH 2/7] gh-102120: Added a stream attribute to TarFile --- Lib/tarfile.py | 79 +++++++------------------------------------------- 1 file changed, 11 insertions(+), 68 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index efec472b5bdb0f..0b5959d938cc76 100755 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -1465,7 +1465,7 @@ class TarFile(object): def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, errors="surrogateescape", pax_headers=None, debug=None, - errorlevel=None, copybufsize=None): + errorlevel=None, copybufsize=None, stream=False): """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to read from an existing archive, 'a' to append data to an existing file or 'w' to create a new file overwriting an existing one. `mode' @@ -1497,6 +1497,8 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None, self.name = os.path.abspath(name) if name else None self.fileobj = fileobj + self.stream = stream + # Init attributes. if format is not None: self.format = format @@ -2378,7 +2380,9 @@ def next(self): break if tarinfo is not None: - self.members.append(tarinfo) + # if streaming the file we do not want to cache the tarinfo + if not self.stream: + self.members.append(tarinfo) else: self._loaded = True @@ -2412,11 +2416,12 @@ def _getmember(self, name, tarinfo=None, normalize=False): def _load(self): """Read through the entire archive file and look for readable - members. + members. This should not run if the file is set to stream. """ - while self.next() is not None: - pass - self._loaded = True + if not self.stream: + while self.next() is not None: + pass + self._loaded = True def _check(self, mode=None): """Check if TarFile is still open, and if the operation's mode @@ -2477,68 +2482,6 @@ def __iter__(self): index += 1 yield tarinfo - def iter_no_cache(self): - """Provide an iterator object that does not cache files - for systems low on memory. - """ - self._check("ra") - if self.firstmember is not None: - m = self.firstmember - self.firstmember = None - yield m - - # Advance the file pointer. - if self.offset != self.fileobj.tell(): - if self.offset == 0: - return None - self.fileobj.seek(self.offset - 1) - if not self.fileobj.read(1): - raise ReadError("unexpected end of data") - - # Read the next block. - while True: - tarinfo = None - # Advance the file pointer. - if self.offset != self.fileobj.tell(): - self.fileobj.seek(self.offset - 1) - if not self.fileobj.read(1): - break - try: - tarinfo = self.tarinfo.fromtarfile(self) - except EOFHeaderError as e: - if self.ignore_zeros: - self._dbg(2, "0x%X: %s" % (self.offset, e)) - self.offset += BLOCKSIZE - continue - except InvalidHeaderError as e: - if self.ignore_zeros: - self._dbg(2, "0x%X: %s" % (self.offset, e)) - self.offset += BLOCKSIZE - continue - elif self.offset == 0: - raise ReadError(str(e)) from None - except EmptyHeaderError: - if self.offset == 0: - raise ReadError("empty file") from None - except TruncatedHeaderError as e: - if self.offset == 0: - raise ReadError(str(e)) from None - except SubsequentHeaderError as e: - raise ReadError(str(e)) from None - except Exception as e: - try: - import zlib - if isinstance(e, zlib.error): - raise ReadError(f'zlib error: {e}') from None - else: - raise e - except ImportError: - raise e - if tarinfo is not None: - yield tarinfo - else: - break - def _dbg(self, level, msg): """Write debugging output to sys.stderr. """ From f901e29184ec29daa722a2b53cbc5b3b959f645e Mon Sep 17 00:00:00 2001 From: Robert O'Shea Date: Wed, 8 Mar 2023 19:23:54 +0000 Subject: [PATCH 3/7] gh-102120: Added tests for tarfile stream mode --- Lib/test/test_tarfile.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index f15a800976681c..6c1666cc170417 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -95,6 +95,14 @@ def setUp(self): def tearDown(self): self.tar.close() +class StreamModeTest(ReadTest): + + # Only needs to change how the tarfile is opened to set + # stream mode + def setUp(self): + self.tar = tarfile.open(self.tarname, mode=self.mode, + encoding="iso8859-1", + stream=True) class UstarReadTest(ReadTest, unittest.TestCase): @@ -837,6 +845,21 @@ class Bz2StreamReadTest(Bz2Test, StreamReadTest): class LzmaStreamReadTest(LzmaTest, StreamReadTest): pass +class TarStreamModeReadTest(StreamModeTest, unittest.TestCase): + + def test_stream_mode_no_cache(self): + for _ in self.tar: + pass + self.assertEqual(self.tar.members, []) + +class GzipStreamModeReadTest(GzipTest, TarStreamModeReadTest): + pass + +class Bz2StreamModeReadTest(Bz2Test, TarStreamModeReadTest): + pass + +class LzmaStreamModeReadTest(LzmaTest, TarStreamModeReadTest): + pass class DetectReadTest(TarTest, unittest.TestCase): def _testfunc_file(self, name, mode): From 62416fe96db6812d6c145521fd3c253ff87d53f0 Mon Sep 17 00:00:00 2001 From: Robert O'Shea Date: Wed, 8 Mar 2023 19:24:48 +0000 Subject: [PATCH 4/7] gh-102120: Added to tarfile docs --- Doc/library/tarfile.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index 741d40da152101..52ba9100b98865 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -279,7 +279,7 @@ be finalized; only the internally used file object will be closed. See the .. versionadded:: 3.2 Added support for the context management protocol. -.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=1) +.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=1, stream=False) All following arguments are optional and can be accessed as instance attributes as well. @@ -333,6 +333,9 @@ be finalized; only the internally used file object will be closed. See the The *pax_headers* argument is an optional dictionary of strings which will be added as a pax global header if *format* is :const:`PAX_FORMAT`. + If *stream* is set to :const:`True` then while reading the archive info about files + in the archive are not cached, saving memory. + .. versionchanged:: 3.2 Use ``'surrogateescape'`` as the default for the *errors* argument. @@ -342,6 +345,8 @@ be finalized; only the internally used file object will be closed. See the .. versionchanged:: 3.6 The *name* parameter accepts a :term:`path-like object`. + .. versionchange:: 3.12 + The *stream* parameter has been added. .. classmethod:: TarFile.open(...) From c8f57d47e651888271bc3f01bf4f1c171ec8219a Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Wed, 8 Mar 2023 19:30:55 +0000 Subject: [PATCH 5/7] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2023-03-08-19-30-53.gh-issue-102120.xkQ5Wr.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2023-03-08-19-30-53.gh-issue-102120.xkQ5Wr.rst diff --git a/Misc/NEWS.d/next/Library/2023-03-08-19-30-53.gh-issue-102120.xkQ5Wr.rst b/Misc/NEWS.d/next/Library/2023-03-08-19-30-53.gh-issue-102120.xkQ5Wr.rst new file mode 100644 index 00000000000000..ca50242fdbe293 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-03-08-19-30-53.gh-issue-102120.xkQ5Wr.rst @@ -0,0 +1,2 @@ +Added a stream mode to ``tarfile`` that allows for reading +archives without caching info about the inner files. From 7e9a5f9ad63b0c0b0fc0fbd6ace60ba72faed95c Mon Sep 17 00:00:00 2001 From: Robert O'Shea Date: Wed, 8 Mar 2023 19:33:39 +0000 Subject: [PATCH 6/7] gh-102120: Fixed documentation error --- Doc/library/tarfile.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index 52ba9100b98865..3630e3cf29c4de 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -345,7 +345,7 @@ be finalized; only the internally used file object will be closed. See the .. versionchanged:: 3.6 The *name* parameter accepts a :term:`path-like object`. - .. versionchange:: 3.12 + .. versionchanged:: 3.12 The *stream* parameter has been added. .. classmethod:: TarFile.open(...) From 3ce2b8a55dfa710a11f2ed739c5a603f2cd067e6 Mon Sep 17 00:00:00 2001 From: Robert O'Shea Date: Tue, 23 May 2023 21:07:57 +0100 Subject: [PATCH 7/7] Update wording on Doc/library/tarfile.rst Co-authored-by: Ethan Furman --- Doc/library/tarfile.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/tarfile.rst b/Doc/library/tarfile.rst index 61fb73c26cb236..2f330f018a48be 100644 --- a/Doc/library/tarfile.rst +++ b/Doc/library/tarfile.rst @@ -381,8 +381,8 @@ be finalized; only the internally used file object will be closed. See the .. versionchanged:: 3.6 The *name* parameter accepts a :term:`path-like object`. - .. versionchanged:: 3.12 - The *stream* parameter has been added. + .. versionchanged:: 3.13 + Add the *stream* parameter. .. classmethod:: TarFile.open(...)