From 2119f17f3f3e6cb62040c56cecd26666bd284f10 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Thu, 18 Jan 2024 22:30:54 +0000 Subject: [PATCH 1/5] gh-101438: Avoid reference cycle in ElementTree.iterparse. Refactor IterParseIterator to avoid a reference cycle between the iterator() function and the IterParseIterator() instance. This leads to more prompt clean-up of the "source" file if the returned iterator is not exhausted and not otherwise part of a reference cycle. This also avoids a test failure in the GC implementation for the free-threaded build: if the "source" file is finalized before the "iterator()" generator, a ResourceWarning is issued leading to a failure in test_iterparse(). In theory, this warning can occur in the default build as well, but is much less likely because it would require an unlucky scheduling of the GC between creation of the generator and the file object in order to change the order of finalization. --- Lib/xml/etree/ElementTree.py | 18 ++++++++++++++---- ...4-01-18-22-29-28.gh-issue-101438.1-uUi_.rst | 4 ++++ 2 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-01-18-22-29-28.gh-issue-101438.1-uUi_.rst diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 42574eefd81beb..e3aaed0feecc61 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -1222,6 +1222,7 @@ def iterparse(source, events=None, parser=None): # Use the internal, undocumented _parser argument for now; When the # parser argument of iterparse is removed, this can be killed. pullparser = XMLPullParser(events=events, _parser=parser) + _root = None def iterator(source): close_source = False @@ -1239,15 +1240,24 @@ def iterator(source): pullparser.feed(data) root = pullparser._close_and_return_root() yield from pullparser.read_events() - it.root = root + nonlocal _root + _root = root finally: if close_source: source.close() class IterParseIterator(collections.abc.Iterator): - __next__ = iterator(source).__next__ - it = IterParseIterator() - it.root = None + def __init__(self, it): + self.it = it + + def __next__(self): + return next(self.it) + + @property + def root(self): + return _root + + it = IterParseIterator(iterator(source)) del iterator, IterParseIterator next(it) diff --git a/Misc/NEWS.d/next/Library/2024-01-18-22-29-28.gh-issue-101438.1-uUi_.rst b/Misc/NEWS.d/next/Library/2024-01-18-22-29-28.gh-issue-101438.1-uUi_.rst new file mode 100644 index 00000000000000..56af65a323d477 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-01-18-22-29-28.gh-issue-101438.1-uUi_.rst @@ -0,0 +1,4 @@ +Avoid reference cycle in ElementTree.iterparse. The iterator returned by +``ElementTree.iterparse`` may hold on to a file descriptor. The reference +cycle prevented prompt clean-up of the file decsriptor if the returned +iterator was not exhausted. From 8fabc7c6f644918a0c823012a63e0b8b5faec348 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 19 Jan 2024 19:41:30 +0000 Subject: [PATCH 2/5] Avoid regression in bm_xml_etree performance. This avoids the `__next__` wrapper and the `root` property, both of which had a performance impact on the iterparse benchmark in bm_xml_etree. --- Lib/xml/etree/ElementTree.py | 37 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index e3aaed0feecc61..5618bf0ec955cc 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -99,6 +99,7 @@ import collections import collections.abc import contextlib +import weakref from . import ElementPath @@ -1222,15 +1223,15 @@ def iterparse(source, events=None, parser=None): # Use the internal, undocumented _parser argument for now; When the # parser argument of iterparse is removed, this can be killed. pullparser = XMLPullParser(events=events, _parser=parser) - _root = None - def iterator(source): + if not hasattr(source, "read"): + source = open(source, "rb") + close_source = True + else: close_source = False + + def iterator(source): try: - if not hasattr(source, "read"): - source = open(source, "rb") - close_source = True - yield None while True: yield from pullparser.read_events() # load event buffer @@ -1240,27 +1241,23 @@ def iterator(source): pullparser.feed(data) root = pullparser._close_and_return_root() yield from pullparser.read_events() - nonlocal _root - _root = root + iterator = wr() + if iterator: + iterator.root = root finally: if close_source: source.close() class IterParseIterator(collections.abc.Iterator): - def __init__(self, it): - self.it = it + __next__ = iterator(source).__next__ - def __next__(self): - return next(self.it) - - @property - def root(self): - return _root - - it = IterParseIterator(iterator(source)) - del iterator, IterParseIterator + def __del__(self): + if close_source: + source.close() - next(it) + it = IterParseIterator() + wr = weakref.ref(it) + del IterParseIterator return it From 1ae917e729bef6b322972add2a626920912ef635 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 19 Jan 2024 19:48:19 +0000 Subject: [PATCH 3/5] Minor simplification --- Lib/xml/etree/ElementTree.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 5618bf0ec955cc..2045d3dceaa28f 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -1241,9 +1241,8 @@ def iterator(source): pullparser.feed(data) root = pullparser._close_and_return_root() yield from pullparser.read_events() - iterator = wr() - if iterator: - iterator.root = root + if it := wr(): + it.root = root finally: if close_source: source.close() From 05baaad71636387fdecb495f6f9921f0e411148f Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 23 Jan 2024 13:30:57 -0500 Subject: [PATCH 4/5] Update Lib/xml/etree/ElementTree.py Co-authored-by: Serhiy Storchaka --- Lib/xml/etree/ElementTree.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 2045d3dceaa28f..ae6575028be11c 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -1241,7 +1241,8 @@ def iterator(source): pullparser.feed(data) root = pullparser._close_and_return_root() yield from pullparser.read_events() - if it := wr(): + it = wr() + if it is not None: it.root = root finally: if close_source: From 307b375ded8db92dbe4fca0af3a3f601756aa522 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 23 Jan 2024 21:51:23 +0200 Subject: [PATCH 5/5] Update 2024-01-18-22-29-28.gh-issue-101438.1-uUi_.rst --- .../next/Library/2024-01-18-22-29-28.gh-issue-101438.1-uUi_.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2024-01-18-22-29-28.gh-issue-101438.1-uUi_.rst b/Misc/NEWS.d/next/Library/2024-01-18-22-29-28.gh-issue-101438.1-uUi_.rst index 56af65a323d477..9b69b5deb1b5a0 100644 --- a/Misc/NEWS.d/next/Library/2024-01-18-22-29-28.gh-issue-101438.1-uUi_.rst +++ b/Misc/NEWS.d/next/Library/2024-01-18-22-29-28.gh-issue-101438.1-uUi_.rst @@ -1,4 +1,4 @@ Avoid reference cycle in ElementTree.iterparse. The iterator returned by ``ElementTree.iterparse`` may hold on to a file descriptor. The reference -cycle prevented prompt clean-up of the file decsriptor if the returned +cycle prevented prompt clean-up of the file descriptor if the returned iterator was not exhausted.