From 99f4691399229be2daacfacdba09ab4d4805f5c4 Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Sat, 26 Nov 2022 18:52:03 -0500 Subject: [PATCH 01/12] gh-89727: Fix os.walk RecursionError on deep trees Use a stack to implement os.walk iteratively instead of recursively to avoid hitting recursion limits on deeply nested trees. --- Lib/os.py | 158 ++++++++++-------- ...2-11-29-20-44-54.gh-issue-89727.UJZjkk.rst | 3 + 2 files changed, 87 insertions(+), 74 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-11-29-20-44-54.gh-issue-89727.UJZjkk.rst diff --git a/Lib/os.py b/Lib/os.py index fd1e774fdcbcfa..5dff593260fe30 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -343,86 +343,96 @@ def walk(top, topdown=True, onerror=None, followlinks=False): return _walk(fspath(top), topdown, onerror, followlinks) def _walk(top, topdown, onerror, followlinks): - dirs = [] - nondirs = [] - walk_dirs = [] - - # We may not have read permission for top, in which case we can't - # get a list of the files the directory contains. os.walk - # always suppressed the exception then, rather than blow up for a - # minor reason when (say) a thousand readable directories are still - # left to visit. That logic is copied here. - try: - # Note that scandir is global in this module due - # to earlier import-*. - scandir_it = scandir(top) - except OSError as error: - if onerror is not None: - onerror(error) - return + stack = [(False, top)] + while stack: + is_result, top = stack.pop() + if is_result: + yield top + continue - with scandir_it: - while True: - try: + dirs = [] + nondirs = [] + walk_dirs = [] + + # We may not have read permission for top, in which case we can't + # get a list of the files the directory contains. os.walk + # always suppressed the exception then, rather than blow up for a + # minor reason when (say) a thousand readable directories are still + # left to visit. That logic is copied here. + try: + # Note that scandir is global in this module due + # to earlier import-*. + scandir_it = scandir(top) + except OSError as error: + if onerror is not None: + onerror(error) + continue + + cont = False + with scandir_it: + while True: try: - entry = next(scandir_it) - except StopIteration: + try: + entry = next(scandir_it) + except StopIteration: + break + except OSError as error: + if onerror is not None: + onerror(error) + cont = True break - except OSError as error: - if onerror is not None: - onerror(error) - return - try: - is_dir = entry.is_dir() - except OSError: - # If is_dir() raises an OSError, consider that the entry is not - # a directory, same behaviour than os.path.isdir(). - is_dir = False - - if is_dir: - dirs.append(entry.name) - else: - nondirs.append(entry.name) + try: + is_dir = entry.is_dir() + except OSError: + # If is_dir() raises an OSError, consider that the entry is not + # a directory, same behaviour than os.path.isdir(). + is_dir = False - if not topdown and is_dir: - # Bottom-up: recurse into sub-directory, but exclude symlinks to - # directories if followlinks is False - if followlinks: - walk_into = True + if is_dir: + dirs.append(entry.name) else: - try: - is_symlink = entry.is_symlink() - except OSError: - # If is_symlink() raises an OSError, consider that the - # entry is not a symbolic link, same behaviour than - # os.path.islink(). - is_symlink = False - walk_into = not is_symlink - - if walk_into: - walk_dirs.append(entry.path) - - # Yield before recursion if going top down - if topdown: - yield top, dirs, nondirs - - # Recurse into sub-directories - islink, join = path.islink, path.join - for dirname in dirs: - new_path = join(top, dirname) - # Issue #23605: os.path.islink() is used instead of caching - # entry.is_symlink() result during the loop on os.scandir() because - # the caller can replace the directory entry during the "yield" - # above. - if followlinks or not islink(new_path): - yield from _walk(new_path, topdown, onerror, followlinks) - else: - # Recurse into sub-directories - for new_path in walk_dirs: - yield from _walk(new_path, topdown, onerror, followlinks) - # Yield after recursion if going bottom up - yield top, dirs, nondirs + nondirs.append(entry.name) + + if not topdown and is_dir: + # Bottom-up: traverse into sub-directory, but exclude symlinks to + # directories if followlinks is False + if followlinks: + walk_into = True + else: + try: + is_symlink = entry.is_symlink() + except OSError: + # If is_symlink() raises an OSError, consider that the + # entry is not a symbolic link, same behaviour than + # os.path.islink(). + is_symlink = False + walk_into = not is_symlink + + if walk_into: + walk_dirs.append(entry.path) + if cont: + continue + + # Yield before sub-directory traversal if going top down + if topdown: + yield top, dirs, nondirs + # Traverse into sub-directories + islink, join = path.islink, path.join + for dirname in reversed(dirs): + new_path = join(top, dirname) + # Issue #23605: os.path.islink() is used instead of caching + # entry.is_symlink() result during the loop on os.scandir() because + # the caller can replace the directory entry during the "yield" + # above. + if followlinks or not islink(new_path): + stack.append((False, new_path)) + else: + # Yield after sub-directory traversal if going bottom up + stack.append((True, (top, dirs, nondirs))) + # Traverse into sub-directories + for new_path in reversed(walk_dirs): + stack.append((False, new_path)) __all__.append("walk") diff --git a/Misc/NEWS.d/next/Library/2022-11-29-20-44-54.gh-issue-89727.UJZjkk.rst b/Misc/NEWS.d/next/Library/2022-11-29-20-44-54.gh-issue-89727.UJZjkk.rst new file mode 100644 index 00000000000000..9b86f0cbaec2ad --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-11-29-20-44-54.gh-issue-89727.UJZjkk.rst @@ -0,0 +1,3 @@ +Fix issue with :func:`os.walk` where a :exc:`RecursionError` would occur on +deep directory structures by adjusting the implementation of +:func:`os._walk` to be iterative instead of recursive. From 3552d4989724e9086774e76b21c6871fc83f820f Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Thu, 15 Dec 2022 21:15:59 -0500 Subject: [PATCH 02/12] add unit test to ensure os.walk is not affect by recursion limit --- Lib/test/support/__init__.py | 16 ++++++++++------ Lib/test/test_os.py | 9 +++++++++ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py index a631bfc80cfaf0..589655fed3cbf1 100644 --- a/Lib/test/support/__init__.py +++ b/Lib/test/support/__init__.py @@ -2178,19 +2178,23 @@ def check_disallow_instantiation(testcase, tp, *args, **kwds): testcase.assertRaisesRegex(TypeError, msg, tp, *args, **kwds) @contextlib.contextmanager +def temp_recursion_limit(limit): + """Temporarily change the recursion limit.""" + original_limit = sys.getrecursionlimit() + try: + sys.setrecursionlimit(limit) + yield + finally: + sys.setrecursionlimit(original_limit) + def infinite_recursion(max_depth=75): """Set a lower limit for tests that interact with infinite recursions (e.g test_ast.ASTHelpers_Test.test_recursion_direct) since on some debug windows builds, due to not enough functions being inlined the stack size might not handle the default recursion limit (1000). See bpo-11105 for details.""" + return temp_recursion_limit(max_depth) - original_depth = sys.getrecursionlimit() - try: - sys.setrecursionlimit(max_depth) - yield - finally: - sys.setrecursionlimit(original_depth) def ignore_deprecations_from(module: str, *, like: str) -> object: token = object() diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index e0577916428a08..0e05aff4efb6fa 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -33,6 +33,7 @@ from test.support import import_helper from test.support import os_helper from test.support import socket_helper +from test.support import temp_recursion_limit from test.support import warnings_helper from platform import win32_is_iot @@ -1471,6 +1472,12 @@ def test_walk_many_open_files(self): self.assertEqual(next(it), expected) p = os.path.join(p, 'd') + def test_walk_above_recursion_limit(self): + os.makedirs(os.path.join(self.walk_path, *(['d'] * 50))) + with temp_recursion_limit(50): + all = list(self.walk(self.walk_path)) + self.assertEqual(len(all), 54) + @unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()") class FwalkTests(WalkTests): @@ -1545,6 +1552,8 @@ def test_fd_leak(self): # fwalk() keeps file descriptors open test_walk_many_open_files = None + # fwalk() still uses recursion + test_walk_above_recursion_limit = None class BytesWalkTests(WalkTests): From 3078ea601feae31baecb36fc2eecfe5e0c19040f Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Thu, 15 Dec 2022 21:42:46 -0500 Subject: [PATCH 03/12] move os._walk code to os.walk --- Lib/os.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index 5dff593260fe30..21057e130553e4 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -340,13 +340,11 @@ def walk(top, topdown=True, onerror=None, followlinks=False): """ sys.audit("os.walk", top, topdown, onerror, followlinks) - return _walk(fspath(top), topdown, onerror, followlinks) -def _walk(top, topdown, onerror, followlinks): - stack = [(False, top)] + stack = [(False, fspath(top))] while stack: - is_result, top = stack.pop() - if is_result: + must_yield, top = stack.pop() + if must_yield: yield top continue From f37bbe8aaa3ffd91b879a17bc5d0daee2cbd20e2 Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Thu, 15 Dec 2022 21:52:38 -0500 Subject: [PATCH 04/12] fix blurb to mention os.walk --- .../next/Library/2022-11-29-20-44-54.gh-issue-89727.UJZjkk.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2022-11-29-20-44-54.gh-issue-89727.UJZjkk.rst b/Misc/NEWS.d/next/Library/2022-11-29-20-44-54.gh-issue-89727.UJZjkk.rst index 9b86f0cbaec2ad..8a5fdb64b87f82 100644 --- a/Misc/NEWS.d/next/Library/2022-11-29-20-44-54.gh-issue-89727.UJZjkk.rst +++ b/Misc/NEWS.d/next/Library/2022-11-29-20-44-54.gh-issue-89727.UJZjkk.rst @@ -1,3 +1,3 @@ Fix issue with :func:`os.walk` where a :exc:`RecursionError` would occur on deep directory structures by adjusting the implementation of -:func:`os._walk` to be iterative instead of recursive. +:func:`os.walk` to be iterative instead of recursive. From f26a5b88b8636d2ffd27f5410d94a795b8ec0d4d Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Thu, 15 Dec 2022 21:53:00 -0500 Subject: [PATCH 05/12] use deque in os.walk --- Lib/os.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/os.py b/Lib/os.py index 21057e130553e4..31de865c9b7fdc 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -23,6 +23,7 @@ #' import abc +from collections import deque import sys import stat as st @@ -341,7 +342,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False): """ sys.audit("os.walk", top, topdown, onerror, followlinks) - stack = [(False, fspath(top))] + stack = deque([(False, fspath(top))]) while stack: must_yield, top = stack.pop() if must_yield: From ef46eda965d1c8dc1a63dbf464a79ab19fc8d80d Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Fri, 16 Dec 2022 10:56:15 -0500 Subject: [PATCH 06/12] move deque import inside os.walk --- Lib/os.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/os.py b/Lib/os.py index 31de865c9b7fdc..2f3b40cbb4bc70 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -23,7 +23,6 @@ #' import abc -from collections import deque import sys import stat as st @@ -342,6 +341,8 @@ def walk(top, topdown=True, onerror=None, followlinks=False): """ sys.audit("os.walk", top, topdown, onerror, followlinks) + from collections import deque + stack = deque([(False, fspath(top))]) while stack: must_yield, top = stack.pop() From e261b9fd1b2628f578ceff19b86df9bc02206601 Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Fri, 16 Dec 2022 11:02:05 -0500 Subject: [PATCH 07/12] rename temp_recursion_limit to set_recursion_limit --- Lib/test/support/__init__.py | 4 ++-- Lib/test/test_os.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py index 589655fed3cbf1..b7186057990ac1 100644 --- a/Lib/test/support/__init__.py +++ b/Lib/test/support/__init__.py @@ -2178,7 +2178,7 @@ def check_disallow_instantiation(testcase, tp, *args, **kwds): testcase.assertRaisesRegex(TypeError, msg, tp, *args, **kwds) @contextlib.contextmanager -def temp_recursion_limit(limit): +def set_recursion_limit(limit): """Temporarily change the recursion limit.""" original_limit = sys.getrecursionlimit() try: @@ -2193,7 +2193,7 @@ def infinite_recursion(max_depth=75): debug windows builds, due to not enough functions being inlined the stack size might not handle the default recursion limit (1000). See bpo-11105 for details.""" - return temp_recursion_limit(max_depth) + return set_recursion_limit(max_depth) def ignore_deprecations_from(module: str, *, like: str) -> object: diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index 0e05aff4efb6fa..b8565bc306dac4 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -33,7 +33,7 @@ from test.support import import_helper from test.support import os_helper from test.support import socket_helper -from test.support import temp_recursion_limit +from test.support import set_recursion_limit from test.support import warnings_helper from platform import win32_is_iot @@ -1474,7 +1474,7 @@ def test_walk_many_open_files(self): def test_walk_above_recursion_limit(self): os.makedirs(os.path.join(self.walk_path, *(['d'] * 50))) - with temp_recursion_limit(50): + with set_recursion_limit(50): all = list(self.walk(self.walk_path)) self.assertEqual(len(all), 54) From 507b650a7d8dc01b6115f434591238699ef8ad38 Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Fri, 16 Dec 2022 12:10:50 -0500 Subject: [PATCH 08/12] switch back to list instead of deque for os.walk stack --- Lib/os.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index 2f3b40cbb4bc70..21057e130553e4 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -341,9 +341,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False): """ sys.audit("os.walk", top, topdown, onerror, followlinks) - from collections import deque - - stack = deque([(False, fspath(top))]) + stack = [(False, fspath(top))] while stack: must_yield, top = stack.pop() if must_yield: From 1c356102d80ac154b9379c86603d975d2dfe20e0 Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Sun, 18 Dec 2022 17:45:22 -0500 Subject: [PATCH 09/12] Apply suggestions from code review Co-authored-by: Jelle Zijlstra --- Lib/os.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index 21057e130553e4..1dda6e616273af 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -353,13 +353,11 @@ def walk(top, topdown=True, onerror=None, followlinks=False): walk_dirs = [] # We may not have read permission for top, in which case we can't - # get a list of the files the directory contains. os.walk - # always suppressed the exception then, rather than blow up for a + # get a list of the files the directory contains. + # We suppressed the exception here, rather than blow up for a # minor reason when (say) a thousand readable directories are still - # left to visit. That logic is copied here. + # left to visit. try: - # Note that scandir is global in this module due - # to earlier import-*. scandir_it = scandir(top) except OSError as error: if onerror is not None: @@ -384,7 +382,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False): is_dir = entry.is_dir() except OSError: # If is_dir() raises an OSError, consider that the entry is not - # a directory, same behaviour than os.path.isdir(). + # a directory, same behaviour as os.path.isdir(). is_dir = False if is_dir: @@ -419,7 +417,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False): islink, join = path.islink, path.join for dirname in reversed(dirs): new_path = join(top, dirname) - # Issue #23605: os.path.islink() is used instead of caching + # bpo-23605: os.path.islink() is used instead of caching # entry.is_symlink() result during the loop on os.scandir() because # the caller can replace the directory entry during the "yield" # above. From 73138a6363e5b8276e10d4639e66788bcd7323f4 Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Sun, 18 Dec 2022 18:01:54 -0500 Subject: [PATCH 10/12] clean up os.walk comments --- Lib/os.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Lib/os.py b/Lib/os.py index 1dda6e616273af..d2ddd71174a96f 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -354,7 +354,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False): # We may not have read permission for top, in which case we can't # get a list of the files the directory contains. - # We suppressed the exception here, rather than blow up for a + # We suppress the exception here, rather than blow up for a # minor reason when (say) a thousand readable directories are still # left to visit. try: @@ -381,8 +381,8 @@ def walk(top, topdown=True, onerror=None, followlinks=False): try: is_dir = entry.is_dir() except OSError: - # If is_dir() raises an OSError, consider that the entry is not - # a directory, same behaviour as os.path.isdir(). + # If is_dir() raises an OSError, consider the entry not to + # be a directory, same behaviour as os.path.isdir(). is_dir = False if is_dir: @@ -391,17 +391,17 @@ def walk(top, topdown=True, onerror=None, followlinks=False): nondirs.append(entry.name) if not topdown and is_dir: - # Bottom-up: traverse into sub-directory, but exclude symlinks to - # directories if followlinks is False + # Bottom-up: traverse into sub-directory, but exclude + # symlinks to directories if followlinks is False if followlinks: walk_into = True else: try: is_symlink = entry.is_symlink() except OSError: - # If is_symlink() raises an OSError, consider that the - # entry is not a symbolic link, same behaviour than - # os.path.islink(). + # If is_symlink() raises an OSError, consider the + # entry not to be a symbolic link, same behaviour + # as os.path.islink(). is_symlink = False walk_into = not is_symlink @@ -410,8 +410,8 @@ def walk(top, topdown=True, onerror=None, followlinks=False): if cont: continue - # Yield before sub-directory traversal if going top down if topdown: + # Yield before sub-directory traversal if going top down yield top, dirs, nondirs # Traverse into sub-directories islink, join = path.islink, path.join From 2814cc5f728a054fe463dd8aa65ccfd0b14c6f74 Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Sun, 18 Dec 2022 17:26:35 -0500 Subject: [PATCH 11/12] test correct value in test_walk_above_recursion_limit --- Lib/test/test_os.py | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index b8565bc306dac4..e6e25b507de051 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -1473,10 +1473,44 @@ def test_walk_many_open_files(self): p = os.path.join(p, 'd') def test_walk_above_recursion_limit(self): - os.makedirs(os.path.join(self.walk_path, *(['d'] * 50))) - with set_recursion_limit(50): + depth = 50 + os.makedirs(os.path.join(self.walk_path, *(['d'] * depth))) + with set_recursion_limit(depth - 5): all = list(self.walk(self.walk_path)) - self.assertEqual(len(all), 54) + + sub2_path = self.sub2_tree[0] + for root, dirs, files in all: + if root == sub2_path: + dirs.sort() + files.sort() + + d_entries = [] + d_path = self.walk_path + for _ in range(depth): + d_path = os.path.join(d_path, "d") + d_entries.append((d_path, ["d"], [])) + d_entries[-1][1].clear() + + # Sub-sequences where the order is known + sections = { + "SUB1": [ + (self.sub1_path, ["SUB11"], ["tmp2"]), + (self.sub11_path, [], []), + ], + "SUB2": [self.sub2_tree], + "d": d_entries, + } + + # The ordering of sub-dirs is arbitrary but determines the order in + # which sub-sequences appear + dirs = all[0][1] + expected = [(self.walk_path, dirs, ["tmp1"])] + for d in dirs: + expected.extend(sections[d]) + + self.assertEqual(len(all), depth + 4) + self.assertEqual(sorted(dirs), ["SUB1", "SUB2", "d"]) + self.assertEqual(all, expected) @unittest.skipUnless(hasattr(os, 'fwalk'), "Test needs os.fwalk()") From 37f3cc77d8cc9380561280bf49f4979c68437657 Mon Sep 17 00:00:00 2001 From: Jon Burdo Date: Sun, 18 Dec 2022 18:10:13 -0500 Subject: [PATCH 12/12] set islink and join before loop in os.walk --- Lib/os.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/os.py b/Lib/os.py index d2ddd71174a96f..73a5442ee8b83f 100644 --- a/Lib/os.py +++ b/Lib/os.py @@ -342,6 +342,7 @@ def walk(top, topdown=True, onerror=None, followlinks=False): sys.audit("os.walk", top, topdown, onerror, followlinks) stack = [(False, fspath(top))] + islink, join = path.islink, path.join while stack: must_yield, top = stack.pop() if must_yield: @@ -414,7 +415,6 @@ def walk(top, topdown=True, onerror=None, followlinks=False): # Yield before sub-directory traversal if going top down yield top, dirs, nondirs # Traverse into sub-directories - islink, join = path.islink, path.join for dirname in reversed(dirs): new_path = join(top, dirname) # bpo-23605: os.path.islink() is used instead of caching