From f1b56038f0e49a873584966f3da89f065a19a8a3 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 26 Feb 2024 21:39:44 +0000 Subject: [PATCH 1/6] GH-115060: Speed up `pathlib.Path.glob()` by skipping directory scanning. For ordinary literal pattern segments (e.g. `foo/bar` in `foo/bar/../**`), skip calling `_scandir()` on each segment, and instead call `exists()` or `is_dir()` as necessary to exclude missing paths. This only applies when *case_sensitive* is `None` (the default); otherwise we can't guarantee case sensitivity or realness with this approach. If *follow_symlinks* is `False` we also need to exclude symlinks from intermediate segments. This restores an optimization that was removed in da1980a by some eejit. It's actually even faster because we don't `stat()` intermediate directories, and in some cases we can skip all filesystem access when expanding a literal part (e.g. when it's followed by a non-recursive wildcard segment). --- Lib/pathlib/_abc.py | 78 ++++++++++++++++++- Lib/test/test_pathlib/test_pathlib_abc.py | 6 +- ...-02-29-20-42-48.gh-issue-115060.fofNVf.rst | 2 + 3 files changed, 79 insertions(+), 7 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-02-29-20-42-48.gh-issue-115060.fofNVf.rst diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 44fea525b6cac7..153c6c32537ec4 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -36,6 +36,12 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) +def _is_wildcard_pattern(pat): + """Whether this pattern needs actual matching using fnmatch, or can be + looked up directly as a file.""" + return "*" in pat or "?" in pat or "[" in pat + + @functools.cache def _is_case_sensitive(pathmod): return pathmod.normcase('Aa') == 'Aa' @@ -60,12 +66,42 @@ def _compile_pattern(pat, sep, case_sensitive, recursive=True): return re.compile(regex, flags=flags).match -def _select_special(paths, part): - """Yield special literal children of the given paths.""" +def _select_literal(paths, part): + """Yield literal children of the given paths.""" for path in paths: yield path._make_child_relpath(part) +def _select_directories(paths): + """Yield the given paths, filtering out non-directories.""" + for path in paths: + try: + if path.is_dir(): + yield path + except OSError: + pass + + +def _deselect_missing(paths): + """Yield the given paths, filtering out missing files.""" + for path in paths: + try: + path.stat(follow_symlinks=False) + yield path + except OSError: + pass + + +def _deselect_symlinks(paths): + """Yield the given paths, filtering out symlinks.""" + for path in paths: + try: + if not path.is_symlink(): + yield path + except OSError: + pass + + def _select_children(parent_paths, dir_only, follow_symlinks, match): """Yield direct children of given paths, filtering by name and type.""" if follow_symlinks is None: @@ -799,8 +835,18 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): # TODO: evaluate case-sensitivity of each directory in _select_children(). case_sensitive = _is_case_sensitive(self.pathmod) + # User doesn't care about case sensitivity, so for non-wildcard + # patterns like "foo/bar" we can stat() once rather than scandir() + # twice. Returned paths may not match real filesystem case. + case_preserving = False + else: + # Explicit case sensitivity choice provided. We must use scandir() + # to retrieve and match filenames with real filesystem case. + case_preserving = True + stack = pattern._pattern_stack specials = ('', '.', '..') + check_paths = False deduplicate_paths = False sep = self.pathmod.sep paths = iter([self] if self.is_dir() else []) @@ -808,7 +854,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): part = stack.pop() if part in specials: # Join special component (e.g. '..') onto paths. - paths = _select_special(paths, part) + paths = _select_literal(paths, part) elif part == '**': # Consume following '**' components, which have no effect. @@ -826,6 +872,11 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): # re.Pattern object based on those components. match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None + # Ensure directories exist. + if check_paths: + paths = _select_directories(paths) + check_paths = False + # Recursively walk directories, filtering by type and regex. paths = _select_recursive(paths, bool(stack), follow_symlinks, match) @@ -837,13 +888,32 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): elif '**' in part: raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: + elif case_preserving or _is_wildcard_pattern(part): # If the pattern component isn't '*', compile an re.Pattern # object based on the component. match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None # Iterate over directories' children filtering by type and regex. paths = _select_children(paths, bool(stack), follow_symlinks, match) + + # Paths are known to exist: they're directory children from _scandir() + check_paths = False + + else: + # Join non-wildcard component onto paths. + paths = _select_literal(paths, part) + + # Filter out non-symlinks if requested. + if follow_symlinks is False: + paths = _deselect_symlinks(paths) + + # Paths might not exist; mark them to be checked. + check_paths = True + + if check_paths: + # Filter out paths that don't exist. + paths = _deselect_missing(paths) + return paths def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 5bfb76f85c7909..dfaaa6dbde3cec 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1431,10 +1431,10 @@ def __repr__(self): return "{}({!r})".format(self.__class__.__name__, self.as_posix()) def stat(self, *, follow_symlinks=True): - if follow_symlinks: - path = str(self.resolve()) + if follow_symlinks or self.name == '..': + path = str(self.resolve(strict=True)) else: - path = str(self.parent.resolve() / self.name) + path = str(self.parent.resolve(strict=True) / self.name) if path in self._files: st_mode = stat.S_IFREG elif path in self._directories: diff --git a/Misc/NEWS.d/next/Library/2024-02-29-20-42-48.gh-issue-115060.fofNVf.rst b/Misc/NEWS.d/next/Library/2024-02-29-20-42-48.gh-issue-115060.fofNVf.rst new file mode 100644 index 00000000000000..6e612cb0d0ed2d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-02-29-20-42-48.gh-issue-115060.fofNVf.rst @@ -0,0 +1,2 @@ +Speed up handling of non-wildcard pattern segments in +:meth:`pathlib.Path.glob`. From cbc7002edec6ea8c2d8f86c23fa07ea1406ad028 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 29 Feb 2024 21:09:24 +0000 Subject: [PATCH 2/6] Also skip initial is_dir() --- Doc/library/pathlib.rst | 4 +--- Lib/pathlib/_abc.py | 4 ++-- Lib/test/test_pathlib/test_pathlib_abc.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 4b461a5d4a2949..3f4d2c16f2eb2b 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -1004,9 +1004,7 @@ call fails (for example because the path doesn't exist). .. seealso:: :ref:`pathlib-pattern-language` documentation. - This method calls :meth:`Path.is_dir` on the top-level directory and - propagates any :exc:`OSError` exception that is raised. Subsequent - :exc:`OSError` exceptions from scanning directories are suppressed. + This method suppresses :exc:`OSError` exceptions. By default, or when the *case_sensitive* keyword-only argument is set to ``None``, this method matches paths using platform-specific casing rules: diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 153c6c32537ec4..f3882eaafe9fca 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -846,10 +846,10 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): stack = pattern._pattern_stack specials = ('', '.', '..') - check_paths = False + check_paths = True deduplicate_paths = False sep = self.pathmod.sep - paths = iter([self] if self.is_dir() else []) + paths = iter([self]) while stack: part = stack.pop() if part in specials: diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index dfaaa6dbde3cec..78573d0cd7576c 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1431,7 +1431,7 @@ def __repr__(self): return "{}({!r})".format(self.__class__.__name__, self.as_posix()) def stat(self, *, follow_symlinks=True): - if follow_symlinks or self.name == '..': + if follow_symlinks or not self.name or self.name == '.' or self.name == '..': path = str(self.resolve(strict=True)) else: path = str(self.parent.resolve(strict=True) / self.name) From d1f8cae2a773e602f873a28e70122ae440ade789 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 29 Feb 2024 21:44:03 +0000 Subject: [PATCH 3/6] Fix tests --- Lib/test/test_pathlib/test_pathlib_abc.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 78573d0cd7576c..7ee58036f0d2ae 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1752,10 +1752,10 @@ def test_glob_posix(self): def test_glob_windows(self): P = self.cls p = P(self.base) - self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") }) + self.assertEqual(set(p.glob("FILEa")), { P(self.base, "FILEa") }) self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") }) self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") }) - self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\fileA"}) + self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\FILEa"}) self.assertEqual(set(map(str, p.glob("F*a"))), {f"{p}\\fileA"}) def test_glob_empty_pattern(self): @@ -1903,9 +1903,9 @@ def test_rglob_posix(self): def test_rglob_windows(self): P = self.cls p = P(self.base, "dirC") - self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") }) + self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/FILEd") }) self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") }) - self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\fileD"}) + self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\FILEd"}) @needs_symlinks def test_rglob_follow_symlinks_common(self): @@ -1993,9 +1993,20 @@ def test_glob_dotdot(self): self.assertEqual(set(p.glob("dirA/../file*")), { P(self.base, "dirA/../fileA") }) self.assertEqual(set(p.glob("dirA/../file*/..")), set()) self.assertEqual(set(p.glob("../xyzzy")), set()) - self.assertEqual(set(p.glob("xyzzy/..")), set()) self.assertEqual(set(p.glob("/".join([".."] * 50))), { P(self.base, *[".."] * 50)}) + @needs_posix + def test_glob_dotdot_posix(self): + p = self.cls(self.base) + self.assertEqual(set(p.glob("xyzzy/..")), set()) + + @needs_windows + def test_glob_dotdot_windows(self): + # '..' segments are resolved first on Windows, so + # 'xyzzy' doesn't need to exist. + p = self.cls(self.base) + self.assertEqual(set(p.glob("xyzzy/..")), { p }) + @needs_symlinks def test_glob_permissions(self): # See bpo-38894 From 80e384c38cd492a9e0473ace936f64f06ae4535e Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 29 Feb 2024 22:02:25 +0000 Subject: [PATCH 4/6] Fix tests, attempt 2 --- Lib/test/test_pathlib/test_pathlib_abc.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 7ee58036f0d2ae..08e5ecf8a20a71 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1997,15 +1997,17 @@ def test_glob_dotdot(self): @needs_posix def test_glob_dotdot_posix(self): - p = self.cls(self.base) + P = self.cls + p = P(self.base) self.assertEqual(set(p.glob("xyzzy/..")), set()) @needs_windows def test_glob_dotdot_windows(self): # '..' segments are resolved first on Windows, so # 'xyzzy' doesn't need to exist. - p = self.cls(self.base) - self.assertEqual(set(p.glob("xyzzy/..")), { p }) + P = self.cls + p = P(self.base) + self.assertEqual(set(p.glob("xyzzy/..")), { P(self.base, "xyzzy", "..") }) @needs_symlinks def test_glob_permissions(self): From 0b98008012e194404e2fd818410f3dcde5a62925 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 29 Feb 2024 22:26:09 +0000 Subject: [PATCH 5/6] Fix handling of final literal symlink --- Lib/pathlib/_abc.py | 20 ++++++++++++-------- Lib/test/test_pathlib/test_pathlib_abc.py | 5 +++++ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index f3882eaafe9fca..e7321ab7685f4f 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -92,14 +92,19 @@ def _deselect_missing(paths): pass -def _deselect_symlinks(paths): +def _deselect_symlinks(paths, dir_only, follow_symlinks): """Yield the given paths, filtering out symlinks.""" + if follow_symlinks is None: + follow_symlinks = True for path in paths: - try: - if not path.is_symlink(): - yield path - except OSError: - pass + if follow_symlinks or not dir_only: + yield path + else: + try: + if not path.is_symlink(): + yield path + except OSError: + pass def _select_children(parent_paths, dir_only, follow_symlinks, match): @@ -904,8 +909,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): paths = _select_literal(paths, part) # Filter out non-symlinks if requested. - if follow_symlinks is False: - paths = _deselect_symlinks(paths) + paths = _deselect_symlinks(paths, bool(stack), follow_symlinks) # Paths might not exist; mark them to be checked. check_paths = True diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 08e5ecf8a20a71..8b0a1ed2baba38 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1843,6 +1843,11 @@ def _check(path, glob, expected): _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD/"]) _check(p, "*/dirD/**", ["dirC/dirD/", "dirC/dirD/fileD"]) _check(p, "*/dirD/**/", ["dirC/dirD/"]) + _check(p, "linkA", ["linkA"]) + _check(p, "linkB", ["linkB"]) + _check(p, "linkB/fileB", []) + _check(p, "dirA/linkC", ["dirA/linkC"]) + _check(p, "dirA/linkC/fileB", []) def test_rglob_common(self): def _check(glob, expected): From dfc4766f3b80154236c3111ec3e84cdb508ed05e Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 29 Feb 2024 22:31:07 +0000 Subject: [PATCH 6/6] Make follow_symlinks behaviour a bit clearer --- Lib/pathlib/_abc.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index e7321ab7685f4f..599229c3bb2f05 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -94,8 +94,6 @@ def _deselect_missing(paths): def _deselect_symlinks(paths, dir_only, follow_symlinks): """Yield the given paths, filtering out symlinks.""" - if follow_symlinks is None: - follow_symlinks = True for path in paths: if follow_symlinks or not dir_only: yield path @@ -109,8 +107,6 @@ def _deselect_symlinks(paths, dir_only, follow_symlinks): def _select_children(parent_paths, dir_only, follow_symlinks, match): """Yield direct children of given paths, filtering by name and type.""" - if follow_symlinks is None: - follow_symlinks = True for parent_path in parent_paths: try: # We must close the scandir() object before proceeding to @@ -137,8 +133,6 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks, match): """Yield given paths and all their children, recursively, filtering by string and type. """ - if follow_symlinks is None: - follow_symlinks = False for parent_path in parent_paths: if match is not None: # If we're filtering paths through a regex, record the length of @@ -849,6 +843,13 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): # to retrieve and match filenames with real filesystem case. case_preserving = True + if follow_symlinks is None: + # Legacy behaviour: follow symlinks unless we're expanding '**'. + follow_symlinks = True + follow_symlinks_recursive = False + else: + follow_symlinks_recursive = follow_symlinks + stack = pattern._pattern_stack specials = ('', '.', '..') check_paths = True @@ -869,7 +870,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): # Consume following non-special components, provided we're # treating symlinks consistently. Each component is joined # onto 'part', which is used to generate an re.Pattern object. - if follow_symlinks is not None: + if follow_symlinks == follow_symlinks_recursive: while stack and stack[-1] not in specials: part += sep + stack.pop() @@ -883,7 +884,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): check_paths = False # Recursively walk directories, filtering by type and regex. - paths = _select_recursive(paths, bool(stack), follow_symlinks, match) + paths = _select_recursive(paths, bool(stack), follow_symlinks_recursive, match) # De-duplicate if we've already seen a '**' component. if deduplicate_paths: