From 06477e49f1f45aa5110a834b01e14fd37bcb33ad Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 13 Oct 2024 21:44:55 +0100 Subject: [PATCH 1/3] GH-125413: Add `pathlib.Path.dir_entry` attribute Add a `Path.dir_entry` attribute. In any path object generated by `Path.iterdir()`, it stores an `os.DirEntry` object corresponding to the path; in other cases it is `None`. This can be used to retrieve the file type and attributes of directory children without necessarily incurring further system calls. Under the hood, we use `dir_entry` in our implementations of `PathBase.glob()`, `PathBase.walk()` and `PathBase.copy()`, the last of which also provides the implementation of `Path.copy()`, resulting in a modest speedup when copying local directory trees. --- Doc/library/pathlib.rst | 23 +++++++++++ Doc/whatsnew/3.14.rst | 6 +++ Lib/glob.py | 13 ++---- Lib/pathlib/_abc.py | 28 +++++++------ Lib/pathlib/_local.py | 12 +++--- Lib/pathlib/_os.py | 3 +- Lib/test/test_pathlib/test_pathlib_abc.py | 50 ++++++++++++++++++++--- 7 files changed, 102 insertions(+), 33 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 30d0d385d0539c..087bf33f2d111b 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -1289,6 +1289,29 @@ Reading directories raised. +.. attribute:: Path.dir_entry + + In a path yielded from :meth:`Path.iterdir`, this attribute stores a + :class:`os.DirEntry` object corresponding to the path; in other cases it is + ``None``. This can be used to retrieve the file type and attributes of + directory children without necessarily incurring further system calls:: + + >>> p = Path('docs') + >>> for child in p.iterdir(): + ... entry = child.dir_entry + ... if entry.is_dir(): + ... child + ... + PosixPath('docs/_templates') + PosixPath('docs/_build') + PosixPath('docs/_static') + + For technical reasons, this attribute is also available from + :class:`PurePath` objects, where its value is always ``None``. + + .. versionadded:: 3.14 + + .. method:: Path.glob(pattern, *, case_sensitive=None, recurse_symlinks=False) Glob the given relative *pattern* in the directory represented by this path, diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index c62a3ca5872eef..37402342fb32c8 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -343,6 +343,12 @@ pathlib (Contributed by Barney Gale in :gh:`73991`.) +* Add a :attr:`.Path.dir_entry` attribute. In a path object generated by + :meth:`.Path.iterdir`, it stores a :class:`os.DirEntry` object corresponding + to the path; in other cases it is ``None``. + + (Contributed by Barney Gale in :gh:`125413`.) + pdb --- diff --git a/Lib/glob.py b/Lib/glob.py index 574e5ad51b601d..ce9b3698888dd9 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -364,12 +364,6 @@ def concat_path(path, text): """ raise NotImplementedError - @staticmethod - def parse_entry(entry): - """Returns the path of an entry yielded from scandir(). - """ - raise NotImplementedError - # High-level methods def compile(self, pat): @@ -438,6 +432,7 @@ def select_wildcard(path, exists=False): except OSError: pass else: + prefix = self.add_slash(path) for entry in entries: if match is None or match(entry.name): if dir_only: @@ -446,7 +441,7 @@ def select_wildcard(path, exists=False): continue except OSError: continue - entry_path = self.parse_entry(entry) + entry_path = self.concat_path(prefix, entry.name) if dir_only: yield from select_next(entry_path, exists=True) else: @@ -495,6 +490,7 @@ def select_recursive_step(stack, match_pos): except OSError: pass else: + prefix = self.add_slash(path) for entry in entries: is_dir = False try: @@ -504,7 +500,7 @@ def select_recursive_step(stack, match_pos): pass if is_dir or not dir_only: - entry_path = self.parse_entry(entry) + entry_path = self.concat_path(prefix, entry.name) if match is None or match(str(entry_path), match_pos): if dir_only: yield from select_next(entry_path, exists=True) @@ -533,7 +529,6 @@ class _StringGlobber(_GlobberBase): """ lexists = staticmethod(os.path.lexists) scandir = staticmethod(os.scandir) - parse_entry = operator.attrgetter('path') concat_path = operator.add if os.name == 'nt': diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 11c8018b28f26b..d426049019b8bc 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -101,18 +101,13 @@ def scandir(path): a context manager. This method is called by walk() and glob(). """ import contextlib - return contextlib.nullcontext(path.iterdir()) + return contextlib.nullcontext(child.dir_entry for child in path.iterdir()) @staticmethod def concat_path(path, text): """Appends text to the given path.""" return path.with_segments(path._raw_path + text) - @staticmethod - def parse_entry(entry): - """Returns the path of an entry yielded from scandir().""" - return entry - class PurePathBase: """Base class for pure path objects. @@ -132,6 +127,12 @@ class PurePathBase: # is being processed by `PathBase.resolve()`. This prevents duplicate # work from occurring when `resolve()` calls `stat()` or `readlink()`. '_resolving', + + # The 'dir_entry' slot stores an `os.DirEntry`-like object or `None`. + # It is available for paths generated from `PathBase.iterdir()`. It is + # defined here rather than in `PathBase` to avoid a class layout + # conflict in `Path`. + 'dir_entry', ) parser = ParserBase() _globber = PathGlobber @@ -142,6 +143,7 @@ def __init__(self, path, *paths): raise TypeError( f"path should be a str, not {type(self._raw_path).__name__!r}") self._resolving = False + self.dir_entry = None def with_segments(self, *pathsegments): """Construct a new path object from any number of path-like objects. @@ -696,15 +698,16 @@ def walk(self, top_down=True, on_error=None, follow_symlinks=False): paths.append((path, dirnames, filenames)) try: for child in path.iterdir(): + entry = child.dir_entry try: - if child.is_dir(follow_symlinks=follow_symlinks): + if entry.is_dir(follow_symlinks=follow_symlinks): if not top_down: paths.append(child) - dirnames.append(child.name) + dirnames.append(entry.name) else: - filenames.append(child.name) + filenames.append(entry.name) except OSError: - filenames.append(child.name) + filenames.append(entry.name) except OSError as error: if on_error is not None: on_error(error) @@ -875,11 +878,12 @@ def copy(self, target, *, follow_symlinks=True, dirs_exist_ok=False, stack = [(self, target)] while stack: src, dst = stack.pop() - if not follow_symlinks and src.is_symlink(): + entry = src.dir_entry or src + if not follow_symlinks and entry.is_symlink(): dst._symlink_to_target_of(src) if preserve_metadata: src._copy_metadata(dst, follow_symlinks=False) - elif src.is_dir(): + elif entry.is_dir(): children = src.iterdir() dst.mkdir(exist_ok=dirs_exist_ok) stack.extend((child, dst.joinpath(child.name)) diff --git a/Lib/pathlib/_local.py b/Lib/pathlib/_local.py index a78997179820b1..971722f5997284 100644 --- a/Lib/pathlib/_local.py +++ b/Lib/pathlib/_local.py @@ -137,6 +137,7 @@ def __init__(self, *args): paths.append(path) # Avoid calling super().__init__, as an optimisation self._raw_paths = paths + self.dir_entry = None def joinpath(self, *pathsegments): """Combine this path with one or several arguments, and return a @@ -622,11 +623,12 @@ def iterdir(self): special entries '.' and '..' are not included. """ root_dir = str(self) - with os.scandir(root_dir) as scandir_it: - paths = [entry.path for entry in scandir_it] - if root_dir == '.': - paths = map(self._remove_leading_dot, paths) - return map(self._from_parsed_string, paths) + str_attr = 'name' if root_dir == '.' else 'path' + def parse(entry): + path = self._from_parsed_string(getattr(entry, str_attr)) + path.dir_entry = entry + return path + return map(parse, list(os.scandir(root_dir))) def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False): """Iterate over this subtree and yield all existing files (of any diff --git a/Lib/pathlib/_os.py b/Lib/pathlib/_os.py index 642b3a57c59a1d..3e5830901e8843 100644 --- a/Lib/pathlib/_os.py +++ b/Lib/pathlib/_os.py @@ -192,7 +192,8 @@ def read_file_metadata(path, keys=None, *, follow_symlinks=True): if err.errno not in (EPERM, ENOTSUP, ENODATA, EINVAL, EACCES): raise continue - st = os.stat(path, follow_symlinks=follow_symlinks) + entry = path.dir_entry or path + st = entry.stat(follow_symlinks=follow_symlinks) if key == 'mode': result['mode'] = stat.S_IMODE(st.st_mode) elif key == 'times_ns': diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index 08355a71453807..9341ccd18acdff 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -1424,6 +1424,21 @@ def close(self): 'st_mode st_ino st_dev st_nlink st_uid st_gid st_size st_atime st_mtime st_ctime') +class DummyDirEntry: + __slots__ = ('name', '_is_symlink', '_is_dir') + + def __init__(self, name, is_symlink, is_dir): + self.name = name + self._is_symlink = is_symlink + self._is_dir = is_dir + + def is_symlink(self): + return self._is_symlink + + def is_dir(self, *, follow_symlinks=True): + return self._is_dir and (follow_symlinks or not self._is_symlink) + + class DummyPath(PathBase): """ Simple implementation of PathBase that keeps files and directories in @@ -1492,13 +1507,22 @@ def open(self, mode='r', buffering=-1, encoding=None, return stream def iterdir(self): - path = str(self.resolve()) - if path in self._files: - raise NotADirectoryError(errno.ENOTDIR, "Not a directory", path) - elif path in self._directories: - return iter([self / name for name in self._directories[path]]) + path = self.resolve() + path_str = str(path) + if path_str in self._files: + raise NotADirectoryError(errno.ENOTDIR, "Not a directory", path_str) + elif path_str in self._directories: + return iter([self._make_dir_child(path, name) for name in self._directories[path_str]]) else: - raise FileNotFoundError(errno.ENOENT, "File not found", path) + raise FileNotFoundError(errno.ENOENT, "File not found", path_str) + + def _make_dir_child(self, resolved_self, name): + path = self.joinpath(name) + path_str = str(resolved_self.joinpath(name)) + is_symlink = path_str in self._symlinks + is_directory = path_str in self._directories if not is_symlink else path.is_dir() + path.dir_entry = DummyDirEntry(name, is_symlink, is_directory) + return path def mkdir(self, mode=0o777, parents=False, exist_ok=False): path = str(self.parent.resolve() / self.name) @@ -2187,6 +2211,20 @@ def test_iterdir_nodir(self): self.assertIn(cm.exception.errno, (errno.ENOTDIR, errno.ENOENT, errno.EINVAL)) + def test_dir_entry(self): + p = self.cls(self.base) + self.assertIsNone(p.dir_entry) + for child in p.iterdir(): + entry = child.dir_entry + self.assertIsNotNone(entry) + self.assertEqual(entry.name, child.name) + self.assertEqual(entry.is_symlink(), + child.is_symlink()) + self.assertEqual(entry.is_dir(follow_symlinks=False), + child.is_dir(follow_symlinks=False)) + if entry.name != 'brokenLinkLoop': + self.assertEqual(entry.is_dir(), child.is_dir()) + def test_glob_common(self): def _check(glob, expected): self.assertEqual(set(glob), { P(self.base, q) for q in expected }) From 5b0ada9c6a4029cc995a0f7a1f2666d82e88d005 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Fri, 18 Oct 2024 03:15:43 +0100 Subject: [PATCH 2/3] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Doc/library/pathlib.rst | 4 ++-- Doc/whatsnew/3.14.rst | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 087bf33f2d111b..d270dada082add 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -1291,7 +1291,7 @@ Reading directories .. attribute:: Path.dir_entry - In a path yielded from :meth:`Path.iterdir`, this attribute stores a + In a path yielded from :meth:`Path.iterdir`, this attribute stores an :class:`os.DirEntry` object corresponding to the path; in other cases it is ``None``. This can be used to retrieve the file type and attributes of directory children without necessarily incurring further system calls:: @@ -1300,7 +1300,7 @@ Reading directories >>> for child in p.iterdir(): ... entry = child.dir_entry ... if entry.is_dir(): - ... child + ... print(child) ... PosixPath('docs/_templates') PosixPath('docs/_build') diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 37402342fb32c8..e215f9ddf8151e 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -343,9 +343,9 @@ pathlib (Contributed by Barney Gale in :gh:`73991`.) -* Add a :attr:`.Path.dir_entry` attribute. In a path object generated by - :meth:`.Path.iterdir`, it stores a :class:`os.DirEntry` object corresponding - to the path; in other cases it is ``None``. +* Add the :attr:`.Path.dir_entry` attribute. In a path object generated by + :meth:`Path.iterdir `, it stores an :class:`os.DirEntry` + object corresponding to the path; in other cases it is ``None``. (Contributed by Barney Gale in :gh:`125413`.) From 71082a39ee9fabe977df19557f26d7cbeaadf7e1 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 18 Oct 2024 04:06:27 +0100 Subject: [PATCH 3/3] Add news blurb --- .../Library/2024-10-18-04-06-22.gh-issue-125413.EpzLWg.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2024-10-18-04-06-22.gh-issue-125413.EpzLWg.rst diff --git a/Misc/NEWS.d/next/Library/2024-10-18-04-06-22.gh-issue-125413.EpzLWg.rst b/Misc/NEWS.d/next/Library/2024-10-18-04-06-22.gh-issue-125413.EpzLWg.rst new file mode 100644 index 00000000000000..d939b713046cdb --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-18-04-06-22.gh-issue-125413.EpzLWg.rst @@ -0,0 +1,4 @@ +Add the :attr:`.Path.dir_entry` attribute. In a path object generated by +:meth:`Path.iterdir `, it stores an +:class:`os.DirEntry` object corresponding to the path; in other cases it is +``None``.