diff --git a/CHANGELOG.md b/CHANGELOG.md index 5be8f09b..a91bdf61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.2] +### Added +- upath.registry: provide `available_implementations()` and `register_implementation()` (#134). +- upath: add `UPath.storage_options` and `UPath.protocol` (#135). + +### Fixed +- upath: fix `UPath.as_uri()` (#133). + ## [0.1.1] ### Fixed - restore `._kwargs` and `._url` on `PosixUPath` and `WindowsUPath` subclasses (#131). @@ -71,7 +79,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - started a changelog to keep track of significant changes -[Unreleased]: https://github.com/fsspec/universal_pathlib/compare/v0.1.1...HEAD +[Unreleased]: https://github.com/fsspec/universal_pathlib/compare/v0.1.2...HEAD +[0.1.2]: https://github.com/fsspec/universal_pathlib/compare/v0.1.1...v0.1.2 [0.1.1]: https://github.com/fsspec/universal_pathlib/compare/v0.1.0...v0.1.1 [0.1.0]: https://github.com/fsspec/universal_pathlib/compare/v0.0.24...v0.1.0 [0.0.24]: https://github.com/fsspec/universal_pathlib/compare/v0.0.23...v0.0.24 diff --git a/README.md b/README.md index baf9eda7..160588e0 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,62 @@ If a local path is provided, `UPath` will return a `PosixUPath` or `WindowsUPath These two subclasses are 100% compatible with the `PosixPath` and `WindowsPath` classes of their specific Python version, and are tested against all relevant tests of the CPython pathlib test-suite. +### UPath public class API + +`UPath`'s public class interface is identical to `pathlib.Path` with the addition of the following attributes: + +- `UPath(...).protocol: str` the filesystem_spec protocol _(note: for `PosixUPath` and `WindowsUPath` it's an empty string)_ +- `UPath(...).storage_options: dict[str, Any]` the storage options for instantiating the filesystem_spec class +- `UPath(...).path: str` the filesystem_spec compatible path for use with filesystem instances +- `UPath(...).fs: AbstractFileSystem` convenience attribute to access an instantiated filesystem + +the first three provide a public interface to access a file via fsspec as follows: + +```python +from upath import UPath +from fsspec import filesystem + +p = UPath("s3://bucket/file.txt", anon=True) + +fs = filesystem(p.protocol, **p.storage_options) # equivalent to p.fs +with fs.open(p.path) as f: + data = f.read() +``` + +### Register custom UPath implementations + +In case you develop a custom UPath implementation, feel free to open an issue to discuss integrating it +in `universal_pathlib`. You can dynamically register your implementation too! Here are your options: + +#### Dynamic registration from Python + +```python +# for example: mymodule/submodule.py +from upath import UPath +from upath.registry import register_implementation + +my_protocol = "myproto" +class MyPath(UPath): + ... # your custom implementation + +register_implementation(my_protocol, MyPath) +``` + +#### Registration via entry points + +```toml +# pyproject.toml +[project.entry-points."unversal_pathlib.implementations"] +myproto = "my_module.submodule:MyPath" +``` + +```ini +# setup.cfg +[options.entry_points] +universal_pathlib.implementations = + myproto = my_module.submodule:MyPath +``` + ## Contributing Contributions are very welcome. diff --git a/upath/core.py b/upath/core.py index 9e07e8db..62f2d2c2 100644 --- a/upath/core.py +++ b/upath/core.py @@ -44,7 +44,7 @@ def __init__(self, parsed_url: SplitResult | None, **kwargs: Any) -> None: self._fs = cls(**url_kwargs) def _format_path(self, path: UPath) -> str: - return path.path + return path._path def open(self, path, mode="r", *args, **kwargs): return self._fs.open(self._format_path(path), mode, *args, **kwargs) @@ -206,6 +206,44 @@ def __new__(cls: type[PT], *args: str | PathLike, **kwargs: Any) -> PT: args_list, url=parsed_url, **kwargs ) + @property + def protocol(self) -> str: + """The filesystem_spec protocol + + For local paths protocol is either 'file' if the UPath instance + is backed by fsspec or '' if it's backed by stdlib pathlib. For + both `fsspec.get_filesystem_class` returns `LocalFileSystem`. + """ + if self._url is None: + return "" + return self._url.scheme + + @property + def storage_options(self) -> dict[str, Any]: + """The filesystem_spec storage options dictionary + + Accessing `.storage_options` does not instantiate the + corresponding fsspec filesystem class. + """ + return { + key: value + for key, value in self._kwargs.items() + if key not in {"scheme", "netloc", "url"} + } + + @property + def fs(self) -> AbstractFileSystem: + """The filesystem_spec filesystem instance""" + return self._accessor._fs + + @property + def path(self) -> str: + """The filesystem_spec path for use with a filesystem instance + + Note: for some file systems this can be prefixed by the protocol. + """ + return self._path + def __getattr__(self, item: str) -> Any: if item == "_accessor": # cache the _accessor attribute on first access @@ -258,7 +296,7 @@ def _format_parsed_parts( return formatted @property - def path(self) -> str: + def _path(self) -> str: if self._parts: join_parts = self._parts[1:] if self._parts[0] == "/" else self._parts path: str = self._flavour.join(join_parts) @@ -349,7 +387,7 @@ def rglob(self: PT, pattern: str) -> Generator[PT, None, None]: def _sub_path(self, name): # only want the path name with iterdir - sp = self.path + sp = self._path return re.sub(f"^({sp}|{sp[1:]})/", "", name) def absolute(self: PT) -> PT: @@ -631,10 +669,6 @@ def __str__(self) -> str: ) return self._str - @property - def fs(self) -> AbstractFileSystem: - return self._accessor._fs - def __truediv__(self: PT, key: str | PathLike) -> PT: # Add `/` root if not present if len(self._parts) == 0: @@ -722,6 +756,9 @@ def parents(self) -> _UPathParents: """A sequence of this upath's logical parents.""" return _UPathParents(self) + def as_uri(self) -> str: + return str(self) + class _UPathParents(Sequence[UPath]): """This object provides sequence-like access to the logical ancestors diff --git a/upath/implementations/cloud.py b/upath/implementations/cloud.py index b3fffd6d..d03388f4 100644 --- a/upath/implementations/cloud.py +++ b/upath/implementations/cloud.py @@ -10,7 +10,7 @@ def _format_path(self, path): """ netloc has already been set to project via `CloudPath._from_parts` """ - return f"{path._url.netloc}/{path.path.lstrip('/')}" + return f"{path._url.netloc}/{path._path.lstrip('/')}" def mkdir(self, path, create_parents=True, **kwargs): _path = self._format_path(path) @@ -49,7 +49,7 @@ def _sub_path(self, name): `listdir` and `glob`. However, in `iterdir` and `glob` we only want the relative path to `self`. """ - sp = re.escape(self.path) + sp = re.escape(self._path) netloc = self._url.netloc return re.sub( f"^({netloc})?/?({sp}|{sp[1:]})/?", @@ -71,6 +71,12 @@ def joinpath(self, *args): self._kwargs["bucket"] = bucket return super().joinpath(*tuple(args_list)) + @property + def path(self) -> str: + if self._url is None: + raise RuntimeError(str(self)) + return f"{self._url.netloc}{super()._path}" + class GCSPath(CloudPath): pass diff --git a/upath/implementations/http.py b/upath/implementations/http.py index 14c14b3f..6f215d93 100644 --- a/upath/implementations/http.py +++ b/upath/implementations/http.py @@ -1,5 +1,7 @@ from __future__ import annotations +from urllib.parse import urlunsplit + from fsspec.asyn import sync import upath.core @@ -43,7 +45,7 @@ def _sub_path(self, name): relative path to `self`. """ complete_address = self._format_parsed_parts( - None, None, [self.path], url=self._url, **self._kwargs + None, None, [self._path], url=self._url, **self._kwargs ) if name.startswith(complete_address): @@ -83,3 +85,10 @@ def resolve( break return resolved_path + + @property + def path(self) -> str: + # http filesystems use the full url as path + if self._url is None: + raise RuntimeError(str(self)) + return urlunsplit(self._url) diff --git a/upath/implementations/webdav.py b/upath/implementations/webdav.py index d665dce1..434f0444 100644 --- a/upath/implementations/webdav.py +++ b/upath/implementations/webdav.py @@ -1,5 +1,6 @@ from __future__ import annotations +from typing import Any from urllib.parse import ParseResult from urllib.parse import urlunsplit @@ -49,3 +50,20 @@ def _sub_path(self, name): name = name.strip("/") return name + + @property + def protocol(self) -> str: + if self._url is None: + raise RuntimeError(str(self)) + return self._url.scheme.split("+")[0] + + @property + def storage_options(self) -> dict[str, Any]: + if self._url is None: + raise RuntimeError(str(self)) + sopts = super().storage_options + http_protocol = self._url.scheme.split("+")[1] + assert http_protocol in {"http", "https"} + base_url = urlunsplit(self._url._replace(scheme=http_protocol, path="")) + sopts["base_url"] = base_url + return sopts diff --git a/upath/registry.py b/upath/registry.py index b30fd6d1..43bf3ccc 100644 --- a/upath/registry.py +++ b/upath/registry.py @@ -1,22 +1,63 @@ +"""upath.registry -- registry for file system specific implementations + +Retrieve UPath implementations via `get_upath_class`. +Register custom UPath subclasses in one of two ways: + +### directly from Python + +>>> from upath import UPath +>>> from upath.registry import register_implementation +>>> my_protocol = "myproto" +>>> class MyPath(UPath): +... pass +>>> register_implementation(my_protocol, MyPath) + +### via entry points + +```toml +# pyproject.toml +[project.entry-points."unversal_pathlib.implementations"] +myproto = "my_module.submodule:MyPath" +``` + +```ini +# setup.cfg +[options.entry_points] +universal_pathlib.implementations = + myproto = my_module.submodule:MyPath +``` +""" from __future__ import annotations -import importlib import os +import re +import sys import warnings +from collections import ChainMap from functools import lru_cache -from typing import TYPE_CHECKING +from importlib import import_module +from importlib.metadata import entry_points +from typing import Iterator +from typing import MutableMapping from fsspec.core import get_filesystem_class +from fsspec.registry import available_protocols -if TYPE_CHECKING: - from upath.core import UPath +import upath.core __all__ = [ "get_upath_class", + "available_implementations", + "register_implementation", ] -class _Registry: +_ENTRY_POINT_GROUP = "universal_pathlib.implementations" + + +class _Registry(MutableMapping[str, "type[upath.core.UPath]"]): + """internal registry for UPath subclasses""" + known_implementations: dict[str, str] = { "abfs": "upath.implementations.cloud.AzurePath", "abfss": "upath.implementations.cloud.AzurePath", @@ -35,26 +76,118 @@ class _Registry: "webdav+https": "upath.implementations.webdav.WebdavPath", } - def __getitem__(self, item: str) -> type[UPath] | None: - try: - fqn = self.known_implementations[item] - except KeyError: - return None - module_name, name = fqn.rsplit(".", 1) - mod = importlib.import_module(module_name) - return getattr(mod, name) # type: ignore + def __init__(self) -> None: + if sys.version_info >= (3, 10): + eps = entry_points(group=_ENTRY_POINT_GROUP) + else: + eps = entry_points().get(_ENTRY_POINT_GROUP, []) + self._entries = {ep.name: ep for ep in eps} + self._m = ChainMap({}, self.known_implementations) # type: ignore + + def __contains__(self, item: object) -> bool: + return item in set().union(self._m, self._entries) + + def __getitem__(self, item: str) -> type[upath.core.UPath]: + fqn = self._m.get(item) + if fqn is None: + if item in self._entries: + fqn = self._m[item] = self._entries[item].load() + if fqn is None: + raise KeyError(f"{item} not in registry") + if isinstance(fqn, str): + module_name, name = fqn.rsplit(".", 1) + mod = import_module(module_name) + cls = getattr(mod, name) # type: ignore + else: + cls = fqn + return cls + + def __setitem__(self, item: str, value: type[upath.core.UPath] | str) -> None: + if not ( + (isinstance(value, type) and issubclass(value, upath.core.UPath)) + or isinstance(value, str) + ): + raise ValueError( + f"expected UPath subclass or FQN-string, got: {type(value).__name__!r}" + ) + self._m[item] = value + + def __delitem__(self, __v: str) -> None: + raise NotImplementedError("removal is unsupported") + + def __len__(self) -> int: + return len(set().union(self._m, self._entries)) + + def __iter__(self) -> Iterator[str]: + return iter(set().union(self._m, self._entries)) _registry = _Registry() -@lru_cache -def get_upath_class(protocol: str) -> type[UPath] | None: - """Return the upath cls for the given protocol.""" - cls: type[UPath] | None = _registry[protocol] - if cls is not None: - return cls +def available_implementations(*, fallback: bool = False) -> list[str]: + """return a list of protocols for available implementations + + Parameters + ---------- + fallback: + If True, also return protocols for fsspec filesystems without + an implementation in universal_pathlib. + """ + impl = list(_registry) + if not fallback: + return impl else: + return list({*impl, *available_protocols()}) + + +def register_implementation( + protocol: str, + cls: type[upath.core.UPath] | str, + *, + clobber: bool = False, +) -> None: + """register a UPath implementation with a protocol + + Parameters + ---------- + protocol: + Protocol name to associate with the class + cls: + The UPath subclass for the protocol or a str representing the + full path to an implementation class like package.module.class. + clobber: + Whether to overwrite a protocol with the same name; if False, + will raise instead. + """ + if not re.match(r"^[a-z][a-z0-9+_.]+$", protocol): + raise ValueError(f"{protocol!r} is not a valid URI scheme") + if not clobber and protocol in _registry: + raise ValueError(f"{protocol!r} is already in registry and clobber is False!") + _registry[protocol] = cls + + +@lru_cache +def get_upath_class( + protocol: str, + *, + fallback: bool = True, +) -> type[upath.core.UPath] | None: + """Return the upath cls for the given protocol. + + Returns `None` if no matching protocol can be found. + + Parameters + ---------- + protocol: + The protocol string + fallback: + If fallback is False, don't return UPath instances for fsspec + filesystems that don't have an implementation registered. + """ + try: + return _registry[protocol] + except KeyError: if not protocol: if os.name == "nt": from upath.implementations.local import WindowsUPath @@ -64,6 +197,8 @@ def get_upath_class(protocol: str) -> type[UPath] | None: from upath.implementations.local import PosixUPath return PosixUPath + if not fallback: + return None try: _ = get_filesystem_class(protocol) except ValueError: @@ -76,5 +211,4 @@ def get_upath_class(protocol: str) -> type[UPath] | None: UserWarning, stacklevel=2, ) - mod = importlib.import_module("upath.core") - return mod.UPath # type: ignore + return upath.core.UPath diff --git a/upath/tests/cases.py b/upath/tests/cases.py index ba0fbcb8..3b553bd2 100644 --- a/upath/tests/cases.py +++ b/upath/tests/cases.py @@ -4,6 +4,7 @@ from pathlib import Path import pytest +from fsspec import filesystem from upath import UPath @@ -408,4 +409,32 @@ def test_private_url_attr_in_sync(self): p1 = self.path.joinpath("c") p2 = self.path / "c" assert p1._url == p2._url - assert p1 != p._url + assert p1._url != p._url + + def test_as_uri(self): + # test that we can reconstruct the path from the uri + p0 = self.path + uri = p0.as_uri() + p1 = UPath(uri, **p0.fs.storage_options) + assert p0 == p1 + + def test_protocol(self): + protocol = self.path.protocol + protocols = [p] if isinstance((p := type(self.path.fs).protocol), str) else p + print(protocol, protocols) + assert protocol in protocols + + def test_storage_options(self): + storage_options = self.path.storage_options + assert storage_options == self.path.fs.storage_options + + def test_read_with_fsspec(self): + p = self.path.joinpath("file2.txt") + + protocol = p.protocol + storage_options = p.storage_options + path = p.path + + fs = filesystem(protocol, **storage_options) + with fs.open(path) as f: + assert f.read() == b"hello world" diff --git a/upath/tests/implementations/test_azure.py b/upath/tests/implementations/test_azure.py index 74be18f1..ececfae3 100644 --- a/upath/tests/implementations/test_azure.py +++ b/upath/tests/implementations/test_azure.py @@ -44,3 +44,8 @@ def test_makedirs_exist_ok_false(self): def test_rglob(self, pathlib_base): return super().test_rglob(pathlib_base) + + def test_protocol(self): + # test all valid protocols for azure... + protocol = self.path.protocol + assert protocol in ["abfs", "abfss", "adl", "az"] diff --git a/upath/tests/implementations/test_webdav.py b/upath/tests/implementations/test_webdav.py index 0b534112..e9a9678e 100644 --- a/upath/tests/implementations/test_webdav.py +++ b/upath/tests/implementations/test_webdav.py @@ -12,3 +12,11 @@ def path(self, webdav_fixture): def test_fsspec_compat(self): pass + + def test_storage_options(self): + # we need to add base_url to storage options for webdav filesystems, + # to be able to serialize the http protocol to string... + storage_options = self.path.storage_options + base_url = storage_options.pop("base_url") + assert storage_options == self.path.fs.storage_options + assert base_url == self.path.fs.client.base_url diff --git a/upath/tests/test_registry.py b/upath/tests/test_registry.py new file mode 100644 index 00000000..148f0324 --- /dev/null +++ b/upath/tests/test_registry.py @@ -0,0 +1,126 @@ +import pytest +from fsspec.registry import available_protocols + +from upath import UPath +from upath.registry import available_implementations +from upath.registry import get_upath_class +from upath.registry import register_implementation + +IMPLEMENTATIONS = { + "abfs", + "abfss", + "adl", + "az", + "file", + "gcs", + "gs", + "hdfs", + "http", + "https", + "memory", + "s3", + "s3a", + "webdav+http", + "webdav+https", +} + + +@pytest.fixture(autouse=True) +def reset_registry(): + from upath.registry import _registry + + try: + yield + finally: + _registry._m.maps[0].clear() # type: ignore + + +@pytest.fixture() +def fake_entrypoint(): + from importlib.metadata import EntryPoint + + from upath.registry import _registry + + ep = EntryPoint( + name="myeps", + value="upath.core:UPath", + group="universal_pathlib.implementations", + ) + old_registry = _registry._entries.copy() + + try: + _registry._entries["myeps"] = ep + yield + finally: + _registry._entries.clear() + _registry._entries.update(old_registry) + + +def test_available_implementations(): + impl = available_implementations() + assert len(impl) == len(set(impl)) + assert set(impl) == IMPLEMENTATIONS + + +def test_available_implementations_with_fallback(): + impl = available_implementations(fallback=True) + assert set(impl) == IMPLEMENTATIONS.union(available_protocols()) + + +def test_available_implementations_with_entrypoint(fake_entrypoint): + impl = available_implementations() + assert set(impl) == IMPLEMENTATIONS.union({"myeps"}) + + +def test_register_implementation(): + class MyProtoPath(UPath): + pass + + register_implementation("myproto", MyProtoPath) + + assert get_upath_class("myproto") is MyProtoPath + + +def test_register_implementation_wrong_input(): + with pytest.raises(TypeError): + register_implementation(None, UPath) # type: ignore + with pytest.raises(ValueError): + register_implementation("incorrect**protocol", UPath) + with pytest.raises(ValueError): + register_implementation("myproto", object, clobber=True) # type: ignore + with pytest.raises(ValueError): + register_implementation("file", UPath, clobber=False) + assert set(available_implementations()) == IMPLEMENTATIONS + + +@pytest.mark.parametrize("protocol", IMPLEMENTATIONS) +def test_get_upath_class(protocol): + upath_cls = get_upath_class("file") + assert issubclass(upath_cls, UPath) + + +def test_get_upath_class_without_implementation(clear_registry): + with pytest.warns( + UserWarning, match="UPath 'mock' filesystem not explicitly implemented." + ): + upath_cls = get_upath_class("mock") + assert issubclass(upath_cls, UPath) + + +def test_get_upath_class_without_implementation_no_fallback(clear_registry): + assert get_upath_class("mock", fallback=False) is None + + +def test_get_upath_class_unknown_protocol(clear_registry): + assert get_upath_class("doesnotexist") is None + + +def test_get_upath_class_from_entrypoint(fake_entrypoint): + assert issubclass(get_upath_class("myeps"), UPath) + + +@pytest.mark.parametrize( + "protocol", [pytest.param("", id="empty-str"), pytest.param(None, id="none")] +) +def test_get_upath_class_falsey_protocol(protocol): + assert issubclass(get_upath_class(protocol), UPath)