From 3646cd2c4fa35b1c5b593663509dadeb903967e3 Mon Sep 17 00:00:00 2001 From: Andreas Poehlmann Date: Sun, 3 Mar 2024 23:14:02 +0100 Subject: [PATCH] upath: implement poc for flavour base vendoring (#200) * upath: implement poc for flavour base vendoring * update sources generator * upath: update flavour implementation * upath.implementations: adjust for codechanges * upath: fix resolving issue * upath: provide default flavour * upath.implementations: cleanup * upath.core: fix prefix issue with glob on windows * upath._flavour: for file/local get drive on windows * upath._flavour: move _deprecated to upath._compat * upath._flavour: use local_file attribute in splitdrive * upath._flavour: use os.path for local_file in isabs * readme: fix toml entrypoint spelling * upath: fallback classmethod for UPath._parse_path and UPath._format_parsed_parts * flavours: fix reproducibility in flavour generate script * upath._flavour: refactor flavour settings * tests: test stat * upath: move flavour specializations to subclasses * tests: adjust resolve test for http paths * upath: ensure support for __fspath__ args * upath.implementations.local: suppress warning * test: adjust fspath test for windows * upath.local: WindowsUPath.path should return the posix version * upath.local: fix WindowsUPath.path --- .pre-commit-config.yaml | 2 +- README.md | 2 +- dev/generate_flavours.py | 350 +++++++++++++ upath/_compat.py | 43 ++ upath/_flavour.py | 542 ++++++++++++-------- upath/_flavour_sources.py | 866 ++++++++++++++++++++++++++++++++ upath/_protocol.py | 28 +- upath/_stat.py | 2 + upath/core.py | 150 ++++-- upath/implementations/cloud.py | 14 +- upath/implementations/http.py | 30 +- upath/implementations/local.py | 10 +- upath/implementations/webdav.py | 13 + upath/tests/test_core.py | 23 +- upath/tests/test_stat.py | 99 ++++ 15 files changed, 1856 insertions(+), 318 deletions(-) create mode 100644 dev/generate_flavours.py create mode 100644 upath/_flavour_sources.py create mode 100644 upath/tests/test_stat.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f9c9bb2b..ce4f9932 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ default_language_version: python: python3 -exclude: ^upath/tests/pathlib/test_pathlib.*\.py|^upath/tests/pathlib/_test_support\.py +exclude: ^upath/tests/pathlib/test_pathlib.*\.py|^upath/tests/pathlib/_test_support\.py|^upath/_flavour_sources\.py repos: - repo: https://github.com/psf/black rev: 24.1.1 diff --git a/README.md b/README.md index 4d6654ad..f46feefa 100644 --- a/README.md +++ b/README.md @@ -346,7 +346,7 @@ If you distribute your implementation in your own Python package, you can inform ``` # pyproject.toml -[project.entry-points."unversal_pathlib.implementations"] +[project.entry-points."universal_pathlib.implementations"] myproto = "my_module.submodule:MyPath" ``` diff --git a/dev/generate_flavours.py b/dev/generate_flavours.py new file mode 100644 index 00000000..f37780bc --- /dev/null +++ b/dev/generate_flavours.py @@ -0,0 +1,350 @@ +"""Generates the _flavour_sources.py file""" + +from __future__ import annotations + +import inspect +import re +import sys +import warnings +from io import StringIO +from typing import Any +from unittest.mock import Mock + +from fsspec.registry import available_protocols +from fsspec.registry import get_filesystem_class +from fsspec.spec import AbstractFileSystem +from fsspec.utils import get_package_version_without_import + +HEADER = '''\ +""" upath._flavour_sources + + + +Warning +------- + Do not modify this file manually! + It is generated by `dev/generate_flavours.py` + +To be able to parse the different filesystem uri schemes, we need +the string parsing functionality each of the filesystem implementations. +In an attempt to support parsing uris without having to import the +specific filesystems, we extract the necessary subset of the +AbstractFileSystem classes and generate a new "flavour" class for +each of the known filesystems. This will allow us to provide a +`PurePath` equivalent `PureUPath` for each protocol in the future +without a direct dependency on the underlying filesystem package. + +""" +''' + +IMPORTS = """\ +from __future__ import annotations + +import logging +import re +from typing import Any +from typing import cast +from urllib.parse import parse_qs +from urllib.parse import urlsplit + +from fsspec.implementations.local import make_path_posix +from fsspec.utils import infer_storage_options +from fsspec.utils import stringify_path + +""" + +INIT_CODE = '''\ +__all__ = [ + "AbstractFileSystemFlavour", + "FileSystemFlavourBase", + "flavour_registry", +] + +logger = logging.getLogger(__name__) +flavour_registry: dict[str, type[FileSystemFlavourBase]] = {} + + +class FileSystemFlavourBase: + """base class for the fsspec flavours""" + + def __init_subclass__(cls: Any, **kwargs): + if isinstance(cls.protocol, str): + protocols = (cls.protocol,) + else: + protocols = tuple(cls.protocol) + for protocol in protocols: + if protocol in flavour_registry: + raise ValueError(f"protocol {protocol!r} already registered") + flavour_registry[protocol] = cls +''' + +BASE_CLASS_NAME_SUFFIX = "Flavour" +BASE_CLASS_NAME = f"{AbstractFileSystem.__name__}{BASE_CLASS_NAME_SUFFIX}" + +SKIP_PROTOCOLS = [ + "dir", + "blockcache", + "cached", + "simplecache", + "filecache", +] + +FIX_PROTOCOLS = { + "MemFS": ("memfs",), + "AsyncLocalFileSystem": (), +} + +FIX_METHODS = { + "GCSFileSystem": ["_strip_protocol", "_get_kwargs_from_urls", "_split_path"], +} + + +def _fix_azure_blob_file_system(x: str) -> str: + return re.sub( + r"host = ops.get\(\"host\", None\)", + 'host: str | None = ops.get("host", None)', + x, + ) + + +def _fix_memfs_file_system(x: str) -> str: + return re.sub( + "_MemFS", + "MemoryFileSystemFlavour", + x, + ) + + +def _fix_xrootd_file_system(x: str) -> str: + x = re.sub( + r"client.URL", + "urlsplit", + x, + ) + return re.sub( + "url.hostid", + "url.netloc", + x, + ) + + +FIX_SOURCE = { + "AzureBlobFileSystem": _fix_azure_blob_file_system, + "MemFS": _fix_memfs_file_system, + "XRootDFileSystem": _fix_xrootd_file_system, +} + + +def before_imports() -> None: + """allow to patch the generated state before importing anything""" + # patch libarchive + sys.modules["libarchive"] = Mock() + sys.modules["libarchive.ffi"] = Mock() + # patch xrootd + sys.modules["XRootD"] = Mock() + sys.modules["XRootD.client"] = Mock() + sys.modules["XRootD.client.flags"] = Mock() + sys.modules["XRootD.client.responses"] = Mock() + + +def get_protos(cls: type, remove: str, add: str) -> tuple[str, ...]: + try: + return FIX_PROTOCOLS[cls.__name__] + except KeyError: + pass + if isinstance(cls.protocol, str): + p = [cls.protocol, add] + else: + p = [*cls.protocol, add] + return tuple([x for x in p if x != remove]) + + +def get_fsspec_filesystems_and_protocol_errors() -> ( + tuple[dict[type[AbstractFileSystem], tuple[str, ...]], dict[str, str]] +): + before_imports() + + classes: dict[type[AbstractFileSystem], tuple[str]] = {} + errors: dict[str, str] = {} + + for protocol in available_protocols(): + if protocol in SKIP_PROTOCOLS: + continue + try: + cls = get_filesystem_class(protocol) + except ImportError as err: + errors[protocol] = str(err) + else: + protos = get_protos(cls, remove="abstract", add=protocol) + cprotos = classes.get(cls, []) + classes[cls] = tuple(dict.fromkeys([*cprotos, *protos])) + return classes, errors + + +def _get_plain_method(cls, name): + for c in cls.__mro__: + try: + return c.__dict__[name] + except KeyError: + pass + else: + raise AttributeError(f"{cls.__name__}.{name} not found") + + +def get_subclass_methods(cls: type) -> list[str]: # noqa: C901 + try: + return FIX_METHODS[cls.__name__] + except KeyError: + pass + errors = [] + + # storage options + so = None + base_get_kwargs_from_urls = _get_plain_method( + AbstractFileSystem, "_get_kwargs_from_urls" + ) + try: + cls_get_kwargs_from_urls = _get_plain_method(cls, "_get_kwargs_from_urls") + except AttributeError: + errors.append("missing `_get_kwargs_from_urls()`") + else: + so = cls_get_kwargs_from_urls is base_get_kwargs_from_urls + if not isinstance(cls_get_kwargs_from_urls, staticmethod): + warnings.warn( + f"{cls.__name__}: {cls_get_kwargs_from_urls!r} not a staticmethod", + RuntimeWarning, + stacklevel=2, + ) + + # strip protocol + sp = None + base_strip_protocol = _get_plain_method(AbstractFileSystem, "_strip_protocol") + try: + cls_strip_protocol = _get_plain_method(cls, "_strip_protocol") + except AttributeError: + errors.append("missing `_strip_protocol()`") + else: + if isinstance(cls_strip_protocol, staticmethod): + warnings.warn( + f"{cls.__name__}: {cls_strip_protocol.__name__!r} is not a classmethod", + UserWarning, + stacklevel=2, + ) + sp = False + elif isinstance(cls_strip_protocol, classmethod): + sp = cls_strip_protocol.__func__ is base_strip_protocol.__func__ + else: + errors.append( + f"{cls.__name__}: {cls_strip_protocol.__name__!r} not a classmethod" + ) + + # _parent + pt = None + base_parent = _get_plain_method(AbstractFileSystem, "_parent") + try: + cls_parent = _get_plain_method(cls, "_parent") + except AttributeError: + errors.append("missing `_parent()`") + else: + pt = cls_parent is base_parent + + if errors or sp is None or so is None: + raise AttributeError(" AND ".join(errors)) + + methods = [] + if not sp: + methods.append("_strip_protocol") + if not so: + methods.append("_get_kwargs_from_urls") + if not pt: + methods.append("_parent") + return methods + + +def generate_class_source_code( + cls: type, + methods: list[str], + overrides: dict[str, Any], + attributes: list[str], + cls_suffix: str, + base_cls: str | None, +) -> str: + s = ["\n"] + if base_cls: + s += [f"class {cls.__name__}{cls_suffix}({base_cls}):"] + else: + s += [f"class {cls.__name__}{cls_suffix}:"] + mod_ver = get_package_version_without_import(cls.__module__.partition(".")[0]) + s.append(f" __orig_class__ = '{cls.__module__}.{cls.__name__}'") + s.append(f" __orig_version__ = {mod_ver!r}") + for attr, value in overrides.items(): + s.append(f" {attr} = {value!r}") + for attr in attributes: + s.append(f" {attr} = {getattr(cls, attr)!r}") + s.append("") + for method in methods: + s.append(inspect.getsource(getattr(cls, method))) + try: + fix_func = FIX_SOURCE[cls.__name__] + except KeyError: + return "\n".join(s) + else: + return "\n".join(fix_func(line) for line in s) + + +def create_source() -> str: + buf = StringIO() + buf.write(HEADER) + + classes, errors = get_fsspec_filesystems_and_protocol_errors() + + srcs = [ + generate_class_source_code( + AbstractFileSystem, + ["_strip_protocol", "_get_kwargs_from_urls", "_parent"], + {}, + ["protocol", "root_marker"], + cls_suffix=BASE_CLASS_NAME_SUFFIX, + base_cls="FileSystemFlavourBase", + ) + ] + + for cls in sorted(classes, key=lambda cls: cls.__name__): + try: + sub_cls_methods = get_subclass_methods(cls) + except AttributeError as err: + protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol + for proto in protos: + errors[proto] = str(err) + continue + sub_cls = generate_class_source_code( + cls, + sub_cls_methods, + {"protocol": classes[cls]}, + ["root_marker", "sep"], + cls_suffix=BASE_CLASS_NAME_SUFFIX, + base_cls=BASE_CLASS_NAME, + ) + srcs.append(sub_cls) + + if SKIP_PROTOCOLS: + buf.write("#\n# skipping protocols:\n") + for protocol in sorted(SKIP_PROTOCOLS): + buf.write(f"# - {protocol}\n") + + if errors: + buf.write("# protocol import errors:\n") + for protocol, error_msg in sorted(errors.items()): + buf.write(f"# - {protocol} ({error_msg})\n") + buf.write("#\n") + + buf.write(IMPORTS) + buf.write(INIT_CODE) + for cls_src in srcs: + buf.write(cls_src) + + return buf.getvalue().removesuffix("\n") + + +if __name__ == "__main__": + print(create_source()) diff --git a/upath/_compat.py b/upath/_compat.py index d80a0f0b..cb6b45b2 100644 --- a/upath/_compat.py +++ b/upath/_compat.py @@ -6,10 +6,13 @@ import sys import warnings from collections.abc import Sequence +from functools import wraps from pathlib import Path from pathlib import PurePath from typing import TYPE_CHECKING from typing import Any +from typing import Callable +from typing import TypeVar from urllib.parse import SplitResult from fsspec import get_filesystem_class @@ -22,6 +25,7 @@ "str_remove_prefix", "str_remove_suffix", "FSSpecAccessorShim", + "deprecated", ] @@ -484,3 +488,42 @@ def mv(self, path, target, recursive=False, maxdepth=None, **kwargs): maxdepth=maxdepth, **kwargs, ) + + +F = TypeVar("F") + + +def deprecated(*, python_version: tuple[int, ...]) -> Callable[[F], F]: + """marks function as deprecated""" + pyver_str = ".".join(map(str, python_version)) + + def deprecated_decorator(func: F) -> F: + if sys.version_info >= python_version: + + @wraps(func) + def wrapper(*args, **kwargs): + warnings.warn( + f"{func.__name__} is deprecated on py>={pyver_str}", + DeprecationWarning, + stacklevel=2, + ) + return func(*args, **kwargs) + + return wrapper + + else: + return func + + return deprecated_decorator + + +class method_and_classmethod: + """Allow a method to be used as both a method and a classmethod""" + + def __init__(self, method): + self.method = method + + def __get__(self, instance, owner): + if instance is None: + return self.method.__get__(owner) + return self.method.__get__(instance) diff --git a/upath/_flavour.py b/upath/_flavour.py index aba592ed..6bbabf0f 100644 --- a/upath/_flavour.py +++ b/upath/_flavour.py @@ -1,15 +1,15 @@ from __future__ import annotations -import ntpath import os.path import posixpath import sys import warnings from functools import lru_cache -from functools import wraps +from typing import TYPE_CHECKING from typing import Any -from typing import Callable -from typing import Iterable +from typing import Mapping +from typing import Sequence +from typing import TypedDict from typing import Union from urllib.parse import urlsplit @@ -18,38 +18,69 @@ else: TypeAlias = Any +from fsspec.registry import known_implementations +from fsspec.registry import registry as class_registry +from fsspec.spec import AbstractFileSystem + +from upath._compat import deprecated from upath._compat import str_remove_prefix from upath._compat import str_remove_suffix +from upath._flavour_sources import FileSystemFlavourBase +from upath._flavour_sources import flavour_registry from upath._protocol import get_upath_protocol -from upath._protocol import strip_upath_protocol +from upath._protocol import normalize_empty_netloc -PathOrStr: TypeAlias = Union[str, "os.PathLike[str]"] +if TYPE_CHECKING: + from upath.core import UPath __all__ = [ - "FSSpecFlavour", + "LazyFlavourDescriptor", + "default_flavour", "upath_urijoin", + "upath_get_kwargs_from_url", ] +class_registry: Mapping[str, type[AbstractFileSystem]] +PathOrStr: TypeAlias = Union[str, "os.PathLike[str]"] -def _deprecated(func): - if sys.version_info >= (3, 12): - @wraps(func) - def wrapper(*args, **kwargs): - warnings.warn( - f"{func.__name__} is deprecated on py3.12", - DeprecationWarning, - stacklevel=2, - ) - return func(*args, **kwargs) +class AnyProtocolFileSystemFlavour(FileSystemFlavourBase): + sep: str = "/" + protocol: tuple[str, ...] = () + root_marker: str = "/" + + @classmethod + def _strip_protocol(cls, path: str) -> str: + protocol = get_upath_protocol(path) + if path.startswith(protocol + "://"): + path = path[len(protocol) + 3 :] + elif path.startswith(protocol + "::"): + path = path[len(protocol) + 2 :] + path = path.rstrip("/") + return path or cls.root_marker + + @staticmethod + def _get_kwargs_from_urls(path: str) -> dict[str, Any]: + return {} + + @classmethod + def _parent(cls, path): + path = cls._strip_protocol(path) + if "/" in path: + parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker) + return cls.root_marker + parent + else: + return cls.root_marker - return wrapper - else: - return func +class ProtocolConfig(TypedDict): + netloc_is_anchor: set[str] + supports_empty_parts: set[str] + meaningful_trailing_slash: set[str] -class FSSpecFlavour: - """fsspec flavour for universal_pathlib + +class WrappedFileSystemFlavour: # (pathlib_abc.FlavourBase) + """flavour class for universal_pathlib **INTERNAL AND VERY MUCH EXPERIMENTAL** @@ -57,7 +88,7 @@ class FSSpecFlavour: PurePathBase-like objects. Note: - In case you find yourself in need of subclassing FSSpecFlavour, + In case you find yourself in need of subclassing this class, please open an issue in the universal_pathlib issue tracker: https://github.com/fsspec/universal_pathlib/issues Ideally we can find a way to make your use-case work by adding @@ -65,144 +96,249 @@ class FSSpecFlavour: """ + # Note: + # It would be ideal if there would be a way to avoid the need for + # indicating the following settings via the protocol. This is a + # workaround to be able to implement the flavour correctly. + # TODO: + # These settings should be configured on the UPath class?!? + protocol_config: ProtocolConfig = { + "netloc_is_anchor": { + "http", + "https", + "s3", + "s3a", + "gs", + "gcs", + "az", + "adl", + "abfs", + "webdav+http", + "webdav+https", + }, + "supports_empty_parts": { + "http", + "https", + "s3", + "s3a", + "gs", + "gcs", + "az", + "adl", + "abfs", + }, + "meaningful_trailing_slash": { + "http", + "https", + }, + } + def __init__( self, + spec: type[AbstractFileSystem | FileSystemFlavourBase] | AbstractFileSystem, *, - # URI behavior - join_prepends_protocol: bool = False, - join_like_urljoin: bool = False, + netloc_is_anchor: bool = False, supports_empty_parts: bool = False, - supports_netloc: bool = False, - supports_query_parameters: bool = False, - supports_fragments: bool = False, - posixpath_only: bool = True, - # configurable separators - sep: str = "/", - altsep: str | None = None, - ): - self._owner = None - # separators - self.sep = sep - self.altsep = altsep - # configuration - self.join_prepends_protocol = join_prepends_protocol - self.join_like_urljoin = join_like_urljoin - self.supports_empty_parts = supports_empty_parts - self.supports_netloc = supports_netloc - self.supports_query_parameters = supports_query_parameters - self.supports_fragments = supports_fragments - self.posixpath_only = posixpath_only - - def __set_name__(self, owner, name): - # helper to provide a more informative repr - self._owner = owner.__name__ - - def _asdict(self) -> dict[str, Any]: - """return a dict representation of the flavour's settings""" - dct = vars(self).copy() - dct.pop("_owner") - return dct + meaningful_trailing_slash: bool = False, + ) -> None: + """initialize the flavour with the given fsspec""" + self._spec = spec + + # netloc is considered an anchor, influences: + # - splitdrive + # - join + self.netloc_is_anchor = bool(netloc_is_anchor) + + # supports empty parts, influences: + # - join + # - UPath._parse_path + self.supports_empty_parts = bool(supports_empty_parts) + + # meaningful trailing slash, influences: + # - join + # - UPath._parse_path + self.has_meaningful_trailing_slash = bool(meaningful_trailing_slash) + + @classmethod + @lru_cache(maxsize=None) + def from_protocol( + cls, + protocol: str, + ) -> WrappedFileSystemFlavour: + """return the fsspec flavour for the given protocol""" + + config = { + key: True + for key, protocols in cls.protocol_config.items() + if protocol in protocols + } + + # first try to get an already imported fsspec filesystem class + try: + return cls(class_registry[protocol], **config) + except KeyError: + pass + # next try to get the flavour from the generated flavour registry + # to avoid imports + try: + return cls(flavour_registry[protocol], **config) + except KeyError: + pass + # finally fallback to a default flavour for the protocol + if protocol in known_implementations: + warnings.warn( + f"Could not find default for known protocol {protocol!r}." + " Creating a default flavour for it. Please report this" + " to the universal_pathlib issue tracker.", + UserWarning, + stacklevel=2, + ) + return cls(AnyProtocolFileSystemFlavour, **config) def __repr__(self): - return f"<{__name__}.{type(self).__name__} of {self._owner}>" - - def join(self, __path: PathOrStr, *paths: PathOrStr) -> str: - """Join two or more path components, inserting '/' as needed.""" - - # [py38-py312] _flavour.join is Callable[[list[str]], str] - if isinstance(__path, (list, tuple)) and not paths: - if not __path: - return "" - __path, *paths = __path # type: ignore - - _path0: str = strip_upath_protocol(__path) - _paths: Iterable[str] = map(strip_upath_protocol, paths) - - if self.join_like_urljoin: - pth = str_remove_suffix(str(_path0), "/") - sep = self.sep - for b in _paths: - if b.startswith(sep): - pth = b - elif not pth: - pth += b - else: - pth += sep + b - joined = pth - elif self.posixpath_only: - joined = posixpath.join(_path0, *_paths) + if isinstance(self._spec, type): + return f"" else: - joined = os.path.join(_path0, *_paths) - - if self.join_prepends_protocol and (protocol := get_upath_protocol(__path)): - joined = f"{protocol}://{joined}" - - return joined - - def splitroot(self, __path: PathOrStr) -> tuple[str, str, str]: - """Split a path in the drive, the root and the rest.""" - if self.supports_fragments or self.supports_query_parameters: - url = urlsplit(str(__path)) - drive = url._replace(path="", query="", fragment="").geturl() - path = url._replace(scheme="", netloc="").geturl() - # root = "/" if path.startswith("/") else "" - root = "/" # emulate upath.core.UPath < 3.12 behaviour - return drive, root, str_remove_prefix(path, "/") - - if self.supports_netloc: - path = strip_upath_protocol(__path, allow_unknown=True) - protocol = get_upath_protocol(__path) - if protocol: - drive, root, tail = path.partition("/") - return drive, root or "/", tail - else: - return "", "", path + return f"" + + # === fsspec.AbstractFileSystem =================================== + + @property + def protocol(self) -> tuple[str, ...]: + if isinstance(self._spec.protocol, str): + return (self._spec.protocol,) + else: + return self._spec.protocol + + @property + def root_marker(self) -> str: + return self._spec.root_marker + + @property + def local_file(self) -> bool: + return bool(getattr(self._spec, "local_file", False)) + + @staticmethod + def stringify_path(pth: PathOrStr) -> str: + if isinstance(pth, str): + out = pth + elif getattr(pth, "__fspath__", None) is not None: + out = pth.__fspath__() + elif isinstance(pth, os.PathLike): + out = str(pth) + elif hasattr(pth, "path"): + out = pth.path + else: + out = str(pth) + return normalize_empty_netloc(out) + + def empty_part_join(self, path: str, *paths: str) -> str: + sep = self.sep + return sep.join([str_remove_suffix(path, sep), *paths]) - elif self.posixpath_only: - path = strip_upath_protocol(__path, allow_unknown=True) - return _get_splitroot(posixpath)(path) + def strip_protocol(self, pth: PathOrStr) -> str: + pth = self.stringify_path(pth) + return self._spec._strip_protocol(pth) + + def get_kwargs_from_url(self, url: PathOrStr) -> dict[str, Any]: + # NOTE: the public variant is _from_url not _from_urls + if hasattr(url, "storage_options"): + return dict(url.storage_options) + url = self.stringify_path(url) + return self._spec._get_kwargs_from_urls(url) + + def parent(self, path: PathOrStr) -> str: + path = self.stringify_path(path) + return self._spec._parent(path) + + # === pathlib_abc.FlavourBase ===================================== + + @property + def sep(self) -> str: + return self._spec.sep + + @property + def altsep(self) -> str | None: + return None + + def isabs(self, path: PathOrStr) -> bool: + path = self.strip_protocol(path) + if self.local_file: + return os.path.isabs(path) + else: + return path.startswith(self.root_marker) + def join(self, path: PathOrStr, *paths: PathOrStr) -> str: + if self.supports_empty_parts: + _join = self.empty_part_join else: - path = strip_upath_protocol(__path, allow_unknown=True) - drv, root, path = _get_splitroot(os.path)(path) - if os.name == "nt" and not drv: - drv = "C:" - return drv, root, path - - def splitdrive(self, __path: PathOrStr) -> tuple[str, str]: - """Split a path into drive and path.""" - if self.supports_fragments or self.supports_query_parameters: - path = strip_upath_protocol(__path) - url = urlsplit(path) - path = url._replace(scheme="", netloc="").geturl() - drive = url._replace(path="", query="", fragment="").geturl() - return drive, path - - path = strip_upath_protocol(__path) - if self.supports_netloc: - protocol = get_upath_protocol(__path) - if protocol: - drive, root, tail = path.partition("/") - return drive, f"{root}{tail}" + _join = posixpath.join + if self.netloc_is_anchor: + drv, p0 = self.splitdrive(path) + pN = list(map(self.stringify_path, paths)) + if not drv and not p0: + path, *pN = pN + drv, p0 = self.splitdrive(path) + return drv + _join(p0 or self.sep, *pN) + else: + p0 = str(self.strip_protocol(path)) + pN = map(self.stringify_path, paths) + return _join(p0, *pN) + + def split(self, path: PathOrStr): + stripped_path = self.strip_protocol(path) + head = self.parent(stripped_path) or self.root_marker + if head: + return head, stripped_path[len(head) + 1 :] + else: + return "", stripped_path + + def splitdrive(self, path: PathOrStr) -> tuple[str, str]: + path = self.strip_protocol(path) + if self.netloc_is_anchor: + u = urlsplit(path) + if u.scheme: + # cases like: "http://example.com/foo/bar" + drive = u._replace(path="", query="", fragment="").geturl() + rest = u._replace(scheme="", netloc="").geturl() + return drive, rest or self.root_marker or self.sep else: - return "", path - elif self.posixpath_only: - return posixpath.splitdrive(path) + # cases like: "bucket/some/special/key + drive, root, tail = path.partition(self.sep) + return drive, root + tail + elif self.local_file: + return os.path.splitdrive(path) + else: + # all other cases don't have a drive + return "", path + + def normcase(self, path: PathOrStr) -> str: + if self.local_file: + return os.path.normcase(self.stringify_path(path)) else: - drv, path = os.path.splitdrive(path) - if os.name == "nt" and not drv: - drv = "C:" - return drv, path - - def normcase(self, __path: PathOrStr) -> str: - """Normalize case of pathname. Has no effect under Posix""" - if self.posixpath_only: - return posixpath.normcase(__path) + return self.stringify_path(path) + + # === Python3.12 pathlib flavour ================================== + + def splitroot(self, path: PathOrStr) -> tuple[str, str, str]: + drive, tail = self.splitdrive(path) + if self.netloc_is_anchor: + root_marker = self.root_marker or self.sep else: - return os.path.normcase(__path) + root_marker = self.root_marker + return drive, root_marker, str_remove_prefix(tail, self.sep) - @_deprecated - def parse_parts(self, parts): + # === deprecated backwards compatibility =========================== + + @deprecated(python_version=(3, 12)) + def casefold(self, s: str) -> str: + if self.local_file: + return s + else: + return s.lower() + + @deprecated(python_version=(3, 12)) + def parse_parts(self, parts: Sequence[str]) -> tuple[str, str, list[str]]: parsed = [] sep = self.sep drv = root = "" @@ -213,18 +349,21 @@ def parse_parts(self, parts): if not root or root and rel: for x in reversed(rel.split(sep)): parsed.append(sys.intern(x)) - if drv or root: parsed.append(drv + root) parsed.reverse() return drv, root, parsed - @_deprecated - def join_parsed_parts(self, drv, root, parts, drv2, root2, parts2): - """ - Join the two paths represented by the respective - (drive, root, parts) tuples. Return a new (drive, root, parts) tuple. - """ + @deprecated(python_version=(3, 12)) + def join_parsed_parts( + self, + drv: str, + root: str, + parts: list[str], + drv2: str, + root2: str, + parts2: list[str], + ) -> tuple[str, str, list[str]]: if root2: if not drv2 and drv: return drv, root2, [drv + root2] + parts2[1:] @@ -237,69 +376,50 @@ def join_parsed_parts(self, drv, root, parts, drv2, root2, parts2): return drv, root, parts + parts2 return drv2, root2, parts2 - @_deprecated - def casefold(self, s: str) -> str: - """Casefold the string s.""" - if self.posixpath_only or os.name != "nt": - return s + +default_flavour = WrappedFileSystemFlavour(AnyProtocolFileSystemFlavour) + + +class LazyFlavourDescriptor: + """descriptor to lazily get the flavour for a given protocol""" + + def __init__(self) -> None: + self._owner = None + + def __set_name__(self, owner: type[UPath], name: str) -> None: + # helper to provide a more informative repr + self._owner = owner + try: + self._default_protocol = self._owner.protocols[0] + except (AttributeError, IndexError): + self._default_protocol = None + + def __get__(self, instance: UPath, owner: type[UPath]) -> WrappedFileSystemFlavour: + if instance is not None: + return WrappedFileSystemFlavour.from_protocol(instance.protocol) + elif self._default_protocol: + return WrappedFileSystemFlavour.from_protocol(self._default_protocol) else: - return s.lower() + return default_flavour + def __repr__(self): + cls_name = f"{type(self).__name__}" + if self._owner is None: + return f"" + else: + return f"<{cls_name} of {self._owner.__name__}>" -@lru_cache -def _get_splitroot(mod) -> Callable[[PathOrStr], tuple[str, str, str]]: - """return the splitroot function from the given module""" - if hasattr(mod, "splitroot"): - return mod.splitroot - elif mod is posixpath: +def upath_strip_protocol(pth: PathOrStr) -> str: + if protocol := get_upath_protocol(pth): + return WrappedFileSystemFlavour.from_protocol(protocol).strip_protocol(pth) + return WrappedFileSystemFlavour.stringify_path(pth) - def splitroot(p): - p = os.fspath(p) - sep = "/" - empty = "" - if p[:1] != sep: - return empty, empty, p - elif p[1:2] != sep or p[2:3] == sep: - return empty, sep, p[1:] - else: - return empty, p[:2], p[2:] - - return splitroot - - elif mod is ntpath: - - def splitroot(p): - p = os.fspath(p) - sep = "\\" - altsep = "/" - colon = ":" - unc_prefix = "\\\\?\\UNC\\" - empty = "" - normp = p.replace(altsep, sep) - if normp[:1] == sep: - if normp[1:2] == sep: - start = 8 if normp[:8].upper() == unc_prefix else 2 - index = normp.find(sep, start) - if index == -1: - return p, empty, empty - index2 = normp.find(sep, index + 1) - if index2 == -1: - return p, empty, empty - return p[:index2], p[index2 : index2 + 1], p[index2 + 1 :] - else: - return empty, p[:1], p[1:] - elif normp[1:2] == colon: - if normp[2:3] == sep: - return p[:2], p[2:3], p[3:] - else: - return p[:2], empty, p[2:] - else: - return empty, empty, p - return splitroot - else: - raise NotImplementedError(f"unsupported module: {mod!r}") +def upath_get_kwargs_from_url(url: PathOrStr) -> dict[str, Any]: + if protocol := get_upath_protocol(url): + return WrappedFileSystemFlavour.from_protocol(protocol).get_kwargs_from_url(url) + return {} def upath_urijoin(base: str, uri: str) -> str: diff --git a/upath/_flavour_sources.py b/upath/_flavour_sources.py new file mode 100644 index 00000000..ab22e010 --- /dev/null +++ b/upath/_flavour_sources.py @@ -0,0 +1,866 @@ +""" upath._flavour_sources + + + +Warning +------- + Do not modify this file manually! + It is generated by `dev/generate_flavours.py` + +To be able to parse the different filesystem uri schemes, we need +the string parsing functionality each of the filesystem implementations. +In an attempt to support parsing uris without having to import the +specific filesystems, we extract the necessary subset of the +AbstractFileSystem classes and generate a new "flavour" class for +each of the known filesystems. This will allow us to provide a +`PurePath` equivalent `PureUPath` for each protocol in the future +without a direct dependency on the underlying filesystem package. + +""" +# +# skipping protocols: +# - blockcache +# - cached +# - dir +# - filecache +# - simplecache +# protocol import errors: +# - gdrive (Please install gdrivefs for access to Google Drive) +# - generic (GenericFileSystem: '_strip_protocol' not a classmethod) +# +from __future__ import annotations + +import logging +import re +from typing import Any +from typing import cast +from urllib.parse import parse_qs +from urllib.parse import urlsplit + +from fsspec.implementations.local import make_path_posix +from fsspec.utils import infer_storage_options +from fsspec.utils import stringify_path + +__all__ = [ + "AbstractFileSystemFlavour", + "FileSystemFlavourBase", + "flavour_registry", +] + +logger = logging.getLogger(__name__) +flavour_registry: dict[str, type[FileSystemFlavourBase]] = {} + + +class FileSystemFlavourBase: + """base class for the fsspec flavours""" + + def __init_subclass__(cls: Any, **kwargs): + if isinstance(cls.protocol, str): + protocols = (cls.protocol,) + else: + protocols = tuple(cls.protocol) + for protocol in protocols: + if protocol in flavour_registry: + raise ValueError(f"protocol {protocol!r} already registered") + flavour_registry[protocol] = cls + + +class AbstractFileSystemFlavour(FileSystemFlavourBase): + __orig_class__ = 'fsspec.spec.AbstractFileSystem' + __orig_version__ = '2024.2.0' + protocol = 'abstract' + root_marker = '' + + @classmethod + def _strip_protocol(cls, path): + """Turn path from fully-qualified to file-system-specific + + May require FS-specific handling, e.g., for relative paths or links. + """ + if isinstance(path, list): + return [cls._strip_protocol(p) for p in path] + path = stringify_path(path) + protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol + for protocol in protos: + if path.startswith(protocol + "://"): + path = path[len(protocol) + 3 :] + elif path.startswith(protocol + "::"): + path = path[len(protocol) + 2 :] + path = path.rstrip("/") + # use of root_marker to make minimum required path, e.g., "/" + return path or cls.root_marker + + @staticmethod + def _get_kwargs_from_urls(path): + """If kwargs can be encoded in the paths, extract them here + + This should happen before instantiation of the class; incoming paths + then should be amended to strip the options in methods. + + Examples may look like an sftp path "sftp://user@host:/my/path", where + the user and host should become kwargs and later get stripped. + """ + # by default, nothing happens + return {} + + @classmethod + def _parent(cls, path): + path = cls._strip_protocol(path) + if "/" in path: + parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker) + return cls.root_marker + parent + else: + return cls.root_marker + + +class AsyncLocalFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'morefs.asyn_local.AsyncLocalFileSystem' + __orig_version__ = '0.2.0' + protocol = () + root_marker = '/' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + path = stringify_path(path) + if path.startswith("file://"): + path = path[7:] + elif path.startswith("file:"): + path = path[5:] + elif path.startswith("local://"): + path = path[8:] + elif path.startswith("local:"): + path = path[6:] + return make_path_posix(path).rstrip("/") or cls.root_marker + + @classmethod + def _parent(cls, path): + path = cls._strip_protocol(path).rstrip("/") + if "/" in path: + return path.rsplit("/", 1)[0] + else: + return cls.root_marker + + +class AzureBlobFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'adlfs.spec.AzureBlobFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('abfs', 'az', 'abfss') + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path: str): + """ + Remove the protocol from the input path + + Parameters + ---------- + path: str + Path to remove the protocol from + + Returns + ------- + str + Returns a path without the protocol + """ + if isinstance(path, list): + return [cls._strip_protocol(p) for p in path] + + STORE_SUFFIX = ".dfs.core.windows.net" + logger.debug(f"_strip_protocol for {path}") + if not path.startswith(("abfs://", "az://", "abfss://")): + path = path.lstrip("/") + path = "abfs://" + path + ops = infer_storage_options(path) + if "username" in ops: + if ops.get("username", None): + ops["path"] = ops["username"] + ops["path"] + # we need to make sure that the path retains + # the format {host}/{path} + # here host is the container_name + elif ops.get("host", None): + if ( + ops["host"].count(STORE_SUFFIX) == 0 + ): # no store-suffix, so this is container-name + ops["path"] = ops["host"] + ops["path"] + url_query = ops.get("url_query") + if url_query is not None: + ops["path"] = f"{ops['path']}?{url_query}" + + logger.debug(f"_strip_protocol({path}) = {ops}") + stripped_path = ops["path"].lstrip("/") + return stripped_path + + @staticmethod + def _get_kwargs_from_urls(urlpath): + """Get the account_name from the urlpath and pass to storage_options""" + ops = infer_storage_options(urlpath) + out = {} + host: str | None = ops.get("host", None) + if host: + match = re.match( + r"(?P.+)\.(dfs|blob)\.core\.windows\.net", host + ) + if match: + account_name = match.groupdict()["account_name"] + out["account_name"] = account_name + url_query = ops.get("url_query") + if url_query is not None: + from urllib.parse import parse_qs + + parsed = parse_qs(url_query) + if "versionid" in parsed: + out["version_aware"] = True + return out + + +class AzureDatalakeFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'adlfs.gen1.AzureDatalakeFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('adl',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + ops = infer_storage_options(path) + return ops["path"] + + @staticmethod + def _get_kwargs_from_urls(paths): + """Get the store_name from the urlpath and pass to storage_options""" + ops = infer_storage_options(paths) + out = {} + if ops.get("host", None): + out["store_name"] = ops["host"] + return out + + +class BoxFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'boxfs.boxfs.BoxFileSystem' + __orig_version__ = '0.2.1' + protocol = ('box',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path) -> str: + path = super()._strip_protocol(path) + path = path.replace("\\", "/") + return path + + +class DaskWorkerFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.dask.DaskWorkerFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('dask',) + root_marker = '' + sep = '/' + + @staticmethod + def _get_kwargs_from_urls(path): + so = infer_storage_options(path) + if "host" in so and "port" in so: + return {"client": f"{so['host']}:{so['port']}"} + else: + return {} + + +class DataFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.data.DataFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('data',) + root_marker = '' + sep = '/' + + +class DatabricksFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.dbfs.DatabricksFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('dbfs',) + root_marker = '' + sep = '/' + + +class DictFSFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'morefs.dict.DictFS' + __orig_version__ = '0.2.0' + protocol = ('dictfs',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path: str) -> str: + if path.startswith("dictfs://"): + path = path[len("dictfs://") :] + if "::" in path or "://" in path: + return path.rstrip("/") + path = path.lstrip("/").rstrip("/") + return "/" + path if path else cls.root_marker + + +class DropboxDriveFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'dropboxdrivefs.core.DropboxDriveFileSystem' + __orig_version__ = '1.3.1' + protocol = ('dropbox',) + root_marker = '' + sep = '/' + + +class FTPFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.ftp.FTPFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('ftp',) + root_marker = '/' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/") + + @staticmethod + def _get_kwargs_from_urls(urlpath): + out = infer_storage_options(urlpath) + out.pop("path", None) + out.pop("protocol", None) + return out + + +class GCSFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'gcsfs.core.GCSFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('gcs', 'gs') + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + if isinstance(path, list): + return [cls._strip_protocol(p) for p in path] + path = stringify_path(path) + protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol + for protocol in protos: + if path.startswith(protocol + "://"): + path = path[len(protocol) + 3 :] + elif path.startswith(protocol + "::"): + path = path[len(protocol) + 2 :] + # use of root_marker to make minimum required path, e.g., "/" + return path or cls.root_marker + + @classmethod + def _get_kwargs_from_urls(cls, path): + _, _, generation = cls._split_path(path, version_aware=True) + if generation is not None: + return {"version_aware": True} + return {} + + @classmethod + def _split_path(cls, path, version_aware=False): + """ + Normalise GCS path string into bucket and key. + + Parameters + ---------- + path : string + Input path, like `gcs://mybucket/path/to/file`. + Path is of the form: '[gs|gcs://]bucket[/key][?querystring][#fragment]' + + GCS allows object generation (object version) to be specified in either + the URL fragment or the `generation` query parameter. When provided, + the fragment will take priority over the `generation` query paramenter. + + Returns + ------- + (bucket, key, generation) tuple + """ + path = cls._strip_protocol(path).lstrip("/") + if "/" not in path: + return path, "", None + bucket, keypart = path.split("/", 1) + key = keypart + generation = None + if version_aware: + parts = urlsplit(keypart) + try: + if parts.fragment: + generation = parts.fragment + elif parts.query: + parsed = parse_qs(parts.query) + if "generation" in parsed: + generation = parsed["generation"][0] + # Sanity check whether this could be a valid generation ID. If + # it is not, assume that # or ? characters are supposed to be + # part of the object name. + if generation is not None: + int(generation) + key = parts.path + except ValueError: + generation = None + return ( + bucket, + key, + generation, + ) + + +class GitFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.git.GitFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('git',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + path = super()._strip_protocol(path).lstrip("/") + if ":" in path: + path = path.split(":", 1)[1] + if "@" in path: + path = path.split("@", 1)[1] + return path.lstrip("/") + + @staticmethod + def _get_kwargs_from_urls(path): + if path.startswith("git://"): + path = path[6:] + out = {} + if ":" in path: + out["path"], path = path.split(":", 1) + if "@" in path: + out["ref"], path = path.split("@", 1) + return out + + +class GithubFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.github.GithubFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('github',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + opts = infer_storage_options(path) + if "username" not in opts: + return super()._strip_protocol(path) + return opts["path"].lstrip("/") + + @staticmethod + def _get_kwargs_from_urls(path): + opts = infer_storage_options(path) + if "username" not in opts: + return {} + out = {"org": opts["username"], "repo": opts["password"]} + if opts["host"]: + out["sha"] = opts["host"] + return out + + +class HTTPFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.http.HTTPFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('http', 'https') + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + """For HTTP, we always want to keep the full URL""" + return path + + @classmethod + def _parent(cls, path): + # override, since _strip_protocol is different for URLs + par = super()._parent(path) + if len(par) > 7: # "http://..." + return par + return "" + + +class HadoopFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.arrow.HadoopFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('hdfs', 'arrow_hdfs') + root_marker = '/' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + ops = infer_storage_options(path) + path = ops["path"] + if path.startswith("//"): + # special case for "hdfs://path" (without the triple slash) + path = path[1:] + return path + + @staticmethod + def _get_kwargs_from_urls(path): + ops = infer_storage_options(path) + out = {} + if ops.get("host", None): + out["host"] = ops["host"] + if ops.get("username", None): + out["user"] = ops["username"] + if ops.get("port", None): + out["port"] = ops["port"] + if ops.get("url_query", None): + queries = parse_qs(ops["url_query"]) + if queries.get("replication", None): + out["replication"] = int(queries["replication"][0]) + return out + + +class HfFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'huggingface_hub.hf_file_system.HfFileSystem' + __orig_version__ = '0.20.3' + protocol = ('hf',) + root_marker = '' + sep = '/' + + +class JupyterFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.jupyter.JupyterFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('jupyter', 'jlab') + root_marker = '' + sep = '/' + + +class LakeFSFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'lakefs_spec.spec.LakeFSFileSystem' + __orig_version__ = '0.7.0' + protocol = ('lakefs',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + """Copied verbatim from the base class, save for the slash rstrip.""" + if isinstance(path, list): + return [cls._strip_protocol(p) for p in path] + spath = super()._strip_protocol(path) + if stringify_path(path).endswith("/"): + return spath + "/" + return spath + + +class LibArchiveFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.libarchive.LibArchiveFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('libarchive',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + # file paths are always relative to the archive root + return super()._strip_protocol(path).lstrip("/") + + +class LocalFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.local.LocalFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('file', 'local') + root_marker = '/' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + path = stringify_path(path) + if path.startswith("file://"): + path = path[7:] + elif path.startswith("file:"): + path = path[5:] + elif path.startswith("local://"): + path = path[8:] + elif path.startswith("local:"): + path = path[6:] + return make_path_posix(path).rstrip("/") or cls.root_marker + + @classmethod + def _parent(cls, path): + path = cls._strip_protocol(path).rstrip("/") + if "/" in path: + return path.rsplit("/", 1)[0] + else: + return cls.root_marker + + +class MemFSFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'morefs.memory.MemFS' + __orig_version__ = '0.2.0' + protocol = ('memfs',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + if path.startswith("memfs://"): + path = path[len("memfs://") :] + return MemoryFileSystemFlavour._strip_protocol(path) # pylint: disable=protected-access + + +class MemoryFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.memory.MemoryFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('memory',) + root_marker = '/' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + if path.startswith("memory://"): + path = path[len("memory://") :] + if "::" in path or "://" in path: + return path.rstrip("/") + path = path.lstrip("/").rstrip("/") + return "/" + path if path else "" + + +class OCIFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'ocifs.core.OCIFileSystem' + __orig_version__ = '1.3.1' + protocol = ('oci', 'ocilake') + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + if isinstance(path, list): + return [cls._strip_protocol(p) for p in path] + path = stringify_path(path) + stripped_path = super()._strip_protocol(path) + if stripped_path == cls.root_marker and "@" in path: + return "@" + path.rstrip("/").split("@", 1)[1] + return stripped_path + + @classmethod + def _parent(cls, path): + path = cls._strip_protocol(path.rstrip("/")) + if "/" in path: + return cls.root_marker + path.rsplit("/", 1)[0] + elif "@" in path: + return cls.root_marker + "@" + path.split("@", 1)[1] + else: + raise ValueError(f"the following path does not specify a namespace: {path}") + + +class OSSFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'ossfs.core.OSSFileSystem' + __orig_version__ = '2023.12.0' + protocol = ('oss',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + """Turn path from fully-qualified to file-system-specifi + Parameters + ---------- + path : Union[str, List[str]] + Input path, like + `http://oss-cn-hangzhou.aliyuncs.com/mybucket/myobject` + `oss://mybucket/myobject` + Examples + -------- + >>> _strip_protocol( + "http://oss-cn-hangzhou.aliyuncs.com/mybucket/myobject" + ) + ('/mybucket/myobject') + >>> _strip_protocol( + "oss://mybucket/myobject" + ) + ('/mybucket/myobject') + """ + if isinstance(path, list): + return [cls._strip_protocol(p) for p in path] + path_string: str = stringify_path(path) + if path_string.startswith("oss://"): + path_string = path_string[5:] + + parser_re = r"https?://(?Poss.+aliyuncs\.com)(?P/.+)" + matcher = re.compile(parser_re).match(path_string) + if matcher: + path_string = matcher["path"] + return path_string or cls.root_marker + + +class OverlayFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'morefs.overlay.OverlayFileSystem' + __orig_version__ = '0.2.0' + protocol = ('overlayfs',) + root_marker = '' + sep = '/' + + +class ReferenceFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.reference.ReferenceFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('reference',) + root_marker = '' + sep = '/' + + +class S3FileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 's3fs.core.S3FileSystem' + __orig_version__ = '2024.2.0' + protocol = ('s3', 's3a') + root_marker = '' + sep = '/' + + @staticmethod + def _get_kwargs_from_urls(urlpath): + """ + When we have a urlpath that contains a ?versionId= + + Assume that we want to use version_aware mode for + the filesystem. + """ + url_storage_opts = infer_storage_options(urlpath) + url_query = url_storage_opts.get("url_query") + out = {} + if url_query is not None: + from urllib.parse import parse_qs + + parsed = parse_qs(url_query) + if "versionId" in parsed: + out["version_aware"] = True + return out + + +class SFTPFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.sftp.SFTPFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('sftp', 'ssh') + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + return infer_storage_options(path)["path"] + + @staticmethod + def _get_kwargs_from_urls(urlpath): + out = infer_storage_options(urlpath) + out.pop("path", None) + out.pop("protocol", None) + return out + + +class SMBFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.smb.SMBFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('smb',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + return infer_storage_options(path)["path"] + + @staticmethod + def _get_kwargs_from_urls(path): + # smb://workgroup;user:password@host:port/share/folder/file.csv + out = infer_storage_options(path) + out.pop("path", None) + out.pop("protocol", None) + return out + + +class TarFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.tar.TarFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('tar',) + root_marker = '' + sep = '/' + + +class WandbFSFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'wandbfs._wandbfs.WandbFS' + __orig_version__ = '0.0.2' + protocol = ('wandb',) + root_marker = '' + sep = '/' + + +class WebHDFSFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.webhdfs.WebHDFS' + __orig_version__ = '2024.2.0' + protocol = ('webhdfs', 'webHDFS') + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + return infer_storage_options(path)["path"] + + @staticmethod + def _get_kwargs_from_urls(urlpath): + out = infer_storage_options(urlpath) + out.pop("path", None) + out.pop("protocol", None) + if "username" in out: + out["user"] = out.pop("username") + return out + + +class WebdavFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'webdav4.fsspec.WebdavFileSystem' + __orig_version__ = '0.9.8' + protocol = ('webdav', 'dav') + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path: str) -> str: + """Strips protocol from the given path, overriding for type-casting.""" + stripped = super()._strip_protocol(path) + return cast(str, stripped) + + +class XRootDFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec_xrootd.xrootd.XRootDFileSystem' + __orig_version__ = '0.2.4' + protocol = ('root',) + root_marker = '/' + sep = '/' + + @classmethod + def _strip_protocol(cls, path: str | list[str]) -> Any: + if isinstance(path, str): + if path.startswith(cls.protocol): + return urlsplit(path).path.rstrip("/") or cls.root_marker + # assume already stripped + return path.rstrip("/") or cls.root_marker + elif isinstance(path, list): + return [cls._strip_protocol(item) for item in path] + else: + raise ValueError("Strip protocol not given string or list") + + @staticmethod + def _get_kwargs_from_urls(u: str) -> dict[Any, Any]: + url = urlsplit(u) + # The hostid encapsulates user,pass,host,port in one string + return {"hostid": url.netloc} + + +class ZipFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'fsspec.implementations.zip.ZipFileSystem' + __orig_version__ = '2024.2.0' + protocol = ('zip',) + root_marker = '' + sep = '/' + + @classmethod + def _strip_protocol(cls, path): + # zip file paths are always relative to the archive root + return super()._strip_protocol(path).lstrip("/") + + +class _DVCFileSystemFlavour(AbstractFileSystemFlavour): + __orig_class__ = 'dvc.fs.dvc._DVCFileSystem' + __orig_version__ = '3.47.0' + protocol = ('dvc',) + root_marker = '/' + sep = '/' diff --git a/upath/_protocol.py b/upath/_protocol.py index 568dae04..a3827bdd 100644 --- a/upath/_protocol.py +++ b/upath/_protocol.py @@ -5,12 +5,9 @@ from pathlib import PurePath from typing import Any -from fsspec.core import strip_protocol as strip_fsspec_protocol -from fsspec.spec import AbstractFileSystem - __all__ = [ "get_upath_protocol", - "strip_upath_protocol", + "normalize_empty_netloc", ] # Regular expression to match fsspec style protocols. @@ -42,6 +39,8 @@ def get_upath_protocol( pth_protocol = _match_protocol(pth) elif isinstance(pth, PurePath): pth_protocol = getattr(pth, "protocol", "") + elif hasattr(pth, "__fspath__"): + pth_protocol = _match_protocol(pth.__fspath__()) else: pth_protocol = _match_protocol(str(pth)) # if storage_options and not protocol and not pth_protocol: @@ -53,27 +52,10 @@ def get_upath_protocol( return protocol or pth_protocol or "" -def strip_upath_protocol( - pth: str | os.PathLike[str], - *, - allow_unknown: bool = False, -) -> str: - """strip protocol from path""" - if isinstance(pth, PurePath): - pth = str(pth) - elif not isinstance(pth, str): - pth = os.fspath(pth) +def normalize_empty_netloc(pth: str) -> str: if m := _PROTOCOL_RE.match(pth): if len(m.group("slashes")) == 1: protocol = m.group("protocol") path = m.group("path") pth = f"{protocol}:///{path}" - try: - return strip_fsspec_protocol(pth) - except ValueError as err: - if allow_unknown and str(err).endswith(m.group("protocol")): - # fsspec raised ValueError because the protocol is not registered - return AbstractFileSystem._strip_protocol(pth) - raise - else: - return pth + return pth diff --git a/upath/_stat.py b/upath/_stat.py index 3a6ec789..e72b420b 100644 --- a/upath/_stat.py +++ b/upath/_stat.py @@ -43,6 +43,8 @@ def _get_stat_result_extra_fields() -> tuple[str, ...]: # named fields of the stat_result class as keys and the internal # index of the field as value. sr = os.stat_result(range(os.stat_result.n_fields)) + rd = sr.__reduce__() + assert isinstance(rd, tuple), "unexpected return os.stat_result.__reduce__" _, (_, extra) = sr.__reduce__() extra_fields = sorted(extra, key=extra.__getitem__) return tuple(extra_fields) diff --git a/upath/core.py b/upath/core.py index ec0a575e..6be5343e 100644 --- a/upath/core.py +++ b/upath/core.py @@ -17,9 +17,11 @@ from upath._compat import FSSpecAccessorShim from upath._compat import PathlibPathShim +from upath._compat import method_and_classmethod from upath._compat import str_remove_prefix from upath._compat import str_remove_suffix -from upath._flavour import FSSpecFlavour +from upath._flavour import LazyFlavourDescriptor +from upath._flavour import upath_get_kwargs_from_url from upath._flavour import upath_urijoin from upath._protocol import get_upath_protocol from upath._stat import UPathStatResult @@ -30,6 +32,8 @@ def __getattr__(name): if name == "_UriFlavour": + from upath._flavour import default_flavour + warnings.warn( "upath.core._UriFlavour should not be used anymore." " Please follow the universal_pathlib==0.2.0 migration guide at" @@ -38,7 +42,7 @@ def __getattr__(name): DeprecationWarning, stacklevel=2, ) - return FSSpecFlavour + return default_flavour elif name == "PT": warnings.warn( "upath.core.PT should not be used anymore." @@ -92,7 +96,7 @@ class UPath(PathlibPathShim, Path): _fs_cached: AbstractFileSystem _protocol_dispatch: bool | None = None - _flavour = FSSpecFlavour() + _flavour = LazyFlavourDescriptor() # === upath.UPath constructor ===================================== @@ -200,8 +204,12 @@ def __init__( if isinstance(args0, UPath): self._storage_options = {**args0.storage_options, **storage_options} else: + if hasattr(args0, "__fspath__"): + _args0 = args0.__fspath__() + else: + _args0 = str(args0) self._storage_options = type(self)._parse_storage_options( - str(args0), protocol, storage_options + _args0, protocol, storage_options ) else: self._storage_options = storage_options.copy() @@ -287,8 +295,7 @@ def _parse_storage_options( cls, urlpath: str, protocol: str, storage_options: Mapping[str, Any] ) -> dict[str, Any]: """Parse storage_options from the urlpath""" - fs_cls: type[AbstractFileSystem] = get_filesystem_class(protocol) - pth_storage_options = fs_cls._get_kwargs_from_urls(urlpath) + pth_storage_options = upath_get_kwargs_from_url(urlpath) return {**pth_storage_options, **storage_options} @classmethod @@ -307,7 +314,10 @@ def __init_subclass__(cls, **kwargs): """provide a clean migration path for custom user subclasses""" # Check if the user subclass has a custom `__new__` method - has_custom_new_method = cls.__new__ is not UPath.__new__ + has_custom_new_method = ( + cls.__new__ is not UPath.__new__ + and cls.__name__ not in {"PosixUPath", "WindowsUPath"} + ) if has_custom_new_method and cls._protocol_dispatch is None: warnings.warn( @@ -450,27 +460,10 @@ def _parse_args(cls, args): DeprecationWarning, stacklevel=2, ) + # TODO !!! pth = cls._flavour.join(*args) return cls._parse_path(pth) - @classmethod - def _format_parsed_parts(cls, drv, root, tail, **kwargs): - if kwargs: - warnings.warn( - "UPath._format_parsed_parts should not be used with" - " additional kwargs. Please follow the" - " universal_pathlib==0.2.0 migration guide at" - " https://github.com/fsspec/universal_pathlib for more" - " information.", - DeprecationWarning, - stacklevel=2, - ) - if "url" in kwargs and tail[:1] == [f"{drv}{root}"]: - # This was called from code that expected py38-py311 behavior - # of _format_parsed_parts, which takes drv, root and parts - tail = tail[1:] - return super()._format_parsed_parts(drv, root, tail) - @property def _drv(self): # direct access to ._drv should emit a warning, @@ -538,19 +531,89 @@ def with_segments(self, *pathsegments): **self._storage_options, ) - @classmethod - def _parse_path(cls, path): - if getattr(cls._flavour, "supports_empty_parts", False): - drv, root, rel = cls._flavour.splitroot(path) + # === upath.UPath non-standard changes ============================ + + # NOTE: + # this is a classmethod on the parent class, but we need to + # override it here to make it possible to provide the _flavour + # with the correct protocol... + # pathlib 3.12 never calls this on the class. Only on the instance. + @method_and_classmethod + def _parse_path(self_or_cls, path): # noqa: B902 + if isinstance(self_or_cls, type): + warnings.warn( + "UPath._parse_path should not be used as a classmethod." + " Please file an issue on the universal_pathlib issue tracker" + " and describe your use case.", + DeprecationWarning, + stacklevel=2, + ) + flavour = self_or_cls._flavour + + if flavour.supports_empty_parts: + drv, root, rel = flavour.splitroot(path) if not root: parsed = [] else: - parsed = list(map(sys.intern, rel.split(cls._flavour.sep))) + parsed = list(map(sys.intern, rel.split(flavour.sep))) if parsed[-1] == ".": parsed[-1] = "" parsed = [x for x in parsed if x != "."] + if not flavour.has_meaningful_trailing_slash and parsed[-1] == "": + parsed.pop() return drv, root, parsed - return super()._parse_path(path) + if not path: + return "", "", [] + sep = flavour.sep + altsep = flavour.altsep + if altsep: + path = path.replace(altsep, sep) + drv, root, rel = flavour.splitroot(path) + if not root and drv.startswith(sep) and not drv.endswith(sep): + drv_parts = drv.split(sep) + if len(drv_parts) == 4 and drv_parts[2] not in "?.": + # e.g. //server/share + root = sep + elif len(drv_parts) == 6: + # e.g. //?/unc/server/share + root = sep + parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != "."] + return drv, root, parsed + + @method_and_classmethod + def _format_parsed_parts(self_or_cls, drv, root, tail, **kwargs): # noqa: B902 + if isinstance(self_or_cls, type): + warnings.warn( + "UPath._format_parsed_path should not be used as a classmethod." + " Please file an issue on the universal_pathlib issue tracker" + " and describe your use case.", + DeprecationWarning, + stacklevel=2, + ) + flavour = self_or_cls._flavour + + if kwargs: + warnings.warn( + "UPath._format_parsed_parts should not be used with" + " additional kwargs. Please follow the" + " universal_pathlib==0.2.0 migration guide at" + " https://github.com/fsspec/universal_pathlib for more" + " information.", + DeprecationWarning, + stacklevel=2, + ) + if "url" in kwargs and tail[:1] == [f"{drv}{root}"]: + # This was called from code that expected py38-py311 behavior + # of _format_parsed_parts, which takes drv, root and parts + tail = tail[1:] + + if drv or root: + return drv + root + flavour.sep.join(tail) + elif tail and flavour.splitdrive(tail[0])[0]: + tail = ["."] + tail + return flavour.sep.join(tail) + + # === upath.UPath changes ========================================= def __str__(self): if self._protocol: @@ -682,12 +745,6 @@ def open(self, mode="r", buffering=-1, encoding=None, errors=None, newline=None) return self.fs.open(self.path, mode) # fixme def iterdir(self): - if getattr(self._flavour, "supports_empty_parts", False) and self.parts[ - -1: - ] == ("",): - base = self.with_segments(self.anchor, *self._tail[:-1]) - else: - base = self for name in self.fs.listdir(self.path): # fsspec returns dictionaries if isinstance(name, dict): @@ -697,7 +754,7 @@ def iterdir(self): continue # only want the path name with iterdir _, _, name = str_remove_suffix(name, "/").rpartition(self._flavour.sep) - yield base._make_child_relpath(name) + yield self._make_child_relpath(name) def _scandir(self): raise NotImplementedError # todo @@ -710,8 +767,9 @@ def _make_child_relpath(self, name): def glob(self, pattern: str, *, case_sensitive=None): path_pattern = self.joinpath(pattern).path sep = self._flavour.sep + base = self.fs._strip_protocol(self.path) for name in self.fs.glob(path_pattern): - name = str_remove_prefix(str_remove_prefix(name, self.path), sep) + name = str_remove_prefix(str_remove_prefix(name, base), sep) yield self.joinpath(name) def rglob(self, pattern: str, *, case_sensitive=None): @@ -721,18 +779,20 @@ def rglob(self, pattern: str, *, case_sensitive=None): if _FSSPEC_HAS_WORKING_GLOB: r_path_pattern = self.joinpath("**", pattern).path sep = self._flavour.sep + base = self.fs._strip_protocol(self.path) for name in self.fs.glob(r_path_pattern): - name = str_remove_prefix(str_remove_prefix(name, self.path), sep) + name = str_remove_prefix(str_remove_prefix(name, base), sep) yield self.joinpath(name) else: path_pattern = self.joinpath(pattern).path r_path_pattern = self.joinpath("**", pattern).path sep = self._flavour.sep + base = self.fs._strip_protocol(self.path) seen = set() for p in (path_pattern, r_path_pattern): for name in self.fs.glob(p): - name = str_remove_prefix(str_remove_prefix(name, self.path), sep) + name = str_remove_prefix(str_remove_prefix(name, base), sep) if name in seen: continue else: @@ -765,16 +825,10 @@ def resolve(self, strict: bool = False): resolved: list[str] = [] resolvable_parts = _parts[1:] - last_idx = len(resolvable_parts) - 1 - for idx, part in enumerate(resolvable_parts): + for part in resolvable_parts: if part == "..": if resolved: resolved.pop() - if ( - getattr(self._flavour, "supports_empty_parts", False) - and idx == last_idx - ): - resolved.append("") elif part != ".": resolved.append(part) diff --git a/upath/implementations/cloud.py b/upath/implementations/cloud.py index a4f25ede..a7fe60bc 100644 --- a/upath/implementations/cloud.py +++ b/upath/implementations/cloud.py @@ -4,7 +4,7 @@ from typing import Any from upath._compat import FSSpecAccessorShim as _FSSpecAccessorShim -from upath._flavour import FSSpecFlavour as _FSSpecFlavour +from upath._flavour import upath_strip_protocol from upath.core import UPath __all__ = [ @@ -21,10 +21,6 @@ class CloudPath(UPath): __slots__ = () - _flavour = _FSSpecFlavour( - join_prepends_protocol=True, - supports_netloc=True, - ) @classmethod def _transform_init_args( @@ -39,7 +35,8 @@ def _transform_init_args( if str(args[0]).startswith("/"): args = (f"{protocol}://{bucket}{args[0]}", *args[1:]) else: - args = (f"{protocol}://{bucket}/", *args) + args0 = upath_strip_protocol(args[0]) + args = (f"{protocol}://{bucket}/", args0, *args[1:]) break return super()._transform_init_args(args, protocol, storage_options) @@ -53,7 +50,10 @@ def mkdir( def iterdir(self): if self.is_file(): raise NotADirectoryError(str(self)) - yield from super().iterdir() + if self.parts[-1:] == ("",): + yield from self.parent.iterdir() + else: + yield from super().iterdir() def relative_to(self, other, /, *_deprecated, walk_up=False): # use the parent implementation for the ValueError logic diff --git a/upath/implementations/http.py b/upath/implementations/http.py index 6f9b73fb..c759fb9a 100644 --- a/upath/implementations/http.py +++ b/upath/implementations/http.py @@ -8,7 +8,6 @@ from fsspec.asyn import sync from upath._compat import FSSpecAccessorShim as _FSSpecAccessorShim -from upath._flavour import FSSpecFlavour as _FSSpecFlavour from upath._stat import UPathStatResult from upath.core import UPath @@ -19,13 +18,6 @@ class HTTPPath(UPath): - _flavour = _FSSpecFlavour( - join_like_urljoin=True, - supports_empty_parts=True, - supports_netloc=True, - supports_query_parameters=True, - supports_fragments=True, - ) @classmethod def _transform_init_args( @@ -80,15 +72,18 @@ def stat(self, follow_symlinks: bool = True): return UPathStatResult.from_info(info) def iterdir(self): - it = iter(super().iterdir()) - try: - item0 = next(it) - except (StopIteration, NotADirectoryError): - raise NotADirectoryError(str(self)) - except FileNotFoundError: - raise FileNotFoundError(str(self)) + if self.parts[-1:] == ("",): + yield from self.parent.iterdir() else: - yield from chain([item0], it) + it = iter(super().iterdir()) + try: + item0 = next(it) + except (StopIteration, NotADirectoryError): + raise NotADirectoryError(str(self)) + except FileNotFoundError: + raise FileNotFoundError(str(self)) + else: + yield from chain([item0], it) def resolve( self: HTTPPath, @@ -98,6 +93,9 @@ def resolve( """Normalize the path and resolve redirects.""" # Normalise the path resolved_path = super().resolve(strict=strict) + # if the last part is "..", then it's a directory + if self.parts[-1:] == ("..",): + resolved_path = resolved_path.joinpath("") if follow_redirects: # Get the fsspec fs diff --git a/upath/implementations/local.py b/upath/implementations/local.py index 038872ad..e0fba453 100644 --- a/upath/implementations/local.py +++ b/upath/implementations/local.py @@ -11,7 +11,6 @@ from typing import MutableMapping from urllib.parse import SplitResult -from upath._flavour import FSSpecFlavour as _FSSpecFlavour from upath.core import UPath __all__ = [ @@ -40,9 +39,6 @@ def _check_listdir_works_on_files() -> bool: class LocalPath(UPath): __slots__ = () - _flavour = _FSSpecFlavour( - posixpath_only=False, - ) @property def path(self): @@ -189,6 +185,6 @@ def _from_parsed_parts(cls, drv, root, parts): _upath_init(obj) return obj - @property - def path(self) -> str: - return WindowsPath.__str__(self) + @property + def path(self) -> str: + return WindowsPath.as_posix(self) diff --git a/upath/implementations/webdav.py b/upath/implementations/webdav.py index 4a49143d..e299788c 100644 --- a/upath/implementations/webdav.py +++ b/upath/implementations/webdav.py @@ -2,6 +2,7 @@ import os from typing import Any +from typing import Mapping from urllib.parse import urlsplit from fsspec.registry import known_implementations @@ -52,6 +53,18 @@ def _transform_init_args( ) return super()._transform_init_args(args, "webdav", storage_options) + @classmethod + def _parse_storage_options( + cls, urlpath: str, protocol: str, storage_options: Mapping[str, Any] + ) -> dict[str, Any]: + so = dict(storage_options) + if urlpath.startswith(("webdav+http:", "webdav+https:")): + url = urlsplit(str(urlpath)) + base = url._replace(scheme=url.scheme.split("+")[1], path="").geturl() + urlpath = url._replace(scheme="", netloc="").geturl() or "/" + so.setdefault("base_url", base) + return super()._parse_storage_options(urlpath, "webdav", so) + @property def path(self) -> str: # webdav paths don't start at "/" diff --git a/upath/tests/test_core.py b/upath/tests/test_core.py index 71a53aec..f52e6b52 100644 --- a/upath/tests/test_core.py +++ b/upath/tests/test_core.py @@ -255,6 +255,19 @@ def test_compare_to_pathlib_path_ne(): assert pathlib.Path("/bucket/folder") == UPath("/bucket/folder") +def test_handle_fspath_args(tmp_path): + f = tmp_path.joinpath("file.txt").as_posix() + + class X: + def __str__(self): + raise ValueError("should not be called") + + def __fspath__(self): + return f + + assert UPath(X()).path == f + + @pytest.mark.parametrize( "urlpath", [ @@ -376,10 +389,12 @@ def test_uri_parsing(): @pytest.mark.parametrize(*NORMALIZATIONS) def test_normalize(unnormalized, normalized): expected = UPath(normalized) - # Normalise only, do not attempt to follow redirects for http:// paths here - result = UPath.resolve(UPath(unnormalized)) - if expected.protocol == "memory": - pass + pth = UPath(unnormalized) + if pth.protocol in {"http", "https"}: + # Normalise only, do not attempt to follow redirects for http:// paths here + result = pth.resolve(strict=True, follow_redirects=False) + else: + result = pth.resolve(strict=True) assert expected == result assert str(expected) == str(result) diff --git a/upath/tests/test_stat.py b/upath/tests/test_stat.py new file mode 100644 index 00000000..66d9668c --- /dev/null +++ b/upath/tests/test_stat.py @@ -0,0 +1,99 @@ +import os +from datetime import datetime +from datetime import timezone + +import pytest + +import upath + + +@pytest.fixture +def pth_file(tmp_path): + f = tmp_path.joinpath("abc.txt") + f.write_bytes(b"a") + p = upath.UPath(f"file://{f.absolute().as_posix()}") + yield p + + +def test_stat_repr(pth_file): + assert repr(pth_file.stat()).startswith("UPathStatResult") + + +def test_stat_as_info(pth_file): + dct = pth_file.stat().as_info() + assert dct["size"] == pth_file.stat().st_size + + +def test_stat_atime(pth_file): + assert isinstance(pth_file.stat().st_atime, (float, int)) + + +def test_stat_mtime(pth_file): + assert isinstance(pth_file.stat().st_mtime, (float, int)) + + +def test_stat_ctime(pth_file): + assert isinstance(pth_file.stat().st_ctime, (float, int)) + + +def test_stat_seq_interface(pth_file): + assert len(tuple(pth_file.stat())) == os.stat_result.n_sequence_fields + assert isinstance(pth_file.stat().index(0), int) + assert isinstance(pth_file.stat().count(0), int) + assert isinstance(pth_file.stat()[0], int) + + +def test_stat_warn_if_dict_interface(pth_file): + with pytest.warns(DeprecationWarning): + pth_file.stat().keys() + + with pytest.warns(DeprecationWarning): + pth_file.stat().items() + + with pytest.warns(DeprecationWarning): + pth_file.stat().values() + + with pytest.warns(DeprecationWarning): + pth_file.stat().get("size") + + with pytest.warns(DeprecationWarning): + pth_file.stat().copy() + + with pytest.warns(DeprecationWarning): + _ = pth_file.stat()["size"] + + +@pytest.mark.parametrize( + "timestamp", + [ + 10, + datetime(1970, 1, 1, 0, 0, 10, tzinfo=timezone.utc), + "1970-01-01T00:00:10Z", + "1970-01-01T00:00:10+00:00", + ], +) +def test_timestamps(timestamp): + from upath._stat import UPathStatResult + + s = UPathStatResult( + [0] * 10, + { + "ctime": timestamp, + "atime": timestamp, + "mtime": timestamp, + "created": timestamp, + }, + ) + assert s.st_atime == 10.0 + assert s.st_ctime == 10.0 + assert s.st_mtime == 10.0 + + +def test_bad_timestamp(): + from upath._stat import UPathStatResult + + with pytest.raises(TypeError), pytest.warns( + RuntimeWarning, "universal_pathlib/issues" + ): + s = UPathStatResult([0] * 10, {"ctime": "bad"}) + _ = s.st_ctime