diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index ba05f23fcc..ec98af029e 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -27,7 +27,7 @@ body: attributes: label: Python Version description: Version of Python interpreter - placeholder: 3.8.5, 3.9, 3.10, etc. + placeholder: 3.9, 3.10, 3.11, etc. validations: required: true - type: input diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 9cb5ec9a78..9ceaab2ae7 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -3,9 +3,9 @@ contact_links: - name: ✨ Propose a new major feature url: https://github.com/zarr-developers/zarr-specs about: A new major feature should be discussed in the Zarr specifications repository. - - name: ❓ Discuss something on gitter - url: https://gitter.im/zarr-developers/community - about: For questions like "How do I do X with Zarr?", you can move to our Gitter channel. + - name: ❓ Discuss something on Zulip + url: https://ossci.zulipchat.com/ + about: For questions like "How do I do X with Zarr?", you can move to our Zulip Chat. - name: ❓ Discuss something on GitHub Discussions url: https://github.com/zarr-developers/zarr-python/discussions about: For questions like "How do I do X with Zarr?", you can move to GitHub Discussions. diff --git a/.github/dependabot.yml b/.github/dependabot.yml index d8e8d4d57a..5a0befe9b5 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,6 +5,10 @@ updates: directory: "/" schedule: interval: "daily" + groups: + requirements: + patterns: + - "*" - package-ecosystem: "github-actions" directory: "/" schedule: diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 7013f1784f..bb3d433629 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -42,7 +42,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -56,7 +56,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -69,4 +69,4 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 diff --git a/.github/workflows/minimal.yml b/.github/workflows/minimal.yml index 2c0cd45ca9..b5b2f48d62 100644 --- a/.github/workflows/minimal.yml +++ b/.github/workflows/minimal.yml @@ -15,7 +15,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: Setup Miniconda - uses: conda-incubator/setup-miniconda@v2.3.0 + uses: conda-incubator/setup-miniconda@v3.0.4 with: channels: conda-forge environment-file: environment.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index aa7158f1cf..f53cb2d9a9 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,13 +15,15 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] - numpy_version: ['>=1.22.0', '==1.20.*'] + python-version: ['3.9', '3.10', '3.11', '3.12'] + numpy_version: ['>=1.24.0', '==1.23.*'] exclude: - python-version: '3.10' - numpy_version: '==1.20.*' + numpy_version: '==1.23.*' - python-version: '3.11' - numpy_version: '==1.20.*' + numpy_version: '==1.23.*' + - python-version: '3.12' + numpy_version: '==1.23.*' services: redis: image: redis @@ -42,7 +44,7 @@ jobs: with: fetch-depth: 0 - name: Setup Miniconda - uses: conda-incubator/setup-miniconda@v2.3.0 + uses: conda-incubator/setup-miniconda@v3.0.4 with: channels: conda-forge python-version: ${{ matrix.python-version }} @@ -62,7 +64,7 @@ jobs: python -m pip install --upgrade pip python -m pip install -U pip setuptools wheel line_profiler python -m pip install -rrequirements_dev_minimal.txt numpy${{matrix.numpy_version}} -rrequirements_dev_optional.txt pymongo redis - python -m pip install . + python -m pip install -e . python -m pip freeze - name: Tests shell: "bash -l {0}" @@ -78,11 +80,8 @@ jobs: mkdir ~/blob_emulator azurite -l ~/blob_emulator --debug debug.log 2>&1 > stdouterr.log & pytest --cov=zarr --cov-config=pyproject.toml --doctest-plus --cov-report xml --cov=./ --timeout=300 - - uses: codecov/codecov-action@v3 + - uses: codecov/codecov-action@v4 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} with: - token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos - #files: ./coverage1.xml,./coverage2.xml # optional - #flags: unittests # optional - #name: codecov-umbrella # optional - #fail_ci_if_error: true # optional (default = false) verbose: true # optional (default = false) diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index c08bfc6677..8ac76c899b 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -16,10 +16,10 @@ jobs: submodules: true fetch-depth: 0 - - uses: actions/setup-python@v4.7.1 + - uses: actions/setup-python@v5.1.0 name: Install Python with: - python-version: '3.8' + python-version: '3.9' - name: Install PyBuild run: | @@ -36,7 +36,7 @@ jobs: else echo "All seem good" fi - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: releases path: dist @@ -45,7 +45,7 @@ jobs: needs: [build_artifacts] runs-on: ubuntu-latest steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: releases path: dist @@ -60,11 +60,11 @@ jobs: runs-on: ubuntu-latest if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: releases path: dist - - uses: pypa/gh-action-pypi-publish@v1.8.10 + - uses: pypa/gh-action-pypi-publish@v1.8.14 with: user: __token__ password: ${{ secrets.pypi_password }} diff --git a/.github/workflows/windows-testing.yml b/.github/workflows/windows-testing.yml index 78945e97aa..ab86831aae 100644 --- a/.github/workflows/windows-testing.yml +++ b/.github/workflows/windows-testing.yml @@ -16,12 +16,12 @@ jobs: strategy: fail-fast: True matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.10', '3.11'] steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: conda-incubator/setup-miniconda@v2.3.0 + - uses: conda-incubator/setup-miniconda@v3.0.4 with: auto-update-conda: true python-version: ${{ matrix.python-version }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f22dc39832..be57770200 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,28 +6,25 @@ default_stages: [commit, push] default_language_version: python: python3 repos: - - repo: https://github.com/charliermarsh/ruff-pre-commit + - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.0.224' + rev: 'v0.4.4' hooks: - id: ruff - # Respect `exclude` and `extend-exclude` settings. - args: ["--force-exclude"] - repo: https://github.com/psf/black - rev: 22.12.0 + rev: 24.4.2 hooks: - id: black - repo: https://github.com/codespell-project/codespell - rev: v2.2.5 + rev: v2.2.6 hooks: - id: codespell - args: ["-L", "ba,ihs,kake,nd,noe,nwo,te,fo,zar", "-S", "fixture"] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.6.0 hooks: - id: check-yaml - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.3.0 + rev: v1.10.0 hooks: - id: mypy files: zarr diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 08cac8d78d..e45cae1b45 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -16,4 +16,6 @@ python: extra_requirements: - docs -formats: all +formats: + - htmlzip + - pdf diff --git a/LICENSE.txt b/LICENSE.txt index 850a0d8772..a4de1c39d3 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2015-2023 Zarr Developers +Copyright (c) 2015-2024 Zarr Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index b035ffa597..e379c9719f 100644 --- a/README.md +++ b/README.md @@ -70,10 +70,10 @@ - Gitter + Zulip - - + + diff --git a/TEAM.md b/TEAM.md new file mode 100644 index 0000000000..6a22d83d1f --- /dev/null +++ b/TEAM.md @@ -0,0 +1,26 @@ +## Active core-developers +- @joshmoore (Josh Moore) +- @jni (Juan Nunez-Iglesias) +- @rabernat (Ryan Abernathey) +- @jhamman (Joe Hamman) +- @d-v-b (Davis Bennett) +- @jakirkham (jakirkham) +- @martindurant (Martin Durant) +- @normanrz (Norman Rzepka) + +## Emeritus core-developers +- @alimanfoo (Alistair Miles) +- @shoyer (Stephan Hoyer) +- @ryan-williams (Ryan Williams) +- @jrbourbeau (James Bourbeau) +- @mzjp2 (Zain Patel) +- @grlee77 (Gregory Lee) + +## Former core-developers +- @jeromekelleher (Jerome Kelleher) +- @tjcrone (Tim Crone) +- @funkey (Jan Funke) +- @shikharsg +- @Carreau (Matthias Bussonnier) +- @dazzag24 +- @WardF (Ward Fisher) diff --git a/bench/compress_normal.py b/bench/compress_normal.py index 9f1655541c..803d54b76b 100644 --- a/bench/compress_normal.py +++ b/bench/compress_normal.py @@ -8,7 +8,6 @@ from zarr import blosc if __name__ == "__main__": - sys.path.insert(0, "..") # setup diff --git a/docs/api/v3.rst b/docs/api/v3.rst index 7665b2ddd1..3503e3fe81 100644 --- a/docs/api/v3.rst +++ b/docs/api/v3.rst @@ -1,13 +1,12 @@ V3 Specification Implementation(``zarr._storage.v3``) ===================================================== -This module contains the implementation of the `Zarr V3 Specification `_. +This module contains an experimental implementation of the `Zarr V3 Specification `_. .. warning:: - Since Zarr Python 2.12 release, this module provides experimental infrastructure for reading and - writing the upcoming V3 spec of the Zarr format. Users wishing to prepare for the migration can set - the environment variable ``ZARR_V3_EXPERIMENTAL_API=1`` to begin experimenting, however data - written with this API should be expected to become stale, as the implementation will still change. + The experimental v3 implementation included in Zarr Python >2.12,<3 is not aligned with the final + V3 specification. This version is deprecated and will be removed in Zarr Python 3.0 in favor of a + spec compliant version. The new ``zarr._store.v3`` package has the necessary classes and functions for evaluating Zarr V3. Since the design is not finalised, the classes and functions are not automatically imported into diff --git a/docs/conf.py b/docs/conf.py index 318843a9fb..048e77f51d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -72,7 +72,7 @@ # General information about the project. project = "zarr" -copyright = "2023, Zarr Developers" +copyright = "2024, Zarr Developers" author = "Zarr Developers" version = zarr.__version__ diff --git a/docs/index.rst b/docs/index.rst index 06f79b7e7c..cf54e261af 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -19,13 +19,13 @@ Zarr-Python **Version**: |version| -**Download documentation**: `PDF/Zipped HTML/EPUB `_ +**Download documentation**: `PDF/Zipped HTML `_ **Useful links**: `Installation `_ | `Source Repository `_ | `Issue Tracker `_ | -`Gitter `_ +`Zulip Chat `_ Zarr is a file storage format for chunked, compressed, N-dimensional arrays based on an open-source specification. diff --git a/docs/installation.rst b/docs/installation.rst index 8553d451cb..35865c764d 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -6,6 +6,11 @@ Zarr depends on NumPy. It is generally best to `install NumPy appropriate for your operating system and Python distribution. Other dependencies should be installed automatically if using one of the installation methods below. +Note: Zarr has endorsed `Scientific-Python SPEC 0 `_ and now follows the version support window as outlined below: + +- Python: 36 months after initial release +- Core package dependencies (e.g. NumPy): 24 months after initial release + Install Zarr from PyPI:: $ pip install zarr diff --git a/docs/release.rst b/docs/release.rst index 9873d62896..59051bbf97 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -13,11 +13,171 @@ Release notes # to document your changes. On releases it will be # re-indented so that it does not show up in the notes. +.. note:: + Zarr-Python 2.18.* is expected be the final release in the 2.* series. Work on Zarr-Python 3.0 is underway. + See `GH1777 `_ for more details on the upcoming + 3.0 release. + .. _unreleased: Unreleased ---------- +Enhancements +~~~~~~~~~~~~ + +Docs +~~~~ + +Maintenance +~~~~~~~~~~~ + +Deprecations +~~~~~~~~~~~~ + +.. _release_2.18.1: + +2.18.1 +------ + +Maintenance +~~~~~~~~~~~ +* Fix a regression when getting or setting a single value from arrays with size-1 chunks. + By :user:`Deepak Cherian ` :issue:`1874` + +.. _release_2.18.0: + +2.18.0 +------ + +Enhancements +~~~~~~~~~~~~ +* Performance improvement for reading and writing chunks if any of the dimensions is size 1. + By :user:`Deepak Cherian ` :issue:`1730`. + +Maintenance +~~~~~~~~~~~ +* Enable ruff/bugbear rules (B) and fix issues. + By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1702`. + +* Minor updates to use `np.inf` instead of `np.PINF` / `np.NINF` in preparation for NumPy 2.0.0 release. + By :user:`Joe Hamman ` :issue:`1842`. + +Deprecations +~~~~~~~~~~~~ + +* Deprecate experimental v3 support by issuing a `FutureWarning`. + Also updated docs to warn about using the experimental v3 version. + By :user:`Joe Hamman ` :issue:`1802` and :issue:`1807`. + +* Deprecate the following stores: :class:`zarr.storage.DBMStore`, :class:`zarr.storage.LMDBStore`, + :class:`zarr.storage.SQLiteStore`, :class:`zarr.storage.MongoDBStore`, :class:`zarr.storage.RedisStore`, + and :class:`zarr.storage.ABSStore`. These stores are slated to be removed from Zarr-Python in version 3.0. + By :user:`Joe Hamman ` :issue:`1801`. + +.. _release_2.17.2: + +2.17.2 +------ + +Enhancements +~~~~~~~~~~~~ + +* [v3] Dramatically reduce number of ``__contains__`` requests in favor of optimistically calling `__getitem__` + and handling any error that may arise. + By :user:`Deepak Cherian ` :issue:`1741`. + +* [v3] Reuse the downloaded array metadata when creating an ``Array``. + By :user:`Deepak Cherian ` :issue:`1734`. + +* Optimize ``Array.info`` so that it calls `getsize` only once. + By :user:`Deepak Cherian ` :issue:`1733`. + +* Override IPython ``_repr_*_`` methods to avoid expensive lookups against object stores. + By :user:`Deepak Cherian ` :issue:`1716`. + +* FSStore now raises rather than return bad data. + By :user:`Martin Durant ` and :user:`Ian Carroll ` :issue:`1604`. + +* Avoid redundant ``__contains__``. + By :user:`Deepak Cherian ` :issue:`1739`. + +Docs +~~~~ + +* Fix link to GCSMap in ``tutorial.rst``. + By :user:`Daniel Jahn ` :issue:`1689`. + +* Endorse `SPEC0000 `_ and state version support policy in ``installation.rst``. + By :user:`Sanket Verma ` :issue:`1665`. + +* Migrate v1 and v2 specification to `Zarr-Specs `_. + By :user:`Sanket Verma ` :issue:`1582`. + +Maintenance +~~~~~~~~~~~ + +* Add CI test environment for Python 3.12 + By :user:`Joe Hamman ` :issue:`1719`. + +* Bump minimum supported NumPy version to 1.23 (per spec 0000) + By :user:`Joe Hamman ` :issue:`1719`. + +* Minor fixes: Using ``is`` instead of ``type`` and removing unnecessary ``None``. + By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1737`. + +* Fix tests failure related to Pytest 8. + By :user:`David Stansby ` :issue:`1714`. + +.. _release_2.17.1: + +2.17.1 +------ + +Enhancements +~~~~~~~~~~~~ + +* Change occurrences of % and format() to f-strings. + By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1423`. + +* Proper argument for numpy.reshape. + By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1425`. + +* Add typing to dimension separator arguments. + By :user:`David Stansby ` :issue:`1620`. + +Docs +~~~~ + +* ZIP related tweaks. + By :user:`Davis Bennett ` :issue:`1641`. + +Maintenance +~~~~~~~~~~~ + +* Update config.yml with Zulip. + By :user:`Josh Moore `. + +* Replace Gitter with the new Zulip Chat link. + By :user:`Sanket Verma ` :issue:`1685`. + +* Fix RTD build. + By :user:`Sanket Verma ` :issue:`1694`. + +.. _release_2.17.0: + +2.17.0 +------ + +Enhancements +~~~~~~~~~~~~ + +* Added type hints to ``zarr.creation.create()``. + By :user:`David Stansby ` :issue:`1536`. + +* Pyodide support: Don't require fasteners on Emscripten. + By :user:`Hood Chatham ` :issue:`1663`. + Docs ~~~~ @@ -39,10 +199,30 @@ Docs * Minor tweak to advanced indexing tutorial examples. By :user:`Ross Barnowski ` :issue:`1550`. +* Automatically document array members using sphinx-automodapi. + By :user:`David Stansby ` :issue:`1547`. + +* Add a markdown file documenting the current and former core-developer team. + By :user:`Joe Hamman ` :issue:`1628`. + +* Add Norman Rzepka to core-dev team. + By :user:`Joe Hamman ` :issue:`1630`. + +* Added section about accessing ZIP archives on s3. + By :user:`Jeff Peck ` :issue:`1613`, :issue:`1615`, and :user:`Davis Bennett ` :issue:`1641`. + +* Add V3 roadmap and design document. + By :user:`Joe Hamman ` :issue:`1583`. Maintenance ~~~~~~~~~~~ +* Drop Python 3.8 and NumPy 1.20 + By :user:`Josh Moore `; :issue:`1557`. + +* Cache result of ``FSStore._fsspec_installed()``. + By :user:`Janick Martinez Esturo ` :issue:`1581`. + * Extend copyright notice to 2023. By :user:`Jack Kelly ` :issue:`1528`. @@ -61,6 +241,26 @@ Maintenance * Remove ``sphinx-rtd-theme`` dependency from ``pyproject.toml``. By :user:`Sanket Verma ` :issue:`1563`. +* Remove ``CODE_OF_CONDUCT.md`` file from the Zarr-Python repository. + By :user:`Sanket Verma ` :issue:`1572`. + +* Bump version of black in pre-commit. + By :user:`David Stansby ` :issue:`1559`. + +* Use list comprehension where applicable. + By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1555`. + +* Use format specification mini-language to format string. + By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1558`. + +* Single startswith() call instead of multiple ones. + By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1556`. + +* Move codespell options around. + By :user:`Dimitri Papadopoulos Orfanos ` :issue:`1196`. + +* Remove unused mypy ignore comments. + By :user:`David Stansby ` :issue:`1602`. .. _release_2.16.1: @@ -103,10 +303,10 @@ Maintenance By :user:`Davis Bennett ` :issue:`1462`. * Style the codebase with ``ruff`` and ``black``. - By :user:`Davis Bennett` :issue:`1459` + By :user:`Davis Bennett ` :issue:`1459` * Ensure that chunks is tuple of ints upon array creation. - By :user:`Philipp Hanslovsky` :issue:`1461` + By :user:`Philipp Hanslovsky ` :issue:`1461` .. _release_2.15.0: @@ -494,7 +694,7 @@ Maintenance By :user:`Saransh Chopra ` :issue:`1079`. * Remove option to return None from _ensure_store. - By :user:`Greggory Lee ` :issue:`1068`. + By :user:`Gregory Lee ` :issue:`1068`. * Fix a typo of "integers". By :user:`Richard Scott ` :issue:`1056`. @@ -512,7 +712,7 @@ Enhancements Since the format is not yet finalized, the classes and functions are not automatically imported into the regular `zarr` name space. Setting the `ZARR_V3_EXPERIMENTAL_API` environment variable will activate them. - By :user:`Greggory Lee `; :issue:`898`, :issue:`1006`, and :issue:`1007` + By :user:`Gregory Lee `; :issue:`898`, :issue:`1006`, and :issue:`1007` as well as by :user:`Josh Moore ` :issue:`1032`. * **Create FSStore from an existing fsspec filesystem**. If you have created @@ -634,7 +834,7 @@ Enhancements higher-level array creation and convenience functions still accept plain Python dicts or other mutable mappings for the ``store`` argument, but will internally convert these to a ``KVStore``. - By :user:`Greggory Lee `; :issue:`839`, :issue:`789`, and :issue:`950`. + By :user:`Gregory Lee `; :issue:`839`, :issue:`789`, and :issue:`950`. * Allow to assign array ``fill_values`` and update metadata accordingly. By :user:`Ryan Abernathey `, :issue:`662`. @@ -781,7 +981,7 @@ Bug fixes ~~~~~~~~~ * Fix FSStore.listdir behavior for nested directories. - By :user:`Greggory Lee `; :issue:`802`. + By :user:`Gregory Lee `; :issue:`802`. .. _release_2.9.4: @@ -865,7 +1065,7 @@ Bug fixes By :user:`Josh Moore `; :issue:`781`. * avoid NumPy 1.21.0 due to https://github.com/numpy/numpy/issues/19325 - By :user:`Greggory Lee `; :issue:`791`. + By :user:`Gregory Lee `; :issue:`791`. Maintenance ~~~~~~~~~~~ @@ -877,7 +1077,7 @@ Maintenance By :user:`Elliott Sales de Andrade `; :issue:`799`. * TST: add missing assert in test_hexdigest. - By :user:`Greggory Lee `; :issue:`801`. + By :user:`Gregory Lee `; :issue:`801`. .. _release_2.8.3: @@ -1521,11 +1721,11 @@ Bug fixes Documentation ~~~~~~~~~~~~~ -* Some changes have been made to the :ref:`spec_v2` document to clarify +* Some changes have been made to the Zarr Specification v2 document to clarify ambiguities and add some missing information. These changes do not break compatibility with any of the material as previously implemented, and so the changes have been made in-place in the document without incrementing the document version number. See the - section on :ref:`spec_v2_changes` in the specification document for more information. + section on changes in the specification document for more information. * A new :ref:`tutorial_indexing` section has been added to the tutorial. * A new :ref:`tutorial_strings` section has been added to the tutorial (:issue:`135`, :issue:`175`). diff --git a/docs/spec/v1.rst b/docs/spec/v1.rst index 13f68ef36e..27a0490e0a 100644 --- a/docs/spec/v1.rst +++ b/docs/spec/v1.rst @@ -3,268 +3,5 @@ Zarr Storage Specification Version 1 ==================================== -This document provides a technical specification of the protocol and -format used for storing a Zarr array. The key words "MUST", "MUST -NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", -"RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be -interpreted as described in `RFC 2119 -`_. - -Status ------- - -This specification is deprecated. See :ref:`spec` for the latest version. - -Storage -------- - -A Zarr array can be stored in any storage system that provides a -key/value interface, where a key is an ASCII string and a value is an -arbitrary sequence of bytes, and the supported operations are read -(get the sequence of bytes associated with a given key), write (set -the sequence of bytes associated with a given key) and delete (remove -a key/value pair). - -For example, a directory in a file system can provide this interface, -where keys are file names, values are file contents, and files can be -read, written or deleted via the operating system. Equally, an S3 -bucket can provide this interface, where keys are resource names, -values are resource contents, and resources can be read, written or -deleted via HTTP. - -Below an "array store" refers to any system implementing this -interface. - -Metadata --------- - -Each array requires essential configuration metadata to be stored, -enabling correct interpretation of the stored data. This metadata is -encoded using JSON and stored as the value of the 'meta' key within an -array store. - -The metadata resource is a JSON object. The following keys MUST be -present within the object: - -zarr_format - An integer defining the version of the storage specification to which the - array store adheres. -shape - A list of integers defining the length of each dimension of the array. -chunks - A list of integers defining the length of each dimension of a chunk of the - array. Note that all chunks within a Zarr array have the same shape. -dtype - A string or list defining a valid data type for the array. See also - the subsection below on data type encoding. -compression - A string identifying the primary compression library used to compress - each chunk of the array. -compression_opts - An integer, string or dictionary providing options to the primary - compression library. -fill_value - A scalar value providing the default value to use for uninitialized - portions of the array. -order - Either 'C' or 'F', defining the layout of bytes within each chunk of the - array. 'C' means row-major order, i.e., the last dimension varies fastest; - 'F' means column-major order, i.e., the first dimension varies fastest. - -Other keys MAY be present within the metadata object however they MUST -NOT alter the interpretation of the required fields defined above. - -For example, the JSON object below defines a 2-dimensional array of -64-bit little-endian floating point numbers with 10000 rows and 10000 -columns, divided into chunks of 1000 rows and 1000 columns (so there -will be 100 chunks in total arranged in a 10 by 10 grid). Within each -chunk the data are laid out in C contiguous order, and each chunk is -compressed using the Blosc compression library:: - - { - "chunks": [ - 1000, - 1000 - ], - "compression": "blosc", - "compression_opts": { - "clevel": 5, - "cname": "lz4", - "shuffle": 1 - }, - "dtype": "`_. The -format consists of 3 parts: a character describing the byteorder of -the data (``<``: little-endian, ``>``: big-endian, ``|``: -not-relevant), a character code giving the basic type of the array, -and an integer providing the number of bytes the type uses. The byte -order MUST be specified. E.g., ``"i4"``, ``"|b1"`` and -``"|S12"`` are valid data types. - -Structure data types (i.e., with multiple named fields) are encoded as -a list of two-element lists, following `NumPy array protocol type -descriptions (descr) -`_. -For example, the JSON list ``[["r", "|u1"], ["g", "|u1"], ["b", -"|u1"]]`` defines a data type composed of three single-byte unsigned -integers labelled 'r', 'g' and 'b'. - -Chunks ------- - -Each chunk of the array is compressed by passing the raw bytes for the -chunk through the primary compression library to obtain a new sequence -of bytes comprising the compressed chunk data. No header is added to -the compressed bytes or any other modification made. The internal -structure of the compressed bytes will depend on which primary -compressor was used. For example, the `Blosc compressor -`_ -produces a sequence of bytes that begins with a 16-byte header -followed by compressed data. - -The compressed sequence of bytes for each chunk is stored under a key -formed from the index of the chunk within the grid of chunks -representing the array. To form a string key for a chunk, the indices -are converted to strings and concatenated with the period character -('.') separating each index. For example, given an array with shape -(10000, 10000) and chunk shape (1000, 1000) there will be 100 chunks -laid out in a 10 by 10 grid. The chunk with indices (0, 0) provides -data for rows 0-999 and columns 0-999 and is stored under the key -'0.0'; the chunk with indices (2, 4) provides data for rows 2000-2999 -and columns 4000-4999 and is stored under the key '2.4'; etc. - -There is no need for all chunks to be present within an array -store. If a chunk is not present then it is considered to be in an -uninitialized state. An uninitialized chunk MUST be treated as if it -was uniformly filled with the value of the 'fill_value' field in the -array metadata. If the 'fill_value' field is ``null`` then the -contents of the chunk are undefined. - -Note that all chunks in an array have the same shape. If the length of -any array dimension is not exactly divisible by the length of the -corresponding chunk dimension then some chunks will overhang the edge -of the array. The contents of any chunk region falling outside the -array are undefined. - -Attributes ----------- - -Each array can also be associated with custom attributes, which are -simple key/value items with application-specific meaning. Custom -attributes are encoded as a JSON object and stored under the 'attrs' -key within an array store. Even if the attributes are empty, the -'attrs' key MUST be present within an array store. - -For example, the JSON object below encodes three attributes named -'foo', 'bar' and 'baz':: - - { - "foo": 42, - "bar": "apples", - "baz": [1, 2, 3, 4] - } - -Example -------- - -Below is an example of storing a Zarr array, using a directory on the -local file system as storage. - -Initialize the store:: - - >>> import zarr - >>> store = zarr.DirectoryStore('example.zarr') - >>> zarr.init_store(store, shape=(20, 20), chunks=(10, 10), - ... dtype='i4', fill_value=42, compression='zlib', - ... compression_opts=1, overwrite=True) - -No chunks are initialized yet, so only the 'meta' and 'attrs' keys -have been set:: - - >>> import os - >>> sorted(os.listdir('example.zarr')) - ['attrs', 'meta'] - -Inspect the array metadata:: - - >>> print(open('example.zarr/meta').read()) - { - "chunks": [ - 10, - 10 - ], - "compression": "zlib", - "compression_opts": 1, - "dtype": ">> print(open('example.zarr/attrs').read()) - {} - -Set some data:: - - >>> z = zarr.Array(store) - >>> z[0:10, 0:10] = 1 - >>> sorted(os.listdir('example.zarr')) - ['0.0', 'attrs', 'meta'] - -Set some more data:: - - >>> z[0:10, 10:20] = 2 - >>> z[10:20, :] = 3 - >>> sorted(os.listdir('example.zarr')) - ['0.0', '0.1', '1.0', '1.1', 'attrs', 'meta'] - -Manually decompress a single chunk for illustration:: - - >>> import zlib - >>> b = zlib.decompress(open('example.zarr/0.0', 'rb').read()) - >>> import numpy as np - >>> a = np.frombuffer(b, dtype='>> a - array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) - -Modify the array attributes:: - - >>> z.attrs['foo'] = 42 - >>> z.attrs['bar'] = 'apples' - >>> z.attrs['baz'] = [1, 2, 3, 4] - >>> print(open('example.zarr/attrs').read()) - { - "bar": "apples", - "baz": [ - 1, - 2, - 3, - 4 - ], - "foo": 42 - } +The V1 Specification has been migrated to its website → +https://zarr-specs.readthedocs.io/. diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst index c1e12e1218..deb6d46ce6 100644 --- a/docs/spec/v2.rst +++ b/docs/spec/v2.rst @@ -3,563 +3,5 @@ Zarr Storage Specification Version 2 ==================================== -This document provides a technical specification of the protocol and format -used for storing Zarr arrays. The key words "MUST", "MUST NOT", "REQUIRED", -"SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and -"OPTIONAL" in this document are to be interpreted as described in `RFC 2119 -`_. - -Status ------- - -This specification is the latest version. See :ref:`spec` for previous -versions. - -.. _spec_v2_storage: - -Storage -------- - -A Zarr array can be stored in any storage system that provides a key/value -interface, where a key is an ASCII string and a value is an arbitrary sequence -of bytes, and the supported operations are read (get the sequence of bytes -associated with a given key), write (set the sequence of bytes associated with -a given key) and delete (remove a key/value pair). - -For example, a directory in a file system can provide this interface, where -keys are file names, values are file contents, and files can be read, written -or deleted via the operating system. Equally, an S3 bucket can provide this -interface, where keys are resource names, values are resource contents, and -resources can be read, written or deleted via HTTP. - -Below an "array store" refers to any system implementing this interface. - -.. _spec_v2_array: - -Arrays ------- - -.. _spec_v2_array_metadata: - -Metadata -~~~~~~~~ - -Each array requires essential configuration metadata to be stored, enabling -correct interpretation of the stored data. This metadata is encoded using JSON -and stored as the value of the ".zarray" key within an array store. - -The metadata resource is a JSON object. The following keys MUST be present -within the object: - -zarr_format - An integer defining the version of the storage specification to which the - array store adheres. -shape - A list of integers defining the length of each dimension of the array. -chunks - A list of integers defining the length of each dimension of a chunk of the - array. Note that all chunks within a Zarr array have the same shape. -dtype - A string or list defining a valid data type for the array. See also - the subsection below on data type encoding. -compressor - A JSON object identifying the primary compression codec and providing - configuration parameters, or ``null`` if no compressor is to be used. - The object MUST contain an ``"id"`` key identifying the codec to be used. -fill_value - A scalar value providing the default value to use for uninitialized - portions of the array, or ``null`` if no fill_value is to be used. -order - Either "C" or "F", defining the layout of bytes within each chunk of the - array. "C" means row-major order, i.e., the last dimension varies fastest; - "F" means column-major order, i.e., the first dimension varies fastest. -filters - A list of JSON objects providing codec configurations, or ``null`` if no - filters are to be applied. Each codec configuration object MUST contain a - ``"id"`` key identifying the codec to be used. - -The following keys MAY be present within the object: - -dimension_separator - If present, either the string ``"."`` or ``"/"`` defining the separator placed - between the dimensions of a chunk. If the value is not set, then the - default MUST be assumed to be ``"."``, leading to chunk keys of the form "0.0". - Arrays defined with ``"/"`` as the dimension separator can be considered to have - nested, or hierarchical, keys of the form "0/0" that SHOULD where possible - produce a directory-like structure. - -Other keys SHOULD NOT be present within the metadata object and SHOULD be -ignored by implementations. - -For example, the JSON object below defines a 2-dimensional array of 64-bit -little-endian floating point numbers with 10000 rows and 10000 columns, divided -into chunks of 1000 rows and 1000 columns (so there will be 100 chunks in total -arranged in a 10 by 10 grid). Within each chunk the data are laid out in C -contiguous order. Each chunk is encoded using a delta filter and compressed -using the Blosc compression library prior to storage:: - - { - "chunks": [ - 1000, - 1000 - ], - "compressor": { - "id": "blosc", - "cname": "lz4", - "clevel": 5, - "shuffle": 1 - }, - "dtype": "`. The format -consists of 3 parts: - -* One character describing the byteorder of the data (``"<"``: little-endian; - ``">"``: big-endian; ``"|"``: not-relevant) -* One character code giving the basic type of the array (``"b"``: Boolean (integer - type where all values are only True or False); ``"i"``: integer; ``"u"``: unsigned - integer; ``"f"``: floating point; ``"c"``: complex floating point; ``"m"``: timedelta; - ``"M"``: datetime; ``"S"``: string (fixed-length sequence of char); ``"U"``: unicode - (fixed-length sequence of Py_UNICODE); ``"V"``: other (void * – each item is a - fixed-size chunk of memory)) -* An integer specifying the number of bytes the type uses. - -The byte order MUST be specified. E.g., ``"i4"``, ``"|b1"`` and -``"|S12"`` are valid data type encodings. - -For datetime64 ("M") and timedelta64 ("m") data types, these MUST also include the -units within square brackets. A list of valid units and their definitions are given in -the :ref:`NumPy documentation on Datetimes and Timedeltas -`. -For example, ``"`. Each -sub-list has the form ``[fieldname, datatype, shape]`` where ``shape`` -is optional. ``fieldname`` is a string, ``datatype`` is a string -specifying a simple data type (see above), and ``shape`` is a list of -integers specifying subarray shape. For example, the JSON list below -defines a data type composed of three single-byte unsigned integer -fields named "r", "g" and "b":: - - [["r", "|u1"], ["g", "|u1"], ["b", "|u1"]] - -For example, the JSON list below defines a data type composed of three -fields named "x", "y" and "z", where "x" and "y" each contain 32-bit -floats, and each item in "z" is a 2 by 2 array of floats:: - - [["x", "`_ -produces a sequence of bytes that begins with a 16-byte header followed by -compressed data. - -The compressed sequence of bytes for each chunk is stored under a key formed -from the index of the chunk within the grid of chunks representing the array. -To form a string key for a chunk, the indices are converted to strings and -concatenated with the period character (".") separating each index. For -example, given an array with shape (10000, 10000) and chunk shape (1000, 1000) -there will be 100 chunks laid out in a 10 by 10 grid. The chunk with indices -(0, 0) provides data for rows 0-999 and columns 0-999 and is stored under the -key "0.0"; the chunk with indices (2, 4) provides data for rows 2000-2999 and -columns 4000-4999 and is stored under the key "2.4"; etc. - -There is no need for all chunks to be present within an array store. If a chunk -is not present then it is considered to be in an uninitialized state. An -uninitialized chunk MUST be treated as if it was uniformly filled with the value -of the "fill_value" field in the array metadata. If the "fill_value" field is -``null`` then the contents of the chunk are undefined. - -Note that all chunks in an array have the same shape. If the length of any -array dimension is not exactly divisible by the length of the corresponding -chunk dimension then some chunks will overhang the edge of the array. The -contents of any chunk region falling outside the array are undefined. - -.. _spec_v2_array_filters: - -Filters -~~~~~~~ - -Optionally a sequence of one or more filters can be used to transform chunk -data prior to compression. When storing data, filters are applied in the order -specified in array metadata to encode data, then the encoded data are passed to -the primary compressor. When retrieving data, stored chunk data are -decompressed by the primary compressor then decoded using filters in the -reverse order. - -.. _spec_v2_hierarchy: - -Hierarchies ------------ - -.. _spec_v2_hierarchy_paths: - -Logical storage paths -~~~~~~~~~~~~~~~~~~~~~ - -Multiple arrays can be stored in the same array store by associating each array -with a different logical path. A logical path is simply an ASCII string. The -logical path is used to form a prefix for keys used by the array. For example, -if an array is stored at logical path "foo/bar" then the array metadata will be -stored under the key "foo/bar/.zarray", the user-defined attributes will be -stored under the key "foo/bar/.zattrs", and the chunks will be stored under -keys like "foo/bar/0.0", "foo/bar/0.1", etc. - -To ensure consistent behaviour across different storage systems, logical paths -MUST be normalized as follows: - -* Replace all backward slash characters ("\\\\") with forward slash characters - ("/") -* Strip any leading "/" characters -* Strip any trailing "/" characters -* Collapse any sequence of more than one "/" character into a single "/" - character - -The key prefix is then obtained by appending a single "/" character to the -normalized logical path. - -After normalization, if splitting a logical path by the "/" character results -in any path segment equal to the string "." or the string ".." then an error -MUST be raised. - -N.B., how the underlying array store processes requests to store values under -keys containing the "/" character is entirely up to the store implementation -and is not constrained by this specification. E.g., an array store could simply -treat all keys as opaque ASCII strings; equally, an array store could map -logical paths onto some kind of hierarchical storage (e.g., directories on a -file system). - -.. _spec_v2_hierarchy_groups: - -Groups -~~~~~~ - -Arrays can be organized into groups which can also contain other groups. A -group is created by storing group metadata under the ".zgroup" key under some -logical path. E.g., a group exists at the root of an array store if the -".zgroup" key exists in the store, and a group exists at logical path "foo/bar" -if the "foo/bar/.zgroup" key exists in the store. - -If the user requests a group to be created under some logical path, then groups -MUST also be created at all ancestor paths. E.g., if the user requests group -creation at path "foo/bar" then groups MUST be created at path "foo" and the -root of the store, if they don't already exist. - -If the user requests an array to be created under some logical path, then -groups MUST also be created at all ancestor paths. E.g., if the user requests -array creation at path "foo/bar/baz" then groups must be created at path -"foo/bar", path "foo", and the root of the store, if they don't already exist. - -The group metadata resource is a JSON object. The following keys MUST be present -within the object: - -zarr_format - An integer defining the version of the storage specification to which the - array store adheres. - -Other keys MUST NOT be present within the metadata object. - -The members of a group are arrays and groups stored under logical paths that -are direct children of the parent group's logical path. E.g., if groups exist -under the logical paths "foo" and "foo/bar" and an array exists at logical path -"foo/baz" then the members of the group at path "foo" are the group at path -"foo/bar" and the array at path "foo/baz". - -.. _spec_v2_attrs: - -Attributes ----------- - -An array or group can be associated with custom attributes, which are arbitrary -key/value pairs with application-specific meaning. Custom attributes are encoded -as a JSON object and stored under the ".zattrs" key within an array store. The -".zattrs" key does not have to be present, and if it is absent the attributes -should be treated as empty. - -For example, the JSON object below encodes three attributes named -"foo", "bar" and "baz":: - - { - "foo": 42, - "bar": "apples", - "baz": [1, 2, 3, 4] - } - -.. _spec_v2_examples: - -Examples --------- - -Storing a single array -~~~~~~~~~~~~~~~~~~~~~~ - -Below is an example of storing a Zarr array, using a directory on the -local file system as storage. - -Create an array:: - - >>> import zarr - >>> store = zarr.DirectoryStore('data/example.zarr') - >>> a = zarr.create(shape=(20, 20), chunks=(10, 10), dtype='i4', - ... fill_value=42, compressor=zarr.Zlib(level=1), - ... store=store, overwrite=True) - -No chunks are initialized yet, so only the ".zarray" and ".zattrs" keys -have been set in the store:: - - >>> import os - >>> sorted(os.listdir('data/example.zarr')) - ['.zarray'] - -Inspect the array metadata:: - - >>> print(open('data/example.zarr/.zarray').read()) - { - "chunks": [ - 10, - 10 - ], - "compressor": { - "id": "zlib", - "level": 1 - }, - "dtype": ">> a[0:10, 0:10] = 1 - >>> sorted(os.listdir('data/example.zarr')) - ['.zarray', '0.0'] - -Set some more data:: - - >>> a[0:10, 10:20] = 2 - >>> a[10:20, :] = 3 - >>> sorted(os.listdir('data/example.zarr')) - ['.zarray', '0.0', '0.1', '1.0', '1.1'] - -Manually decompress a single chunk for illustration:: - - >>> import zlib - >>> buf = zlib.decompress(open('data/example.zarr/0.0', 'rb').read()) - >>> import numpy as np - >>> chunk = np.frombuffer(buf, dtype='>> chunk - array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) - -Modify the array attributes:: - - >>> a.attrs['foo'] = 42 - >>> a.attrs['bar'] = 'apples' - >>> a.attrs['baz'] = [1, 2, 3, 4] - >>> sorted(os.listdir('data/example.zarr')) - ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] - >>> print(open('data/example.zarr/.zattrs').read()) - { - "bar": "apples", - "baz": [ - 1, - 2, - 3, - 4 - ], - "foo": 42 - } - -Storing multiple arrays in a hierarchy -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Below is an example of storing multiple Zarr arrays organized into a group -hierarchy, using a directory on the local file system as storage. This storage -implementation maps logical paths onto directory paths on the file system, -however this is an implementation choice and is not required. - -Setup the store:: - - >>> import zarr - >>> store = zarr.DirectoryStore('data/group.zarr') - -Create the root group:: - - >>> root_grp = zarr.group(store, overwrite=True) - -The metadata resource for the root group has been created:: - - >>> import os - >>> sorted(os.listdir('data/group.zarr')) - ['.zgroup'] - -Inspect the group metadata:: - - >>> print(open('data/group.zarr/.zgroup').read()) - { - "zarr_format": 2 - } - -Create a sub-group:: - - >>> sub_grp = root_grp.create_group('foo') - -What has been stored:: - - >>> sorted(os.listdir('data/group.zarr')) - ['.zgroup', 'foo'] - >>> sorted(os.listdir('data/group.zarr/foo')) - ['.zgroup'] - -Create an array within the sub-group:: - - >>> a = sub_grp.create_dataset('bar', shape=(20, 20), chunks=(10, 10)) - >>> a[:] = 42 - -Set a custom attributes:: - - >>> a.attrs['comment'] = 'answer to life, the universe and everything' - -What has been stored:: - - >>> sorted(os.listdir('data/group.zarr')) - ['.zgroup', 'foo'] - >>> sorted(os.listdir('data/group.zarr/foo')) - ['.zgroup', 'bar'] - >>> sorted(os.listdir('data/group.zarr/foo/bar')) - ['.zarray', '.zattrs', '0.0', '0.1', '1.0', '1.1'] - -Here is the same example using a Zip file as storage:: - - >>> store = zarr.ZipStore('data/group.zip', mode='w') - >>> root_grp = zarr.group(store) - >>> sub_grp = root_grp.create_group('foo') - >>> a = sub_grp.create_dataset('bar', shape=(20, 20), chunks=(10, 10)) - >>> a[:] = 42 - >>> a.attrs['comment'] = 'answer to life, the universe and everything' - >>> store.close() - -What has been stored:: - - >>> import zipfile - >>> zf = zipfile.ZipFile('data/group.zip', mode='r') - >>> for name in sorted(zf.namelist()): - ... print(name) - .zgroup - foo/.zgroup - foo/bar/.zarray - foo/bar/.zattrs - foo/bar/0.0 - foo/bar/0.1 - foo/bar/1.0 - foo/bar/1.1 - -.. _spec_v2_changes: - -Changes -------- - -Version 2 clarifications -~~~~~~~~~~~~~~~~~~~~~~~~ - -The following changes have been made to the version 2 specification since it was -initially published to clarify ambiguities and add some missing information. - -* The specification now describes how bytes fill values should be encoded and - decoded for arrays with a fixed-length byte string data type (:issue:`165`, - :issue:`176`). - -* The specification now clarifies that units must be specified for datetime64 and - timedelta64 data types (:issue:`85`, :issue:`215`). - -* The specification now clarifies that the '.zattrs' key does not have to be present for - either arrays or groups, and if absent then custom attributes should be treated as - empty. - -* The specification now describes how structured datatypes with - subarray shapes and/or with nested structured data types are encoded - in array metadata (:issue:`111`, :issue:`296`). - -* Clarified the key/value pairs of custom attributes as "arbitrary" rather than - "simple". - -Changes from version 1 to version 2 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The following changes were made between version 1 and version 2 of this specification: - -* Added support for storing multiple arrays in the same store and organising - arrays into hierarchies using groups. -* Array metadata is now stored under the ".zarray" key instead of the "meta" - key. -* Custom attributes are now stored under the ".zattrs" key instead of the - "attrs" key. -* Added support for filters. -* Changed encoding of "fill_value" field within array metadata. -* Changed encoding of compressor information within array metadata to be - consistent with representation of filter information. +The V2 Specification has been migrated to its website → +https://zarr-specs.readthedocs.io/. diff --git a/docs/spec/v3.rst b/docs/spec/v3.rst index bd8852707b..3d39f35ba6 100644 --- a/docs/spec/v3.rst +++ b/docs/spec/v3.rst @@ -1,7 +1,7 @@ .. _spec_v3: Zarr Storage Specification Version 3 -======================================================= +==================================== The V3 Specification has been migrated to its website → https://zarr-specs.readthedocs.io/. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 4099bac1c8..214dd4f63f 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -774,7 +774,7 @@ the following code:: Any other compatible storage class could be used in place of :class:`zarr.storage.DirectoryStore` in the code examples above. For example, -here is an array stored directly into a Zip file, via the +here is an array stored directly into a ZIP archive, via the :class:`zarr.storage.ZipStore` class:: >>> store = zarr.ZipStore('data/example.zip', mode='w') @@ -798,12 +798,12 @@ Re-open and check that data have been written:: [42, 42, 42, ..., 42, 42, 42]], dtype=int32) >>> store.close() -Note that there are some limitations on how Zip files can be used, because items -within a Zip file cannot be updated in place. This means that data in the array +Note that there are some limitations on how ZIP archives can be used, because items +within a ZIP archive cannot be updated in place. This means that data in the array should only be written once and write operations should be aligned with chunk boundaries. Note also that the ``close()`` method must be called after writing any data to the store, otherwise essential records will not be written to the -underlying zip file. +underlying ZIP archive. Another storage alternative is the :class:`zarr.storage.DBMStore` class, added in Zarr version 2.2. This class allows any DBM-style database to be used for @@ -846,7 +846,7 @@ respectively require the `redis-py `_ and `pymongo `_ packages to be installed. For compatibility with the `N5 `_ data format, Zarr also provides -an N5 backend (this is currently an experimental feature). Similar to the zip storage class, an +an N5 backend (this is currently an experimental feature). Similar to the ZIP storage class, an :class:`zarr.n5.N5Store` can be instantiated directly:: >>> store = zarr.N5Store('data/example.n5') @@ -868,7 +868,7 @@ implementations of the ``MutableMapping`` interface for Amazon S3 (`S3Map Distributed File System (`HDFSMap `_) and Google Cloud Storage (`GCSMap -`_), which +`_), which can be used with Zarr. Here is an example using S3Map to read an array created previously:: @@ -1000,6 +1000,32 @@ separately from Zarr. .. _tutorial_copy: +Accessing ZIP archives on S3 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The built-in :class:`zarr.storage.ZipStore` will only work with paths on the local file-system; however +it is possible to access ZIP-archived Zarr data on the cloud via the `ZipFileSystem `_ +class from ``fsspec``. The following example demonstrates how to access +a ZIP-archived Zarr group on s3 using `s3fs `_ and ``ZipFileSystem``: + + >>> s3_path = "s3://path/to/my.zarr.zip" + >>> + >>> s3 = s3fs.S3FileSystem() + >>> f = s3.open(s3_path) + >>> fs = ZipFileSystem(f, mode="r") + >>> store = FSMap("", fs, check=False) + >>> + >>> # caching may improve performance when repeatedly reading the same data + >>> cache = zarr.storage.LRUStoreCache(store, max_size=2**28) + >>> z = zarr.group(store=cache) + +This store can also be generated with ``fsspec``'s handler chaining, like so: + + >>> store = zarr.storage.FSStore(url=f"zip::{s3_path}", mode="r") + +This can be especially useful if you have a very large ZIP-archived Zarr array or group on s3 +and only need to access a small portion of it. + Consolidating metadata ~~~~~~~~~~~~~~~~~~~~~~ @@ -1136,7 +1162,7 @@ re-compression, and so should be faster. E.g.:: └── spam (100,) int64 >>> new_root['foo/bar/baz'][:] array([ 0, 1, 2, ..., 97, 98, 99]) - >>> store2.close() # zip stores need to be closed + >>> store2.close() # ZIP stores need to be closed .. _tutorial_strings: diff --git a/environment.yml b/environment.yml index dc99507427..ff2f9eedef 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: - wheel - numcodecs >= 0.6.4 - - numpy >= 1.20 + - numpy >= 1.21 - pip - pip: - asciitree diff --git a/pyproject.toml b/pyproject.toml index 22ea19f28f..dacd45ec2c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,11 +10,11 @@ readme = { file = "README.md", content-type = "text/markdown" } maintainers = [ { name = "Alistair Miles", email = "alimanfoo@googlemail.com" } ] -requires-python = ">=3.8" +requires-python = ">=3.9" dependencies = [ 'asciitree', - 'numpy>=1.20,!=1.21.0', - 'fasteners', + 'numpy>=1.23', + 'fasteners; sys_platform != "emscripten"', 'numcodecs>=0.10.0', ] dynamic = [ @@ -30,7 +30,6 @@ classifiers = [ 'Topic :: Software Development :: Libraries :: Python Modules', 'Operating System :: Unix', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', @@ -65,6 +64,7 @@ Homepage = "https://github.com/zarr-developers/zarr-python" exclude_lines = [ "pragma: no cover", "pragma: ${PY_MAJOR_VERSION} no cover", + '.*\.\.\.' # Ignore "..." lines ] [tool.coverage.run] @@ -103,6 +103,11 @@ exclude = [ "docs" ] +[tool.ruff.lint] +extend-select = [ + "B" +] + [tool.black] line-length = 100 exclude = ''' @@ -119,9 +124,10 @@ exclude = ''' ''' [tool.mypy] -python_version = "3.8" ignore_missing_imports = true -follow_imports = "silent" +warn_unused_configs = true +warn_redundant_casts = true +warn_unused_ignores = true [tool.pytest.ini_options] doctest_optionflags = [ @@ -136,4 +142,11 @@ filterwarnings = [ "error:::zarr.*", "ignore:PY_SSIZE_T_CLEAN will be required.*:DeprecationWarning", "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning", + "ignore:The .* is deprecated and will be removed in a Zarr-Python version 3*:FutureWarning", + "ignore:The experimental Zarr V3 implementation in this version .*:FutureWarning", ] + + +[tool.codespell] +ignore-words-list = "ba,ihs,kake,nd,noe,nwo,te,fo,zar" +skip = 'fixture,.git' diff --git a/requirements_dev_minimal.txt b/requirements_dev_minimal.txt index e2be6eb825..5d156db655 100644 --- a/requirements_dev_minimal.txt +++ b/requirements_dev_minimal.txt @@ -1,8 +1,8 @@ # library requirements asciitree==0.3.3 fasteners==0.19 -numcodecs==0.11.0 +numcodecs==0.12.1 msgpack-python==0.5.6 setuptools-scm==8.0.4 # test requirements -pytest==7.4.3 +pytest==8.1.1 diff --git a/requirements_dev_numpy.txt b/requirements_dev_numpy.txt index a6135bd831..d8d6c3d097 100644 --- a/requirements_dev_numpy.txt +++ b/requirements_dev_numpy.txt @@ -1,4 +1,4 @@ # Break this out into a separate file to allow testing against # different versions of numpy. This file should pin to the latest # numpy version. -numpy==1.24.3 +numpy==1.26.4 diff --git a/requirements_dev_optional.txt b/requirements_dev_optional.txt index f3ea80a546..3456cca21a 100644 --- a/requirements_dev_optional.txt +++ b/requirements_dev_optional.txt @@ -3,21 +3,21 @@ lmdb==1.4.1; sys_platform != 'win32' # optional library requirements for Jupyter ipytree==0.2.2 -ipywidgets==8.1.0 +ipywidgets==8.1.2 # optional library requirements for services # don't let pyup change pinning for azure-storage-blob, need to pin to older # version to get compatibility with azure storage emulator on appveyor (FIXME) azure-storage-blob==12.16.0 # pyup: ignore -redis==5.0.1 +redis==5.0.4 types-redis types-setuptools -pymongo==4.5.0 +pymongo==4.6.3 # optional test requirements coverage -pytest-cov==4.1.0 -pytest-doctestplus==1.0.0 -pytest-timeout==2.2.0 -h5py==3.10.0 -fsspec==2023.10.0 -s3fs==2023.10.0 -moto[server]>=4.0.8 +pytest-cov==5.0.0 +pytest-doctestplus==1.2.1 +pytest-timeout==2.3.1 +h5py==3.11.0 +fsspec==2023.12.2 +s3fs==2023.12.2 +moto[server]>=5.0.1 diff --git a/v3-roadmap-and-design.md b/v3-roadmap-and-design.md new file mode 100644 index 0000000000..696799e56f --- /dev/null +++ b/v3-roadmap-and-design.md @@ -0,0 +1,429 @@ +# Zarr Python Roadmap + +- Status: draft +- Author: Joe Hamman +- Created On: October 31, 2023 +- Input from: + - Davis Bennett / @d-v-b + - Norman Rzepka / @normanrz + - Deepak Cherian @dcherian + - Brian Davis / @monodeldiablo + - Oliver McCormack / @olimcc + - Ryan Abernathey / @rabernat + - Jack Kelly / @JackKelly + - Martin Durrant / @martindurant + +## Introduction + +This document lays out a design proposal for version 3.0 of the [Zarr-Python](https://zarr.readthedocs.io/en/stable/) package. A specific focus of the design is to bring Zarr-Python's API up to date with the [Zarr V3 specification](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html), with the hope of enabling the development of the many features and extensions that motivated the V3 Spec. The ideas presented here are expected to result in a major release of Zarr-Python (version 3.0) including significant a number of breaking API changes. +For clarity, “V3” will be used to describe the version of the Zarr specification and “3.0” will be used to describe the release tag of the Zarr-Python project. + +### Current status of V3 in Zarr-Python + +During the development of the V3 Specification, a [prototype implementation](https://github.com/zarr-developers/zarr-python/pull/898) was added to the Zarr-Python library. Since that implementation, the V3 spec evolved in significant ways and as a result, the Zarr-Python library is now out of sync with the approved spec. Downstream libraries (e.g. [Xarray](https://github.com/pydata/xarray)) have added support for this implementation and will need to migrate to the accepted spec when its available in Zarr-Python. + +## Goals + +- Provide a complete implementation of Zarr V3 through the Zarr-Python API +- Clear the way for exciting extensions / ZEPs (i.e. [sharding](https://zarr-specs.readthedocs.io/en/latest/v3/codecs/sharding-indexed/v1.0.html), [variable chunking](https://zarr.dev/zeps/draft/ZEP0003.html), etc.) +- Provide a developer API that can be used to implement and register V3 extensions +- Improve the performance of Zarr-Python by streamlining the interface between the Store layer and higher level APIs (e.g. Groups and Arrays) +- Clean up the internal and user facing APIs +- Improve code quality and robustness (e.g. achieve 100% type hint coverage) +- Align the Zarr-Python array API with the [array API Standard](https://data-apis.org/array-api/latest/) + +## Examples of what 3.0 will enable? +1. Reading and writing V3 spec-compliant groups and arrays +2. V3 extensions including sharding and variable chunking. +3. Improved performance by leveraging concurrency when creating/reading/writing to stores (imagine a `create_hierarchy(zarr_objects)` function). +4. User-developed extensions (e.g. storage-transformers) can be registered with Zarr-Python at runtime + +## Non-goals (of this document) + +- Implementation of any unaccepted Zarr V3 extensions +- Major revisions to the Zarr V3 spec + +## Requirements + +1. Read and write spec compliant V2 and V3 data +2. Limit unnecessary traffic to/from the store +3. Cleanly define the Array/Group/Store abstractions +4. Cleanly define how V2 will be supported going forward +5. Provide a clear roadmap to help users upgrade to 3.0 +6. Developer tools / hooks for registering extensions + +## Design + +### Async API + +Zarr-Python is an IO library. As such, supporting concurrent action against the storage layer is critical to achieving acceptable performance. The Zarr-Python 2 was not designed with asynchronous computation in mind and as a result has struggled to effectively leverage the benefits of concurrency. At one point, `getitems` and `setitems` support was added to the Zarr store model but that is only used for operating on a set of chunks in a single variable. + +With Zarr-Python 3.0, we have the opportunity to revisit this design. The proposal here is as follows: + +1. The `Store` interface will be entirely async. +2. On top of the async `Store` interface, we will provide an `AsyncArray` and `AsyncGroup` interface. +3. Finally, the primary user facing API will be synchronous `Array` and `Group` classes that wrap the async equivalents. + +**Examples** + +- **Store** + + ```python + class Store: + ... + async def get(self, key: str) -> bytes: + ... + async def get_partial_values(self, key_ranges: List[Tuple[str, Tuple[int, Optional[int]]]]) -> bytes: + ... + # (no sync interface here) + ``` +- **Array** + + ```python + class AsyncArray: + ... + + async def getitem(self, selection: Selection) -> np.ndarray: + # the core logic for getitem goes here + + class Array: + _async_array: AsyncArray + + def __getitem__(self, selection: Selection) -> np.ndarray: + return sync(self._async_array.getitem(selection)) + ``` +- **Group** + + ```python + class AsyncGroup: + ... + + async def create_group(self, path: str, **kwargs) -> AsyncGroup: + # the core logic for create_group goes here + + class Group: + _async_group: AsyncGroup + + def create_group(self, path: str, **kwargs) -> Group: + return sync(self._async_group.create_group(path, **kwargs)) + ``` +**Internal Synchronization API** + +With the `Store` and core `AsyncArray`/ `AsyncGroup` classes being predominantly async, Zarr-Python will need an internal API to provide a synchronous API. The proposal here is to use the approach in [fsspec](https://github.com/fsspec/filesystem_spec/blob/master/fsspec/asyn.py) to provide a high-level `sync` function that takes an `awaitable` and runs it in its managed IO Loop / thread. + +**FAQ** +1. Why two levels of Arrays/groups? + a. First, this is an intentional decision and departure from the current Zarrita implementation + b. The idea is that users rarely want to mix interfaces. Either they are working within an async context (currently quite rare) or they are in a typical synchronous context. + c. Splitting the two will allow us to clearly define behavior on the `AsyncObj` and simply wrap it in the `SyncObj`. +2. What if a store is only has a synchronous backend? + a. First off, this is expected to be a fairly rare occurrence. Most storage backends have async interfaces. + b. But in the event a storage backend doesn’t have a async interface, there is nothing wrong with putting synchronous code in `async` methods. There are approaches to enabling concurrent action through wrappers like AsyncIO's `loop.run_in_executor` ([ref 1](https://stackoverflow.com/questions/38865050/is-await-in-python3-cooperative-multitasking ), [ref 2](https://stackoverflow.com/a/43263397/732596), [ref 3](https://bbc.github.io/cloudfit-public-docs/asyncio/asyncio-part-5.html), [ref 4](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor). +3. Will Zarr help manage the async contexts encouraged by some libraries (e.g. [AioBotoCore](https://aiobotocore.readthedocs.io/en/latest/tutorial.html#using-botocore))? + a. Many async IO libraries require entering an async context before interacting with the API. We expect some experimentation to be needed here but the initial design will follow something close to what fsspec does ([example in s3fs](https://github.com/fsspec/s3fs/blob/949442693ec940b35cda3420c17a864fbe426567/s3fs/core.py#L527)). +4. Why not provide a synchronous Store interface? + a. We could but this design is simpler. It would mean supporting it in the `AsyncGroup` and `AsyncArray` classes which, may be more trouble than its worth. Storage backends that do not have an async API will be encouraged to wrap blocking calls in an async wrapper (e.g. `loop.run_in_executor`). + +### Store API + +The `Store` API is specified directly in the V3 specification. All V3 stores should implement this abstract API, omitting Write and List support as needed. As described above, all stores will be expected to expose the required methods as async methods. + +**Example** + +```python +class ReadWriteStore: + ... + async def get(self, key: str) -> bytes: + ... + + async def get_partial_values(self, key_ranges: List[Tuple[str, int, int]) -> bytes: + ... + + async def set(self, key: str, value: Union[bytes, bytearray, memoryview]) -> None: + ... # required for writable stores + + async def set_partial_values(self, key_start_values: List[Tuple[str, int, Union[bytes, bytearray, memoryview]]]) -> None: + ... # required for writable stores + + async def list(self) -> List[str]: + ... # required for listable stores + + async def list_prefix(self, prefix: str) -> List[str]: + ... # required for listable stores + + async def list_dir(self, prefix: str) -> List[str]: + ... # required for listable stores + + # additional (optional methods) + async def getsize(self, prefix: str) -> int: + ... + + async def rename(self, src: str, dest: str) -> None + ... + +``` + +Recognizing that there are many Zarr applications today that rely on the `MutableMapping` interface supported by Zarr-Python 2, a wrapper store will be developed to allow existing stores to plug directly into this API. + +### Array API + +The user facing array interface will implement a subset of the [Array API Standard](https://data-apis.org/array-api/latest/). Most of the computational parts of the Array API Standard don’t fit into Zarr right now. That’s okay. What matters most is that we ensure we can give downstream applications a compliant API. + +*Note, Zarr already does most of this so this is more about formalizing the relationship than a substantial change in API.* + +| | Included | Not Included | Unknown / Maybe possible? | +| --- | --- | --- | --- | +| Attributes | `dtype` | `mT` | `device` | +| | `ndim` | `T` | | +| | `shape` | | | +| | `size` | | | +| Methods | `__getitem__` | `__array_namespace__` | `to_device` | +| | `__setitem__` | `__abs__` | `__bool__` | +| | `__eq__` | `__add__` | `__complex__` | +| | `__bool__` | `__and__` | `__dlpack__` | +| | | `__floordiv__` | `__dlpack_device__` | +| | | `__ge__` | `__float__` | +| | | `__gt__` | `__index__` | +| | | `__invert__` | `__int__` | +| | | `__le__` | | +| | | `__lshift__` | | +| | | `__lt__` | | +| | | `__matmul__` | | +| | | `__mod__` | | +| | | `__mul__` | | +| | | `__ne__` | | +| | | `__neg__` | | +| | | `__or__` | | +| | | `__pos__` | | +| | | `__pow__` | | +| | | `__rshift__` | | +| | | `__sub__` | | +| | | `__truediv__` | | +| | | `__xor__` | | +| Creation functions (`zarr.creation`) | `zeros` | | `arange` | +| | `zeros_like` | | `asarray` | +| | `ones` | | `eye` | +| | `ones_like` | | `from_dlpack` | +| | `full` | | `linspace` | +| | `full_like` | | `meshgrid` | +| | `empty` | | `tril` | +| | `empty_like` | | `triu` | + +In addition to the core array API defined above, the Array class should have the following Zarr specific properties: + +- `.metadata` (see Metadata Interface below) +- `.attrs` - (pull from metadata object) +- `.info` - (pull from existing property †) + +*† In Zarr-Python 2, the info property lists the store to identify initialized chunks. By default this will be turned off in 3.0 but will be configurable.* + +**Indexing** + +Zarr-Python currently supports `__getitem__` style indexing and the special `oindex` and `vindex` indexers. These are not part of the current Array API standard (see [data-apis/array-api\#669](https://github.com/data-apis/array-api/issues/669)) but they have been [proposed as a NEP](https://numpy.org/neps/nep-0021-advanced-indexing.html). Zarr-Python will maintain these in 3.0. + +We are also exploring a new high-level indexing API that will enabled optimized batch/concurrent loading of many chunks. We expect this to be important to enable performant loading of data in the context of sharding. See [this discussion](https://github.com/zarr-developers/zarr-python/discussions/1569) for more detail. + +Concurrent indexing across multiple arrays will be possible using the AsyncArray API. + +**Async and Sync Array APIs** + +Most the logic to support Zarr Arrays will live in the `AsyncArray` class. There are a few notable differences that should be called out. + +| Sync Method | Async Method | +| --- | --- | +| `__getitem__` | `getitem` | +| `__setitem__` | `setitem` | +| `__eq__` | `equals` | + +**Metadata interface** + +Zarr-Python 2.* closely mirrors the V2 spec metadata schema in the Array and Group classes. In 3.0, we plan to move the underlying metadata representation to a separate interface (e.g. `Array.metadata`). This interface will return either a `V2ArrayMetadata` or `V3ArrayMetadata` object (both will inherit from a parent `ArrayMetadataABC` class. The `V2ArrayMetadata` and `V3ArrayMetadata` classes will be responsible for producing valid JSON representations of their metadata, and yielding a consistent view to the `Array` or `Group` class. + +### Group API + +The main question is how closely we should follow the existing Zarr-Python implementation / `MutableMapping` interface. The table below shows the primary `Group` methods in Zarr-Python 2 and attempts to identify if and how they would be implemented in 3.0. + +| V2 Group Methods | `AsyncGroup` | `Group` | `h5py_compat.Group`` | +| --- | --- | --- | --- | +| `__len__` | `length` | `__len__` | `__len__` | +| `__iter__` | `__aiter__` | `__iter__` | `__iter__` | +| `__contains__` | `contains` | `__contains__` | `__contains__` | +| `__getitem__` | `getitem` | `__getitem__` | `__getitem__` | +| `__enter__` | N/A | N/A | `__enter__` | +| `__exit__` | N/A | N/A | `__exit__` | +| `group_keys` | `group_keys` | `group_keys` | N/A | +| `groups` | `groups` | `groups` | N/A | +| `array_keys` | `array_key` | `array_keys` | N/A | +| `arrays` | `arrays`* | `arrays` | N/A | +| `visit` | ? | ? | `visit` | +| `visitkeys` | ? | ? | ? | +| `visitvalues` | ? | ? | ? | +| `visititems` | ? | ? | `visititems` | +| `tree` | `tree` | `tree` | `Both` | +| `create_group` | `create_group` | `create_group` | `create_group` | +| `require_group` | N/A | N/A | `require_group` | +| `create_groups` | ? | ? | N/A | +| `require_groups` | ? | ? | ? | +| `create_dataset` | N/A | N/A | `create_dataset` | +| `require_dataset` | N/A | N/A | `require_dataset` | +| `create` | `create_array` | `create_array` | N/A | +| `empty` | `empty` | `empty` | N/A | +| `zeros` | `zeros` | `zeros` | N/A | +| `ones` | `ones` | `ones` | N/A | +| `full` | `full` | `full` | N/A | +| `array` | `create_array` | `create_array` | N/A | +| `empty_like` | `empty_like` | `empty_like` | N/A | +| `zeros_like` | `zeros_like` | `zeros_like` | N/A | +| `ones_like` | `ones_like` | `ones_like` | N/A | +| `full_like` | `full_like` | `full_like` | N/A | +| `move` | `move` | `move` | `move` | + +**`zarr.h5compat.Group`** + +Zarr-Python 2.* made an attempt to align its API with that of [h5py](https://docs.h5py.org/en/stable/index.html). With 3.0, we will relax this alignment in favor of providing an explicit compatibility module (`zarr.h5py_compat`). This module will expose the `Group` and `Dataset` APIs that map to Zarr-Python’s `Group` and `Array` objects. + +### Creation API + +Zarr-Python 2.* bundles together the creation and serialization of Zarr objects. Zarr-Python 3.* will make it possible to create objects in memory separate from serializing them. This will specifically enable writing hierarchies of Zarr objects in a single batch step. For example: + +```python + +arr1 = Array(shape=(10, 10), path="foo/bar", dtype="i4", store=store) +arr2 = Array(shape=(10, 10), path="foo/spam", dtype="f8", store=store) + +arr1.save() +arr2.save() + +# or equivalently + +zarr.save_many([arr1 ,arr2]) +``` + +*Note: this batch creation API likely needs additional design effort prior to implementation.* + +### Plugin API + +Zarr V3 was designed to be extensible at multiple layers. Zarr-Python will support these extensions through a combination of [Abstract Base Classes](https://docs.python.org/3/library/abc.html) (ABCs) and [Entrypoints](https://packaging.python.org/en/latest/specifications/entry-points/). + +**ABCs** + +Zarr V3 will expose Abstract base classes for the following objects: + +- `Store`, `ReadStore`, `ReadWriteStore`, `ReadListStore`, and `ReadWriteListStore` +- `BaseArray`, `SynchronousArray`, and `AsynchronousArray` +- `BaseGroup`, `SynchronousGroup`, and `AsynchronousGroup` +- `Codec`, `ArrayArrayCodec`, `ArrayBytesCodec`, `BytesBytesCodec` + +**Entrypoints** + +Lots more thinking here but the idea here is to provide entrypoints for `data type`, `chunk grid`, `chunk key encoding`, `codecs`, `storage_transformers` and `stores`. These might look something like: + +``` +entry_points=""" + [zarr.codecs] + blosc_codec=codec_plugin:make_blosc_codec + zlib_codec=codec_plugin:make_zlib_codec +""" +``` + +### Python type hints and static analysis + +Target 100% Mypy coverage in 3.0 source. + +### Observability + +A persistent problem in Zarr-Python is diagnosing problems that span many parts of the stack. To address this in 3.0, we will add a basic logging framework that can be used to debug behavior at various levels of the stack. We propose to add the separate loggers for the following namespaces: + +- `array` +- `group` +- `store` +- `codec` + +These should be documented such that users know how to activate them and developers know how to use them when developing extensions. + +### Dependencies + +Today, Zarr-Python has the following required dependencies: + +```python +dependencies = [ + 'asciitree', + 'numpy>=1.20,!=1.21.0', + 'fasteners', + 'numcodecs>=0.10.0', +] +``` + +What other dependencies should be considered? + +1. Attrs - Zarrita makes extensive use of the Attrs library +2. Fsspec - Zarrita has a hard dependency on Fsspec. This could be easily relaxed though. + +## Breaking changes relative to Zarr-Python 2.* + +1. H5py compat moved to a stand alone module? +2. `Group.__getitem__` support moved to `Group.members.__getitem__`? +3. Others? + +## Open questions + +1. How to treat V2 + a. Note: Zarrita currently implements a separate `V2Array` and `V3Array` classes. This feels less than ideal. + b. We could easily convert metadata from v2 to the V3 Array, but what about writing? + c. Ideally, we don’t have completely separate code paths. But if its too complicated to support both within one interface, its probably better. +2. How and when to remove the current implementation of V3. + a. It's hidden behind a hard-to-use feature flag so we probably don't need to do anything. +4. How to model runtime configuration? +5. Which extensions belong in Zarr-Python and which belong in separate packages? + a. We don't need to take a strong position on this here. It's likely that someone will want to put Sharding in. That will be useful to develop in parallel because it will give us a good test case for the plugin interface. + +## Testing + +Zarr-python 3.0 adds a major new dimension to Zarr: Async support. This also comes with a compatibility risk, we will need to thoroughly test support in key execution environments. Testing plan: +- Reuse the existing test suite for testing the `v3` API. + - `xfail` tests that expose breaking changes with `3.0 - breaking change` description. This will help identify additional and/or unintentional breaking changes + - Rework tests that were only testing internal APIs. +- Add a set of functional / integration tests targeting real-world workflows in various contexts (e.g. w/ Dask) + +## Development process + +Zarr-Python 3.0 will introduce a number of new APIs and breaking changes to existing APIs. In order to facilitate ongoing support for Zarr-Python 2.*, we will take on the following development process: + +- Create a `v3` branch that can be use for developing the core functionality apart from the `main` branch. This will allow us to support ongoing work and bug fixes on the `main` branch. +- Put the `3.0` APIs inside a `zarr.v3` module. Imports from this namespace will all be new APIs that users can develop and test against once the `v3` branch is merged to `main`. +- Kickstart the process by pulling in the current state of `zarrita` - which has many of the features described in this design. +- Release a series of 2.* releases with the `v3` namespace +- When `v3` is complete, move contents of `v3` to the package root + +**Milestones** + +Below are a set of specific milestones leading toward the completion of this process. As work begins, we expect this list to grow in specificity. + +1. Port current version of Zarrita to Zarr-Python +2. Formalize Async interface by splitting `Array` and `Group` objects into Sync and Async versions +4. Implement "fancy" indexing operations on the `AsyncArray` +6. Implement an abstract base class for the `Store` interface and a wrapper `Store` to make use of existing `MutableMapping` stores. +7. Rework the existing unit test suite to use the `v3` namespace. +8. Develop a plugin interface for extensions +9. Develop a set of functional and integration tests +10. Work with downstream libraries (Xarray, Dask, etc.) to test new APIs + +## TODOs + +The following subjects are not covered in detail above but perhaps should be. Including them here so they are not forgotten. + +1. [Store] Should Zarr provide an API for caching objects after first read/list/etc. Read only stores? +2. [Array] buffer protocol support +3. [Array] `meta_array` support +4. [Extensions] Define how Zarr-Python will consume the various plugin types +5. [Misc] H5py compatibility requires a bit more work and a champion to drive it forward. +6. [Misc] Define `chunk_store` API in 3.0 +7. [Misc] Define `synchronizer` API in 3.0 + +## References + +1. [Zarr-Python repository](https://github.com/zarr-developers/zarr-python) +2. [Zarr core specification (version 3.0) — Zarr specs documentation](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#) +3. [Zarrita repository](https://github.com/scalableminds/zarrita) +4. [Async-Zarr](https://github.com/martindurant/async-zarr) +5. [Zarr-Python Discussion Topic](https://github.com/zarr-developers/zarr-python/discussions/1569) diff --git a/zarr/_storage/absstore.py b/zarr/_storage/absstore.py index f62529f096..1e49754f38 100644 --- a/zarr/_storage/absstore.py +++ b/zarr/_storage/absstore.py @@ -1,9 +1,19 @@ """This module contains storage classes related to Azure Blob Storage (ABS)""" +from typing import Optional import warnings + from numcodecs.compat import ensure_bytes from zarr.util import normalize_storage_path -from zarr._storage.store import _get_metadata_suffix, data_root, meta_root, Store, StoreV3 +from zarr._storage.store import ( + _get_metadata_suffix, + data_root, + meta_root, + Store, + StoreV3, + V3_DEPRECATION_MESSAGE, +) +from zarr.types import DIMENSION_SEPARATOR __doctest_requires__ = { ("ABSStore", "ABSStore.*"): ["azure.storage.blob"], @@ -67,9 +77,15 @@ def __init__( account_name=None, account_key=None, blob_service_kwargs=None, - dimension_separator=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, client=None, ): + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=3, + ) + self._dimension_separator = dimension_separator self.prefix = normalize_storage_path(prefix) if client is None: @@ -84,10 +100,10 @@ def __init__( blob_service_kwargs = blob_service_kwargs or {} client = ContainerClient( - "https://{}.blob.core.windows.net/".format(account_name), + f"https://{account_name}.blob.core.windows.net/", container, credential=account_key, - **blob_service_kwargs + **blob_service_kwargs, ) self.client = client @@ -140,8 +156,8 @@ def __getitem__(self, key): blob_name = self._append_path_to_prefix(key) try: return self.client.download_blob(blob_name).readall() - except ResourceNotFoundError: - raise KeyError("Blob %s not found" % blob_name) + except ResourceNotFoundError as e: + raise KeyError(f"Blob {blob_name} not found") from e def __setitem__(self, key, value): value = ensure_bytes(value) @@ -153,8 +169,8 @@ def __delitem__(self, key): try: self.client.delete_blob(self._append_path_to_prefix(key)) - except ResourceNotFoundError: - raise KeyError("Blob %s not found" % key) + except ResourceNotFoundError as e: + raise KeyError(f"Blob {key} not found") from e def __eq__(self, other): return ( @@ -240,7 +256,6 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def rmdir(self, path=None): - if not path: # Currently allowing clear to delete everything as in v2 diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 8daedae48f..dba29d13c0 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -1,5 +1,6 @@ import abc import os +import warnings from collections import defaultdict from collections.abc import MutableMapping from copy import copy @@ -9,6 +10,7 @@ from zarr.meta import Metadata2, Metadata3 from zarr.util import normalize_storage_path from zarr.context import Context +from zarr.types import ZARR_VERSION # v2 store keys array_meta_key = ".zarray" @@ -19,12 +21,29 @@ meta_root = "meta/root/" data_root = "data/root/" -DEFAULT_ZARR_VERSION = 2 +DEFAULT_ZARR_VERSION: ZARR_VERSION = 2 v3_api_available = os.environ.get("ZARR_V3_EXPERIMENTAL_API", "0").lower() not in ["0", "false"] +_has_warned_about_v3 = False # to avoid printing the warning multiple times + +V3_DEPRECATION_MESSAGE = ( + "The {store} is deprecated and will be removed in a Zarr-Python version 3, see " + "https://github.com/zarr-developers/zarr-python/issues/1274 for more information." +) def assert_zarr_v3_api_available(): + # we issue a warning about the experimental v3 implementation when it is first used + global _has_warned_about_v3 + if v3_api_available and not _has_warned_about_v3: + warnings.warn( + "The experimental Zarr V3 implementation in this version of Zarr-Python is not " + "in alignment with the final V3 specification. This version will be removed in " + "Zarr-Python 3 in favor of a spec compliant version.", + FutureWarning, + stacklevel=1, + ) + _has_warned_about_v3 = True if not v3_api_available: raise NotImplementedError( "# V3 reading and writing is experimental! To enable support, set:\n" @@ -221,13 +240,12 @@ def _validate_key(self, key: str): ) if ( - not key.startswith("data/") - and (not key.startswith("meta/")) - and (not key == "zarr.json") + not key.startswith(("data/", "meta/")) + and key != "zarr.json" # TODO: Possibly allow key == ".zmetadata" too if we write a # consolidated metadata spec corresponding to this? ): - raise ValueError("keys starts with unexpected value: `{}`".format(key)) + raise ValueError(f"key starts with unexpected value: `{key}`") if key.endswith("/"): raise ValueError("keys may not end in /") @@ -462,7 +480,7 @@ def inner_store(self) -> Union["StorageTransformer", StoreV3]: def __eq__(self, other): return ( - type(self) == type(other) + type(self) is type(other) and self._inner_store == other._inner_store and self.get_config() == other.get_config() ) @@ -629,7 +647,6 @@ def _rmdir_from_keys(store: StoreLike, path: Optional[str] = None) -> None: def _rmdir_from_keys_v3(store: StoreV3, path: str = "") -> None: - meta_dir = meta_root + path meta_dir = meta_dir.rstrip("/") _rmdir_from_keys(store, meta_dir) @@ -643,10 +660,10 @@ def _rmdir_from_keys_v3(store: StoreV3, path: str = "") -> None: sfx = _get_metadata_suffix(store) array_meta_file = meta_dir + ".array" + sfx if array_meta_file in store: - store.erase(array_meta_file) # type: ignore + store.erase(array_meta_file) group_meta_file = meta_dir + ".group" + sfx if group_meta_file in store: - store.erase(group_meta_file) # type: ignore + store.erase(group_meta_file) def _listdir_from_keys(store: BaseStore, path: Optional[str] = None) -> List[str]: diff --git a/zarr/_storage/v3.py b/zarr/_storage/v3.py index 00dc085dac..4987f820cf 100644 --- a/zarr/_storage/v3.py +++ b/zarr/_storage/v3.py @@ -3,13 +3,14 @@ from collections import OrderedDict from collections.abc import MutableMapping from threading import Lock -from typing import Union, Dict, Any +from typing import Union, Dict, Any, Optional from zarr.errors import ( MetadataError, ReadOnlyError, ) from zarr.util import buffer_size, json_loads, normalize_storage_path +from zarr.types import DIMENSION_SEPARATOR from zarr._storage.absstore import ABSStoreV3 # noqa: F401 from zarr._storage.store import ( # noqa: F401 @@ -118,7 +119,6 @@ def _get_files_and_dirs_from_path(store, path): class FSStoreV3(FSStore, StoreV3): - # FSStoreV3 doesn't use this (FSStore uses it within _normalize_key) _META_KEYS = () @@ -225,7 +225,9 @@ def get_partial_values(self, key_ranges): class MemoryStoreV3(MemoryStore, StoreV3): - def __init__(self, root=None, cls=dict, dimension_separator=None): + def __init__( + self, root=None, cls=dict, dimension_separator: Optional[DIMENSION_SEPARATOR] = None + ): if root is None: self.root = cls() else: @@ -570,7 +572,7 @@ def __init__(self, store: StoreLike, metadata_key=meta_root + "consolidated/.zme consolidated_format = meta.get("zarr_consolidated_format", None) if consolidated_format != 1: raise MetadataError( - "unsupported zarr consolidated metadata format: %s" % consolidated_format + f"unsupported zarr consolidated metadata format: {consolidated_format}" ) # decode metadata diff --git a/zarr/_storage/v3_storage_transformers.py b/zarr/_storage/v3_storage_transformers.py index ff31a7281c..00467d44f9 100644 --- a/zarr/_storage/v3_storage_transformers.py +++ b/zarr/_storage/v3_storage_transformers.py @@ -8,6 +8,7 @@ from zarr._storage.store import StorageTransformer, StoreV3, _rmdir_from_keys_v3 from zarr.util import normalize_storage_path +from zarr.types import DIMENSION_SEPARATOR MAX_UINT_64 = 2**64 - 1 @@ -118,7 +119,7 @@ def _copy_for_array(self, array, inner_store): return transformer_copy @property - def dimension_separator(self) -> str: + def dimension_separator(self) -> DIMENSION_SEPARATOR: assert ( self._dimension_separator is not None ), "dimension_separator is not initialized, first get a copy via _copy_for_array." @@ -182,8 +183,8 @@ def __getitem__(self, key): shard_key, chunk_subkey = self._key_to_shard(key) try: full_shard_value = self.inner_store[shard_key] - except KeyError: - raise KeyError(key) + except KeyError as e: + raise KeyError(key) from e index = self._get_index_from_buffer(full_shard_value) chunk_slice = index.get_chunk_slice(chunk_subkey) if chunk_slice is not None: @@ -264,8 +265,8 @@ def __delitem__(self, key): shard_key, chunk_subkey = self._key_to_shard(key) try: index = self._get_index_from_store(shard_key) - except KeyError: - raise KeyError(key) + except KeyError as e: + raise KeyError(key) from e index.set_chunk_slice(chunk_subkey, None) @@ -351,7 +352,7 @@ def erase_prefix(self, prefix): def rmdir(self, path=None): path = normalize_storage_path(path) - _rmdir_from_keys_v3(self, path) # type: ignore + _rmdir_from_keys_v3(self, path) def __contains__(self, key): if self._is_data_key(key): diff --git a/zarr/attrs.py b/zarr/attrs.py index 01fc617b3c..af9a5f1d30 100644 --- a/zarr/attrs.py +++ b/zarr/attrs.py @@ -25,15 +25,16 @@ class Attributes(MutableMapping): """ - def __init__(self, store, key=".zattrs", read_only=False, cache=True, synchronizer=None): - + def __init__( + self, store, key=".zattrs", read_only=False, cache=True, synchronizer=None, cached_dict=None + ): self._version = getattr(store, "_store_version", 2) _Store = Store if self._version == 2 else StoreV3 self.store = _Store._ensure_store(store) self.key = key self.read_only = read_only self.cache = cache - self._cached_asdict = None + self._cached_asdict = cached_dict if cache else None self.synchronizer = synchronizer def _get_nosync(self): @@ -73,7 +74,6 @@ def __getitem__(self, item): return self.asdict()[item] def _write_op(self, f, *args, **kwargs): - # guard condition if self.read_only: raise PermissionError("attributes are read-only") @@ -89,7 +89,6 @@ def __setitem__(self, item, value): self._write_op(self._setitem_nosync, item, value) def _setitem_nosync(self, item, value): - # load existing data d = self._get_nosync() @@ -106,7 +105,6 @@ def __delitem__(self, item): self._write_op(self._delitem_nosync, item) def _delitem_nosync(self, key): - # load existing data d = self._get_nosync() @@ -128,7 +126,6 @@ def put(self, d): self._write_op(self._put_nosync, dict(attributes=d)) def _put_nosync(self, d): - d_to_check = d if self._version == 2 else d["attributes"] if not all(isinstance(item, str) for item in d_to_check): # TODO: Raise an error for non-string keys @@ -154,19 +151,20 @@ def _put_nosync(self, d): if self.cache: self._cached_asdict = d else: - if self.key in self.store: + try: + meta_unparsed = self.store[self.key] # Cannot write the attributes directly to JSON, but have to # store it within the pre-existing attributes key of the v3 # metadata. # Note: this changes the store.counter result in test_caching_on! - meta = self.store._metadata_class.parse_metadata(self.store[self.key]) + meta = self.store._metadata_class.parse_metadata(meta_unparsed) if "attributes" in meta and "filters" in meta["attributes"]: # need to preserve any existing "filters" attribute d["attributes"]["filters"] = meta["attributes"]["filters"] meta["attributes"] = d["attributes"] - else: + except KeyError: meta = d self.store[self.key] = json_dumps(meta) if self.cache: @@ -178,7 +176,6 @@ def update(self, *args, **kwargs): self._write_op(self._update_nosync, *args, **kwargs) def _update_nosync(self, *args, **kwargs): - # load existing data d = self._get_nosync() diff --git a/zarr/convenience.py b/zarr/convenience.py index 0ee8a8d323..bd284e0844 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1,4 +1,5 @@ """Convenience functions for storing and loading data.""" + import itertools import os import re @@ -54,6 +55,11 @@ def open(store: StoreLike = None, mode: str = "a", *, zarr_version=None, path=No The zarr protocol version to use. The default value of None will attempt to infer the version from `store` if possible, otherwise it will fall back to 2. + + .. warning:: `zarr_version=3` is currently using the experimental Zarr V3 + implementation. This implementation is not in sync with the final specification + and will be replaced with a spec compliant version in the version 3.0. + path : str or None, optional The path within the store to open. **kwargs @@ -149,6 +155,11 @@ def save_array(store: StoreLike, arr, *, zarr_version=None, path=None, **kwargs) The zarr protocol version to use when saving. The default value of None will attempt to infer the version from `store` if possible, otherwise it will fall back to 2. + + .. warning:: `zarr_version=3` is currently using the experimental Zarr V3 + implementation. This implementation is not in sync with the final specification + and will be replaced with a spec compliant version in the version 3.0. + path : str or None, optional The path within the store where the array will be saved. kwargs @@ -199,6 +210,11 @@ def save_group(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): The zarr protocol version to use when saving. The default value of None will attempt to infer the version from `store` if possible, otherwise it will fall back to 2. + + .. warning:: `zarr_version=3` is currently using the experimental Zarr V3 + implementation. This implementation is not in sync with the final specification + and will be replaced with a spec compliant version in the version 3.0. + path : str or None, optional Path within the store where the group will be saved. kwargs @@ -258,7 +274,7 @@ def save_group(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): try: grp = _create_group(_store, path=path, overwrite=True, zarr_version=zarr_version) for i, arr in enumerate(args): - k = "arr_{}".format(i) + k = f"arr_{i}" grp.create_dataset(k, data=arr, overwrite=True, zarr_version=zarr_version) for k, arr in kwargs.items(): grp.create_dataset(k, data=arr, overwrite=True, zarr_version=zarr_version) @@ -281,6 +297,11 @@ def save(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): The zarr protocol version to use when saving. The default value of None will attempt to infer the version from `store` if possible, otherwise it will fall back to 2. + + .. warning:: `zarr_version=3` is currently using the experimental Zarr V3 + implementation. This implementation is not in sync with the final specification + and will be replaced with a spec compliant version in the version 3.0. + path : str or None, optional The path within the group where the arrays will be saved. kwargs @@ -394,6 +415,11 @@ def load(store: StoreLike, zarr_version=None, path=None): The zarr protocol version to use when loading. The default value of None will attempt to infer the version from `store` if possible, otherwise it will fall back to 2. + + .. warning:: `zarr_version=3` is currently using the experimental Zarr V3 + implementation. This implementation is not in sync with the final specification + and will be replaced with a spec compliant version in the version 3.0. + path : str or None, optional The path within the store from which to load. @@ -498,7 +524,7 @@ def __init__(self, log): self.log_file = log else: raise TypeError( - "log must be a callable function, file path or " "file-like object, found %r" % log + f"log must be a callable function, file path or file-like object, found {log!r}" ) def __enter__(self): @@ -525,9 +551,9 @@ def _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied): message = "dry run: " else: message = "all done: " - message += "{:,} copied, {:,} skipped".format(n_copied, n_skipped) + message += f"{n_copied:,} copied, {n_skipped:,} skipped" if not dry_run: - message += ", {:,} bytes copied".format(n_bytes_copied) + message += f", {n_bytes_copied:,} bytes copied" log(message) @@ -656,9 +682,7 @@ def copy_store( # check if_exists parameter valid_if_exists = ["raise", "replace", "skip"] if if_exists not in valid_if_exists: - raise ValueError( - "if_exists must be one of {!r}; found {!r}".format(valid_if_exists, if_exists) - ) + raise ValueError(f"if_exists must be one of {valid_if_exists!r}; found {if_exists!r}") # setup counting variables n_copied = n_skipped = n_bytes_copied = 0 @@ -675,10 +699,8 @@ def copy_store( # setup logging with _LogWriter(log) as log: - # iterate over source keys for source_key in sorted(source.keys()): - # filter to keys under source path if source_store_version == 2: if not source_key.startswith(source_path): @@ -721,20 +743,20 @@ def copy_store( if if_exists != "replace": if dest_key in dest: if if_exists == "raise": - raise CopyError("key {!r} exists in destination".format(dest_key)) + raise CopyError(f"key {dest_key!r} exists in destination") elif if_exists == "skip": do_copy = False # take action if do_copy: - log("copy {}".format(descr)) + log(f"copy {descr}") if not dry_run: data = source[source_key] n_bytes_copied += buffer_size(data) dest[dest_key] = data n_copied += 1 else: - log("skip {}".format(descr)) + log(f"skip {descr}") n_skipped += 1 # log a final message with a summary of what happened @@ -745,7 +767,7 @@ def copy_store( def _check_dest_is_group(dest): if not hasattr(dest, "create_dataset"): - raise ValueError("dest must be a group, got {!r}".format(dest)) + raise ValueError(f"dest must be a group, got {dest!r}") def copy( @@ -757,7 +779,7 @@ def copy( log=None, if_exists="raise", dry_run=False, - **create_kws + **create_kws, ): """Copy the `source` array or group into the `dest` group. @@ -878,7 +900,6 @@ def copy( # setup logging with _LogWriter(log) as log: - # do the copying n_copied, n_skipped, n_bytes_copied = _copy( log, @@ -890,7 +911,7 @@ def copy( without_attrs=without_attrs, if_exists=if_exists, dry_run=dry_run, - **create_kws + **create_kws, ) # log a final message with a summary of what happened @@ -912,11 +933,9 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ # check if_exists parameter valid_if_exists = ["raise", "replace", "skip", "skip_initialized"] if if_exists not in valid_if_exists: - raise ValueError( - "if_exists must be one of {!r}; found {!r}".format(valid_if_exists, if_exists) - ) + raise ValueError(f"if_exists must be one of {valid_if_exists!r}; found {if_exists!r}") if dest_h5py and if_exists == "skip_initialized": - raise ValueError("{!r} can only be used when copying to zarr".format(if_exists)) + raise ValueError(f"{if_exists!r} can only be used when copying to zarr") # determine name to copy to if name is None: @@ -936,9 +955,7 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ exists = dest is not None and name in dest if exists: if if_exists == "raise": - raise CopyError( - "an object {!r} already exists in destination " "{!r}".format(name, dest.name) - ) + raise CopyError(f"an object {name!r} already exists in destination {dest.name!r}") elif if_exists == "skip": do_copy = False elif if_exists == "skip_initialized": @@ -948,12 +965,10 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ # take action if do_copy: - # log a message about what we're going to do - log("copy {} {} {}".format(source.name, source.shape, source.dtype)) + log(f"copy {source.name} {source.shape} {source.dtype}") if not dry_run: - # clear the way if exists: del dest[name] @@ -1019,7 +1034,7 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ n_copied += 1 else: - log("skip {} {} {}".format(source.name, source.shape, source.dtype)) + log(f"skip {source.name} {source.shape} {source.dtype}") n_skipped += 1 elif root or not shallow: @@ -1030,20 +1045,16 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ exists_array = dest is not None and name in dest and hasattr(dest[name], "shape") if exists_array: if if_exists == "raise": - raise CopyError( - "an array {!r} already exists in destination " "{!r}".format(name, dest.name) - ) + raise CopyError(f"an array {name!r} already exists in destination {dest.name!r}") elif if_exists == "skip": do_copy = False # take action if do_copy: - # log action - log("copy {}".format(source.name)) + log(f"copy {source.name}") if not dry_run: - # clear the way if exists_array: del dest[name] @@ -1056,7 +1067,6 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ grp.attrs.update(source.attrs) else: - # setup for dry run without creating any groups in the # destination if dest is not None: @@ -1076,7 +1086,7 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ without_attrs=without_attrs, if_exists=if_exists, dry_run=dry_run, - **create_kws + **create_kws, ) n_copied += c n_skipped += s @@ -1085,7 +1095,7 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, dry_ n_copied += 1 else: - log("skip {}".format(source.name)) + log(f"skip {source.name}") n_skipped += 1 return n_copied, n_skipped, n_bytes_copied @@ -1099,7 +1109,7 @@ def copy_all( log=None, if_exists="raise", dry_run=False, - **create_kws + **create_kws, ): """Copy all children of the `source` group into the `dest` group. @@ -1189,7 +1199,6 @@ def copy_all( # setup logging with _LogWriter(log) as log: - for k in source.keys(): c, s, b = _copy( log, @@ -1201,7 +1210,7 @@ def copy_all( without_attrs=without_attrs, if_exists=if_exists, dry_run=dry_run, - **create_kws + **create_kws, ) n_copied += c n_skipped += s @@ -1262,7 +1271,6 @@ def is_zarr_key(key): return key.endswith(".zarray") or key.endswith(".zgroup") or key.endswith(".zattrs") else: - assert_zarr_v3_api_available() sfx = _get_metadata_suffix(store) # type: ignore @@ -1336,7 +1344,7 @@ def open_consolidated(store: StoreLike, metadata_key=".zmetadata", mode="r+", ** store, storage_options=kwargs.get("storage_options"), mode=mode, zarr_version=zarr_version ) if mode not in {"r", "r+"}: - raise ValueError("invalid mode, expected either 'r' or 'r+'; found {!r}".format(mode)) + raise ValueError(f"invalid mode, expected either 'r' or 'r+'; found {mode!r}") path = kwargs.pop("path", None) if store._store_version == 2: diff --git a/zarr/core.py b/zarr/core.py index c07a31e95f..b1ccd203db 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -172,11 +172,15 @@ def __init__( # initialize attributes akey = _prefix_to_attrs_key(self._store, self._key_prefix) self._attrs = Attributes( - store, key=akey, read_only=read_only, synchronizer=synchronizer, cache=cache_attrs + store, + key=akey, + read_only=read_only, + synchronizer=synchronizer, + cache=cache_attrs, + cached_dict=self._meta["attributes"] if self._version == 3 else None, ) # initialize info reporter - self._info_reporter = InfoReporter(self) # initialize indexing helpers self._oindex = OIndex(self) @@ -196,8 +200,8 @@ def _load_metadata_nosync(self): try: mkey = _prefix_to_array_key(self._store, self._key_prefix) meta_bytes = self._store[mkey] - except KeyError: - raise ArrayNotFoundError(self._path) + except KeyError as e: + raise ArrayNotFoundError(self._path) from e else: # decode and store metadata as instance members meta = self._store._metadata_class.decode_array_metadata(meta_bytes) @@ -2026,7 +2030,9 @@ def _process_chunk( and not self._filters and self._dtype != object ): - dest = out[out_selection] + # For 0D arrays out_selection = () and out[out_selection] is a scalar + # Avoid that + dest = out[out_selection] if out_selection else out # Assume that array-like objects that doesn't have a # `writeable` flag is writable. dest_is_writable = getattr(dest, "writeable", True) @@ -2060,9 +2066,11 @@ def _process_chunk( index_selection = PartialChunkIterator(chunk_selection, self.chunks) for start, nitems, partial_out_selection in index_selection: expected_shape = [ - len(range(*partial_out_selection[i].indices(self.chunks[0] + 1))) - if i < len(partial_out_selection) - else dim + ( + len(range(*partial_out_selection[i].indices(self.chunks[0] + 1))) + if i < len(partial_out_selection) + else dim + ) for i, dim in enumerate(self.chunks) ] if isinstance(cdata, UncompressedPartialReadBufferV3): @@ -2394,11 +2402,11 @@ def _encode_chunk(self, chunk): def __repr__(self): t = type(self) - r = "<{}.{}".format(t.__module__, t.__name__) + r = f"<{t.__module__}.{t.__name__}" if self.name: - r += " %r" % self.name - r += " %s" % str(self.shape) - r += " %s" % self.dtype + r += f" {self.name!r}" + r += f" {str(self.shape)}" + r += f" {self.dtype}" if self._read_only: r += " read-only" r += ">" @@ -2427,18 +2435,18 @@ def info(self): Chunks initialized : 0/10 """ - return self._info_reporter + return InfoReporter(self) def info_items(self): return self._synchronized_op(self._info_items_nosync) def _info_items_nosync(self): def typestr(o): - return "{}.{}".format(type(o).__module__, type(o).__name__) + return f"{type(o).__module__}.{type(o).__name__}" def bytestr(n): if n > 2**10: - return "{} ({})".format(n, human_readable_size(n)) + return f"{n} ({human_readable_size(n)})" else: return str(n) @@ -2449,7 +2457,7 @@ def bytestr(n): items += [("Name", self.name)] items += [ ("Type", typestr(self)), - ("Data type", "%s" % self.dtype), + ("Data type", str(self.dtype)), ("Shape", str(self.shape)), ("Chunk shape", str(self.chunks)), ("Order", self.order), @@ -2459,7 +2467,7 @@ def bytestr(n): # filters if self.filters: for i, f in enumerate(self.filters): - items += [("Filter [%s]" % i, repr(f))] + items += [(f"Filter [{i}]", repr(f))] # compressor items += [("Compressor", repr(self.compressor))] @@ -2469,16 +2477,18 @@ def bytestr(n): items += [("Synchronizer type", typestr(self._synchronizer))] # storage info + nbytes = self.nbytes + nbytes_stored = self.nbytes_stored items += [("Store type", typestr(self._store))] if self._chunk_store is not None: items += [("Chunk store type", typestr(self._chunk_store))] - items += [("No. bytes", bytestr(self.nbytes))] - if self.nbytes_stored > 0: + items += [("No. bytes", bytestr(nbytes))] + if nbytes_stored > 0: items += [ - ("No. bytes stored", bytestr(self.nbytes_stored)), - ("Storage ratio", "%.1f" % (self.nbytes / self.nbytes_stored)), + ("No. bytes stored", bytestr(nbytes_stored)), + ("Storage ratio", f"{nbytes / nbytes_stored:.1f}"), ] - items += [("Chunks initialized", "{}/{}".format(self.nchunks_initialized, self.nchunks))] + items += [("Chunks initialized", f"{self.nchunks_initialized}/{self.nchunks}")] return items @@ -2536,7 +2546,7 @@ def hexdigest(self, hashname="sha1"): checksum = binascii.hexlify(self.digest(hashname=hashname)) # This is a bytes object on Python 3 and we want a str. - if type(checksum) is not str: + if not isinstance(checksum, str): checksum = checksum.decode("utf8") return checksum diff --git a/zarr/creation.py b/zarr/creation.py index 726d0b5932..9b2b1d6d4c 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -1,7 +1,10 @@ -from typing import Optional +from collections.abc import MutableMapping +from typing import Optional, Tuple, Union, Sequence from warnings import warn import numpy as np +import numpy.typing as npt +from numcodecs.abc import Codec from numcodecs.registry import codec_registry from zarr._storage.store import DEFAULT_ZARR_VERSION @@ -19,32 +22,35 @@ normalize_storage_path, normalize_store_arg, ) +from zarr._storage.store import StorageTransformer +from zarr.sync import Synchronizer +from zarr.types import ZARR_VERSION, DIMENSION_SEPARATOR, MEMORY_ORDER, MetaArray, PathLike from zarr.util import normalize_dimension_separator def create( - shape, - chunks=True, - dtype=None, + shape: Union[int, Tuple[int, ...]], + chunks: Union[int, Tuple[int, ...], bool] = True, + dtype: Optional[npt.DTypeLike] = None, compressor="default", fill_value: Optional[int] = 0, - order="C", - store=None, - synchronizer=None, - overwrite=False, - path=None, - chunk_store=None, - filters=None, - cache_metadata=True, - cache_attrs=True, - read_only=False, - object_codec=None, - dimension_separator=None, - write_empty_chunks=True, + order: MEMORY_ORDER = "C", + store: Optional[Union[str, MutableMapping]] = None, + synchronizer: Optional[Synchronizer] = None, + overwrite: bool = False, + path: Optional[PathLike] = None, + chunk_store: Optional[MutableMapping] = None, + filters: Optional[Sequence[Codec]] = None, + cache_metadata: bool = True, + cache_attrs: bool = True, + read_only: bool = False, + object_codec: Optional[Codec] = None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + write_empty_chunks: bool = True, *, - zarr_version=None, - meta_array=None, - storage_transformers=(), + zarr_version: Optional[ZARR_VERSION] = None, + meta_array: Optional[MetaArray] = None, + storage_transformers: Sequence[StorageTransformer] = (), **kwargs, ): """Create an array. @@ -234,7 +240,6 @@ def create( def _kwargs_compat(compressor, fill_value, kwargs): - # to be compatible with h5py, as well as backwards-compatible with Zarr # 1.x, accept 'compression' and 'compression_opts' keyword arguments @@ -282,7 +287,7 @@ def _kwargs_compat(compressor, fill_value, kwargs): compressor = compression else: - raise ValueError("bad value for compression: %r" % compression) + raise ValueError(f"bad value for compression: {compression!r}") # handle 'fillvalue' if "fillvalue" in kwargs: @@ -292,7 +297,7 @@ def _kwargs_compat(compressor, fill_value, kwargs): # ignore other keyword arguments for k in kwargs: - warn("ignoring keyword argument %r" % k) + warn(f"ignoring keyword argument {k!r}", stacklevel=2) return compressor, fill_value @@ -465,7 +470,7 @@ def open_array( write_empty_chunks=True, *, zarr_version=None, - dimension_separator=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, meta_array=None, **kwargs, ): @@ -697,7 +702,6 @@ def open_array( def _like_args(a, kwargs): - shape, chunks = _get_shape_chunks(a) if shape is not None: kwargs.setdefault("shape", shape) diff --git a/zarr/errors.py b/zarr/errors.py index 30c9b13d39..85789fbcbf 100644 --- a/zarr/errors.py +++ b/zarr/errors.py @@ -67,9 +67,7 @@ def __init__(self): def err_too_many_indices(selection, shape): - raise IndexError( - "too many indices for array; expected {}, got {}".format(len(shape), len(selection)) - ) + raise IndexError(f"too many indices for array; expected {len(shape)}, got {len(selection)}") class VindexInvalidSelectionError(_BaseZarrIndexError): diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 3361969f08..8894a5ed57 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -27,6 +27,7 @@ from zarr.errors import ( ContainsArrayError, ContainsGroupError, + ArrayNotFoundError, GroupNotFoundError, ReadOnlyError, ) @@ -145,7 +146,7 @@ def __init__( synchronizer=None, zarr_version=None, *, - meta_array=None + meta_array=None, ): store: BaseStore = _normalize_store_arg(store, zarr_version=zarr_version) if zarr_version is None: @@ -186,16 +187,16 @@ def __init__( mkey = _prefix_to_group_key(self._store, self._key_prefix) assert not mkey.endswith("root/.group") meta_bytes = store[mkey] - except KeyError: + except KeyError as e: if self._version == 2: - raise GroupNotFoundError(path) + raise GroupNotFoundError(path) from e else: implicit_prefix = meta_root + self._key_prefix if self._store.list_prefix(implicit_prefix): # implicit group does not have any metadata self._meta = None else: - raise GroupNotFoundError(path) + raise GroupNotFoundError(path) from e else: self._meta = self._store._metadata_class.decode_group_metadata(meta_bytes) @@ -207,11 +208,15 @@ def __init__( # object can still be created. akey = mkey self._attrs = Attributes( - store, key=akey, read_only=read_only, cache=cache_attrs, synchronizer=synchronizer + store, + key=akey, + read_only=read_only, + cache=cache_attrs, + synchronizer=synchronizer, + cached_dict=self._meta["attributes"] if self._version == 3 and self._meta else None, ) # setup info - self._info = InfoReporter(self) @property def store(self): @@ -266,7 +271,7 @@ def attrs(self): @property def info(self): """Return diagnostic information about the group.""" - return self._info + return InfoReporter(self) @property def meta_array(self): @@ -340,9 +345,9 @@ def __len__(self): def __repr__(self): t = type(self) - r = "<{}.{}".format(t.__module__, t.__name__) + r = f"<{t.__module__}.{t.__name__}" if self.name: - r += " %r" % self.name + r += f" {self.name!r}" if self._read_only: r += " read-only" r += ">" @@ -358,7 +363,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): def info_items(self): def typestr(o): - return "{}.{}".format(type(o).__module__, type(o).__name__) + return f"{type(o).__module__}.{type(o).__name__}" items = [] @@ -458,7 +463,7 @@ def __getitem__(self, item): """ path = self._item_path(item) - if contains_array(self._store, path): + try: return Array( self._store, read_only=self._read_only, @@ -469,7 +474,10 @@ def __getitem__(self, item): zarr_version=self._version, meta_array=self._meta_array, ) - elif contains_group(self._store, path, explicit_only=True): + except ArrayNotFoundError: + pass + + try: return Group( self._store, read_only=self._read_only, @@ -480,7 +488,10 @@ def __getitem__(self, item): zarr_version=self._version, meta_array=self._meta_array, ) - elif self._version == 3: + except GroupNotFoundError: + pass + + if self._version == 3: implicit_group = meta_root + path + "/" # non-empty folder in the metadata path implies an implicit group if self._store.list_prefix(implicit_group): @@ -515,11 +526,18 @@ def _delitem_nosync(self, item): raise KeyError(item) def __getattr__(self, item): + # https://github.com/jupyter/notebook/issues/2014 + # Save a possibly expensive lookup (for e.g. against cloud stores) + # Note: The _ipython_display_ method is required to display the right info as a side-effect. + # It is simpler to pretend it doesn't exist. + if item in ["_ipython_canary_method_should_not_exist_", "_ipython_display_"]: + raise AttributeError + # allow access to group members via dot notation try: return self.__getitem__(item) - except KeyError: - raise AttributeError + except KeyError as e: + raise AttributeError from e def __dir__(self): # noinspection PyUnresolvedReferences @@ -919,7 +937,6 @@ def tree(self, expand=False, level=None): return TreeViewer(self, expand=expand, level=level) def _write_op(self, f, *args, **kwargs): - # guard condition if self._read_only: raise ReadOnlyError() @@ -1094,7 +1111,6 @@ def create_dataset(self, name, **kwargs): return self._write_op(self._create_dataset_nosync, name, **kwargs) def _create_dataset_nosync(self, name, data=None, **kwargs): - assert "mode" not in kwargs path = self._item_path(name) @@ -1138,11 +1154,9 @@ def require_dataset(self, name, shape, dtype=None, exact=False, **kwargs): ) def _require_dataset_nosync(self, name, shape, dtype=None, exact=False, **kwargs): - path = self._item_path(name) if contains_array(self._store, path): - # array already exists at path, validate that it is the right shape and type synchronizer = kwargs.get("synchronizer", self._synchronizer) @@ -1161,17 +1175,15 @@ def _require_dataset_nosync(self, name, shape, dtype=None, exact=False, **kwargs shape = normalize_shape(shape) if shape != a.shape: raise TypeError( - "shape do not match existing array; expected {}, got {}".format(a.shape, shape) + f"shape do not match existing array; expected {a.shape}, got {shape}" ) dtype = np.dtype(dtype) if exact: if dtype != a.dtype: - raise TypeError( - "dtypes do not match exactly; expected {}, got {}".format(a.dtype, dtype) - ) + raise TypeError(f"dtypes do not match exactly; expected {a.dtype}, got {dtype}") else: if not np.can_cast(dtype, a.dtype): - raise TypeError("dtypes ({}, {}) cannot be safely cast".format(dtype, a.dtype)) + raise TypeError(f"dtypes ({dtype}, {a.dtype}) cannot be safely cast") return a else: @@ -1235,7 +1247,7 @@ def _full_nosync(self, name, fill_value, **kwargs): path=path, chunk_store=self._chunk_store, fill_value=fill_value, - **kwargs + **kwargs, ) def array(self, name, data, **kwargs): @@ -1337,6 +1349,40 @@ def move(self, source, dest): self._write_op(self._move_nosync, source, dest) + # Override ipython repr methods, GH1716 + # https://ipython.readthedocs.io/en/stable/config/integrating.html#custom-methods + # " If the methods don’t exist, the standard repr() is used. If a method exists and + # returns None, it is treated the same as if it does not exist." + def _repr_html_(self): + return None + + def _repr_latex_(self): + return None + + def _repr_mimebundle_(self, **kwargs): + return None + + def _repr_svg_(self): + return None + + def _repr_png_(self): + return None + + def _repr_jpeg_(self): + return None + + def _repr_markdown_(self): + return None + + def _repr_javascript_(self): + return None + + def _repr_pdf_(self): + return None + + def _repr_json_(self): + return None + def _normalize_store_arg(store, *, storage_options=None, mode="r", zarr_version=None): if zarr_version is None: @@ -1361,7 +1407,7 @@ def group( path=None, *, zarr_version=None, - meta_array=None + meta_array=None, ): """Create a group. @@ -1452,7 +1498,7 @@ def open_group( storage_options=None, *, zarr_version=None, - meta_array=None + meta_array=None, ): """Open a group using file-mode-like semantics. diff --git a/zarr/indexing.py b/zarr/indexing.py index 487cc8b9d9..35c1e813b1 100644 --- a/zarr/indexing.py +++ b/zarr/indexing.py @@ -52,6 +52,8 @@ def is_scalar(value, dtype): return True if isinstance(value, tuple) and dtype.names and len(value) == len(dtype.names): return True + if dtype.kind == "O" and not isinstance(value, np.ndarray): + return True return False @@ -111,7 +113,6 @@ def is_pure_orthogonal_indexing(selection, ndim): def normalize_integer_selection(dim_sel, dim_len): - # normalize type to int dim_sel = int(dim_sel) @@ -145,7 +146,6 @@ def normalize_integer_selection(dim_sel, dim_len): class IntDimIndexer: def __init__(self, dim_sel, dim_len, dim_chunk_len): - # normalize dim_sel = normalize_integer_selection(dim_sel, dim_len) @@ -169,7 +169,6 @@ def ceildiv(a, b): class SliceDimIndexer: def __init__(self, dim_sel, dim_len, dim_chunk_len): - # normalize self.start, self.stop, self.step = dim_sel.indices(dim_len) if self.step < 1: @@ -182,14 +181,12 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len) def __iter__(self): - # figure out the range of chunks we need to visit dim_chunk_ix_from = self.start // self.dim_chunk_len dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) # iterate over chunks in range for dim_chunk_ix in range(dim_chunk_ix_from, dim_chunk_ix_to): - # compute offsets for chunk within overall array dim_offset = dim_chunk_ix * self.dim_chunk_len dim_limit = min(self.dim_len, (dim_chunk_ix + 1) * self.dim_chunk_len) @@ -237,7 +234,6 @@ def check_selection_length(selection, shape): def replace_ellipsis(selection, shape): - selection = ensure_tuple(selection) # count number of ellipsis present @@ -330,14 +326,12 @@ def is_basic_selection(selection): # noinspection PyProtectedMember class BasicIndexer: def __init__(self, selection, array): - # handle ellipsis selection = replace_ellipsis(selection, array._shape) # setup per-dimension indexers dim_indexers = [] for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - if is_integer(dim_sel): dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) @@ -346,8 +340,8 @@ def __init__(self, selection, array): else: raise IndexError( - "unsupported selection item for basic indexing; " - "expected integer or slice, got {!r}".format(type(dim_sel)) + f"unsupported selection item for basic indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" ) dim_indexers.append(dim_indexer) @@ -358,7 +352,6 @@ def __init__(self, selection, array): def __iter__(self): for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) out_selection = tuple( @@ -370,7 +363,6 @@ def __iter__(self): class BoolArrayDimIndexer: def __init__(self, dim_sel, dim_len, dim_chunk_len): - # check number of dimensions if not is_bool_array(dim_sel, 1): raise IndexError( @@ -380,8 +372,8 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): # check shape if dim_sel.shape[0] != dim_len: raise IndexError( - "Boolean array has the wrong length for dimension; " - "expected {}, got {}".format(dim_len, dim_sel.shape[0]) + f"Boolean array has the wrong length for dimension; " + f"expected {dim_len}, got { dim_sel.shape[0]}" ) # store attributes @@ -402,10 +394,8 @@ def __init__(self, dim_sel, dim_len, dim_chunk_len): self.dim_chunk_ixs = np.nonzero(self.chunk_nitems)[0] def __iter__(self): - # iterate over chunks with at least one item for dim_chunk_ix in self.dim_chunk_ixs: - # find region in chunk dim_offset = dim_chunk_ix * self.dim_chunk_len dim_chunk_sel = self.dim_sel[dim_offset : dim_offset + self.dim_chunk_len] @@ -472,7 +462,6 @@ def __init__( boundscheck=True, order=Order.UNKNOWN, ): - # ensure 1d array dim_sel = np.asanyarray(dim_sel) if not is_integer_array(dim_sel, 1): @@ -526,9 +515,7 @@ def __init__( self.chunk_nitems_cumsum = np.cumsum(self.chunk_nitems) def __iter__(self): - for dim_chunk_ix in self.dim_chunk_ixs: - # find region in output if dim_chunk_ix == 0: start = 0 @@ -560,11 +547,11 @@ def ix_(selection, shape): # replace slice and int as these are not supported by numpy.ix_ selection = [ - slice_to_range(dim_sel, dim_len) - if isinstance(dim_sel, slice) - else [dim_sel] - if is_integer(dim_sel) - else dim_sel + ( + slice_to_range(dim_sel, dim_len) + if isinstance(dim_sel, slice) + else [dim_sel] if is_integer(dim_sel) else dim_sel + ) for dim_sel, dim_len in zip(selection, shape) ] @@ -602,7 +589,6 @@ def oindex_set(a, selection, value): # noinspection PyProtectedMember class OrthogonalIndexer: def __init__(self, selection, array): - # handle ellipsis selection = replace_ellipsis(selection, array._shape) @@ -612,7 +598,6 @@ def __init__(self, selection, array): # setup per-dimension indexers dim_indexers = [] for dim_sel, dim_len, dim_chunk_len in zip(selection, array._shape, array._chunks): - if is_integer(dim_sel): dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len) @@ -627,9 +612,9 @@ def __init__(self, selection, array): else: raise IndexError( - "unsupported selection item for orthogonal indexing; " - "expected integer, slice, integer array or Boolean " - "array, got {!r}".format(type(dim_sel)) + f"unsupported selection item for orthogonal indexing; " + f"expected integer, slice, integer array or Boolean " + f"array, got {type(dim_sel)!r}" ) dim_indexers.append(dim_indexer) @@ -649,7 +634,6 @@ def __init__(self, selection, array): def __iter__(self): for dim_projections in itertools.product(*self.dim_indexers): - chunk_coords = tuple(p.dim_chunk_ix for p in dim_projections) chunk_selection = tuple(p.dim_chunk_sel for p in dim_projections) out_selection = tuple( @@ -658,7 +642,6 @@ def __iter__(self): # handle advanced indexing arrays orthogonally if self.is_advanced: - # N.B., numpy doesn't support orthogonal indexing directly as yet, # so need to work around via np.ix_. Also np.ix_ does not support a # mixture of arrays and slices or integers, so need to convert slices @@ -692,7 +675,6 @@ def __setitem__(self, selection, value): # noinspection PyProtectedMember class BlockIndexer: def __init__(self, selection, array): - # handle ellipsis selection = replace_ellipsis(selection, array._shape) @@ -718,8 +700,8 @@ def __init__(self, selection, array): if dim_sel.step not in {1, None}: raise IndexError( - "unsupported selection item for block indexing; " - "expected integer or slice with step=1, got {!r}".format(type(dim_sel)) + f"unsupported selection item for block indexing; " + f"expected integer or slice with step=1, got {type(dim_sel)!r}" ) # Can't reuse wraparound_indices because it expects a numpy array @@ -735,8 +717,8 @@ def __init__(self, selection, array): else: raise IndexError( - "unsupported selection item for block indexing; " - "expected integer or slice, got {!r}".format(type(dim_sel)) + f"unsupported selection item for block indexing; " + f"expected integer or slice, got {type(dim_sel)!r}" ) dim_indexer = SliceDimIndexer(slice_, dim_len, dim_chunk_size) @@ -794,7 +776,6 @@ def is_mask_selection(selection, array): # noinspection PyProtectedMember class CoordinateIndexer: def __init__(self, selection, array): - # some initial normalization selection = ensure_tuple(selection) selection = tuple([i] if is_integer(i) else i for i in selection) @@ -803,14 +784,13 @@ def __init__(self, selection, array): # validation if not is_coordinate_selection(selection, array): raise IndexError( - "invalid coordinate selection; expected one integer " - "(coordinate) array per dimension of the target array, " - "got {!r}".format(selection) + f"invalid coordinate selection; expected one integer " + f"(coordinate) array per dimension of the target array, " + f"got {selection!r}" ) # handle wraparound, boundscheck for dim_sel, dim_len in zip(selection, array.shape): - # handle wraparound wraparound_indices(dim_sel, dim_len) @@ -861,10 +841,8 @@ def __init__(self, selection, array): self.chunk_mixs = np.unravel_index(self.chunk_rixs, array._cdata_shape) def __iter__(self): - # iterate over chunks for i, chunk_rix in enumerate(self.chunk_rixs): - chunk_coords = tuple(m[i] for m in self.chunk_mixs) if chunk_rix == 0: start = 0 @@ -891,7 +869,6 @@ def __iter__(self): # noinspection PyProtectedMember class MaskIndexer(CoordinateIndexer): def __init__(self, selection, array): - # some initial normalization selection = ensure_tuple(selection) selection = replace_lists(selection) @@ -899,8 +876,8 @@ def __init__(self, selection, array): # validation if not is_mask_selection(selection, array): raise IndexError( - "invalid mask selection; expected one Boolean (mask)" - "array with the same shape as the target array, got {!r}".format(selection) + f"invalid mask selection; expected one Boolean (mask)" + f"array with the same shape as the target array, got {selection!r}" ) # convert to indices @@ -944,8 +921,7 @@ def check_fields(fields, dtype): # check type if not isinstance(fields, (str, list, tuple)): raise IndexError( - "'fields' argument must be a string or list of strings; found " - "{!r}".format(type(fields)) + f"'fields' argument must be a string or list of strings; found " f"{type(fields)!r}" ) if fields: if dtype.names is None: @@ -958,7 +934,7 @@ def check_fields(fields, dtype): # multiple field selection out_dtype = np.dtype([(f, dtype[f]) for f in fields]) except KeyError as e: - raise IndexError("invalid 'fields' argument, field not found: {!r}".format(e)) + raise IndexError(f"invalid 'fields' argument, field not found: {e!r}") from e else: return out_dtype else: diff --git a/zarr/meta.py b/zarr/meta.py index 48791ddf17..5430ab305d 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -89,7 +89,6 @@ class Metadata2: @classmethod def parse_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType[str, Any]: - # Here we allow that a store may return an already-parsed metadata object, # or a string of JSON that we will parse here. We allow for an already-parsed # object to accommodate a consolidated metadata store, where all the metadata for @@ -112,7 +111,7 @@ def decode_array_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType # check metadata format zarr_format = meta.get("zarr_format", None) if zarr_format != cls.ZARR_FORMAT: - raise MetadataError("unsupported zarr format: %s" % zarr_format) + raise MetadataError(f"unsupported zarr format: {zarr_format}") # extract array metadata fields try: @@ -200,7 +199,7 @@ def decode_group_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType # check metadata format version zarr_format = meta.get("zarr_format", None) if zarr_format != cls.ZARR_FORMAT: - raise MetadataError("unsupported zarr format: %s" % zarr_format) + raise MetadataError(f"unsupported zarr format: {zarr_format}") meta = dict(zarr_format=zarr_format) return meta @@ -228,15 +227,15 @@ def decode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> if v == "NaN": return np.nan elif v == "Infinity": - return np.PINF + return np.inf elif v == "-Infinity": - return np.NINF + return -np.inf else: return np.array(v, dtype=dtype)[()] elif dtype.kind in "c": v = ( - cls.decode_fill_value(v[0], dtype.type().real.dtype), # type: ignore - cls.decode_fill_value(v[1], dtype.type().imag.dtype), # type: ignore + cls.decode_fill_value(v[0], dtype.type().real.dtype), + cls.decode_fill_value(v[1], dtype.type().imag.dtype), ) v = v[0] + 1j * v[1] return np.array(v, dtype=dtype)[()] @@ -311,8 +310,8 @@ def decode_dtype(cls, d, validate=True): # extract the type from the extension info try: d = d["type"] - except KeyError: - raise KeyError("Extended dtype info must provide a key named 'type'.") + except KeyError as e: + raise KeyError("Extended dtype info must provide a key named 'type'.") from e d = cls._decode_dtype_descr(d) dtype = np.dtype(d) if validate: @@ -347,7 +346,7 @@ def decode_group_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType # # check metadata format version # zarr_format = meta.get("zarr_format", None) # if zarr_format != cls.ZARR_FORMAT: - # raise MetadataError("unsupported zarr format: %s" % zarr_format) + # raise MetadataError(f"unsupported zarr format: {zarr_format}") assert "attributes" in meta # meta = dict(attributes=meta['attributes']) @@ -384,7 +383,7 @@ def decode_hierarchy_metadata(cls, s: Union[MappingType, bytes, str]) -> Mapping # check metadata format # zarr_format = meta.get("zarr_format", None) # if zarr_format != "https://purl.org/zarr/spec/protocol/core/3.0": - # raise MetadataError("unsupported zarr format: %s" % zarr_format) + # raise MetadataError(f"unsupported zarr format: {zarr_format}") if set(meta.keys()) != { "zarr_format", "metadata_encoding", @@ -519,7 +518,7 @@ def decode_array_metadata(cls, s: Union[MappingType, bytes, str]) -> MappingType meta["storage_transformers"] = storage_transformers except Exception as e: - raise MetadataError("error decoding metadata: %s" % e) + raise MetadataError(f"error decoding metadata: {e}") from e else: return meta diff --git a/zarr/meta_v1.py b/zarr/meta_v1.py index 4ac381f2ca..714f55f477 100644 --- a/zarr/meta_v1.py +++ b/zarr/meta_v1.py @@ -10,7 +10,7 @@ def decode_metadata(b): meta = json.loads(s) zarr_format = meta.get("zarr_format", None) if zarr_format != 1: - raise MetadataError("unsupported zarr format: %s" % zarr_format) + raise MetadataError(f"unsupported zarr format: {zarr_format}") try: meta = dict( zarr_format=meta["zarr_format"], @@ -23,7 +23,7 @@ def decode_metadata(b): order=meta["order"], ) except Exception as e: - raise MetadataError("error decoding metadata: %s" % e) + raise MetadataError(f"error decoding metadata: {e}") from e else: return meta diff --git a/zarr/n5.py b/zarr/n5.py index 7e73905527..3d3e9afa26 100644 --- a/zarr/n5.py +++ b/zarr/n5.py @@ -1,5 +1,6 @@ """This module contains a storage class and codec to support the N5 format. """ + import os import struct import sys @@ -72,21 +73,18 @@ class N5Store(NestedDirectoryStore): def __getitem__(self, key: str) -> bytes: if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) return json_dumps(value) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) top_level = key == zarr_array_meta_key value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) return json_dumps(value) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) value = attrs_to_zarr(self._load_n5_attrs(key_new)) @@ -104,9 +102,7 @@ def __getitem__(self, key: str) -> bytes: return super().__getitem__(key_new) def __setitem__(self, key: str, value: Any): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) n5_attrs = self._load_n5_attrs(key_new) @@ -115,7 +111,6 @@ def __setitem__(self, key: str, value: Any): value = json_dumps(n5_attrs) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) top_level = key == zarr_array_meta_key n5_attrs = self._load_n5_attrs(key_new) @@ -123,7 +118,6 @@ def __setitem__(self, key: str, value: Any): value = json_dumps(n5_attrs) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) n5_attrs = self._load_n5_attrs(key_new) @@ -131,7 +125,11 @@ def __setitem__(self, key: str, value: Any): for k in n5_keywords: if k in zarr_attrs: - warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) + warnings.warn( + f"Attribute {k} is a reserved N5 keyword", + UserWarning, + stacklevel=2, + ) # remove previous user attributes for k in list(n5_attrs.keys()): @@ -166,9 +164,7 @@ def __delitem__(self, key: str): super().__delitem__(key_new) def __contains__(self, key): - if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, n5_attrs_key) if key_new not in self: return False @@ -176,18 +172,15 @@ def __contains__(self, key): return "dimensions" not in self._load_n5_attrs(key_new) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, n5_attrs_key) # array if attributes contain 'dimensions' return "dimensions" in self._load_n5_attrs(key_new) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, n5_attrs_key) return self._contains_attrs(key_new) elif is_chunk_key(key): - key_new = invert_chunk_coords(key) else: key_new = key @@ -198,7 +191,6 @@ def __eq__(self, other): return isinstance(other, N5Store) and self.path == other.path def listdir(self, path: Optional[str] = None): - if path is not None: path = invert_chunk_coords(path) path = cast(str, path) @@ -208,7 +200,6 @@ def listdir(self, path: Optional[str] = None): children = super().listdir(path=path) if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files children.remove(n5_attrs_key) children.append(zarr_array_meta_key) @@ -234,7 +225,6 @@ def listdir(self, path: Optional[str] = None): return sorted(new_children) elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files children.remove(n5_attrs_key) children.append(zarr_group_meta_key) @@ -244,7 +234,6 @@ def listdir(self, path: Optional[str] = None): return sorted(children) else: - return children def _load_n5_attrs(self, path: str) -> Dict[str, Any]: @@ -255,7 +244,6 @@ def _load_n5_attrs(self, path: str) -> Dict[str, Any]: return {} def _is_group(self, path: str): - if path is None: attrs_key = n5_attrs_key else: @@ -265,7 +253,6 @@ def _is_group(self, path: str): return len(n5_attrs) > 0 and "dimensions" not in n5_attrs def _is_array(self, path: str): - if path is None: attrs_key = n5_attrs_key else: @@ -274,7 +261,6 @@ def _is_array(self, path: str): return "dimensions" in self._load_n5_attrs(attrs_key) def _contains_attrs(self, path: str): - if path is None: attrs_key = n5_attrs_key else: @@ -345,7 +331,10 @@ class N5FSStore(FSStore): def __init__(self, *args, **kwargs): if "dimension_separator" in kwargs: kwargs.pop("dimension_separator") - warnings.warn("Keyword argument `dimension_separator` will be ignored") + warnings.warn( + "Keyword argument `dimension_separator` will be ignored", + stacklevel=2, + ) dimension_separator = "." super().__init__(*args, dimension_separator=dimension_separator, **kwargs) @@ -376,21 +365,18 @@ def _normalize_key(self, key: str): def __getitem__(self, key: str) -> bytes: if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) value = group_metadata_to_zarr(self._load_n5_attrs(key_new)) return json_dumps(value) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) top_level = key == zarr_array_meta_key value = array_metadata_to_zarr(self._load_n5_attrs(key_new), top_level=top_level) return json_dumps(value) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) value = attrs_to_zarr(self._load_n5_attrs(key_new)) @@ -409,7 +395,6 @@ def __getitem__(self, key: str) -> bytes: def __setitem__(self, key: str, value: Any): if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) n5_attrs = self._load_n5_attrs(key_new) @@ -418,7 +403,6 @@ def __setitem__(self, key: str, value: Any): value = json_dumps(n5_attrs) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) top_level = key == zarr_array_meta_key n5_attrs = self._load_n5_attrs(key_new) @@ -427,7 +411,6 @@ def __setitem__(self, key: str, value: Any): value = json_dumps(n5_attrs) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) n5_attrs = self._load_n5_attrs(key_new) @@ -435,7 +418,11 @@ def __setitem__(self, key: str, value: Any): for k in n5_keywords: if k in zarr_attrs.keys(): - warnings.warn(f"Attribute {k} is a reserved N5 keyword", UserWarning) + warnings.warn( + f"Attribute {k} is a reserved N5 keyword", + UserWarning, + stacklevel=2, + ) # replace previous user attributes for k in list(n5_attrs.keys()): @@ -456,7 +443,6 @@ def __setitem__(self, key: str, value: Any): super().__setitem__(key_new, value) def __delitem__(self, key: str): - if key.endswith(zarr_group_meta_key): key_new = key.replace(zarr_group_meta_key, self._group_meta_key) elif key.endswith(zarr_array_meta_key): @@ -471,7 +457,6 @@ def __delitem__(self, key: str): def __contains__(self, key: Any): if key.endswith(zarr_group_meta_key): - key_new = key.replace(zarr_group_meta_key, self._group_meta_key) if key_new not in self: return False @@ -479,13 +464,11 @@ def __contains__(self, key: Any): return "dimensions" not in self._load_n5_attrs(key_new) elif key.endswith(zarr_array_meta_key): - key_new = key.replace(zarr_array_meta_key, self._array_meta_key) # array if attributes contain 'dimensions' return "dimensions" in self._load_n5_attrs(key_new) elif key.endswith(zarr_attrs_key): - key_new = key.replace(zarr_attrs_key, self._attrs_key) return self._contains_attrs(key_new) @@ -508,7 +491,6 @@ def listdir(self, path: Optional[str] = None): # doesn't provide. children = super().listdir(path=path) if self._is_array(path): - # replace n5 attribute file with respective zarr attribute files children.remove(self._array_meta_key) children.append(zarr_array_meta_key) @@ -532,7 +514,6 @@ def listdir(self, path: Optional[str] = None): return sorted(new_children) elif self._is_group(path): - # replace n5 attribute file with respective zarr attribute files children.remove(self._group_meta_key) children.append(zarr_group_meta_key) @@ -550,7 +531,6 @@ def _load_n5_attrs(self, path: str): return {} def _is_group(self, path: Optional[str]): - if path is None: attrs_key = self._attrs_key else: @@ -560,7 +540,6 @@ def _is_group(self, path: Optional[str]): return len(n5_attrs) > 0 and "dimensions" not in n5_attrs def _is_array(self, path: Optional[str]): - if path is None: attrs_key = self._attrs_key else: @@ -569,7 +548,6 @@ def _is_array(self, path: Optional[str]): return "dimensions" in self._load_n5_attrs(attrs_key) def _contains_attrs(self, path: Optional[str]): - if path is None: attrs_key = self._attrs_key else: @@ -630,8 +608,8 @@ def array_metadata_to_n5(array_metadata: Dict[str, Any], top_level=False) -> Dic array_metadata["n5"] = N5_FORMAT try: dtype = np.dtype(array_metadata["dataType"]) - except TypeError: - raise TypeError(f"Data type {array_metadata['dataType']} is not supported by N5") + except TypeError as e: + raise TypeError(f"Data type {array_metadata['dataType']} is not supported by N5") from e array_metadata["dataType"] = dtype.name array_metadata["dimensions"] = array_metadata["dimensions"][::-1] @@ -712,7 +690,6 @@ def attrs_to_zarr(attrs: Dict[str, Any]) -> Dict[str, Any]: def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict[str, Any]: - if compressor_config is None: return {"type": "raw"} else: @@ -726,19 +703,16 @@ def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict n5_config = {"type": codec_id} if codec_id == "bz2": - n5_config["type"] = "bzip2" n5_config["blockSize"] = _compressor_config["level"] elif codec_id == "blosc": - n5_config["cname"] = _compressor_config["cname"] n5_config["clevel"] = _compressor_config["clevel"] n5_config["shuffle"] = _compressor_config["shuffle"] n5_config["blocksize"] = _compressor_config["blocksize"] elif codec_id == "lzma": - # Switch to XZ for N5 if we are using the default XZ format. # Note: 4 is the default, which is lzma.CHECK_CRC64. if _compressor_config["format"] == 1 and _compressor_config["check"] in [-1, 4]: @@ -748,6 +722,7 @@ def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict "Not all N5 implementations support lzma compression (yet). You " "might not be able to open the dataset with another N5 library.", RuntimeWarning, + stacklevel=2, ) n5_config["format"] = _compressor_config["format"] n5_config["check"] = _compressor_config["check"] @@ -760,50 +735,42 @@ def compressor_config_to_n5(compressor_config: Optional[Dict[str, Any]]) -> Dict n5_config["preset"] = 6 elif codec_id == "zlib": - n5_config["type"] = "gzip" n5_config["level"] = _compressor_config["level"] n5_config["useZlib"] = True elif codec_id == "gzip": - n5_config["type"] = "gzip" n5_config["level"] = _compressor_config["level"] n5_config["useZlib"] = False else: - n5_config.update({k: v for k, v in _compressor_config.items() if k != "type"}) return n5_config def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dict[str, Any]]: - codec_id = compressor_config["type"] zarr_config = {"id": codec_id} if codec_id == "bzip2": - zarr_config["id"] = "bz2" zarr_config["level"] = compressor_config["blockSize"] elif codec_id == "blosc": - zarr_config["cname"] = compressor_config["cname"] zarr_config["clevel"] = compressor_config["clevel"] zarr_config["shuffle"] = compressor_config["shuffle"] zarr_config["blocksize"] = compressor_config["blocksize"] elif codec_id == "lzma": - zarr_config["format"] = compressor_config["format"] zarr_config["check"] = compressor_config["check"] zarr_config["preset"] = compressor_config["preset"] zarr_config["filters"] = compressor_config["filters"] elif codec_id == "xz": - zarr_config["id"] = "lzma" zarr_config["format"] = 1 # lzma.FORMAT_XZ zarr_config["check"] = -1 @@ -811,7 +778,6 @@ def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dic zarr_config["filters"] = None elif codec_id == "gzip": - if "useZlib" in compressor_config and compressor_config["useZlib"]: zarr_config["id"] = "zlib" zarr_config["level"] = compressor_config["level"] @@ -820,22 +786,18 @@ def compressor_config_to_zarr(compressor_config: Dict[str, Any]) -> Optional[Dic zarr_config["level"] = compressor_config["level"] elif codec_id == "raw": - return None else: - zarr_config.update({k: v for k, v in compressor_config.items() if k != "type"}) return zarr_config class N5ChunkWrapper(Codec): - codec_id = "n5_wrapper" def __init__(self, dtype, chunk_shape, compressor_config=None, compressor=None): - self.dtype = np.dtype(dtype) self.chunk_shape = tuple(chunk_shape) # is the dtype a little endian format? @@ -860,7 +822,6 @@ def get_config(self): return config def encode(self, chunk): - assert chunk.flags.c_contiguous header = self._create_header(chunk) @@ -872,16 +833,14 @@ def encode(self, chunk): return header + chunk.tobytes(order="A") def decode(self, chunk, out=None) -> bytes: - len_header, chunk_shape = self._read_header(chunk) chunk = chunk[len_header:] if out is not None: - # out should only be used if we read a complete chunk - assert chunk_shape == self.chunk_shape, "Expected chunk of shape {}, found {}".format( - self.chunk_shape, chunk_shape - ) + assert ( + chunk_shape == self.chunk_shape + ), f"Expected chunk of shape {self.chunk_shape}, found {chunk_shape}" if self._compressor: self._compressor.decode(chunk, out) @@ -895,7 +854,6 @@ def decode(self, chunk, out=None) -> bytes: return out else: - if self._compressor: chunk = self._compressor.decode(chunk) @@ -915,7 +873,6 @@ def decode(self, chunk, out=None) -> bytes: @staticmethod def _create_header(chunk): - mode = struct.pack(">H", 0) num_dims = struct.pack(">H", len(chunk.shape)) shape = b"".join(struct.pack(">I", d) for d in chunk.shape[::-1]) @@ -924,7 +881,6 @@ def _create_header(chunk): @staticmethod def _read_header(chunk): - num_dims = struct.unpack(">H", chunk[2:4])[0] shape = tuple( struct.unpack(">I", chunk[i : i + 4])[0] for i in range(4, num_dims * 4 + 4, 4) diff --git a/zarr/storage.py b/zarr/storage.py index b36f804ebd..f412870f75 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -14,6 +14,7 @@ path) and a `getsize` method (return the size in bytes of a given value). """ + import atexit import errno import glob @@ -28,6 +29,7 @@ import zipfile from collections import OrderedDict from collections.abc import MutableMapping +from functools import lru_cache from os import scandir from pickle import PicklingError from threading import Lock, RLock @@ -39,6 +41,8 @@ from numcodecs.compat import ensure_bytes, ensure_text, ensure_contiguous_ndarray_like from numcodecs.registry import codec_registry from zarr.context import Context +from zarr.types import PathLike as Path, DIMENSION_SEPARATOR +from zarr.util import NoLock from zarr.errors import ( MetadataError, @@ -84,6 +88,7 @@ DEFAULT_ZARR_VERSION, BaseStore, Store, + V3_DEPRECATION_MESSAGE, ) __doctest_requires__ = { @@ -104,7 +109,6 @@ default_compressor = Zlib() -Path = Union[str, bytes, None] # allow MutableMapping for backwards compatibility StoreLike = Union[BaseStore, MutableMapping] @@ -205,7 +209,7 @@ def rmdir(store: StoreLike, path: Path = None): store_version = getattr(store, "_store_version", 2) if hasattr(store, "rmdir") and store.is_erasable(): # type: ignore # pass through - store.rmdir(path) # type: ignore + store.rmdir(path) else: # slow version, delete one key at a time if store_version == 2: @@ -235,7 +239,7 @@ def listdir(store: BaseStore, path: Path = None): path = normalize_storage_path(path) if hasattr(store, "listdir"): # pass through - return store.listdir(path) # type: ignore + return store.listdir(path) else: # slow version, iterate through all keys warnings.warn( @@ -288,7 +292,7 @@ def getsize(store: BaseStore, path: Path = None) -> int: if hasattr(store, "getsize"): # pass through path = normalize_storage_path(path) - return store.getsize(path) # type: ignore + return store.getsize(path) elif isinstance(store, MutableMapping): return _getsize(store, path) else: @@ -325,7 +329,7 @@ def init_array( chunk_store: Optional[StoreLike] = None, filters=None, object_codec=None, - dimension_separator=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, storage_transformers=(), ): """Initialize an array store with the given configuration. Note that this is a low-level @@ -479,10 +483,9 @@ def _init_array_metadata( chunk_store: Optional[StoreLike] = None, filters=None, object_codec=None, - dimension_separator=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, storage_transformers=(), ): - store_version = getattr(store, "_store_version", 2) path = normalize_storage_path(path) @@ -586,11 +589,15 @@ def _init_array_metadata( "missing object_codec for object array; this will raise a " "ValueError in version 3.0", FutureWarning, + stacklevel=2, ) else: filters_config.insert(0, object_codec.get_config()) elif object_codec is not None: - warnings.warn("an object_codec is only needed for object arrays") + warnings.warn( + "an object_codec is only needed for object arrays", + stacklevel=2, + ) # use null to indicate no filters if not filters_config: @@ -627,7 +634,7 @@ def _init_array_metadata( key = _prefix_to_array_key(store, _path_to_prefix(path)) if hasattr(store, "_metadata_class"): - store[key] = store._metadata_class.encode_array_metadata(meta) # type: ignore + store[key] = store._metadata_class.encode_array_metadata(meta) else: store[key] = encode_array_metadata(meta) @@ -687,7 +694,6 @@ def _init_group_metadata( path: Optional[str] = None, chunk_store: Optional[StoreLike] = None, ): - store_version = getattr(store, "_store_version", 2) path = normalize_storage_path(path) @@ -731,10 +737,10 @@ def _init_group_metadata( if store_version == 3: meta = {"attributes": {}} # type: ignore else: - meta = {} # type: ignore + meta = {} key = _prefix_to_group_key(store, _path_to_prefix(path)) if hasattr(store, "_metadata_class"): - store[key] = store._metadata_class.encode_group_metadata(meta) # type: ignore + store[key] = store._metadata_class.encode_group_metadata(meta) else: store[key] = encode_group_metadata(meta) @@ -785,7 +791,7 @@ def __len__(self): return len(self._mutable_mapping) def __repr__(self): - return f"<{self.__class__.__name__}: \n{repr(self._mutable_mapping)}\n at {hex(id(self))}>" + return f"<{self.__class__.__name__}: \n{self._mutable_mapping!r}\n at {id(self):#x}>" def __eq__(self, other): if isinstance(other, KVStore): @@ -867,8 +873,8 @@ def __getitem__(self, item: str): parent, key = self._get_parent(item) try: value = parent[key] - except KeyError: - raise KeyError(item) + except KeyError as e: + raise KeyError(item) from e else: if isinstance(value, self.cls): raise KeyError(item) @@ -886,8 +892,8 @@ def __delitem__(self, item: str): parent, key = self._get_parent(item) try: del parent[key] - except KeyError: - raise KeyError(item) + except KeyError as e: + raise KeyError(item) from e def __contains__(self, item: str): # type: ignore[override] try: @@ -1054,8 +1060,9 @@ class DirectoryStore(Store): """ - def __init__(self, path, normalize_keys=False, dimension_separator=None): - + def __init__( + self, path, normalize_keys=False, dimension_separator: Optional[DIMENSION_SEPARATOR] = None + ): # guard conditions path = os.path.abspath(path) if os.path.exists(path) and not os.path.isdir(path): @@ -1134,7 +1141,7 @@ def __setitem__(self, key, value): os.makedirs(dir_path) except OSError as e: if e.errno != errno.EEXIST: - raise KeyError(key) + raise KeyError(key) from e # write to temporary file # note we're not using tempfile.NamedTemporaryFile to avoid restrictive file permissions @@ -1350,7 +1357,7 @@ def __init__( key_separator=None, mode="w", exceptions=(KeyError, PermissionError, IOError), - dimension_separator=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, fs=None, check=False, create=False, @@ -1415,12 +1422,23 @@ def _normalize_key(self, key): def getitems( self, keys: Sequence[str], *, contexts: Mapping[str, Context] ) -> Mapping[str, Any]: - - keys_transformed = [self._normalize_key(key) for key in keys] - results = self.map.getitems(keys_transformed, on_error="omit") - # The function calling this method may not recognize the transformed keys - # So we send the values returned by self.map.getitems back into the original key space. - return {keys[keys_transformed.index(rk)]: rv for rk, rv in results.items()} + keys_transformed = {self._normalize_key(key): key for key in keys} + results_transformed = self.map.getitems(list(keys_transformed), on_error="return") + results = {} + for k, v in results_transformed.items(): + if isinstance(v, self.exceptions): + # Cause recognized exceptions to prompt a KeyError in the + # function calling this method + continue + elif isinstance(v, Exception): + # Raise any other exception + raise v + else: + # The function calling this method may not recognize the transformed + # keys, so we send the values returned by self.map.getitems back into + # the original key space. + results[keys_transformed[k]] = v + return results def __getitem__(self, key): key = self._normalize_key(key) @@ -1540,6 +1558,7 @@ def clear(self): self.map.clear() @classmethod + @lru_cache(maxsize=None) def _fsspec_installed(cls): """Returns true if fsspec is installed""" import importlib.util @@ -1569,7 +1588,12 @@ class TempStore(DirectoryStore): # noinspection PyShadowingBuiltins def __init__( - self, suffix="", prefix="zarr", dir=None, normalize_keys=False, dimension_separator=None + self, + suffix="", + prefix="zarr", + dir=None, + normalize_keys=False, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, ): path = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir) atexit.register(atexit_rmtree, path) @@ -1585,6 +1609,12 @@ class NestedDirectoryStore(DirectoryStore): special handling for chunk keys so that chunk files for multidimensional arrays are stored in a nested directory tree. + .. deprecated:: 2.18.0 + NestedDirectoryStore will be removed in Zarr-Python 3.0 where controlling + the chunk key encoding will be supported as part of the array metadata. See + `GH1274 `_ + for more information. + Parameters ---------- path : string @@ -1653,7 +1683,16 @@ class NestedDirectoryStore(DirectoryStore): """ - def __init__(self, path, normalize_keys=False, dimension_separator="/"): + def __init__( + self, path, normalize_keys=False, dimension_separator: Optional[DIMENSION_SEPARATOR] = "/" + ): + + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + super().__init__(path, normalize_keys=normalize_keys) if dimension_separator is None: dimension_separator = "/" @@ -1766,9 +1805,8 @@ def __init__( compression=zipfile.ZIP_STORED, allowZip64=True, mode="a", - dimension_separator=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, ): - # store properties path = os.path.abspath(path) self.path = path @@ -1975,6 +2013,11 @@ def migrate_1to2(store): class DBMStore(Store): """Storage class using a DBM-style database. + .. deprecated:: 2.18.0 + DBMStore will be removed in Zarr-Python 3.0. See + `GH1274 `_ + for more information. + Parameters ---------- path : string @@ -2060,9 +2103,15 @@ def __init__( mode=0o666, open=None, write_lock=True, - dimension_separator=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, **open_kwargs, ): + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + if open is None: import dbm @@ -2075,6 +2124,7 @@ def __init__( self.mode = mode self.open = open self.write_lock = write_lock + self.write_mutex: Union[Lock, NoLock] if write_lock: # This may not be required as some dbm implementations manage their own # locks, but err on the side of caution. @@ -2179,6 +2229,10 @@ class LMDBStore(Store): """Storage class using LMDB. Requires the `lmdb `_ package to be installed. + .. deprecated:: 2.18.0 + LMDBStore will be removed in Zarr-Python 3.0. See + `GH1274 `_ + for more information. Parameters ---------- @@ -2231,9 +2285,21 @@ class LMDBStore(Store): """ - def __init__(self, path, buffers=True, dimension_separator=None, **kwargs): + def __init__( + self, + path, + buffers=True, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, + **kwargs, + ): import lmdb + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + # set default memory map size to something larger than the lmdb default, which is # very likely to be too small for any moderate array (logic copied from zict) map_size = 2**40 if sys.maxsize >= 2**32 else 2**28 @@ -2553,6 +2619,11 @@ def __delitem__(self, key): class SQLiteStore(Store): """Storage class using SQLite. + .. deprecated:: 2.18.0 + SQLiteStore will be removed in Zarr-Python 3.0. See + `GH1274 `_ + for more information. + Parameters ---------- path : string @@ -2582,9 +2653,15 @@ class SQLiteStore(Store): >>> store.close() # don't forget to call this when you're done """ - def __init__(self, path, dimension_separator=None, **kwargs): + def __init__(self, path, dimension_separator: Optional[DIMENSION_SEPARATOR] = None, **kwargs): import sqlite3 + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + self._dimension_separator = dimension_separator # normalize path @@ -2702,14 +2779,12 @@ def listdir(self, path=None): path = normalize_storage_path(path) sep = "_" if path == "" else "/" keys = self.cursor.execute( - """ + f""" SELECT DISTINCT SUBSTR(m, 0, INSTR(m, "/")) AS l FROM ( SELECT LTRIM(SUBSTR(k, LENGTH(?) + 1), "/") || "/" AS m FROM zarr WHERE k LIKE (? || "{sep}%") ) ORDER BY l ASC - """.format( - sep=sep - ), + """, (path, path), ) keys = list(map(operator.itemgetter(0), keys)) @@ -2753,6 +2828,11 @@ class MongoDBStore(Store): .. note:: This is an experimental feature. + .. deprecated:: 2.18.0 + MongoDBStore will be removed in Zarr-Python 3.0. See + `GH1274 `_ + for more information. + Requires the `pymongo `_ package to be installed. @@ -2780,11 +2860,17 @@ def __init__( self, database="mongodb_zarr", collection="zarr_collection", - dimension_separator=None, + dimension_separator: Optional[DIMENSION_SEPARATOR] = None, **kwargs, ): import pymongo + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + self._database = database self._collection = collection self._dimension_separator = dimension_separator @@ -2841,6 +2927,11 @@ class RedisStore(Store): .. note:: This is an experimental feature. + .. deprecated:: 2.18.0 + RedisStore will be removed in Zarr-Python 3.0. See + `GH1274 `_ + for more information. + Requires the `redis `_ package to be installed. @@ -2855,9 +2946,17 @@ class RedisStore(Store): """ - def __init__(self, prefix="zarr", dimension_separator=None, **kwargs): + def __init__( + self, prefix="zarr", dimension_separator: Optional[DIMENSION_SEPARATOR] = None, **kwargs + ): import redis + warnings.warn( + V3_DEPRECATION_MESSAGE.format(store=self.__class__.__name__), + FutureWarning, + stacklevel=2, + ) + self._prefix = prefix self._kwargs = kwargs self._dimension_separator = dimension_separator @@ -2865,7 +2964,7 @@ def __init__(self, prefix="zarr", dimension_separator=None, **kwargs): self.client = redis.Redis(**kwargs) def _key(self, key): - return "{prefix}:{key}".format(prefix=self._prefix, key=key) + return f"{self._prefix}:{key}" def __getitem__(self, key): return self.client[self._key(key)] @@ -2950,7 +3049,7 @@ def __init__(self, store: StoreLike, metadata_key=".zmetadata"): consolidated_format = meta.get("zarr_consolidated_format", None) if consolidated_format != 1: raise MetadataError( - "unsupported zarr consolidated metadata format: %s" % consolidated_format + f"unsupported zarr consolidated metadata format: {consolidated_format}" ) # decode metadata diff --git a/zarr/sync.py b/zarr/sync.py index 49684a51ee..ba1c5df5b3 100644 --- a/zarr/sync.py +++ b/zarr/sync.py @@ -1,11 +1,18 @@ import os from collections import defaultdict from threading import Lock +from typing import Protocol -import fasteners +class Synchronizer(Protocol): + """Base class for synchronizers.""" -class ThreadSynchronizer: + def __getitem__(self, item): + # see subclasses + ... + + +class ThreadSynchronizer(Synchronizer): """Provides synchronization using thread locks.""" def __init__(self): @@ -24,7 +31,7 @@ def __setstate__(self, *args): self.__init__() -class ProcessSynchronizer: +class ProcessSynchronizer(Synchronizer): """Provides synchronization using file locks via the `fasteners `_ package. @@ -41,6 +48,8 @@ def __init__(self, path): self.path = path def __getitem__(self, item): + import fasteners + path = os.path.join(self.path, item) lock = fasteners.InterProcessLock(path) return lock diff --git a/zarr/tests/test_attrs.py b/zarr/tests/test_attrs.py index 7dd5b340a2..2d9553971b 100644 --- a/zarr/tests/test_attrs.py +++ b/zarr/tests/test_attrs.py @@ -30,7 +30,6 @@ def init_attributes(self, store, read_only=False, cache=True, zarr_version=2): return Attributes(store, key=root + "attrs", read_only=read_only, cache=cache) def test_storage(self, zarr_version): - store = _init_store(zarr_version) root = ".z" if zarr_version == 2 else meta_root attrs_key = root + "attrs" @@ -50,7 +49,6 @@ def test_storage(self, zarr_version): assert dict(foo="bar", baz=42) == d def test_utf8_encoding(self, zarr_version): - project_root = pathlib.Path(zarr.__file__).resolve().parent.parent fixdir = project_root / "fixture" testdir = fixdir / "utf8attrs" @@ -67,7 +65,6 @@ def test_utf8_encoding(self, zarr_version): assert fixture["utf8attrs"].attrs.asdict() == dict(foo="た") def test_get_set_del_contains(self, zarr_version): - store = _init_store(zarr_version) a = self.init_attributes(store, zarr_version=zarr_version) assert "foo" not in a @@ -84,7 +81,6 @@ def test_get_set_del_contains(self, zarr_version): a["foo"] def test_update_put(self, zarr_version): - store = _init_store(zarr_version) a = self.init_attributes(store, zarr_version=zarr_version) assert "foo" not in a @@ -102,7 +98,6 @@ def test_update_put(self, zarr_version): assert "baz" not in a def test_iterators(self, zarr_version): - store = _init_store(zarr_version) a = self.init_attributes(store, zarr_version=zarr_version) assert 0 == len(a) @@ -232,7 +227,6 @@ def test_caching_on(self, zarr_version): assert get_cnt == store.counter["__getitem__", attrs_key] def test_caching_off(self, zarr_version): - # setup store store = CountingDict() if zarr_version == 2 else CountingDictV3() attrs_key = ".zattrs" if zarr_version == 2 else "meta/root/attrs" diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 389ce90a9d..7d190adc2c 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -57,7 +57,6 @@ def _init_creation_kwargs(zarr_version): @pytest.mark.parametrize("zarr_version", _VERSIONS) def test_open_array(path_type, zarr_version): - store = tempfile.mkdtemp() atexit.register(atexit_rmtree, store) store = path_type(store) @@ -86,7 +85,6 @@ def test_open_array(path_type, zarr_version): @pytest.mark.parametrize("zarr_version", _VERSIONS) def test_open_group(path_type, zarr_version): - store = tempfile.mkdtemp() atexit.register(atexit_rmtree, store) store = path_type(store) @@ -210,7 +208,6 @@ def test_tree(zarr_version): def test_consolidate_metadata( with_chunk_store, zarr_version, listable, monkeypatch, stores_from_path ): - # setup initial data if stores_from_path: store = tempfile.mkdtemp() @@ -399,7 +396,6 @@ def test_save_array_separator(tmpdir, options): class TestCopyStore(unittest.TestCase): - _version = 2 def setUp(self): @@ -536,7 +532,6 @@ def test_if_exists(self): @pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") class TestCopyStoreV3(TestCopyStore): - _version = 3 def setUp(self): @@ -557,7 +552,6 @@ def test_mismatched_store_versions(self): def check_copied_array(original, copied, without_attrs=False, expect_props=None): - # setup source_h5py = original.__module__.startswith("h5py.") dest_h5py = copied.__module__.startswith("h5py.") @@ -621,7 +615,6 @@ def check_copied_array(original, copied, without_attrs=False, expect_props=None) def check_copied_group(original, copied, without_attrs=False, expect_props=None, shallow=False): - # setup if expect_props is None: expect_props = dict() diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index f3ca73dea8..01a78ecd68 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -3,7 +3,7 @@ import sys import pickle import shutil -from typing import Any, Literal, Optional, Tuple, Union +from typing import Any, Literal, Optional, Tuple, Union, Sequence import unittest from itertools import zip_longest from tempfile import mkdtemp @@ -26,6 +26,7 @@ VLenUTF8, Zlib, ) +from numcodecs.abc import Codec from numcodecs.compat import ensure_bytes, ensure_ndarray from numcodecs.tests.common import greetings from numpy.testing import assert_array_almost_equal, assert_array_equal @@ -72,7 +73,16 @@ ) from zarr.tests.test_storage_v3 import DummyStorageTransfomer from zarr.util import buffer_size -from zarr.tests.util import abs_container, skip_test_env_var, have_fsspec, mktemp +from zarr.tests.util import ( + abs_container, + have_bsddb3, + have_fsspec, + have_lmdb, + have_sqlite3, + mktemp, + skip_test_env_var, +) +from zarr.types import DIMENSION_SEPARATOR # noinspection PyMethodMayBeStatic @@ -82,8 +92,8 @@ class TestArray: root = "" path = "" compressor = Zlib(level=1) - filters = None - dimension_separator: Literal["/", ".", None] = None + filters: Optional[Sequence[Codec]] = None + dimension_separator: Optional[DIMENSION_SEPARATOR] = None cache_metadata = True cache_attrs = True partial_decompress: bool = False @@ -113,7 +123,7 @@ def create_array(self, shape: Union[int, Tuple[int, ...]], **kwargs): "compressor": kwargs.pop("compressor", self.compressor), "chunk_store": chunk_store, "storage_transformers": self.create_storage_transformers(shape), - "filters": kwargs.pop("filters", self.create_filters(kwargs.get("dtype", None))), + "filters": kwargs.pop("filters", self.create_filters(kwargs.get("dtype"))), } # keyword arguments for array instantiation @@ -178,7 +188,7 @@ def test_store_has_text_keys(self): for k in z.chunk_store.keys(): if not isinstance(k, expected_type): # pragma: no cover - pytest.fail("Non-text key: %s" % repr(k)) + pytest.fail(f"Non-text key: {k!r}") z.store.close() @@ -192,7 +202,7 @@ def test_store_has_binary_values(self): try: ensure_ndarray(v) except TypeError: # pragma: no cover - pytest.fail("Non-bytes-like value: %s" % repr(v)) + pytest.fail(f"Non-bytes-like value: {v!r}") z.store.close() @@ -1202,7 +1212,7 @@ def test_dtypes(self): # datetime, timedelta for base_type in "Mm": for resolution in "D", "us", "ns": - dtype = "{}8[{}]".format(base_type, resolution) + dtype = f"{base_type}8[{resolution}]" z = self.create_array(shape=100, dtype=dtype, fill_value=0) assert z.dtype == np.dtype(dtype) a = np.random.randint( @@ -1392,7 +1402,7 @@ def compare_arrays(expected, actual, item_dtype): # convenience API for item_type in "int", " DBMStoreV3: - bsddb3 = pytest.importorskip("bsddb3") + import bsddb3 + path = mktemp(suffix=".dbm") atexit.register(os.remove, path) store = DBMStoreV3(path, flag="n", open=bsddb3.btopen) @@ -2769,11 +2783,11 @@ def test_nbytes_stored(self): @pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +@pytest.mark.skipif(have_lmdb is False, reason="needs lmdb") class TestArrayWithLMDBStoreV3(TestArrayV3): lmdb_buffers = True def create_store(self) -> LMDBStoreV3: - pytest.importorskip("lmdb") path = mktemp(suffix=".lmdb") atexit.register(atexit_rmtree, path) store = LMDBStoreV3(path, buffers=self.lmdb_buffers) @@ -2795,9 +2809,9 @@ def test_nbytes_stored(self): @pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +@pytest.mark.skipif(have_sqlite3 is False, reason="needs sqlite3") class TestArrayWithSQLiteStoreV3(TestArrayV3): def create_store(self): - pytest.importorskip("sqlite3") path = mktemp(suffix=".db") atexit.register(atexit_rmtree, path) store = SQLiteStoreV3(path) @@ -3143,3 +3157,52 @@ def test_issue_1279(tmpdir): written_data = ds_reopened[:] assert_array_equal(data, written_data) + + +def test_scalar_indexing(): + store = zarr.KVStore({}) + + store["a"] = zarr.create((3,), chunks=(1,), store=store) + store["a"][:] = [1, 2, 3] + + assert store["a"][1] == np.array(2.0) + assert store["a"][(1,)] == np.array(2.0) + + store["a"][slice(1)] = [-1] + assert store["a"][0] == np.array(-1) + + store["a"][0] = -2 + assert store["a"][0] == np.array(-2) + + store["a"][slice(1)] = (-3,) + assert store["a"][0] == np.array(-3) + + +def test_object_array_indexing(): + # regression test for #1874 + from numcodecs import MsgPack + + root = zarr.group() + arr = root.create_dataset( + name="my_dataset", + shape=0, + dtype=object, + object_codec=MsgPack(), + ) + new_items = [ + ["A", 1], + ["B", 2, "hello"], + ] + arr_add = np.empty(len(new_items), dtype=object) + arr_add[:] = new_items + arr.append(arr_add) + + # heterogeneous elements + elem = ["C", 3] + arr[0] = elem + assert arr[0] == elem + + # homogeneous elements + elem = [1, 3] + arr[1] = elem + assert arr[1] == elem diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index b44c6379fd..8e586abfff 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -74,7 +74,6 @@ def _init_creation_kwargs(zarr_version, at_root=True): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_array(zarr_version, at_root): - expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version kwargs = _init_creation_kwargs(zarr_version, at_root) @@ -213,7 +212,6 @@ def test_full_additional_dtypes(zarr_version): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_open_array(zarr_version, at_root, dimension_separator): - store = "data/array.zarr" kwargs = _init_creation_kwargs(zarr_version, at_root) @@ -329,7 +327,6 @@ def test_open_array(zarr_version, at_root, dimension_separator): def test_open_array_none(): - # open with both store and zarr_version = None z = open_array(mode="w", shape=100, chunks=10) assert isinstance(z, Array) @@ -339,7 +336,6 @@ def test_open_array_none(): @pytest.mark.parametrize("dimension_separator", [".", "/", None]) @pytest.mark.parametrize("zarr_version", _VERSIONS2) def test_open_array_infer_separator_from_store(zarr_version, dimension_separator): - if zarr_version == 3: StoreClass = DirectoryStoreV3 path = "data" @@ -370,7 +366,6 @@ def test_open_array_infer_separator_from_store(zarr_version, dimension_separator # TODO: N5 support for v3 @pytest.mark.parametrize("zarr_version", [None, 2]) def test_open_array_n5(zarr_version): - store = "data/array.zarr" kwargs = _init_creation_kwargs(zarr_version) @@ -409,7 +404,6 @@ def test_open_array_n5(zarr_version): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_open_array_dict_store(zarr_version, at_root): - # dict will become a KVStore store = dict() kwargs = _init_creation_kwargs(zarr_version, at_root) @@ -503,7 +497,6 @@ def test_empty_like(zarr_version, at_root): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_zeros_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version @@ -529,7 +522,6 @@ def test_zeros_like(zarr_version, at_root): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_ones_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version @@ -556,7 +548,6 @@ def test_ones_like(zarr_version, at_root): @pytest.mark.parametrize("zarr_version", _VERSIONS) @pytest.mark.parametrize("at_root", [False, True]) def test_full_like(zarr_version, at_root): - kwargs = _init_creation_kwargs(zarr_version, at_root) expected_zarr_version = DEFAULT_ZARR_VERSION if zarr_version is None else zarr_version diff --git a/zarr/tests/test_dim_separator.py b/zarr/tests/test_dim_separator.py index 987852dfd0..0a5814e65f 100644 --- a/zarr/tests/test_dim_separator.py +++ b/zarr/tests/test_dim_separator.py @@ -46,7 +46,6 @@ def dataset(tmpdir, request): static = project_root / "fixture" / suffix if not static.exists(): # pragma: no cover - if "nested" in which: # No way to reproduce the nested_legacy file via code generator = NestedDirectoryStore diff --git a/zarr/tests/test_filters.py b/zarr/tests/test_filters.py index d55be9145f..fc63cdca8d 100644 --- a/zarr/tests/test_filters.py +++ b/zarr/tests/test_filters.py @@ -30,7 +30,6 @@ def test_array_with_delta_filter(): - # setup astype = "u1" dtype = "i8" @@ -38,7 +37,6 @@ def test_array_with_delta_filter(): data = np.arange(100, dtype=dtype) for compressor in compressors: - a = array(data, chunks=10, compressor=compressor, filters=filters) # check round-trip @@ -57,7 +55,6 @@ def test_array_with_delta_filter(): def test_array_with_astype_filter(): - # setup encode_dtype = "i1" decode_dtype = "i8" @@ -68,7 +65,6 @@ def test_array_with_astype_filter(): data = np.arange(shape, dtype=decode_dtype) for compressor in compressors: - a = array(data, chunks=chunks, compressor=compressor, filters=filters) # check round-trip @@ -88,7 +84,6 @@ def test_array_with_astype_filter(): def test_array_with_scaleoffset_filter(): - # setup astype = "u1" dtype = "f8" @@ -97,7 +92,6 @@ def test_array_with_scaleoffset_filter(): data = np.linspace(1000, 1001, 34, dtype="f8") for compressor in compressors: - a = array(data, chunks=5, compressor=compressor, filters=filters) # check round-trip @@ -116,7 +110,6 @@ def test_array_with_scaleoffset_filter(): def test_array_with_quantize_filter(): - # setup dtype = "f8" digits = 3 @@ -125,7 +118,6 @@ def test_array_with_quantize_filter(): data = np.linspace(0, 1, 34, dtype=dtype) for compressor in compressors: - a = array(data, chunks=5, compressor=compressor, filters=filters) # check round-trip @@ -144,14 +136,12 @@ def test_array_with_quantize_filter(): def test_array_with_packbits_filter(): - # setup flt = PackBits() filters = [flt] data = np.random.randint(0, 2, size=100, dtype=bool) for compressor in compressors: - a = array(data, chunks=5, compressor=compressor, filters=filters) # check round-trip @@ -170,14 +160,12 @@ def test_array_with_packbits_filter(): def test_array_with_categorize_filter(): - # setup data = np.random.choice(["foo", "bar", "baz"], size=100) flt = Categorize(dtype=data.dtype, labels=["foo", "bar", "baz"]) filters = [flt] for compressor in compressors: - a = array(data, chunks=5, compressor=compressor, filters=filters) # check round-trip diff --git a/zarr/tests/test_hierarchy.py b/zarr/tests/test_hierarchy.py index cbf59c55c3..161e1eb813 100644 --- a/zarr/tests/test_hierarchy.py +++ b/zarr/tests/test_hierarchy.py @@ -1,4 +1,5 @@ import atexit +import operator import os import sys import pickle @@ -87,6 +88,26 @@ def create_group( ) return g + def test_ipython_repr_methods(self): + g = self.create_group() + for method in [ + "html", + "json", + "javascript", + "markdown", + "svg", + "png", + "jpeg", + "latex", + "pdf", + "mimebundle", + ]: + assert operator.methodcaller(f"_repr_{method}_")(g) is None + with pytest.raises(AttributeError): + g._ipython_display_() + with pytest.raises(AttributeError): + g._ipython_canary_method_should_not_exist_() + def test_group_init_1(self): store, chunk_store = self.create_store() g = self.create_group(store, chunk_store=chunk_store) @@ -1085,7 +1106,6 @@ def test_paths(self): g1.store.close() def test_pickle(self): - # setup group g = self.create_group() d = g.create_dataset("foo/bar", shape=100, chunks=10) @@ -1113,7 +1133,6 @@ def test_pickle(self): g2.store.close() def test_context_manager(self): - with self.create_group() as g: d = g.create_dataset("foo/bar", shape=100, chunks=10) d[:] = np.arange(100) @@ -1375,7 +1394,6 @@ def create_store(): return store, None def test_context_manager(self): - with self.create_group() as g: store = g.store d = g.create_dataset("foo/bar", shape=100, chunks=10) diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 8a34c1e715..a3afc101c5 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -17,7 +17,6 @@ def test_normalize_integer_selection(): - assert 1 == normalize_integer_selection(1, 100) assert 99 == normalize_integer_selection(-1, 100) with pytest.raises(IndexError): @@ -29,7 +28,6 @@ def test_normalize_integer_selection(): def test_replace_ellipsis(): - # 1D, single item assert (0,) == replace_ellipsis(0, (100,)) @@ -68,7 +66,6 @@ def test_replace_ellipsis(): def test_get_basic_selection_0d(): - # setup a = np.array(42) z = zarr.create(shape=a.shape, dtype=a.dtype, fill_value=None) @@ -191,7 +188,6 @@ def _test_get_basic_selection(a, z, selection): # noinspection PyStatementEffect def test_get_basic_selection_1d(): - # setup a = np.arange(1050, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) @@ -264,7 +260,6 @@ def test_get_basic_selection_1d(): # noinspection PyStatementEffect def test_get_basic_selection_2d(): - # setup a = np.arange(10000, dtype=int).reshape(1000, 10) z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) @@ -423,7 +418,6 @@ def test_fancy_indexing_doesnt_mix_with_implicit_slicing(): def test_set_basic_selection_0d(): - # setup v = np.array(42) a = np.zeros_like(v) @@ -479,7 +473,6 @@ def _test_get_orthogonal_selection(a, z, selection): # noinspection PyStatementEffect def test_get_orthogonal_selection_1d_bool(): - # setup a = np.arange(1050, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) @@ -502,7 +495,6 @@ def test_get_orthogonal_selection_1d_bool(): # noinspection PyStatementEffect def test_get_orthogonal_selection_1d_int(): - # setup a = np.arange(1050, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) @@ -561,7 +553,6 @@ def _test_get_orthogonal_selection_2d(a, z, ix0, ix1): # noinspection PyStatementEffect def test_get_orthogonal_selection_2d(): - # setup a = np.arange(10000, dtype=int).reshape(1000, 10) z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) @@ -570,7 +561,6 @@ def test_get_orthogonal_selection_2d(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) @@ -641,7 +631,6 @@ def _test_get_orthogonal_selection_3d(a, z, ix0, ix1, ix2): def test_get_orthogonal_selection_3d(): - # setup a = np.arange(100000, dtype=int).reshape(200, 50, 10) z = zarr.create(shape=a.shape, chunks=(60, 20, 3), dtype=a.dtype) @@ -650,7 +639,6 @@ def test_get_orthogonal_selection_3d(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) @@ -673,7 +661,6 @@ def test_get_orthogonal_selection_3d(): def test_orthogonal_indexing_edge_cases(): - a = np.arange(6).reshape(1, 2, 3) z = zarr.create(shape=a.shape, chunks=(1, 2, 3), dtype=a.dtype) z[:] = a @@ -706,7 +693,6 @@ def _test_set_orthogonal_selection(v, a, z, selection): def test_set_orthogonal_selection_1d(): - # setup v = np.arange(1050, dtype=int) a = np.empty(v.shape, dtype=int) @@ -715,7 +701,6 @@ def test_set_orthogonal_selection_1d(): # test with different degrees of sparseness np.random.seed(42) for p in 0.5, 0.1, 0.01: - # boolean arrays ix = np.random.binomial(1, p, size=a.shape[0]).astype(bool) _test_set_orthogonal_selection(v, a, z, ix) @@ -734,7 +719,6 @@ def test_set_orthogonal_selection_1d(): def _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1): - selections = [ # index both axes with array (ix0, ix1), @@ -749,7 +733,6 @@ def _test_set_orthogonal_selection_2d(v, a, z, ix0, ix1): def test_set_orthogonal_selection_2d(): - # setup v = np.arange(10000, dtype=int).reshape(1000, 10) a = np.empty_like(v) @@ -758,7 +741,6 @@ def test_set_orthogonal_selection_2d(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) @@ -780,7 +762,6 @@ def test_set_orthogonal_selection_2d(): def _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2): - selections = ( # single value (84, 42, 4), @@ -807,7 +788,6 @@ def _test_set_orthogonal_selection_3d(v, a, z, ix0, ix1, ix2): def test_set_orthogonal_selection_3d(): - # setup v = np.arange(100000, dtype=int).reshape(200, 50, 10) a = np.empty_like(v) @@ -816,7 +796,6 @@ def test_set_orthogonal_selection_3d(): np.random.seed(42) # test with different degrees of sparseness for p in 0.5, 0.1, 0.01: - # boolean arrays ix0 = np.random.binomial(1, p, size=a.shape[0]).astype(bool) ix1 = np.random.binomial(1, 0.5, size=a.shape[1]).astype(bool) @@ -888,7 +867,6 @@ def _test_get_coordinate_selection(a, z, selection): # noinspection PyStatementEffect def test_get_coordinate_selection_1d(): - # setup a = np.arange(1050, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) @@ -932,7 +910,6 @@ def test_get_coordinate_selection_1d(): def test_get_coordinate_selection_2d(): - # setup a = np.arange(10000, dtype=int).reshape(1000, 10) z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) @@ -1027,7 +1004,6 @@ def test_set_coordinate_selection_1d(): def test_set_coordinate_selection_2d(): - # setup v = np.arange(10000, dtype=int).reshape(1000, 10) a = np.empty_like(v) @@ -1258,7 +1234,6 @@ def _test_get_mask_selection(a, z, selection): # noinspection PyStatementEffect def test_get_mask_selection_1d(): - # setup a = np.arange(1050, dtype=int) z = zarr.create(shape=a.shape, chunks=100, dtype=a.dtype) @@ -1285,7 +1260,6 @@ def test_get_mask_selection_1d(): # noinspection PyStatementEffect def test_get_mask_selection_2d(): - # setup a = np.arange(10000, dtype=int).reshape(1000, 10) z = zarr.create(shape=a.shape, chunks=(300, 3), dtype=a.dtype) @@ -1318,7 +1292,6 @@ def _test_set_mask_selection(v, a, z, selection): def test_set_mask_selection_1d(): - # setup v = np.arange(1050, dtype=int) a = np.empty_like(v) @@ -1338,7 +1311,6 @@ def test_set_mask_selection_1d(): def test_set_mask_selection_2d(): - # setup v = np.arange(10000, dtype=int).reshape(1000, 10) a = np.empty_like(v) @@ -1352,7 +1324,6 @@ def test_set_mask_selection_2d(): def test_get_selection_out(): - # basic selections a = np.arange(1050) z = zarr.create(shape=1050, chunks=100, dtype=a.dtype) @@ -1426,7 +1397,6 @@ def test_get_selection_out(): def test_get_selections_with_fields(): - a = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] a = np.array(a, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) z = zarr.create(shape=a.shape, chunks=2, dtype=a.dtype, fill_value=None) @@ -1444,7 +1414,6 @@ def test_get_selections_with_fields(): ] for fields in fields_fixture: - # total selection expect = a[fields] actual = z.get_basic_selection(Ellipsis, fields=fields) @@ -1534,7 +1503,6 @@ def test_get_selections_with_fields(): def test_set_selections_with_fields(): - v = [("aaa", 1, 4.2), ("bbb", 2, 8.4), ("ccc", 3, 12.6)] v = np.array(v, dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")]) a = np.empty_like(v) @@ -1553,7 +1521,6 @@ def test_set_selections_with_fields(): ] for fields in fields_fixture: - # currently multi-field assignment is not supported in numpy, so we won't support # it either if isinstance(fields, list) and len(fields) > 1: @@ -1567,7 +1534,6 @@ def test_set_selections_with_fields(): z.set_mask_selection([True, False, True], v, fields=fields) else: - if isinstance(fields, list) and len(fields) == 1: # work around numpy does not support multi-field assignment even if there # is only one field @@ -1666,7 +1632,7 @@ def test_set_selections_with_fields(): ), ( (slice(0, 10, 1),), - np.arange(0, 10).reshape((10)), + np.arange(0, 10).reshape(10), [(0, 10, (slice(0, 10, 1),))], ), ((0,), np.arange(0, 100).reshape((10, 10)), [(0, 10, (slice(0, 1, 1),))]), @@ -1678,7 +1644,7 @@ def test_set_selections_with_fields(): np.arange(0, 100).reshape((10, 10)), [(0, 1, (slice(0, 1, 1), slice(0, 1, 1)))], ), - ((0,), np.arange(0, 10).reshape((10)), [(0, 1, (slice(0, 1, 1),))]), + ((0,), np.arange(0, 10).reshape(10), [(0, 1, (slice(0, 1, 1),))]), pytest.param( (slice(5, 8, 1), slice(2, 4, 1), slice(0, 5, 1)), np.arange(2, 100002).reshape((10, 1, 10000)), @@ -1752,19 +1718,16 @@ def test_accessed_chunks(shape, chunks, ops): z = zarr.create(shape=shape, chunks=chunks, store=store) for ii, (optype, slices) in enumerate(ops): - # Resolve the slices into the accessed chunks for each dimension - chunks_per_dim = [] - for N, C, sl in zip(shape, chunks, slices): - chunk_ind = np.arange(N, dtype=int)[sl] // C - chunks_per_dim.append(np.unique(chunk_ind)) + chunks_per_dim = [ + np.unique(np.arange(N, dtype=int)[sl] // C) for N, C, sl in zip(shape, chunks, slices) + ] # Combine and generate the cartesian product to determine the chunks keys that # will be accessed - chunks_accessed = [] - for comb in itertools.product(*chunks_per_dim): - chunks_accessed.append(".".join([str(ci) for ci in comb])) - + chunks_accessed = ( + ".".join([str(ci) for ci in comb]) for comb in itertools.product(*chunks_per_dim) + ) counts_before = store.counter.copy() # Perform the operation diff --git a/zarr/tests/test_info.py b/zarr/tests/test_info.py index 7fb6feb11b..96eae999f4 100644 --- a/zarr/tests/test_info.py +++ b/zarr/tests/test_info.py @@ -7,7 +7,6 @@ @pytest.mark.parametrize("array_size", [10, 15000]) def test_info(array_size): - # setup g = zarr.group(store=dict(), chunk_store=dict(), synchronizer=zarr.ThreadSynchronizer()) g.create_group("foo") diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index db50560c8e..f9010d6788 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -34,7 +34,6 @@ def assert_json_equal(expect, actual): def test_encode_decode_array_1(): - meta = dict( shape=(100,), chunks=(10,), @@ -45,19 +44,16 @@ def test_encode_decode_array_1(): order="C", ) - meta_json = ( - """{ + meta_json = f"""{{ "chunks": [10], - "compressor": {"id": "zlib", "level": 1}, + "compressor": {{"id": "zlib", "level": 1}}, "dtype": " Tupl def normalize_dtype(dtype: Union[str, np.dtype], object_codec) -> Tuple[np.dtype, Any]: - # convenience API for object arrays if inspect.isclass(dtype): - dtype = dtype.__name__ # type: ignore + dtype = dtype.__name__ if isinstance(dtype, str): # allow ':' to delimit class from codec arguments tokens = dtype.split(":") @@ -198,12 +198,11 @@ def normalize_dtype(dtype: Union[str, np.dtype], object_codec) -> Tuple[np.dtype args = [] try: object_codec = codec_registry[codec_id](*args) - except KeyError: # pragma: no cover + except KeyError as e: # pragma: no cover raise ValueError( - "codec %r for object type %r is not " - "available; please provide an " - "object_codec manually" % (codec_id, key) - ) + f"codec {codec_id!r} for object type {key!r} is not " + f"available; please provide an object_codec manually" + ) from e return dtype, object_codec dtype = np.dtype(dtype) @@ -235,17 +234,25 @@ def is_total_slice(item, shape: Tuple[int]) -> bool: if isinstance(item, tuple): return all( ( - isinstance(it, slice) - and ((it == slice(None)) or ((it.stop - it.start == sh) and (it.step in [1, None]))) + ( + isinstance(it, slice) + and ( + (it == slice(None)) + or ((it.stop - it.start == sh) and (it.step in [1, None])) + ) + ) + # The only scalar edge case, indexing with int 0 along a size-1 dimension + # is identical to a total slice + # https://github.com/zarr-developers/zarr-python/issues/1730 + or (isinstance(it, int) and it == 0 and sh == 1) ) for it, sh in zip(item, shape) ) else: - raise TypeError("expected slice or tuple of slices, found %r" % item) + raise TypeError(f"expected slice or tuple of slices, found {item!r}") def normalize_resize_args(old_shape, *args): - # normalize new shape argument if len(args) == 1: new_shape = args[0] @@ -266,35 +273,34 @@ def normalize_resize_args(old_shape, *args): def human_readable_size(size) -> str: if size < 2**10: - return "%s" % size + return f"{size}" elif size < 2**20: - return "%.1fK" % (size / float(2**10)) + return f"{size / float(2**10):.1f}K" elif size < 2**30: - return "%.1fM" % (size / float(2**20)) + return f"{size / float(2**20):.1f}M" elif size < 2**40: - return "%.1fG" % (size / float(2**30)) + return f"{size / float(2**30):.1f}G" elif size < 2**50: - return "%.1fT" % (size / float(2**40)) + return f"{size / float(2**40):.1f}T" else: - return "%.1fP" % (size / float(2**50)) + return f"{size / float(2**50):.1f}P" def normalize_order(order: str) -> str: order = str(order).upper() if order not in ["C", "F"]: - raise ValueError("order must be either 'C' or 'F', found: %r" % order) + raise ValueError(f"order must be either 'C' or 'F', found: {order!r}") return order -def normalize_dimension_separator(sep: Optional[str]) -> Optional[str]: +def normalize_dimension_separator(sep: Optional[str]) -> Optional[DIMENSION_SEPARATOR]: if sep in (".", "/", None): - return sep + return cast(Optional[DIMENSION_SEPARATOR], sep) else: - raise ValueError("dimension_separator must be either '.' or '/', found: %r" % sep) + raise ValueError(f"dimension_separator must be either '.' or '/', found: {sep!r}") def normalize_fill_value(fill_value, dtype: np.dtype): - if fill_value is None or dtype.hasobject: # no fill value pass @@ -309,8 +315,8 @@ def normalize_fill_value(fill_value, dtype: np.dtype): if not isinstance(fill_value, str): raise ValueError( - "fill_value {!r} is not valid for dtype {}; must be a " - "unicode string".format(fill_value, dtype) + f"fill_value {fill_value!r} is not valid for dtype {dtype}; " + f"must be a unicode string" ) else: @@ -324,15 +330,14 @@ def normalize_fill_value(fill_value, dtype: np.dtype): except Exception as e: # re-raise with our own error message to be helpful raise ValueError( - "fill_value {!r} is not valid for dtype {}; nested " - "exception: {}".format(fill_value, dtype, e) - ) + f"fill_value {fill_value!r} is not valid for dtype {dtype}; " + f"nested exception: {e}" + ) from e return fill_value def normalize_storage_path(path: Union[str, bytes, None]) -> str: - # handle bytes if isinstance(path, bytes): path = str(path, "ascii") @@ -342,7 +347,6 @@ def normalize_storage_path(path: Union[str, bytes, None]) -> str: path = str(path) if path: - # convert backslash to forward slash path = path.replace("\\", "/") @@ -400,10 +404,10 @@ def info_html_report(items) -> str: report += "" for k, v in items: report += ( - "" - '%s' - '%s' - "" % (k, v) + f"" + f'{k}' + f'{v}' + f"" ) report += "" report += "" @@ -413,14 +417,13 @@ def info_html_report(items) -> str: class InfoReporter: def __init__(self, obj): self.obj = obj + self.items = self.obj.info_items() def __repr__(self): - items = self.obj.info_items() - return info_text_report(items) + return info_text_report(self.items) def _repr_html_(self): - items = self.obj.info_items() - return info_html_report(items) + return info_html_report(self.items) class TreeNode: @@ -439,7 +442,7 @@ def get_children(self): def get_text(self): name = self.obj.name.split("/")[-1] or "/" if hasattr(self.obj, "shape"): - name += " {} {}".format(self.obj.shape, self.obj.dtype) + name += f" {self.obj.shape} {self.obj.dtype}" return name def get_type(self): @@ -467,7 +470,7 @@ def tree_get_icon(stype: str) -> str: elif stype == "Group": return tree_group_icon else: - raise ValueError("Unknown type: %s" % stype) + raise ValueError(f"Unknown type: {stype}") def tree_widget_sublist(node, root=False, expand=False): @@ -489,13 +492,13 @@ def tree_widget_sublist(node, root=False, expand=False): def tree_widget(group, expand, level): try: import ipytree - except ImportError as error: + except ImportError as e: raise ImportError( - "{}: Run `pip install zarr[jupyter]` or `conda install ipytree`" - "to get the required ipytree dependency for displaying the tree " - "widget. If using jupyterlab<3, you also need to run " - "`jupyter labextension install ipytree`".format(error) - ) + f"{e}: Run `pip install zarr[jupyter]` or `conda install ipytree`" + f"to get the required ipytree dependency for displaying the tree " + f"widget. If using jupyterlab<3, you also need to run " + f"`jupyter labextension install ipytree`" + ) from e result = ipytree.Tree() root = TreeNode(group, level=level) @@ -506,7 +509,6 @@ def tree_widget(group, expand, level): class TreeViewer: def __init__(self, group, expand=False, level=None): - self.group = group self.expand = expand self.level = level @@ -554,14 +556,10 @@ def _repr_mimebundle_(self, **kwargs): def check_array_shape(param, array, shape): if not hasattr(array, "shape"): - raise TypeError( - "parameter {!r}: expected an array-like object, got {!r}".format(param, type(array)) - ) + raise TypeError(f"parameter {param!r}: expected an array-like object, got {type(array)!r}") if array.shape != shape: raise ValueError( - "parameter {!r}: expected array with shape {!r}, got {!r}".format( - param, shape, array.shape - ) + f"parameter {param!r}: expected array with shape {shape!r}, got {array.shape!r}" )