diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000000..f867eb9250 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,59 @@ +name: Benchmark + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + benchmark: + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -e {0} # -e to fail on error + + strategy: + fail-fast: false + matrix: + python: ["3.12"] + os: [ubuntu-latest] + + env: + OS: ${{ matrix.os }} + PYTHON: ${{ matrix.python }} + ASV_DIR: "./benchmarks" + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + filter: blob:none + + - name: Fetch main branch for `asv run`’s hash + run: git fetch origin main:main + if: ${{ github.ref_name != 'main' }} + + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + cache: 'pip' + + - name: Cache datasets + uses: actions/cache@v4 + with: + path: | + ~/.cache + key: benchmark-state-${{ hashFiles('benchmarks/**') }} + + - name: Install dependencies + run: pip install asv + + - name: Configure ASV + working-directory: ${{ env.ASV_DIR }} + run: asv machine --yes + + - name: Quick benchmark run + working-directory: ${{ env.ASV_DIR }} + run: asv run --dry-run --quick --show-stderr --verbose HEAD^! diff --git a/.github/workflows/check-pr.yml b/.github/workflows/check-pr.yml index e74c041c8b..88d78c9b43 100644 --- a/.github/workflows/check-pr.yml +++ b/.github/workflows/check-pr.yml @@ -45,7 +45,10 @@ jobs: needs: check-milestone if: ${{ needs.check-milestone.outputs.no-relnotes-reason == '' && !contains(github.event.pull_request.labels.*.name, 'Development Process 🚀') }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + filter: blob:none - name: Find out if relevant release notes are modified uses: dorny/paths-filter@v2 id: changes diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index a1e7041c4a..1762ed21eb 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -11,8 +11,11 @@ jobs: permissions: id-token: write # to authenticate as Trusted Publisher to pypi.org steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + filter: blob:none + - uses: actions/setup-python@v5 with: python-version: "3.x" cache: "pip" diff --git a/.gitignore b/.gitignore index 655db58f4c..ca144c6641 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,7 @@ Thumbs.db # IDEs and editors /.idea/ /.vscode/ + +# asv benchmark files +/benchmarks/.asv +/benchmarks/data/ diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000000..f186cfa876 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,10 @@ +# Scanpy Benchmarks + +This directory contains code for benchmarking Scanpy using [asv][]. + +The functionality is checked using the [`benchmark.yml`][] workflow. +Benchmarks are run using the [benchmark bot][]. + +[asv]: https://asv.readthedocs.io/ +[`benchmark.yml`]: ../.github/workflows/benchmark.yml +[benchmark bot]: https://github.com/apps/scverse-benchmark diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 0000000000..57fb4ffc98 --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,168 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "scanpy", + + // The project's homepage + "project_url": "https://scanpy.readthedocs.io/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // "install_command": ["python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + "build_command": [ + "python -m pip install build", + "python -m build --wheel -o {build_cache_dir} {build_dir}", + ], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + "branches": ["main"], // for git + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "https://github.com/scverse/scanpy/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["3.9", "3.12"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + "conda_channels": ["conda-forge", "defaults"], + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list or empty string indicates to just test against the default + // (latest) version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed via + // pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + "matrix": { + "numpy": [""], + // "scipy": ["1.2", ""], + "scipy": [""], + "h5py": [""], + "natsort": [""], + "pandas": [""], + "memory_profiler": [""], + "zarr": [""], + "pytest": [""], + "scanpy": [""], + "python-igraph": [""], + // "psutil": [""] + }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "six": null}, // don't run without six on conda + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "numpy": "1.8"}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + // "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmarks/benchmarks/preprocessing.py b/benchmarks/benchmarks/preprocessing.py new file mode 100644 index 0000000000..1feeb0a8a5 --- /dev/null +++ b/benchmarks/benchmarks/preprocessing.py @@ -0,0 +1,101 @@ +""" +This module will benchmark preprocessing operations in Scanpy +API documentation: https://scanpy.readthedocs.io/en/stable/api/preprocessing.html +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import scanpy as sc + +from .utils import pbmc68k_reduced + +if TYPE_CHECKING: + from anndata import AnnData + + +adata: AnnData + + +def setup(): + global adata + adata = pbmc68k_reduced() + + +def time_calculate_qc_metrics(): + adata.var["mt"] = adata.var_names.str.startswith("MT-") + sc.pp.calculate_qc_metrics( + adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True + ) + + +def peakmem_calculate_qc_metrics(): + adata.var["mt"] = adata.var_names.str.startswith("MT-") + sc.pp.calculate_qc_metrics( + adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True + ) + + +def time_filter_cells(): + sc.pp.filter_cells(adata, min_genes=200) + + +def peakmem_filter_cells(): + sc.pp.filter_cells(adata, min_genes=200) + + +def time_filter_genes(): + sc.pp.filter_genes(adata, min_cells=3) + + +def peakmem_filter_genes(): + sc.pp.filter_genes(adata, min_cells=3) + + +def time_normalize_total(): + sc.pp.normalize_total(adata, target_sum=1e4) + + +def peakmem_normalize_total(): + sc.pp.normalize_total(adata, target_sum=1e4) + + +def time_log1p(): + sc.pp.log1p(adata) + + +def peakmem_time_log1p(): + sc.pp.log1p(adata) + + +def time_pca(): + sc.pp.pca(adata, svd_solver="arpack") + + +def peakmem_pca(): + sc.pp.pca(adata, svd_solver="arpack") + + +def time_highly_variable_genes(): + sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) + + +def peakmem_highly_variable_genes(): + sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) + + +def time_regress_out(): + sc.pp.regress_out(adata, ["n_counts", "percent_mito"]) + + +def peakmem_regress_out(): + sc.pp.regress_out(adata, ["n_counts", "percent_mito"]) + + +def time_scale(): + sc.pp.scale(adata, max_value=10) + + +def peakmem_scale(): + sc.pp.scale(adata, max_value=10) diff --git a/benchmarks/benchmarks/tools.py b/benchmarks/benchmarks/tools.py new file mode 100644 index 0000000000..d45756f02c --- /dev/null +++ b/benchmarks/benchmarks/tools.py @@ -0,0 +1,48 @@ +""" +This module will benchmark tool operations in Scanpy +API documentation: https://scanpy.readthedocs.io/en/stable/api/tools.html +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import scanpy as sc + +from .utils import pbmc68k_reduced + +if TYPE_CHECKING: + from anndata import AnnData + + +adata: AnnData + + +def setup(): + global adata + adata = pbmc68k_reduced() + assert "X_pca" in adata.obsm + + +def time_umap(): + sc.tl.umap(adata) + + +def peakmem_umap(): + sc.tl.umap(adata) + + +def time_diffmap(): + sc.tl.diffmap(adata) + + +def peakmem_diffmap(): + sc.tl.diffmap(adata) + + +def time_leiden(): + sc.tl.leiden(adata, flavor="igraph") + + +def peakmem_leiden(): + sc.tl.leiden(adata, flavor="igraph") diff --git a/benchmarks/benchmarks/utils.py b/benchmarks/benchmarks/utils.py new file mode 100644 index 0000000000..0e907b8229 --- /dev/null +++ b/benchmarks/benchmarks/utils.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import scanpy as sc + +if TYPE_CHECKING: + from anndata import AnnData + +_pbmc68k_reduced: AnnData | None = None + + +def pbmc68k_reduced(): + global _pbmc68k_reduced + if _pbmc68k_reduced is None: + _pbmc68k_reduced = sc.datasets.pbmc68k_reduced() + return _pbmc68k_reduced.copy() diff --git a/docs/release-notes/1.10.2.md b/docs/release-notes/1.10.2.md index 0b09f59d7e..a1b21c1e42 100644 --- a/docs/release-notes/1.10.2.md +++ b/docs/release-notes/1.10.2.md @@ -1,5 +1,10 @@ ### 1.10.2 {small}`the future` +```{rubric} Development features +``` + +* Add performance benchmarking {pr}`2977` {smaller}`R Shrestha`, {smaller}`P Angerer` + ```{rubric} Docs ``` diff --git a/scanpy/_utils/__init__.py b/scanpy/_utils/__init__.py index 1039ef896c..947ba67003 100644 --- a/scanpy/_utils/__init__.py +++ b/scanpy/_utils/__init__.py @@ -75,14 +75,21 @@ def __getattr__(self, attr: str): return getattr(self._rng, "normal" if attr == "gauss" else attr) +def ensure_igraph() -> None: + if importlib.util.find_spec("igraph"): + return + raise ImportError( + "Please install the igraph package: " + "`conda install -c conda-forge python-igraph` or " + "`pip3 install igraph`." + ) + + @contextmanager def set_igraph_random_state(random_state: int): - try: - import igraph - except ImportError: - raise ImportError( - "Please install igraph: `conda install -c conda-forge igraph` or `pip3 install igraph`." - ) + ensure_igraph() + import igraph + rng = RNGIgraph(random_state) try: igraph.set_random_number_generator(rng) diff --git a/scanpy/tools/_leiden.py b/scanpy/tools/_leiden.py index 47258a8696..9e2350fff2 100644 --- a/scanpy/tools/_leiden.py +++ b/scanpy/tools/_leiden.py @@ -1,6 +1,5 @@ from __future__ import annotations -import importlib from typing import TYPE_CHECKING, Literal import numpy as np @@ -42,7 +41,7 @@ def leiden( neighbors_key: str | None = None, obsp: str | None = None, copy: bool = False, - flavor: Literal["leidenalg", "ipgraph"] = "leidenalg", + flavor: Literal["leidenalg", "igraph"] = "leidenalg", **clustering_args, ) -> AnnData | None: """\ @@ -121,11 +120,7 @@ def leiden( raise ValueError( f"flavor must be either 'igraph' or 'leidenalg', but '{flavor}' was passed" ) - igraph_spec = importlib.util.find_spec("igraph") - if igraph_spec is None: - raise ImportError( - "Please install the igraph package: `conda install -c conda-forge igraph` or `pip3 install igraph`." - ) + _utils.ensure_igraph() if flavor == "igraph": if directed: raise ValueError(