diff --git a/.github/.codecov.yml b/.github/.codecov.yml index ad8e37d3..929c1bfb 100644 --- a/.github/.codecov.yml +++ b/.github/.codecov.yml @@ -2,4 +2,6 @@ # SPDX-License-Identifier: MIT ignore: + - "benchmarks/" + - "examples/" - "test/" diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 806f6830..6b7c9cff 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -201,13 +201,13 @@ jobs: test/scripts/hictk_dump_normalizations.sh build/src/hictk/hictk test/scripts/hictk_dump_cells.sh build/src/hictk/hictk - test/scripts/hictk_fix_mcool.sh build/src/hictk/hictk - test/scripts/hictk_dump_gw.sh build/src/hictk/hictk test/scripts/hictk_dump_cis.sh build/src/hictk/hictk test/scripts/hictk_dump_trans.sh build/src/hictk/hictk test/scripts/hictk_dump_balanced.sh build/src/hictk/hictk + test/scripts/hictk_fix_mcool.sh build/src/hictk/hictk + test/scripts/hictk_load_coo.sh build/src/hictk/hictk sorted test/scripts/hictk_load_coo.sh build/src/hictk/hictk unsorted test/scripts/hictk_load_bg2.sh build/src/hictk/hictk sorted diff --git a/.gitignore b/.gitignore index fb2ab590..145e3a57 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ benchmark/data build/ cmake-build*/ conan-envs/ +docs/_build/ external/ scratch/ test/data diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..7202e8ba --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,22 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +version: 2 + +build: + os: ubuntu-22.04 + apt_packages: + - librsvg2-bin + tools: + python: "3.11" + +sphinx: + configuration: docs/conf.py + +python: + install: + - requirements: docs/requirements.txt + +formats: + - pdf diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e43db82..233e538d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,7 +152,7 @@ endif() if(HICTK_BUILD_EXAMPLES) message(STATUS "Building examples.") - # add_subdirectory(examples) + add_subdirectory(examples) endif() if(HICTK_BUILD_BENCHMARKS) diff --git a/README.md b/README.md index 0b112f3f..c50385d1 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,7 @@ Furthermore, the following tools are required: - Python3.6+ (including `pip`, required to install Conan) -We recommend to install CMake and Conan in a Python [virtualenv](https://virtualenvwrapper.readthedocs.io/en/stable/), but you are of course free to install the build dependencies in any way you want. +We recommend installing CMake and Conan in a Python [virtualenv](https://virtualenvwrapper.readthedocs.io/en/stable/), but you are of course free to install the build dependencies in any way you want. ```bash python3 -m venv /tmp/venv @@ -225,7 +225,7 @@ We highly recommend using the same compiler when running Conan and CMake. ## Running automated tests -Steps outlined in this section are optional but highly recommended. +The steps outlined in this section are optional but highly recommended. #### Unit tests @@ -264,7 +264,7 @@ __All tests are expected to pass. Do not ignore test failures!__
Troubleshooting test failures -If one or more test fail, try the following troubleshooting steps before reaching out for help. +If one or more tests fail, try the following troubleshooting steps before reaching out for help. 1. Make sure you are running `ctest` from the root of the source tree (`/tmp/hictk` if you are following the instructions). 2. Make sure you are passing the correct build folder to `--test-dir`. Pass the absolute path if necessary (i.e. `--test-dir=/tmp/hictk/build/` if you are following the instructions). diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..62070d48 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,21 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/assets/4dnucleome_bug_notice.avif b/docs/assets/4dnucleome_bug_notice.avif new file mode 100644 index 00000000..b1fcf737 Binary files /dev/null and b/docs/assets/4dnucleome_bug_notice.avif differ diff --git a/docs/assets/corrupted_mcool_example.tsv b/docs/assets/corrupted_mcool_example.tsv new file mode 100644 index 00000000..eba23b9d --- /dev/null +++ b/docs/assets/corrupted_mcool_example.tsv @@ -0,0 +1,12 @@ +chrom1 start1 end1 chrom2 start2 end2 count balanced +chr1 10828000 10830000 chr1 11002000 11004000 1 0.000208987 +chr1 10828000 10830000 chr1 11002000 11004000 1 0.000208987 +chr1 10828000 10830000 chr1 11006000 11008000 1 0.000199523 +chr1 10828000 10830000 chr1 11006000 11008000 3 0.000598569 +chr1 10828000 10830000 chr1 11010000 11012000 4 0.000695946 +chr1 10828000 10830000 chr1 11010000 11012000 2 0.000347973 +chr1 10828000 10830000 chr1 11020000 11022000 1 0.000219669 +chr1 10828000 10830000 chr1 11020000 11022000 1 0.000219669 +chr1 10828000 10830000 chr1 11030000 11032000 3 0.000499071 +chr1 10828000 10830000 chr1 11030000 11032000 2 0.000332714 +... ... ... ... ... ... ... ... diff --git a/docs/balancing_matrices.rst b/docs/balancing_matrices.rst new file mode 100644 index 00000000..0e484316 --- /dev/null +++ b/docs/balancing_matrices.rst @@ -0,0 +1,50 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Balancing Hi-C matrices +####################### + +``hictk`` supports balancing .hic, .cool and .mcool files using ICE (iterative correction and eigenvector decomposition). + +.. code-block:: console + + user@dev:/tmp$ hictk balance 4DNFIZ1ZVXC8.mcool::/resolutions/1000 + + [2023-10-01 13:18:02.119] [info]: Running hictk v0.0.2-f83f93e + [2023-10-01 13:18:02.130] [info]: Writing interactions to temporary file /tmp/4DNFIZ1ZVXC8.tmp0... + [2023-10-01 13:18:05.098] [info]: Initializing bias vector... + [2023-10-01 13:18:05.099] [info]: Masking rows with fewer than 10 nnz entries... + [2023-10-01 13:18:06.298] [info]: Masking rows using mad_max=5... + [2023-10-01 13:18:06.971] [info]: Iteration 1: 36874560.192587376 + [2023-10-01 13:18:07.634] [info]: Iteration 2: 21347543.04950776 + [2023-10-01 13:18:08.307] [info]: Iteration 3: 7819314.542541969 + ... + [2023-10-01 13:19:20.365] [info]: Iteration 105: 2.1397932757529552e-05 + [2023-10-01 13:19:21.146] [info]: Iteration 106: 1.6604770462001875e-05 + [2023-10-01 13:19:21.870] [info]: Iteration 107: 1.2885285040054778e-05 + [2023-10-01 13:19:22.608] [info]: Iteration 108: 9.99900768769869e-06 + [2023-10-01 13:19:22.619] [info]: Writing weights to 4DNFIZ1ZVXC8.mcool::/resolutions/1000/bins/weight... + +When balancing files in .mcool or .hic formats, all resolutions are balanced. + +By default balancing coefficients are stored in the input file under the name of "weight". + +This can be changed by passing the desired name through the ``--name`` option. + +``hictk`` supports three balancing methods: + +* Using all (genome-wide) interactions (default) +* Using trans interactions only +* Using cis interactions only + +Balancing method can be changed through the ``--mode`` option (e.g. ``--mode=gw`` or ``--mode=cis``). + +When enough memory is available, ``hictk`` can be instructed to load all interactions into system memory by passing the ``--in-memory`` flag. This can dramatically speed up matrix balancing at the cost of potentially much higher memory usage (approximately 1 GB of RAM for every 40M interactions). + +Another way to improve performance is to increase the number of threads available for computation using the ``--thread`` option. +It should be noted that when using a large number of threads (e.g. more than 16) without the ``--in-memory`` option, performance is likely limited by disk throughput. Thus, users are advised to use a large number of threads only when temporary data (``/tmp`` by default on most UNIX-like systems) is stored on a fast SSD. + +When the ``--in-memory`` option is not used, ``hictk`` will create a temporary file under the default temporary folder. This file stores interactions using a layout and compression that are optimized for the access pattern used by ``hictk balance``. When balancing large matrices, this file can be quite large (sometimes tens of GBs). If this is the case, it may be appropriate to change the temporary folder using the ``--tmpdir`` option. + +Finally, when balancing .hic files, ``hictk`` depends on `JuicerTools `_ or `HiCTools `_ to write balancing weights back to the file. Thus, when balancing .hic files, the JAR file to one of the tools should be specified through the ``--juicer-tools-jar`` option. You should use JuicerTools when balancing .hic files in .hic v8 format or older and HiCTools when balancing .hic v9 files. diff --git a/docs/cli_reference.rst b/docs/cli_reference.rst new file mode 100644 index 00000000..1c3a1f7a --- /dev/null +++ b/docs/cli_reference.rst @@ -0,0 +1,261 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +CLI Reference +############# + +.. code-block text + + + Blazing fast tools to work with .hic and .cool files. + Usage: hictk [OPTIONS] SUBCOMMAND + Options: + -h,--help Print this help message and exit + -V,--version Display program version information and exit + Subcommands: + balance Balance HiC matrices using ICE. + convert Convert HiC matrices to a different format. + dump Dump data from .hic and Cooler files to stdout. + fix-mcool Fix corrupted .mcool files. + load Build .cool files from interactions in various text formats. + merge Merge coolers. + validate Validate .hic and Cooler files. + zoomify Convert single-resolution Cooler file to multi-resolution by coarsening. + +hictk balance +------------- + +.. code-block:: text + + Balance HiC matrices using ICE. + Usage: hictk balance [OPTIONS] input + Positionals: + input TEXT:((HiC) OR (Cooler)) OR (Multires-cooler) REQUIRED + Path to the .hic, .cool or .mcool file to be balanced. + Options: + -h,--help Print this help message and exit + --mode TEXT:{gw,trans,cis} [gw] + Balance matrix using: + - genome-wide interactions (gw) + - trans-only interactions (trans) + - cis-only interactions (cis) + --tmpdir TEXT [/tmp] Path to a folder where to store temporary data. + --ignore-diags UINT [2] Number of diagonals (including the main diagonal) to mask before balancing. + --mad-max FLOAT:NONNEGATIVE [5] + Mask bins using the MAD-max filter. + bins whose log marginal sum is less than --mad-max median + absolute deviations below the median log marginal sum of + all the bins in the same chromosome. + --min-nnz UINT [10] Mask rows with fewer than --min-nnz non-zero entries. + --min-count UINT [0] Mask rows with fewer than --min-count interactions. + --tolerance FLOAT:NONNEGATIVE [1e-05] + Threshold of the variance of marginals used to determine whether + the algorithm has converged. + --max-iters UINT:POSITIVE [500] + Maximum number of iterations. + --rescale-weights,--no-rescale-weights{false} + Rescale weights such that rows sum approximately to 2. + --name TEXT [weight] Name to use when writing weights to file. + --in-memory Store all interactions in memory (greatly improves performance). + --stdout Write balancing weights to stdout instead of writing them to the input file. + --chunk-size UINT:POSITIVE [10000000] + Number of interactions to process at once. Ignored when using --in-memory. + -v,--verbosity UINT:INT in [1 - 4] [] + Set verbosity of output to the console. + -t,--threads UINT:UINT in [1 - 16] [1] + Maximum number of parallel threads to spawn. + -l,--compression-level UINT:INT in [0 - 19] [] + Compression level used to compress temporary files using ZSTD. + --juicer-tools-jar TEXT:FILE + Path to juicer_tools or hic_tools JAR. + --juicer-tools-memory UINT:SIZE [b, kb(=1000b), kib(=1024b), ...]:POSITIVE [256MB] + Max heap size used by juicer_tools. + -f,--force Overwrite existing files and datasets (if any). + +hictk convert +------------- + +.. code-block:: text + + Convert HiC matrices to a different format. + Usage: hictk convert [OPTIONS] input output + Positionals: + input TEXT:((HiC) OR (Cooler)) OR (Multires-cooler) REQUIRED + Path to the .hic, .cool or .mcool file to be converted. + output TEXT REQUIRED Output path. File extension is used to infer output format. + Options: + -h,--help Print this help message and exit + --output-fmt TEXT:{cool,mcool,hic} [auto] + Output format (by default this is inferred from the output file extension). + Should be one of: + - cool + - mcool + - hic + -j,--juicer-tools-jar TEXT:FILE + Path to juicer_tools or hic_tools JAR. + -r,--resolutions UINT:POSITIVE ... + One or more resolutions to be converted. By default all resolutions are converted. + --normalization-methods TEXT [ALL] ... + Name of one or more normalization methods to be copied. + By default, vectors for all known normalization methods are copied. + --fail-if-norm-not-found Fail if any of the requested normalization vectors are missing. + -g,--genome TEXT Genome assembly name. By default this is copied from the .hic file metadata. + --juicer-tools-memory UINT:SIZE [b, kb(=1000b), kib(=1024b), ...]:POSITIVE [32GB] + Max heap size used by juicer_tools. Only used when converting from cool to hic + --tmpdir TEXT Path where to store temporary files. + -v,--verbosity UINT:INT in [1 - 4] [] + Set verbosity of output to the console. + -t,--threads UINT:UINT in [2 - 16] [2] + Maximum number of parallel threads to spawn. + When converting from hic to cool, only two threads will be used. + -l,--compression-level UINT:INT in [0 - 9] [] + Compression level used to compress temporary files. + Pass 0 to disable compression. + -f,--force Overwrite existing files (if any). + +hictk dump +---------- + +.. code-block:: text + + Dump data from .hic and Cooler files to stdout. + Usage: hictk dump [OPTIONS] uri + Positionals: + uri TEXT:(((HiC) OR (Cooler)) OR (Multires-cooler)) OR (Single-cell-cooler) REQUIRED + Path to a .hic, .cool or .mcool file (Cooler URI syntax supported). + Options: + -h,--help Print this help message and exit + --resolution UINT:NONNEGATIVE + HiC matrix resolution (ignored when file is not in .hic format). + --matrix-type ENUM:value in {expected->2,observed->0,oe->1} OR {2,0,1} [observed] + Matrix type (ignored when file is not in .hic format). + --matrix-unit ENUM:value in {BP->0,FRAG->1} OR {0,1} [BP] + Matrix unit (ignored when file is not in .hic format). + -t,--table TEXT:{chroms,bins,pixels,normalizations,resolutions,cells} [pixels] + Name of the table to dump. + -r,--range TEXT [all] Excludes: --query-file + Coordinates of the genomic regions to be dumped following UCSC-style notation (chr1:0-1000). + --range2 TEXT [all] Needs: --range Excludes: --query-file + Coordinates of the genomic regions to be dumped following UCSC-style notation (chr1:0-1000). + --query-file TEXT:(FILE) OR ({-}) Excludes: --range --range2 + Path to a BEDPE file with the list of coordinates to be fetched (pass - to read queries from stdin). + -b,--balance TEXT [NONE] Balance interactions using the given method. + --sorted,--unsorted{false} Return interactions in ascending order. + --join,--no-join{false} Output pixels in BG2 format. + --weight-type TEXT:{infer,divisive,multiplicative} [infer] + How balancing weights should be applied to raw interactions (ignored when file is in .hic format). + +hictk fix-mcool +--------------- + +.. code-block:: text + + Fix corrupted .mcool files. + Usage: hictk fix-mcool [OPTIONS] input output + Positionals: + input TEXT:Multires-cooler REQUIRED + Path to a corrupted .mcool file. + output TEXT REQUIRED Path where to store the restored .mcool. + Options: + -h,--help Print this help message and exit + --tmpdir TEXT [/tmp] Path to a folder where to store temporary data. + --skip-balancing Do not recompute or copy balancing weights. + --check-base-resolution Check whether the base resolution is corrupted. + --in-memory Store all interactions in memory while balancing (greatly improves performance). + --chunk-size UINT:POSITIVE [10000000] + Number of interactions to process at once during balancing. + Ignored when using --in-memory. + -v,--verbosity UINT:INT in [1 - 4] [] + Set verbosity of output to the console. + -t,--threads UINT:UINT in [1 - 16] [1] + Maximum number of parallel threads to spawn (only applies to the balancing stage). + -l,--compression-level UINT:INT in [0 - 19] [] + Compression level used to compress temporary files using ZSTD (only applies to the balancing stage). + -f,--force Overwrite existing files (if any). + +hictk load +---------- + +.. code-block:: text + + Build .cool files from interactions in various text formats. + Usage: hictk load [OPTIONS] chrom-sizes bin-size output-uri + Positionals: + chrom-sizes TEXT:FILE REQUIRED + Path to .chrom.sizes file. + bin-size UINT:POSITIVE REQUIRED + Bin size (bp). + output-uri TEXT REQUIRED Path to output Cooler (URI syntax supported). + Options: + -h,--help Print this help message and exit + -f,--format TEXT:{4dn,validpairs,bg2,coo} REQUIRED + Input format. + --force Force overwrite existing output file(s). + --assembly TEXT [unknown] Assembly name. + --count-as-float Interactions are floats. + --assume-sorted,--assume-unsorted{false} + Assume input files are already sorted. + -v,--verbosity UINT:INT in [1 - 4] [] + Set verbosity of output to the console. + --batch-size UINT [20000000] + Number of pixels to buffer in memory. Only used when processing unsorted interactions or pairs + +hictk merge +----------- + +.. code-block:: text + + Merge coolers. + Usage: hictk merge [OPTIONS] input-coolers... + Positionals: + input-coolers TEXT:Cooler x 2 REQUIRED + Path to two or more Cooler files to be merged (URI syntax supported). + Options: + -h,--help Print this help message and exit + -o,--output-cooler TEXT Output Cooler (URI syntax supported). + When not specified, merged interactions will be printed to stdout. + -f,--force Force overwrite output cooler. + --chunk-size UINT [5000000] + Number of pixels to store in memory before writing to disk. + -v,--verbosity UINT:INT in [1 - 4] [] + Set verbosity of output to the console. + +hictk validate +-------------- + +.. code-block:: text + + Validate .hic and Cooler files. + Usage: hictk validate [OPTIONS] uri + Positionals: + uri TEXT REQUIRED Path to a .hic or .[ms]cool file (Cooler URI syntax supported). + Options: + -h,--help Print this help message and exit + --validate-index Validate Cooler index (may take a long time). + --quiet Don't print anything to stdout. Success/failure is reported through exit codes + +hictk zoomify +------------- + +.. code-block:: text + + Convert single-resolution Cooler file to multi-resolution by coarsening. + Usage: hictk zoomify [OPTIONS] cooler [mcool] + Positionals: + cooler TEXT:Cooler REQUIRED Path to a .cool file (Cooler URI syntax supported). + mcool TEXT Output path. + Options: + -h,--help Print this help message and exit + --force Force overwrite existing output file(s). + --resolutions UINT ... One or more resolutions to be used for coarsening. + --copy-base-resolution,--no-copy-base-resolution{false} + Copy the base resolution to the output file. + --nice-steps,--pow2-steps{false} [--nice-steps] + Use nice or power of two steps to automatically generate the list of resolutions. + Example: + Base resolution: 1000 + Pow2: 1000, 2000, 4000, 8000... + Nice: 1000, 2000, 5000, 10000... + -v,--verbosity UINT:INT in [1 - 4] [] + Set verbosity of output to the console. diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..f59ec26c --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,251 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + + +import os +import re + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx_copybutton", + "sphinxcontrib.rsvgconverter", + "sphinxcontrib.moderncmakedomain", + "sphinx.ext.intersphinx", +] + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), +} + +# Add any paths that contain templates here, relative to this directory. +templates_path = [".templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = ".rst" + +# The encoding of source files. +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = "index" + +# General information about the project. +project = "hictk" +copyright = "2023, Roberto Rossini" +author = "Roberto Rossini" + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. + +# Read the listed version +version = "0.0.2" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# today = '' +# Else, today_fmt is used as the format for a strftime call. +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [".build", "release.rst"] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +default_role = "any" + +# If true, '()' will be appended to :func: etc. cross-reference text. +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +# pygments_style = 'monokai' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. + +html_theme = "furo" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +# html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +# html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ["_static"] + +# html_css_files = [ ] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +# html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +# html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# html_additional_pages = {} + +# If false, no module index is generated. +# html_domain_indices = True + +# If false, no index is generated. +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' +# html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +# html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +# html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = "hictk_doc" + +# Tweak copybutton https://sphinx-copybutton.readthedocs.io/en/latest/use.html +copybutton_selector = "div:not(.no-copybutton) > div.highlight > pre" +copybutton_exclude = '.linenos, .gp, .go' +copybutton_copy_empty_lines = False +copybutton_prompt_text = "user@dev:/tmp$" + +# -- Options for LaTeX output --------------------------------------------- + +latex_engine = "pdflatex" + +latex_elements = { + 'papersize': 'a4paper', + 'pointsize': '10pt', + "classoptions": ",openany,oneside", + "preamble": r""" +\usepackage{MnSymbol} +\DeclareUnicodeCharacter{25CB}{\ensuremath{\circ}} +\DeclareUnicodeCharacter{25CF}{\ensuremath{\bullet}} +\DeclareUnicodeCharacter{21B5}{\ensuremath{\rhookswarrow}} +\DeclareUnicodeCharacter{2194}{\ensuremath{\leftrightarrow}} +""", +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, "hictk.tex", "hictk Documentation", author, "manual"), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# latex_logo = 'hictk-logo.png' + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# latex_use_parts = False + +# If true, show page references after internal links. +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# latex_appendices = [] + +# If false, no module index is generated. +# latex_domain_indices = True + +primary_domain = "cpp" +highlight_language = "cpp" diff --git a/docs/cpp_api/cooler.rst b/docs/cpp_api/cooler.rst new file mode 100644 index 00000000..fec7c503 --- /dev/null +++ b/docs/cpp_api/cooler.rst @@ -0,0 +1,254 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +.. cpp:namespace:: hictk + +Cooler API +########## + +API to operate on .cool files. Compared to the generic API, this API provides: + +* more control over how files are opened +* direct access to HDF5 group and datasets +* lower overhead +* support for creating .cool files +* support for opening collections of Coolers (e.g. .mcool and .scool files) + +Single-resolution Cooler (.cool) +-------------------------------- + +.. cpp:namespace:: hictk::cooler +.. cpp:class:: File + + **Constructors** + + .. cpp:function:: File(const File &other) = delete; + .. cpp:function:: File(File &&other) noexcept(noexcept_move_ctor()) = default; + + .. cpp:function:: [[nodiscard]] explicit File(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); + .. cpp:function:: [[nodiscard]] explicit File(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); + + **Factory functions** + + .. cpp:function:: [[nodiscard]] static File open_random_access(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); + .. cpp:function:: [[nodiscard]] static File open_read_once(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); + .. cpp:function:: template [[nodiscard]] static File create(RootGroup entrypoint, const Reference &chroms, std::uint32_t bin_size, Attributes attributes = Attributes::init(0), std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + + **Open/close methods** + + .. cpp:function:: [[nodiscard]] static File open_random_access(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); + .. cpp:function:: [[nodiscard]] static File open_read_once(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); + .. cpp:function:: template [[nodiscard]] static File create(std::string_view uri, const Reference &chroms, std::uint32_t bin_size, bool overwrite_if_exists = false, Attributes attributes = Attributes::init(0), std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + + .. cpp:function:: void close(); + + Note that :cpp:class:`File`\s are automatically closed upon destruction. + + **Operators** + + .. cpp:function:: File &operator=(const File &other) = delete; + .. cpp:function:: File &operator=(File &&other) noexcept(noexcept_move_assigment_op()) = default; + + .. cpp:function:: [[nodiscard]] explicit operator bool() const noexcept; + + Return whether the :cpp:class:`File` is in a valid state and other member functions can be safely called. + + **Accessors** + + .. cpp:function:: [[nodiscard]] std::string uri() const; + .. cpp:function:: [[nodiscard]] std::string hdf5_path() const; + .. cpp:function:: [[nodiscard]] std::string path() const; + + .. cpp:function:: [[nodiscard]] auto chromosomes() const noexcept -> const Reference &; + .. cpp:function:: [[nodiscard]] auto bins() const noexcept -> const BinTable &; + .. cpp:function:: [[nodiscard]] auto bins_ptr() const noexcept -> std::shared_ptr; + + .. cpp:function:: [[nodiscard]] std::uint32_t bin_size() const noexcept; + .. cpp:function:: [[nodiscard]] std::uint64_t nbins() const; + .. cpp:function:: [[nodiscard]] std::uint64_t nchroms() const; + .. cpp:function:: [[nodiscard]] std::uint64_t nnz() const; + + .. cpp:function:: [[nodiscard]] auto attributes() const noexcept -> const Attributes &; + .. cpp:function:: [[nodiscard]] auto group(std::string_view group_name) -> Group &; + .. cpp:function:: [[nodiscard]] auto dataset(std::string_view dataset_name) -> Dataset &; + .. cpp:function:: [[nodiscard]] auto group(std::string_view group_name) const -> const Group &; + .. cpp:function:: [[nodiscard]] auto dataset(std::string_view dataset_name) const -> const Dataset &; + + .. cpp:function:: [[nodiscard]] const NumericVariant &pixel_variant() const noexcept; + .. cpp:function:: template [[nodiscard]] bool has_pixel_of_type() const noexcept; + + .. cpp:function:: [[nodiscard]] bool has_signed_pixels() const noexcept; + .. cpp:function:: [[nodiscard]] bool has_unsigned_pixels() const noexcept; + .. cpp:function:: [[nodiscard]] bool has_integral_pixels() const noexcept; + .. cpp:function:: [[nodiscard]] bool has_float_pixels() const noexcept; + + **Iteration** + + .. cpp:function:: template [[nodiscard]] typename PixelSelector::iterator begin(std::string_view weight_name = "NONE") const; + .. cpp:function:: template [[nodiscard]] typename PixelSelector::iterator end(std::string_view weight_name = "NONE") const; + + .. cpp:function:: template [[nodiscard]] typename PixelSelector::iterator cbegin(std::string_view weight_name = "NONE") const; + .. cpp:function:: template [[nodiscard]] typename PixelSelector::iterator cend(std::string_view weight_name = "NONE") const; + + **Fetch methods (1D queries)** + + .. cpp:function:: [[nodiscard]] PixelSelector fetch(const balancing::Method &normalization = balancing::Method::NONE()) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::shared_ptr weights) const; + + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view range, std::shared_ptr weights, QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view chrom_name, std::uint32_t start, std::uint32_t end, std::shared_ptr weights) const; + + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view range, const balancing::Method &normalization = balancing::Method::NONE(), QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view chrom_name, std::uint32_t start, std::uint32_t end, const balancing::Method &normalization = balancing::Method::NONE()) const; + + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::uint64_t first_bin, std::uint64_t last_bin, std::shared_ptr weights = nullptr) const; + + **Fetch methods (2D queries)** + + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view range1, std::string_view range2, std::shared_ptr weights, QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view chrom1_name, std::uint32_t start1, std::uint32_t end1, std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, std::shared_ptr weights) const; + + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view range1, std::string_view range2, const balancing::Method &normalization = balancing::Method::NONE(), QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view chrom1_name, std::uint32_t start1, std::uint32_t end1, std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, const balancing::Method &normalization = balancing::Method::NONE()) const; + + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::uint64_t first_bin1, std::uint64_t last_bin1, std::uint64_t first_bin2, std::uint64_t last_bin2, std::shared_ptr weights = nullptr) const; + + **Write pixels** + + .. cpp:function:: template >> void append_pixels(PixelIt first_pixel, PixelIt last_pixel, bool validate = false); + + **Normalization** + + .. cpp:function:: [[nodiscard]] bool has_normalization(std::string_view normalization) const; + .. cpp:function:: std::shared_ptr read_normalization(std::string_view normalization, bool rescale = false) const; + .. cpp:function:: std::shared_ptr read_weights(std::string_view normalization, balancing::Weights::Type type, bool rescale = false) const; + + .. cpp:function:: [[nodiscard]] bool has_normalization(const balancing::Method &normalization) const; + .. cpp:function:: std::shared_ptr read_weights(const balancing::Method &normalization, bool rescale = false) const; + .. cpp:function:: std::shared_ptr read_weights(const balancing::Method &normalization, balancing::Weights::Type type, bool rescale = false) const; + + .. cpp:function:: [[nodiscard]] std::vector avail_normalizations() const; + + .. cpp:function:: bool purge_weights(std::string_view name = ""); + + .. cpp:function:: template static void write_weights(std::string_view uri, std::string_view name, It first_weight, It last_weight, bool overwrite_if_exists = false, bool divisive = false); + .. cpp:function:: template void write_weights(std::string_view name, It first_weight, It last_weight, bool overwrite_if_exists = false, bool divisive = false); + + **Others** + + .. cpp:function:: void flush(); + .. cpp:function:: void validate_bins(bool full = false) const; + +Multi-resolution Cooler (.mcool) +-------------------------------- + +.. cpp:namespace:: hictk::cooler +.. cpp:class:: MultiResFile + + **Constructors** + + .. cpp:function:: explicit MultiResFile(const std::filesystem::path& path, unsigned int mode = HighFive::File::ReadOnly); + + **Factory functions** + + .. cpp:function:: [[nodiscard]] static MultiResFile create(const std::filesystem::path& path, const Reference& chroms, bool force_overwrite = false); + .. cpp:function:: template [[nodiscard]] static MultiResFile create(const std::filesystem::path& path, const File& base, ResolutionIt first_res, ResolutionIt last_res, bool force_overwrite = false); + + **Open/close methods** + + .. cpp:function:: [[nodiscard]] File open(std::uint32_t resolution) const; + + **Operators** + + .. cpp:function:: [[nodiscard]] explicit operator bool() const noexcept; + + **Accessors** + + .. cpp:function:: [[nodiscard]] std::string path() const; + .. cpp:function:: [[nodiscard]] auto chromosomes() const noexcept -> const Reference&; + .. cpp:function:: [[nodiscard]] constexpr const std::vector& resolutions() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr const MultiResAttributes& attributes() const noexcept; + + **Modifiers** + + .. cpp:function:: File copy_resolution(const cooler::File& clr); + .. cpp:function:: template File create_resolution(std::uint32_t resolution, Attributes attributes = Attributes::init(0)); + .. cpp:function:: RootGroup init_resolution(std::uint32_t resolution); + + **Others** + + .. cpp:function:: [[nodiscard]] static std::uint32_t compute_base_resolution(const std::vector& resolutions, std::uint32_t target_res); + .. cpp:function:: static void coarsen(const File& clr1, File& clr2, std::vector>& buffer); + +Single-cell Cooler (.scool) +--------------------------- + +.. cpp:namespace:: hictk::cooler +.. cpp:class:: SingleCellFile + + **Constructors** + + .. cpp:function:: explicit SingleCellFile(const std::filesystem::path& path, unsigned int mode = HighFive::File::ReadOnly); + + **Factory functions** + + .. cpp:function:: [[nodiscard]] static SingleCellFile create(const std::filesystem::path& path, const Reference& chroms, std::uint32_t bin_size, bool force_overwrite = false); + + **Open/close functions** + + .. cpp:function:: [[nodiscard]] File open(std::string_view cell) const; + + **Operators** + + .. cpp:function:: [[nodiscard]] explicit operator bool() const noexcept; + + **Accessors** + + .. cpp:function:: [[nodiscard]] std::string path() const; + .. cpp:function:: [[nodiscard]] auto chromosomes() const noexcept -> const Reference&; + .. cpp:function:: [[nodiscard]] auto bins() const noexcept -> const BinTable&; + .. cpp:function:: [[nodiscard]] std::uint32_t bin_size() const noexcept; + + .. cpp:function:: [[nodiscard]] constexpr const phmap::btree_set& cells() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr const SingleCellAttributes& attributes() const noexcept; + + **Modifiers** + + .. cpp:function:: template File create_cell(std::string_view cell, Attributes attrs = Attributes::init(0)); + + **Others** + + .. cpp:function:: template File aggregate(std::string_view uri, bool overwrite_if_exists = false, std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000) const; + +Pixel selector +-------------- + +.. cpp:class:: PixelSelector + + **Operators** + + .. cpp:function:: [[nodiscard]] bool operator==(const PixelSelector &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator!=(const PixelSelector &other) const noexcept; + + **Iteration** + + .. cpp:function:: template [[nodiscard]] auto begin() const -> iterator; + .. cpp:function:: template [[nodiscard]] auto end() const -> iterator; + + .. cpp:function:: template [[nodiscard]] auto cbegin() const -> iterator; + .. cpp:function:: template [[nodiscard]] auto cend() const -> iterator; + + **Fetch at once** + + .. cpp:function:: template [[nodiscard]] std::vector> read_all() const; + .. cpp:function:: template [[nodiscard]] Eigen::SparseMatrix read_sparse() const; + .. cpp:function:: template [[nodiscard]] Eigen::Matrix read_dense() const; + + **Accessors** + + .. cpp:function:: [[nodiscard]] const PixelCoordinates &coord1() const noexcept; + .. cpp:function:: [[nodiscard]] const PixelCoordinates &coord2() const noexcept; + + .. cpp:function:: [[nodiscard]] const BinTable &bins() const noexcept; + .. cpp:function:: [[nodiscard]] std::shared_ptr bins_ptr() const noexcept; diff --git a/docs/cpp_api/generic.rst b/docs/cpp_api/generic.rst new file mode 100644 index 00000000..bee51cfb --- /dev/null +++ b/docs/cpp_api/generic.rst @@ -0,0 +1,237 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +.. cpp:namespace:: hictk + +Generic API +=========== + +hictk generic API allows users to transparently operate on .hic .cool files. +There is virtually no runtime overhead when using the :cpp:class:`File` and :cpp:class:`PixelSelector` classes. However iterating over :cpp:class:`Pixel`\s using this API is slightly slower than using the format-specific APIs. + +Refer to examples in the :doc:`../quickstart_api` section for how to use the generic API without incurring into any overhead when iterating over :cpp:class:`Pixel`\s overlapping queries. + +Common +------ + +.. cpp:namespace:: hictk + +.. cpp:enum-class:: QUERY_TYPE + + .. cpp:enumerator:: BED + .. cpp:enumerator:: UCSC + + +File handle +----------- + +.. cpp:namespace:: hictk + +.. cpp:class:: File + + This class implements a generic file handle capable of transparently operating on .cool and .hic files. + + **Constructors** + + .. cpp:function:: File(cooler::File clr); + .. cpp:function:: File(hic::File hf); + .. cpp:function:: File(std::string uri, std::uint32_t resolution = 0, hic::MatrixType type = hic::MatrixType::observed, hic::MatrixUnit unit = hic::MatrixUnit::BP); + + Constructors for :cpp:class:`File` class. + ``resolution`` is a mandatory argument when opening .hic files. + Matrix ``type`` and ``unit`` are ignored when operating on .cool files. + + **Accessors** + + .. cpp:function:: [[nodiscard]] std::string uri() const; + + Returns the URI of the open file. Always returns the file path when file is .hic. + + .. cpp:function:: [[nodiscard]] std::string path() const; + + Returns the path to the open file. + + .. cpp:function:: [[nodiscard]] constexpr bool is_hic() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr bool is_cooler() const noexcept; + + Test whether the open file is in .hic or .cool format. + + .. cpp:function:: [[nodiscard]] auto chromosomes() const -> const Reference &; + .. cpp:function:: [[nodiscard]] auto bins() const -> const BinTable &; + .. cpp:function:: [[nodiscard]] std::shared_ptr bins_ptr() const; + + Accessors to the chromosomes and bin table of the open file. + + .. cpp:function:: [[nodiscard]] std::uint32_t bin_size() const; + .. cpp:function:: [[nodiscard]] std::uint64_t nbins() const; + .. cpp:function:: [[nodiscard]] std::uint64_t nchroms() const; + + Accessors for common attributes. + Calling any of these accessors does not involve any computation. + + .. cpp:function:: [[nodiscard]] bool has_normalization(std::string_view normalization) const; + .. cpp:function:: [[nodiscard]] std::vector avail_normalizations() const; + + Accessors for normalization methods/vectors. + + **Fetch methods (1D queries)** + + .. cpp:function:: [[nodiscard]] PixelSelector fetch(const balancing::Method &normalization = balancing::Method::NONE()) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view range, const balancing::Method &normalization = balancing::Method::NONE(), QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view chrom_name, std::uint32_t start, std::uint32_t end, const balancing::Method &normalization = balancing::Method::NONE()) const; + + Return a :cpp:class:`PixelSelector` object that can be used to fetch pixels overlapping 1D (symmetric) queries. + + **Example usage:** + + .. code-block:: cpp + + hictk::File f{"myfile.hic", 1'000}; + + // Fetch all pixels + const auto sel1 = f.fetch(); + + // Fetch all pixels (normalized with VC); + const auto sel2 = f.fetch(balancing::Method::VC()); + + // Fetch pixels overlapping chr1 + const auto sel3 = f.fetch("chr1"); + + // Fetch pixels overlapping a region of interest + const auto sel4 = f.fetch("chr1:10,000,000-20,000,000"); + const auto sel5 = f.fetch("chr1", 10'000'000, 20'000'000"); + + // Fetch pixels using a BED query + const auto sel6 = f.fetch("chr1\t10000000\t20000000", + balancing::Method::NONE(), + QUERY_TYPE::BED); + + **Fetch methods (2D queries)** + + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view range1, std::string_view range2, const balancing::Method &normalization = balancing::Method::NONE(), QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view chrom1_name, std::uint32_t start1, std::uint32_t end1, std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, const balancing::Method &normalization = balancing::Method::NONE()) const; + + Return a :cpp:class:`PixelSelector` object that can be used to fetch pixels overlapping 2D (asymmetric) queries. + + **Example usage:** + + .. code-block:: cpp + + hictk::File f{"myfile.hic", 1'000}; + + // Fetch pixels overlapping chr1:chr2 + const auto sel1 = f.fetch("chr1", "chr2"); + + // Fetch pixels overlapping a region of interest + const auto sel2 = f.fetch("chr1:10,000,000-20,000,000", + "chr2:10,000,000-20,000,000"); + const auto sel3 = f.fetch("chr1", 10'000'000, 20'000'000, + "chr2", 10'000'000, 20'000'000); + + + **Advanced** + + .. cpp:function:: template [[nodiscard]] constexpr const FileT &get() const noexcept; + .. cpp:function:: template [[nodiscard]] constexpr FileT &get() noexcept; + .. cpp:function:: [[nodiscard]] constexpr auto get() const noexcept -> const FileVar &; + .. cpp:function:: [[nodiscard]] constexpr auto get() noexcept -> FileVar &; + + Methods to get the underlying :cpp:class:`hic::File` or :cpp:class:`cooler::File` file handle or a :cpp:class:`std::variant` of thereof. + + **Example usage:** + + .. code-block:: cpp + + hictk::File f{"myfile.hic", 1'000}; + + assert(f.get().path() == "myfile.hic"); + assert(f.get().path() == "myfile.hic"); // Throws an exception + + const auto fvar = f.get(); + std::visit([](const auto& f) { + assert(f.path() == "myfile.hic"); + }, fvar); + +Pixel selector +-------------- + +.. cpp:namespace:: hictk + +.. cpp:class:: PixelSelector + + This class implements a generic, lightweight pixel selector object. + + :cpp:class:`PixelSelector` objects are constructed and returned by :cpp:func:`File::fetch` methods. + Users are **not** supposed to construct :cpp:class:`PixelSelector` objects themselves. + + **Iteration** + + .. cpp:function:: template [[nodiscard]] auto begin(bool sorted = true) const -> iterator; + .. cpp:function:: template [[nodiscard]] auto end() const -> iterator; + + .. cpp:function:: template [[nodiscard]] auto cbegin(bool sorted = true) const -> iterator; + .. cpp:function:: template [[nodiscard]] auto cend() const -> iterator; + + Return an `InputIterator `_ to traverse pixels + overlapping the genomic coordinates used to create the :cpp:class:`PixelSelector`. + + Specifying ``sorted = false`` will improve throughput for queries over .hic files. + + When operating on .cool files, pixels are always returned sorted by genomic coordinates. + + **Example usage:** + + .. code-block:: cpp + + hictk::File f{"myfile.hic", 1'000}; + const auto sel = f.fetch(); + + std::for_each(sel.begin(), sel.end(), + [&](const auto& pixel) { fmt::print("{}\n", pixel); }); + + // STDOUT + // 0 0 12 + // 0 2 7 + // 0 4 1 + // ... + + **Fetch at once** + + .. cpp:function:: template [[nodiscard]] std::vector> read_all() const; + .. cpp:function:: template [[nodiscard]] Eigen::SparseMatrix read_sparse() const; + .. cpp:function:: template [[nodiscard]] Eigen::Matrix read_dense() const; + + Read and return all :cpp:class:`Pixel`\s at once using a :cpp:class:`std::vector`. + + **Accessors** + + .. cpp:function:: [[nodiscard]] const PixelCoordinates &coord1() const; + .. cpp:function:: [[nodiscard]] const PixelCoordinates &coord2() const; + + Return the genomic coordinates used to construct the :cpp:class:`PixelSelector`. + + .. cpp:function:: [[nodiscard]] const BinTable &bins() const; + + Return the :cpp:class:`BinTable` used to map :cpp:class:`Pixel`\s to genomic :cpp:class:`Bin`\s. + + **Advanced** + + .. cpp:function:: template [[nodiscard]] constexpr const PixelSelectorT &get() const noexcept; + .. cpp:function:: template [[nodiscard]] constexpr PixelSelectorT &get() noexcept; + .. cpp:function:: [[nodiscard]] constexpr auto get() const noexcept -> const PixelSelectorVar &; + .. cpp:function:: [[nodiscard]] constexpr auto get() noexcept -> PixelSelectorVar &; + + **Example usage:** + + .. code-block:: cpp + + hictk::File f{"myfile.hic", 1'000}; + + const auto sel = f.fetch(); + + assert(f.get().matrix_type() == hic::MatrixType::observed"); + f.get(); // Throws an exception + + const auto selvar = sel.get(); + std::visit([](const auto& s) { assert(s.bins().bin_size() == 1'000); }, selvar); diff --git a/docs/cpp_api/hic.rst b/docs/cpp_api/hic.rst new file mode 100644 index 00000000..a04107df --- /dev/null +++ b/docs/cpp_api/hic.rst @@ -0,0 +1,158 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +.. cpp:namespace:: hictk + +Hi-C API +########## + +API to operate on .hic files. Compared to the generic API, this API provides: + +* more control over how files are opened +* access to .hic-specific metadata +* control over the interaction block cache + +Common +------ + +.. cpp:namespace:: hictk::hic + +.. cpp:enum-class:: MatrixType + + .. cpp:enumerator:: observed + + .. cpp:enumerator:: oe + + .. cpp:enumerator:: expected + +.. cpp:enum-class:: MatrixUnit + + .. cpp:enumerator:: BP + + .. cpp:enumerator:: FRAG + +.. cpp:enum-class:: QUERY_TYPE + + .. cpp:enumerator:: BED + + .. cpp:enumerator:: UCSC + + +File handle +----------- + +.. cpp:namespace:: hictk::hic + +.. cpp:class:: File + + **Constructors** + + .. cpp:function:: explicit File(std::string url_, std::uint32_t resolution_, MatrixType type_ = MatrixType::observed, MatrixUnit unit_ = MatrixUnit::BP, std::uint64_t block_cache_capacity = 0); + + **Open/close methods** + + .. cpp:function:: File &open(std::string url_, std::uint32_t resolution_, MatrixType type_ = MatrixType::observed, MatrixUnit unit_ = MatrixUnit::BP, std::uint64_t block_cache_capacity = 0); + .. cpp:function:: File &open(std::uint32_t resolution_, MatrixType type_ = MatrixType::observed, MatrixUnit unit_ = MatrixUnit::BP, std::uint64_t block_cache_capacity = 0); + + **Accessors** + + .. cpp:function:: [[nodiscard]] bool has_resolution(std::uint32_t resolution) const; + + .. cpp:function:: [[nodiscard]] const std::string &url() const noexcept; + .. cpp:function:: [[nodiscard]] const std::string &name() const noexcept; + + .. cpp:function:: [[nodiscard]] std::int32_t version() const noexcept; + + .. cpp:function:: [[nodiscard]] const Reference &chromosomes() const noexcept; + .. cpp:function:: [[nodiscard]] const BinTable &bins() const noexcept; + .. cpp:function:: [[nodiscard]] std::shared_ptr bins_ptr() const noexcept; + + .. cpp:function:: [[nodiscard]] std::uint32_t bin_size() const noexcept; + .. cpp:function:: [[nodiscard]] std::uint64_t nbins() const; + .. cpp:function:: [[nodiscard]] std::uint64_t nchroms() const; + .. cpp:function:: [[nodiscard]] const std::string &assembly() const noexcept; + .. cpp:function:: [[nodiscard]] const std::vector &avail_resolutions() const noexcept; + .. cpp:function:: [[nodiscard]] bool has_normalization(std::string_view normalization) const; + .. cpp:function:: [[nodiscard]] std::vector avail_normalizations() const; + + **Fetch methods (1D queries)** + + .. cpp:function:: [[nodiscard]] PixelSelectorAll fetch(balancing::Method norm = balancing::Method::NONE()) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view range, balancing::Method norm = balancing::Method::NONE(), QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view chrom_name, std::uint32_t start, std::uint32_t end, balancing::Method norm = balancing::Method::NONE()) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::uint64_t first_bin, std::uint64_t last_bin, balancing::Method norm = balancing::Method::NONE()) const; + + **Fetch methods (2D queries)** + + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view range1, std::string_view range2, balancing::Method norm = balancing::Method::NONE(), QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::string_view chrom1_name, std::uint32_t start1, std::uint32_t end1, std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, balancing::Method norm = balancing::Method::NONE()) const; + .. cpp:function:: [[nodiscard]] PixelSelector fetch(std::uint64_t first_bin1, std::uint64_t last_bin1, std::uint64_t first_bin2, std::uint64_t last_bin2, balancing::Method norm = balancing::Method::NONE()) const; + + **Caching** + + .. cpp:function:: [[nodiscard]] std::size_t num_cached_footers() const noexcept; + .. cpp:function:: void purge_footer_cache(); + + .. cpp:function:: [[nodiscard]] double block_cache_hit_rate() const noexcept; + .. cpp:function:: void reset_cache_stats() const noexcept; + .. cpp:function:: void clear_cache() noexcept; + .. cpp:function:: void optimize_cache_size(std::size_t upper_bound = (std::numeric_limits::max)()); + .. cpp:function:: void optimize_cache_size_for_iteration(std::size_t upper_bound = (std::numeric_limits::max)()); + .. cpp:function:: void optimize_cache_size_for_random_access(std::size_t upper_bound = (std::numeric_limits::max)()); + .. cpp:function:: [[nodiscard]] std::size_t cache_capacity() const noexcept; + +Pixel selector +-------------- + +.. cpp:namespace:: hictk::hic + +.. cpp:class:: PixelSelector + + **Operators** + + .. cpp:function:: [[nodiscard]] bool operator==(const PixelSelector &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator!=(const PixelSelector &other) const noexcept; + + **Iteration** + + .. cpp:function:: template [[nodiscard]] auto begin(bool sorted = true) const -> iterator; + .. cpp:function:: template [[nodiscard]] auto end() const -> iterator; + + .. cpp:function:: template [[nodiscard]] auto cbegin(bool sorted = true) const -> iterator; + .. cpp:function:: template [[nodiscard]] auto cend() const -> iterator; + + **Fetch at once** + + .. cpp:function:: template [[nodiscard]] std::vector> read_all() const; + + .. cpp:function:: template [[nodiscard]] Eigen::SparseMatrix read_sparse() const; + .. cpp:function:: template [[nodiscard]] Eigen::Matrix read_dense() const; + + **Accessors** + + .. cpp:function:: [[nodiscard]] const PixelCoordinates &coord1() const noexcept; + .. cpp:function:: [[nodiscard]] const PixelCoordinates &coord2() const noexcept; + + .. cpp:function:: [[nodiscard]] MatrixType matrix_type() const noexcept; + .. cpp:function:: [[nodiscard]] balancing::Method normalization() const noexcept; + .. cpp:function:: [[nodiscard]] MatrixUnit unit() const noexcept; + .. cpp:function:: [[nodiscard]] std::uint32_t resolution() const noexcept; + + .. cpp:function:: [[nodiscard]] const Chromosome &chrom1() const noexcept; + .. cpp:function:: [[nodiscard]] const Chromosome &chrom2() const noexcept; + + .. cpp:function:: [[nodiscard]] const balancing::Weights &weights1() const noexcept; + .. cpp:function:: [[nodiscard]] const balancing::Weights &weights2() const noexcept; + + .. cpp:function:: [[nodiscard]] const BinTable &bins() const noexcept; + .. cpp:function:: [[nodiscard]] const internal::HiCFooterMetadata &metadata() const noexcept; + + .. cpp:function:: [[nodiscard]] bool is_inter() const noexcept; + .. cpp:function:: [[nodiscard]] bool is_intra() const noexcept; + .. cpp:function:: [[nodiscard]] bool empty() const noexcept; + + **Caching** + + .. cpp:function:: [[nodiscard]] std::size_t estimate_optimal_cache_size(std::size_t num_samples = 500) const; + .. cpp:function:: void clear_cache() const; diff --git a/docs/cpp_api/index.rst b/docs/cpp_api/index.rst new file mode 100644 index 00000000..81b3642c --- /dev/null +++ b/docs/cpp_api/index.rst @@ -0,0 +1,16 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +C++ API Reference +################# + +hictk C++ API is structured as follows: + +.. toctree:: + :maxdepth: 1 + + generic + cooler + hic + shared diff --git a/docs/cpp_api/shared.rst b/docs/cpp_api/shared.rst new file mode 100644 index 00000000..00068e6e --- /dev/null +++ b/docs/cpp_api/shared.rst @@ -0,0 +1,423 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +.. cpp:namespace:: hictk + +Shared Types +############ + +Types documented in this page are used throughout hictk code-base to model various concepts such as genomic intervals, reference genomes, bins and pixels. + +Chromosome +---------- + +.. cpp:namespace:: hictk + +.. cpp:class:: Chromosome + + This class models chromosomes as triplets consisting of: + + * A numeric identifier + * The chromosome name + * The chromosome size + + :cpp:class:`Chromosome`\s are compared by ID. + + **Constructors** + + .. cpp:function:: Chromosome() = default; + .. cpp:function:: Chromosome(std::uint32_t id_, std::string name_, std::uint32_t size_) noexcept; + + **Operators** + + .. cpp:function:: [[nodiscard]] constexpr explicit operator bool() const noexcept; + + **Accessors** + + .. cpp:function:: [[nodiscard]] constexpr std::uint32_t id() const noexcept; + .. cpp:function:: [[nodiscard]] std::string_view name() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr std::uint32_t size() const noexcept; + .. cpp:function:: [[nodiscard]] bool is_all() const noexcept; + + **Comparison operators** + + .. cpp:function:: [[nodiscard]] constexpr bool operator<(const Chromosome& other) const noexcept; + .. cpp:function:: [[nodiscard]] constexpr bool operator>(const Chromosome& other) const noexcept; + .. cpp:function:: [[nodiscard]] constexpr bool operator<=(const Chromosome& other) const noexcept; + .. cpp:function:: [[nodiscard]] constexpr bool operator>=(const Chromosome& other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator==(const Chromosome& other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator!=(const Chromosome& other) const noexcept; + + .. cpp:function:: friend bool operator==(const Chromosome& a, std::string_view b_name) noexcept; + .. cpp:function:: friend bool operator!=(const Chromosome& a, std::string_view b_name) noexcept; + + .. cpp:function:: friend bool operator==(std::string_view a_name, const Chromosome& b) noexcept; + .. cpp:function:: friend bool operator!=(std::string_view a_name, const Chromosome& b) noexcept; + + .. cpp:function:: friend constexpr bool operator<(const Chromosome& a, std::uint32_t b_id) noexcept; + .. cpp:function:: friend constexpr bool operator>(const Chromosome& a, std::uint32_t b_id) noexcept; + .. cpp:function:: friend constexpr bool operator<=(const Chromosome& a, std::uint32_t b_id) noexcept; + .. cpp:function:: friend constexpr bool operator>=(const Chromosome& a, std::uint32_t b_id) noexcept; + .. cpp:function:: friend constexpr bool operator==(const Chromosome& a, std::uint32_t b_id) noexcept; + .. cpp:function:: friend constexpr bool operator!=(const Chromosome& a, std::uint32_t b_id) noexcept; + + .. cpp:function:: friend constexpr bool operator<(std::uint32_t a_id, const Chromosome& b) noexcept; + .. cpp:function:: friend constexpr bool operator>(std::uint32_t a_id, const Chromosome& b) noexcept; + .. cpp:function:: friend constexpr bool operator<=(std::uint32_t a_id, const Chromosome& b) noexcept; + .. cpp:function:: friend constexpr bool operator>=(std::uint32_t a_id, const Chromosome& b) noexcept; + .. cpp:function:: friend constexpr bool operator==(std::uint32_t a_id, const Chromosome& b) noexcept; + .. cpp:function:: friend constexpr bool operator!=(std::uint32_t a_id, const Chromosome& b) noexcept; + + +Genomic intervals +----------------- + +.. cpp:namespace:: hictk + +.. cpp:class:: GenomicInterval + + Class to represent 1D genomic intervals. + + This class has two main purposes: + + * Storing information regarding genomic intervals + * Simplifying comparison of genomic intervals (e.g. is interval A upstream of interval B) + + .. cpp:enum-class:: QUERY_TYPE + + .. cpp:enumerator:: BED + .. cpp:enumerator:: UCSC + + **Constructors** + + .. cpp:function:: constexpr GenomicInterval() = default; + .. cpp:function:: explicit GenomicInterval(const Chromosome &chrom_) noexcept; + .. cpp:function:: GenomicInterval(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept; + + **Factory methods** + + .. cpp:function:: [[nodiscard]] static GenomicInterval parse(const Reference &chroms, std::string query, Type type = Type::UCSC); + .. cpp:function:: [[nodiscard]] static GenomicInterval parse_ucsc(const Reference &chroms, std::string query); + .. cpp:function:: [[nodiscard]] static GenomicInterval parse_bed(const Reference &chroms, std::string_view query, char sep = '\t'); + + **Operators** + + .. cpp:function:: [[nodiscard]] explicit operator bool() const noexcept; + + .. cpp:function:: [[nodiscard]] bool operator==(const GenomicInterval &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator!=(const GenomicInterval &other) const noexcept; + + .. cpp:function:: [[nodiscard]] bool operator<(const GenomicInterval &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator<=(const GenomicInterval &other) const noexcept; + + .. cpp:function:: [[nodiscard]] bool operator>(const GenomicInterval &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator>=(const GenomicInterval &other) const noexcept; + + **Accessors** + + .. cpp:function:: [[nodiscard]] const Chromosome &chrom() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr std::uint32_t start() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr std::uint32_t end() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr std::uint32_t size() const noexcept; + + +Genomic bins +------------ + +.. cpp:namespace:: hictk + +.. cpp:class:: Bin + + Class modeling genomic bins. + + The class is implemented as a thin wrapper around :cpp:class:`GenomicInterval`\s. The main difference between :cpp:class:`Bin` and :cpp:class:`GenomicInterval` objects is that in addition to genomic coordinates, the :cpp:class:`Bin` object also store two identifiers: + + * A unique identifier that can be used to refer :cpp:class:`Bin`\s in a :cpp:class:`Reference`. + * A relative identifier that can be used to refer to :cpp:class:`Bin`\s in a :cpp:class:`Chromosome`. + + .. cpp:function:: constexpr Bin() = default; + .. cpp:function:: Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept; + .. cpp:function:: Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end_) noexcept; + .. cpp:function:: explicit Bin(GenomicInterval interval) noexcept; + .. cpp:function:: Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept; + + .. cpp:function:: [[nodiscard]] explicit operator bool() const noexcept; + + .. cpp:function:: [[nodiscard]] bool operator==(const Bin &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator!=(const Bin &other) const noexcept; + + .. cpp:function:: [[nodiscard]] bool operator<(const Bin &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator<=(const Bin &other) const noexcept; + + .. cpp:function:: [[nodiscard]] bool operator>(const Bin &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator>=(const Bin &other) const noexcept; + + .. cpp:function:: [[nodiscard]] constexpr std::uint64_t id() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr std::uint32_t rel_id() const noexcept; + .. cpp:function:: [[nodiscard]] const GenomicInterval &interval() const noexcept; + .. cpp:function:: [[nodiscard]] const Chromosome &chrom() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr std::uint32_t start() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr std::uint32_t end() const noexcept; + + .. cpp:function:: [[nodiscard]] constexpr bool has_null_id() const noexcept; + + +Reference genome +---------------- + +.. cpp:namespace:: hictk + +.. cpp:class:: Reference + + This class models the reference genome used as coordinate systems in Hi-C matrices. + + :cpp:class:`Reference` objects consist of collections of :cpp:class:`Chromosome`\s with unique IDs. + + :cpp:class:`Chromosome`\s can be queried by ID or by name. + + As a general rule, queries by :cpp:class:`Chromosome` ID are more efficient than queries by name. + + **Constructors** + + .. cpp:function:: Reference() = default; + + .. cpp:function:: template Reference(ChromosomeNameIt first_chrom_name, ChromosomeNameIt last_chrom_name, ChromosomeSizeIt first_chrom_size); + .. cpp:function:: template Reference(ChromosomeIt first_chrom, ChromosomeIt last_chrom); + .. cpp:function:: Reference(std::initializer_list chromosomes); + + **Factory methods** + + .. cpp:function:: [[nodiscard]] static Reference from_chrom_sizes(const std::filesystem::path& path_to_chrom_sizes); + + **Operators** + + .. cpp:function:: [[nodiscard]] bool operator==(const Reference& other) const; + .. cpp:function:: [[nodiscard]] bool operator!=(const Reference& other) const; + + **Iteration** + + .. cpp:function:: [[nodiscard]] auto begin() const -> const_iterator; + .. cpp:function:: [[nodiscard]] auto end() const -> const_iterator; + .. cpp:function:: [[nodiscard]] auto cbegin() const -> const_iterator; + .. cpp:function:: [[nodiscard]] auto cend() const -> const_iterator; + + .. cpp:function:: [[nodiscard]] auto rbegin() const -> const_reverse_iterator; + .. cpp:function:: [[nodiscard]] auto rend() const -> const_reverse_iterator; + .. cpp:function:: [[nodiscard]] auto rcbegin() const -> const_reverse_iterator; + .. cpp:function:: [[nodiscard]] auto rcend() const -> const_reverse_iterator; + + **Accessors** + + .. cpp:function:: [[nodiscard]] bool empty() const noexcept; + .. cpp:function:: [[nodiscard]] std::size_t size() const noexcept; + + **Lookup** + + .. cpp:function:: [[nodiscard]] auto find(std::uint32_t id) const -> const_iterator; + .. cpp:function:: [[nodiscard]] auto find(std::string_view chrom_name) const -> const_iterator; + .. cpp:function:: [[nodiscard]] auto find(const Chromosome& chrom) const -> const_iterator; + + .. cpp:function:: [[nodiscard]] const Chromosome& at(std::uint32_t id) const; + .. cpp:function:: [[nodiscard]] const Chromosome& at(std::string_view chrom_name) const; + + .. cpp:function:: [[nodiscard]] const Chromosome& operator[](std::uint32_t id) const noexcept; + .. cpp:function:: [[nodiscard]] const Chromosome& operator[](std::string_view chrom_name) const noexcept; + + .. cpp:function:: [[nodiscard]] bool contains(std::uint32_t id) const; + .. cpp:function:: [[nodiscard]] bool contains(const Chromosome& chrom) const; + .. cpp:function:: [[nodiscard]] bool contains(std::string_view chrom_name) const; + + .. cpp:function:: [[nodiscard]] std::uint32_t get_id(std::string_view chrom_name) const; + + .. cpp:function:: [[nodiscard]] const Chromosome& longest_chromosome() const; + .. cpp:function:: [[nodiscard]] const Chromosome& chromosome_with_longest_name() const; + + +Bin Table +--------- + +.. cpp:namespace:: hictk + +.. cpp:class:: BinTable + + This class models the bin table used as coordinate systems in Hi-C matrices. + + The class API gives the illusion of operating over a collection of :cpp:class:`Bin`\s. + In reality :cpp:class:`BinTable`\s do not store any :cpp:class:`Bin`\s. All queries are satisfied through simple arithmetic operations on the prefix sum of :cpp:class:`Chromosome` sizes and :cpp:class:`Bin`\s are generated on the fly as needed. + + This implementation has two main benefits: + + * Decoupling of :cpp:class:`BinTable` resolution and memory requirements + * Lookups in constant or linear time complexity with performance independent of resolution. + + **Constructors** + + .. cpp:function:: BinTable() = default; + .. cpp:function:: BinTable(Reference chroms, std::uint32_t bin_size, std::size_t bin_offset = 0); + .. cpp:function:: template BinTable(ChromIt first_chrom, ChromIt last_chrom, std::uint32_t bin_size, std::size_t bin_offset = 0); + .. cpp:function:: template BinTable(ChromNameIt first_chrom_name, ChromNameIt last_chrom_name, ChromSizeIt first_chrom_size, std::uint32_t bin_size, std::size_t bin_offset = 0); + + **Operators** + + .. cpp:function:: [[nodiscard]] bool operator==(const BinTable &other) const; + .. cpp:function:: [[nodiscard]] bool operator!=(const BinTable &other) const; + + **Accessors** + + .. cpp:function:: [[nodiscard]] std::size_t size() const noexcept; + .. cpp:function:: [[nodiscard]] bool empty() const noexcept; + .. cpp:function:: [[nodiscard]] std::size_t num_chromosomes() const; + .. cpp:function:: [[nodiscard]] constexpr std::uint32_t bin_size() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr const Reference &chromosomes() const noexcept; + .. cpp:function:: [[nodiscard]] constexpr const std::vector &num_bin_prefix_sum() const noexcept; + + **Iteration** + + .. cpp:function:: [[nodiscard]] auto begin() const -> iterator; + .. cpp:function:: [[nodiscard]] auto end() const -> iterator; + .. cpp:function:: [[nodiscard]] auto cbegin() const -> iterator; + .. cpp:function:: [[nodiscard]] auto cend() const -> iterator; + + **Slicing** + + .. cpp:function:: [[nodiscard]] BinTable subset(const Chromosome &chrom) const; + .. cpp:function:: [[nodiscard]] BinTable subset(std::string_view chrom_name) const; + .. cpp:function:: [[nodiscard]] BinTable subset(std::uint32_t chrom_id) const; + + **Lookup** + + .. cpp:function:: [[nodiscard]] auto find_overlap(const GenomicInterval &query) const -> std::pair; + .. cpp:function:: [[nodiscard]] auto find_overlap(const Chromosome &chrom, std::uint32_t start, std::uint32_t end) const -> std::pair; + .. cpp:function:: [[nodiscard]] auto find_overlap(std::string_view chrom_name, std::uint32_t start, std::uint32_t end) const -> std::pair; + .. cpp:function:: [[nodiscard]] auto find_overlap(std::uint32_t chrom_id, std::uint32_t start, std::uint32_t end) const -> std::pair; + .. cpp:function:: [[nodiscard]] std::pair at(const GenomicInterval &gi) const; + .. cpp:function:: [[nodiscard]] std::pair map_to_bin_ids(const GenomicInterval &gi) const; + + Query bins by genomic interval. + + .. cpp:function:: [[nodiscard]] Bin at(std::uint64_t bin_id) const; + .. cpp:function:: [[nodiscard]] Bin at(const Chromosome &chrom, std::uint32_t pos = 0) const; + .. cpp:function:: [[nodiscard]] Bin at(std::string_view chrom_name, std::uint32_t pos = 0) const; + .. cpp:function:: [[nodiscard]] Bin at(std::uint32_t chrom_id, std::uint32_t pos) const; + .. cpp:function:: [[nodiscard]] Bin at_hint(std::uint64_t bin_id, const Chromosome &chrom) const; + + Query by bin identifier. + + .. cpp:function:: [[nodiscard]] std::uint64_t map_to_bin_id(const Chromosome &chrom, std::uint32_t pos) const; + .. cpp:function:: [[nodiscard]] std::uint64_t map_to_bin_id(std::string_view chrom_name, std::uint32_t pos) const; + .. cpp:function:: [[nodiscard]] std::uint64_t map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const; + + Query by genomic coordinates + + **Others** + + .. cpp:function:: [[nodiscard]] BinTableConcrete concretize() const; + +Pixels +------ + +.. cpp:namespace:: hictk + +.. cpp:class:: template ThinPixel + + Struct to model a genomic pixel using as little memory as possible. + + **Member variables** + + .. cpp:member:: static constexpr auto null_id = std::numeric_limits::max(); + .. cpp:member:: std::uint64_t bin1_id{null_id}; + .. cpp:member:: std::uint64_t bin2_id{null_id}; + .. cpp:member:: N count{}; + + **Factory methods** + + .. cpp:function:: static auto from_coo(std::string_view line) -> ThinPixel; + .. cpp:function:: static auto from_coo(const BinTable &bins, std::string_view line) -> ThinPixel; + + **Operators** + + .. cpp:function:: [[nodiscard]] explicit operator bool() const noexcept; + .. cpp:function:: [[nodiscard]] bool operator==(const ThinPixel &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator!=(const ThinPixel &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator<(const ThinPixel &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator<=(const ThinPixel &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator>(const ThinPixel &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator>=(const ThinPixel &other) const noexcept; + + +.. cpp:class:: PixelCoordinates; + + Struct to model 2D genomic coordinates using a pair of :cpp:class:`Bin`\s. + + **Member variables** + + .. cpp:member:: Bin bin1 + .. cpp:member:: Bin bin2 + + **Constructors** + + .. cpp:function:: PixelCoordinates() = default; + .. cpp:function:: PixelCoordinates(Bin bin1_, Bin bin2_) noexcept; + .. cpp:function:: explicit PixelCoordinates(std::pair bins) noexcept; + .. cpp:function:: explicit PixelCoordinates(Bin bin) noexcept; + + **Operators** + + .. cpp:function:: [[nodiscard]] explicit operator bool() const noexcept; + .. cpp:function:: [[nodiscard]] bool operator==(const PixelCoordinates &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator!=(const PixelCoordinates &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator<(const PixelCoordinates &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator<=(const PixelCoordinates &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator>(const PixelCoordinates &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator>=(const PixelCoordinates &other) const noexcept; + + **Accessors** + + .. cpp:function:: [[nodiscard]] bool is_intra() const noexcept; + + +.. cpp:class:: template Pixel + + Struct to model genomic pixels as interaction counts associated to a pair of genomic :cpp:class:`Bin`\s. + + The main difference between :cpp:class:`ThinPixel` and :cpp:class:`Pixel` objects, is that the latter possessesall the knowledge required to map interactions to genomic coordinates, not just bin IDs. + + **Member variables** + + .. cpp:member:: PixelCoordinates coords{}; + .. cpp:member:: N count{}; + + **Constructors** + + .. cpp:function:: Pixel() = default; + .. cpp:function:: explicit Pixel(Bin bin, N count_ = 0) noexcept; + .. cpp:function:: Pixel(Bin bin1_, Bin bin2_, N count_ = 0) noexcept; + .. cpp:function:: explicit Pixel(PixelCoordinates coords_, N count_ = 0) noexcept; + .. cpp:function:: Pixel(const Chromosome &chrom, std::uint32_t start, std::uint32_t end, N count_ = 0) noexcept; + .. cpp:function:: Pixel(const Chromosome &chrom1, std::uint32_t start1, std::uint32_t end1, const Chromosome &chrom2, std::uint32_t start2, std::uint32_t end2, N count_ = 0) noexcept; + .. cpp:function:: Pixel(const BinTable &bins, std::uint64_t bin1_id, std::uint64_t bin2_id, N count_ = 0); + .. cpp:function:: Pixel(const BinTable &bins, std::uint64_t bin_id, N count_ = 0); + .. cpp:function:: Pixel(const BinTable &bins, const ThinPixel &p); + + + **Factory methods** + + .. cpp:function:: static auto from_coo(const BinTable &bins, std::string_view line) -> Pixel; + .. cpp:function:: static auto from_bg2(const BinTable &bins, std::string_view line) -> Pixel; + .. cpp:function:: static auto from_validpair(const BinTable &bins, std::string_view line) -> Pixel; + .. cpp:function:: static auto from_4dn_pairs(const BinTable &bins, std::string_view line) -> Pixel; + + **Operators** + + .. cpp:function:: [[nodiscard]] explicit operator bool() const noexcept; + .. cpp:function:: [[nodiscard]] bool operator==(const Pixel &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator!=(const Pixel &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator<(const Pixel &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator<=(const Pixel &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator>(const Pixel &other) const noexcept; + .. cpp:function:: [[nodiscard]] bool operator>=(const Pixel &other) const noexcept; + + **Conversion** + + .. cpp:function:: [[nodiscard]] ThinPixel to_thin() const noexcept; diff --git a/docs/creating_coolers.rst b/docs/creating_coolers.rst new file mode 100644 index 00000000..96d99fdb --- /dev/null +++ b/docs/creating_coolers.rst @@ -0,0 +1,49 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Creating .cool files +#################### + +hictk supports creating .cool files from text files in the following formats: + +* `pairs (4DN-DCIC) `_ +* `validPairs (nf-core/hic) `_ +* `bedGraph2 `_ +* `COO `_ + +File requirements: + +* ``dm6.chrom.sizes`` - `download `__ +* ``4DNFIKNWM36K.pairs.gz`` - `download `__ + +.. code-block:: console + + # Create a 10kbp .cool file using dm6 as reference + user@dev:/tmp$ zcat 4DNFIKNWM36K.pairs.gz | hictk load --format 4dn --assembly dms6 dm6.chrom.sizes 10000 4DNFIKNWM36K.1000.cool + + [2023-09-29 19:16:51.962] [info]: Running hictk v0.0.2 + [2023-09-29 19:16:51.962] [info]: begin loading un-sorted pairs... + [2023-09-29 19:16:51.983] [info]: writing chunk #1 to intermediate file "4DNFIKNWM36K.1000.cool.tmp"... + [2023-09-29 19:17:39.715] [info]: done writing chunk #1 to tmp file "4DNFIKNWM36K.1000.cool.tmp". + [2023-09-29 19:17:39.715] [info]: writing chunk #2 to intermediate file "4DNFIKNWM36K.1000.cool.tmp"... + [2023-09-29 19:17:39.719] [info]: done writing chunk #2 to tmp file "4DNFIKNWM36K.1000.cool.tmp". + [2023-09-29 19:17:39.721] [info]: merging 2 chunks into "4DNFIKNWM36K.1000.cool"... + [2023-09-29 19:17:41.716] [info]: processing chr3L:15100000-15110000 chr3L:16220000-16230000 at 5073567 pixels/s... + + +Merging multiple Cooler files +----------------------------- + +Multiple .cool files using the same reference genome and resolution can be merged using ``hictk merge``: + +.. code-block:: console + + user@dev:/tmp$ hictk merge data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 -o 4DNFIZ1ZVXC8.merged.cool + + [2023-09-29 19:24:49.479] [info]: Running hictk v0.0.2 + [2023-09-29 19:24:49.479] [info]: begin merging 2 coolers... + [2023-09-29 19:24:52.032] [info]: processing chr2R:11267000-11268000 chr4:1052000-1053000 at 3976143 pixels/s... + [2023-09-29 19:24:55.157] [info]: processing chr3R:5812000-5813000 chr3R:23422000-23423000 at 3201024 pixels/s... + [2023-09-29 19:24:57.992] [info]: DONE! Merging 2 coolers took 8.51s! + [2023-09-29 19:24:57.992] [info]: 4DNFIZ1ZVXC8.merged.cool size: 36.23 MB diff --git a/docs/creating_multires_coolers.rst b/docs/creating_multires_coolers.rst new file mode 100644 index 00000000..fb5d6f87 --- /dev/null +++ b/docs/creating_multires_coolers.rst @@ -0,0 +1,35 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Converting .cool to .mcool +########################## + +Interactions from a single-resolution Cooler file (.cool) can be used to generate a multi-resolution Cooler (.mcool) by iterative coarsening using ``hictk zoomify`` + +.. code-block:: console + + user@dev:/tmp$ hictk zoomify data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 out.mcool + + [2023-09-29 19:28:39.926] [info]: Running hictk v0.0.2 + [2023-09-29 19:28:39.929] [info]: coarsening cooler at data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 13 times (1000 -> 1000 -> 2000 -> 5000 -> 10000 -> 20000 -> 50000 -> 100000 -> 200000 -> 500000 -> 1000000 -> 2000000 -> 5000000 -> 10000000) + [2023-09-29 19:28:39.929] [info]: copying 1000 resolution from data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 + [2023-09-29 19:28:40.119] [info]: generating 2000 resolution from 1000 (2x) + [2023-09-29 19:28:40.343] [info]: [1000 -> 2000] processing chr2L:1996000-1998000 at 4484305 pixels/s... + [2023-09-29 19:28:40.663] [info]: [1000 -> 2000] processing chr2L:4932000-4934000 at 3125000 pixels/s... + [2023-09-29 19:28:40.973] [info]: [1000 -> 2000] processing chr2L:7986000-7988000 at 3236246 pixels/s... + ... + [2023-09-29 19:29:12.513] [info]: generating 10000000 resolution from 5000000 (2x) + [2023-09-29 19:29:12.519] [info]: DONE! Processed 13 resolution(s) in 32.59s! + + # Coarsen a single resolution + user@dev:/tmp$ hictk zoomify data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 out.cool --resolutions 50000 + + [2023-09-29 19:30:52.476] [info]: Running hictk v0.0.2 + [2023-09-29 19:30:52.482] [info]: coarsening cooler at data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 2 times (1000 -> 1000 -> 50000) + [2023-09-29 19:30:52.482] [info]: copying 1000 resolution from data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 + [2023-09-29 19:30:52.668] [info]: generating 50000 resolution from 1000 (50x) + [2023-09-29 19:30:53.789] [info]: [1000 -> 50000] processing chr2L:23000000-23050000 at 896057 pixels/s... + [2023-09-29 19:30:55.005] [info]: [1000 -> 50000] processing chr3L:4600000-4650000 at 822368 pixels/s... + [2023-09-29 19:30:56.440] [info]: [1000 -> 50000] processing chr3R:32050000-32079331 at 696864 pixels/s... + [2023-09-29 19:30:56.863] [info]: DONE! Processed 2 resolution(s) in 4.39s! diff --git a/docs/downloading_test_datasets.rst b/docs/downloading_test_datasets.rst new file mode 100644 index 00000000..7920cb6e --- /dev/null +++ b/docs/downloading_test_datasets.rst @@ -0,0 +1,27 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Downloading test datasets +######################### + +Test dataset for ``hictk`` are hosted on Zenodo: `doi.org/10.5281/zenodo.8121686 `_ + +After downloading the data, move to a folder with ~1 GB of free space and extract the test datasets: + +.. code-block:: console + :class: no-copybutton + + user@dev:/tmp$ mkdir data/ + user@dev:/tmp$ tar -xf hictk_test_data.tar.xz \ + -C data --strip-components=3 \ + test/data/hic/4DNFIZ1ZVXC8.hic9 \ + test/data/integration_tests/4DNFIZ1ZVXC8.mcool \ + test/data/integration_tests/4DNFIKNWM36K.subset.pairs.xz + + user@dev:/tmp$ ls -lah data + total 261M + drwx------ 2 dev dev 80 Sep 29 17:00 . + drwxrwxrwt 26 dev dev 960 Sep 29 17:00 .. + -rw------- 1 dev dev 128M Jun 8 19:42 4DNFIZ1ZVXC8.hic9 + -rw------- 1 dev dev 133M Jul 7 16:29 4DNFIZ1ZVXC8.mcool diff --git a/docs/file_validation.rst b/docs/file_validation.rst new file mode 100644 index 00000000..ae0a1ed1 --- /dev/null +++ b/docs/file_validation.rst @@ -0,0 +1,108 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +File validation +############### + +Why is this needed? +------------------- + +`hictk validate` can detect several types of data corruption in .hic and .cool files, from simple file truncation due to e.g. failed downloads to subtle index corruption in .cool files. + +.. _cooler-index-corruption-label: + +Cooler index corruption +^^^^^^^^^^^^^^^^^^^^^^^ + +To make a long story short, older versions of cooler (including v0.8.3) had a bug in ``cooler zoomify`` that caused the generation of invalid file indexes. This results in duplicate pixels with different values being reported for the affected region. + +Example: + +.. csv-table:: Output of cooler dump for corrupted file `4DNFI9GMP2J8.mcool `_ + :file: ./assets/corrupted_mcool_example.tsv + :header-rows: 1 + :delim: tab + +Unfortunately, this is not a rare issue, as the above bug currently affects most (possibly all) .mcool files released by 4DNucleome: + +.. image:: assets/4dnucleome_bug_notice.avif + +hictk validate +-------------- + +``hictk validate`` was initially developed to detect files affected by the above issue and was later extended to also validate .cool, .scool and .hic files. + +Perform a quick check to detect truncated or otherwise invalid files: + +.. code-block:: console + + # Validate a .hic file + user@dev:/tmp$ hictk validate test/data/hic/4DNFIZ1ZVXC8.hic8 + ### SUCCESS: "test/data/hic/4DNFIZ1ZVXC8.hic8" is a valid .hic file. + + # Validate a .cool file + user@dev:/tmp$ hictk validate test/data/integration_tests/4DNFIZ1ZVXC8.mcool + uri="test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/2500000" + is_hdf5=true + unable_to_open_file=false + file_was_properly_closed=true + missing_or_invalid_format_attr=false + missing_or_invalid_bin_type_attr=false + missing_groups=[] + is_valid_cooler=true + index_is_valid=not_checked + ### SUCCESS: "test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/2500000" is a valid Cooler. + uri="test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/1000000" + is_hdf5=true + unable_to_open_file=false + file_was_properly_closed=true + missing_or_invalid_format_attr=false + missing_or_invalid_bin_type_attr=false + missing_groups=[] + is_valid_cooler=true + index_is_valid=not_checked + ### SUCCESS: "test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/1000000" is a valid Cooler. + ... + uri="test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/1000" + is_hdf5=true + unable_to_open_file=false + file_was_properly_closed=true + missing_or_invalid_format_attr=false + missing_or_invalid_bin_type_attr=false + missing_groups=[] + is_valid_cooler=true + index_is_valid=not_checked + ### SUCCESS: "test/data/integration_tests/4DNFIZ1ZVXC8.mcool::/resolutions/1000" is a valid Cooler. + + +The quick check will not detect Cooler files with corrupted index, as this requires the ``--validate-index`` option: + +.. code-block:: console + + user@dev:/tmp$ hictk validate --validate-index 4DNFI9GMP2J8.mcool::/resolutions/1000000 + uri="4DNFI9GMP2J8.mcool::/resolutions/1000000" + is_hdf5=true + unable_to_open_file=false + file_was_properly_closed=true + missing_or_invalid_format_attr=false + missing_or_invalid_bin_type_attr=false + missing_groups=[] + is_valid_cooler=true + index_is_valid=false + ### FAILURE: "4DNFI9GMP2J8.mcool::/resolutions/1000000" is not a valid Cooler. + +Restoring corrupted .mcool files +-------------------------------- + +Luckily, the base resolution of .mcool files corrupted as described in :ref:`cooler-index-corruption-label` is still valid, and so corrupted resolutions can be regenerated from the base resolution. + +File restoration is automated with ``hictk fix-mcool``: + +.. code-block:: sh + + hictk fix-mcool 4DNFI9GMP2J8.mcool 4DNFI9GMP2J8.fixed.mcool + +``hictk fix-mcool`` is basically a wrapper around ``hictk zoomify`` and ``hictk balance``. + +When balancing, ``hictk fix-mcool`` will try to use the same parameters used to balance the original .mcool file. When this is not possible, ``hictk fix-mcool`` will fall back to the default parameters used by ``hictk balance``. diff --git a/docs/format_conversion.rst b/docs/format_conversion.rst new file mode 100644 index 00000000..643eb120 --- /dev/null +++ b/docs/format_conversion.rst @@ -0,0 +1,93 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Format conversion +################# + +``hictk`` supports conversion between .hic and .[m]cool file formats (including .hic v9 files). + +Converting from .hic to .[m]cool +-------------------------------- + +Converting from .hic to .cool or .mcool formats consists of the following operations + +#. Fetch the list of available resolutions +#. For each resolution to be converted: + + a. Copy all raw interactions present in the .hic file + b. Copy all known normalization vectors (currently these are VC, VC_SQRT, KR, and SCALE) + +Interactions are copied using streams of data, so memory requirements remain quite modest even when converting very high resolutions. + +.. code-block:: console + + user@dev:/tmp$ hictk convert data/4DNFIZ1ZVXC8.hic9 4DNFIZ1ZVXC8.mcool + + [2023-09-29 17:12:08.983] [info]: Running hictk v0.0.2-f83f93e + [2023-09-29 17:12:08.983] [info]: Converting data/4DNFIZ1ZVXC8.hic9 to 4DNFIZ1ZVXC8.mcool (hic -> mcool)... + [2023-09-29 17:12:09.052] [info]: [1000] begin processing 1000bp matrix... + [2023-09-29 17:12:12.212] [info]: [1000] processing chr2R:11267000-11268000 at 3167564 pixels/s (cache hit rate 0.00%)... + [2023-09-29 17:12:15.346] [info]: [1000] processing chr3R:5672000-5673000 at 3190810 pixels/s (cache hit rate 0.00%)... + [2023-09-29 17:12:18.204] [info]: [1000] processing SCALE normalization vector... + [2023-09-29 17:12:18.241] [info]: [1000] processing VC normalization vector... + [2023-09-29 17:12:18.285] [info]: [1000] processing VC_SQRT normalization vector... + [2023-09-29 17:12:19.123] [info]: [1000] DONE! Processed 26658348 pixels across 8 chromosomes in 10.07s + ... + [2023-09-29 17:12:37.412] [info]: DONE! Processed 10 resolution(s) in 28.43s! + [2023-09-29 17:12:37.412] [info]: data/4DNFIZ1ZVXC8.hic9 size: 133.68 MB + [2023-09-29 17:12:37.412] [info]: 4DNFIZ1ZVXC8.mcool size: 100.00 MB + + +It is also possible to convert only a subset of available resolutions by specifying resolutions to be converted with the ``--resolutions`` option. + +When specifying a single resolution, the resulting file will be in .cool format. + +.. code-block:: console + + user@dev:/tmp$ hictk convert data/4DNFIZ1ZVXC8.hic9 4DNFIZ1ZVXC8.1000.cool --resolutions 1000 + + [2023-09-29 17:42:47.917] [info]: Running hictk v0.0.2-f83f93e + [2023-09-29 17:42:47.917] [info]: Converting data/4DNFIZ1ZVXC8.hic9 to 4DNFIZ1ZVXC8.cool (hic -> cool)... + [2023-09-29 17:42:47.982] [info]: [1000] begin processing 1000bp matrix... + [2023-09-29 17:42:49.982] [info]: [1000] processing chr2R:11267000-11268000 at 5005005 pixels/s (cache hit rate 93.05%)... + [2023-09-29 17:42:52.339] [info]: [1000] processing chr3R:5672000-5673000 at 4242681 pixels/s (cache hit rate 92.66%)... + [2023-09-29 17:42:54.071] [info]: [1000] processing SCALE normalization vector... + [2023-09-29 17:42:54.109] [info]: [1000] processing VC normalization vector... + [2023-09-29 17:42:54.150] [info]: [1000] processing VC_SQRT normalization vector... + [2023-09-29 17:42:54.931] [info]: [1000] DONE! Processed 26658348 pixels across 8 chromosomes in 6.95s + [2023-09-29 17:42:54.931] [info]: DONE! Processed 1 resolution(s) in 7.01s! + [2023-09-29 17:42:54.931] [info]: data/4DNFIZ1ZVXC8.hic9 size: 133.68 MB + [2023-09-29 17:42:54.931] [info]: 4DNFIZ1ZVXC8.cool size: 36.74 MB + + + +Converting from .[m]cool to .hic +-------------------------------- + +``hictk convert`` can also be used to convert .[m]cool files to .hic format. + +As ``libhictk`` is not yet capable of writing .hic files, ``hictk convert`` requires `JuicerTools `_ or `HiCTools `_ JARs and java to be available. + +You should use HiCTools JAR unless you need to open the resulting .hic file with applications that do not support the latest .hic format specification. + +.. code-block:: console + + user@dev:/tmp$ hictk convert data/4DNFIZ1ZVXC8.mcool 4DNFIZ1ZVXC8.hic --juicer-tools-jar hic_tools.3.30.00.jar + + [2023-09-29 17:44:10.001] [info]: Running hictk v0.0.2-f83f93e + [2023-09-29 17:44:10.001] [info]: Converting data/4DNFIZ1ZVXC8.mcool to 4DNFIZ1ZVXC8.hic (mcool -> hic)... + [2023-09-29 17:44:10.004] [info]: writing chromosomes to file /tmp/hictk-tmp-XXXXjjxVhi/reference.chrom.sizes... + [2023-09-29 17:44:10.004] [info]: DONE! Wrote 8 chromosomes to file /tmp/hictk-tmp-XXXXjjxVhi/reference.chrom.sizes + [2023-09-29 17:44:10.004] [info]: writing pixels to file /tmp/hictk-tmp-XXXXjjxVhi/pixels.tsv.gz... + [2023-09-29 17:44:19.933] [info]: processing chr2R:19727000-19728000 chr2R:21162000-21163000 at 1007252 pixels/s... + [2023-09-29 17:44:31.012] [info]: processing chr3R:6457000-6458000 chr3R:21482000-21483000 at 902609 pixels/s... + [2023-09-29 17:44:37.397] [info]: wrote 26591454 pixels across 8 chromosomes to /tmp/hictk-tmp-XXXXjjxVhi/pixels.tsv.gz in 27.39s + [2023-09-29 17:44:37.398] [info]: running juicer_tools pre... + ... + +**Tips:** + +* Use JuicerTools instead of HiCTools if the output .hic file needs to be opened by applications that do not support the latest .hic format specification. +* When converting large .[m]cool files to .hic, ``hictk`` needs to create large temporary files. When this is the case, use option ``--tmpdir`` to set the temporary folder to a path with sufficient space +* When converting .[m]cool files to .hic, ``hictk`` tries to use ``pigz`` instead of plain ``gzip`` to compress temporary files. This can dramatically reduce conversion time. Please make sure ``pigz`` is installed and increase the number of processing threads with option ``--thread``. diff --git a/docs/generate_cli_reference.sh b/docs/generate_cli_reference.sh new file mode 100755 index 00000000..7814cce9 --- /dev/null +++ b/docs/generate_cli_reference.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +# Copyright (c) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -eu +set -o pipefail + +if [ $# -ne 1 ]; then + 2>&1 echo "Usage: $0 path/to/hictk" + exit 1 +fi + +hictk="$1" + +subcommands=( + balance + convert + dump + fix-mcool + load + merge + validate + zoomify +) + + +cat << EOT +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +CLI Reference +############# + +.. code-block text + + +EOT + +"$hictk" --help |& sed "s|$hictk|hictk|g" | sed 's/^/ /' | sed '/^[[:space:]]*$/d' + +for subcmd in "${subcommands[@]}"; do + header="hictk $subcmd" +cat << EOT + +$header +$(printf '\055%.0s' $(seq ${#header})) + +.. code-block:: text + +EOT + + "$hictk" "$subcmd" --help |& sed "s|$hictk|hictk|g" | sed 's/^/ /' | sed '/^[[:space:]]*$/d' +done diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..d7176e01 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,91 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Introduction +============ + +hictk is a blazing fast toolkit to work with .hic and .cool files. + +hictk is capable of reading files in .cool, .mcool, .scool and .hic format (including hic v9) as well as writing .cool, .mcool and .scool files. + +.. only:: not latex + + Documentation formats + --------------------- + + You are reading the HTML version of the documentation. An alternative `PDF + version `_ is + also available. + + Installation + ------------ + +.. only:: latex + + .. rubric:: Installation + +hictk is developed on Linux and tested on Linux, MacOS and Windows. CLI tools can be installed in several different ways. Refer to :doc:`Installation <./installation>` for more details. + +hictk can be compiled on most UNIX-like systems (including many Linux distributions and MacOS) as well as Windows. See the :doc:`build instructions <./installation_src>` for more details. + +Python bindings for hictk can be installed using pip or conda. Refer to hictkpy `documentation `_ for more details. + +.. only:: not latex + + How to cite this project? + ------------------------- + +.. only:: latex + + .. rubric:: How to cite this project? + +Please use the following BibTeX template to cite hictk in scientific +discourse: + +.. code-block:: bibtex + + @misc{hictk, + author = {Roberto Rossini}, + year = {2023}, + note = {https://github.com/paulsengroup/hictk}, + title = {hictk: blazing fast toolkit to work with .hic and .cool files} + } + +If you use ``hictk convert`` to convert .[m]cool files to .hic format you should also cite JuicerTools or HiCTools. + + +.. only:: not latex + + Table of contents + ----------------- + +.. toctree:: + :caption: Installation + :maxdepth: 1 + + installation + installation_src + +.. toctree:: + :caption: Introduction + :maxdepth: 1 + + quickstart_cli + quickstart_api + downloading_test_datasets + file_validation + format_conversion + reading_interactions + creating_coolers + creating_multires_coolers + balancing_matrices + + +.. toctree:: + :caption: CLI and API Reference + :maxdepth: 2 + + cli_reference + cpp_api/index + Python API diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 00000000..545bcd65 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,94 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Installation +############ + +Conda (bioconda) +================ + +hictk package for Linux and MacOS is available on bioconda and can be installed as follows: + +.. code-block:: console + + user@dev:/tmp$ conda create -n hictk -c conda-forge -c bioconda hictk + + user@dev:/tmp$ conda activate hictk + + (hictk) user@dev:/tmp$ whereis hictk + hictk: /home/user/.miniconda3/envs/hictk/bin/hictk + + (hictk) user@dev:/tmp$ hictk --version + hictk-v0.0.2-bioconda + + +Containers (Docker or Singularity/Apptainer) +============================================ + +First, make sure you follow the instructions on how to install Docker or Singularity/Apptainer on your OS. + +.. raw:: html + +
+ Installing Docker + +The following instructions assume you have root/admin permissions. + +* `Linux `_ +* `MacOS `_ +* `Windows `_ + +On some Linux distributions just installing Docker is not enough. +You also need to start (and optionally enable) the appropriate service(s). +This is usually done with one of the following: + +.. code-block:: sh + + sudo systemctl start docker + sudo systemctl start docker.service + + +Refer to `Docker `_ or your OS/distribution documentation for more details. + +.. raw:: html + +
+ +Pulling hictk Docker image +-------------------------- + +hictk Docker images are available on `GHCR.io `_ +and `DockerHub `_. + +Downloading and running the latest stable release can be done as follows: + +.. code-block:: console + + # Using Docker, may require sudo + user@dev:/tmp$ docker run ghcr.io/paulsengroup/hictk:0.0.2 --help + + # Using Singularity/Apptainer + user@dev:/tmp$ singularity run ghcr.io/paulsengroup/hictk:0.0.2 --help + + Blazing fast tools to work with .hic and .cool files. + Usage: /usr/local/bin/hictk [OPTIONS] SUBCOMMAND + + Options: + -h,--help Print this help message and exit + -V,--version Display program version information and exit + + Subcommands: + convert Convert HiC matrices to a different format. + dump Dump data from .hic and Cooler files to stdout. + load Build .cool files from interactions in various text formats. + merge Merge coolers. + validate Validate .hic and Cooler files. + zoomify Convert single-resolution Cooler file to multi-resolution by coarsening. + +The above will print hictk's help message, and is equivalent to running :code:`hictk --help` on the command line (assuming hictk is available on your machine). + +Installing from source +====================== + +Please refer to hictk's :doc:`build instructions <./installation_src>`. diff --git a/docs/installation_src.rst b/docs/installation_src.rst new file mode 100644 index 00000000..c262adf4 --- /dev/null +++ b/docs/installation_src.rst @@ -0,0 +1,302 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Installation (source) +##################### + +Instructions assume hictk is being built on a UNIX environment. + +Building on Windows follows the same logic but some of the commands may be slightly different. + +Build instructions +================== + +hictk can be compiled on most UNIX-like systems (including many Linux distributions, MacOS) and Windows. + +Build requirements +------------------ + +Compiling hictk requires a compiler toolchain supporting C++17, such as: + +* GCC 8+ +* Clang 8+ +* Apple-Clang 10.0+ + +Furthermore, the following tools are required: + +* CMake 3.25+ +* Conan 2+ +* git 2.7+ +* make or ninja +* Python3.6+ (including :code:`pip`, required to install Conan) + + +We recommend installing CMake and Conan in a Python `virtualenv `_, but you are of course free to install build dependencies in any way you want. + +.. code-block:: bash + + python3 -m venv /tmp/venv + /tmp/venv/bin/python3 -m pip install pip setuptools --upgrade + /tmp/venv/bin/python3 -m pip install 'cmake>=3.25' 'conan>=2' ninja + + # NOTE: It's important to activate the venv after installing CMake + . /tmp/venv/bin/activate + + whereis cmake # cmake: /tmp/venv/bin/cmake + whereis conan # conan: /tmp/venv/bin/conan + whereis ninja # ninja: /tmp/venv/bin/ninja + + cmake --version + conan --version + + # Detect compiler toolchain. It is usually a good idea to explicitly set CC and CXX + CC=gcc CXX=g++ conan profile detect --force + +Getting the source code +----------------------- + +Download from the `Release `_ page (recommended). + +.. code-block:: bash + + mkdir /tmp/hictk + curl -L 'https://github.com/paulsengroup/hictk/archive/refs/tags/v0.0.2.tar.gz' | tar --strip-components=1 -C /tmp/hictk -xzf - + + +Using git. + +.. code-block:: bash + + git clone https://github.com/paulsengroup/hictk.git /tmp/hictk + + cd /tmp/hictk + git checkout v0.0.2 # Skip this step if you want to build the latest commit from main + +Compiling hictk +--------------- + +.. code-block:: bash + + # Activate venv + . /tmp/venv/bin/activate + + # Set these variables to the number of CPU cores available on your machine + # You can check this with e.g. + # python -c 'import multiprocessing as mp; print(mp.cpu_count())') + export CONAN_CPU_COUNT=8 + export CMAKE_BUILD_PARALLEL_LEVEL=8 + + # Install/build dependencies with Conan + conan install --build=missing \ + -pr default \ + -s build_type=Release \ + -s compiler.cppstd=17 \ + --output-folder=./build/ \ + . + + # This may take a while, as CMake will run Conan to build hictk dependencies. + # Do not pass -G Ninja if you want CMake to use make instead of ninja + cmake -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_PREFIX_PATH="$PWD/build" \ + -DHICTK_ENABLE_TESTING=ON \ + -DHICTK_BUILD_TOOLS=ON \ + -G Ninja \ + -S /tmp/hictk \ + -B /tmp/hictk/build + + cmake --build /tmp/hictk/build + +To override the default compiler used by CMake, pass the following arguments to the first CMake command: :code:`-DCMAKE_C_COMPILER=path/to/cc -DCMAKE_CXX_COMPILER=path/to/c++` + +We highly recommend using the same compiler when running Conan and CMake. + +Running automated tests +======================= + +The steps outlined in this section are optional but highly recommended. + +Unit tests +---------- + +.. code-block:: bash + + # Activate venv + . /tmp/venv/bin/activate + + cd /tmp/hictk + ctest --test-dir build/ \ + --schedule-random \ + --output-on-failure \ + --no-tests=error \ + --timeout 120 \ + -j8 # Change this to the number of available CPU cores + +A successful run of the test suite will produce an output like the following: + +.. code-block:: console + + user@dev:/tmp/hictk$ ctest --test-dir build/ ... + ... + 63/70 Test #21: Cooler: init files - SHORT ....................................... Passed 0.02 sec + 64/70 Test #57: HiC: pixel selector fetch (observed NONE BP 10000) - LONG ........ Passed 1.53 sec + 65/70 Test #5: Cooler: index validation - SHORT ................................. Passed 3.83 sec + 66/70 Test #17: Cooler: index validation - SHORT ................................. Passed 3.62 sec + 67/70 Test #37: Cooler: utils merge - LONG ....................................... Passed 4.35 sec + 68/70 Test #67: Transformers (cooler) - SHORT .................................... Passed 4.11 sec + 69/70 Test #36: Cooler: dataset random iteration - MEDIUM ........................ Passed 5.50 sec + 70/70 Test #40: Cooler: dataset large read/write - LONG .......................... Passed 11.47 sec + + 100% tests passed, 0 tests failed out of 70 + + Total Test time (real) = 12.03 sec + +**All tests are expected to pass. Do not ignore test failures!** + +.. raw:: html + +
+ Troubleshooting test failures + +If one or more tests fail, try the following troubleshooting steps before reaching out for help. + +#. Make sure you are running :code:`ctest` from the root of the source tree (:code:`/tmp/hictk` if you are following the instructions). +#. Make sure you are passing the correct build folder to :code:`--test-dir`. Pass the absolute path if necessary (i.e. :code:`--test-dir=/tmp/hictk/build/` if you are following the instructions). +#. Re-run :code:`ctest` with :code:`-j1`. This can be necessary on machines with very little memory (e.g. less than 2GB). +#. Before running :code:`ctest`, create a temporary folder where your user has read-write permissions and where there are at least 100-200MB of space available. + Then set variable :code:`TMPDIR` to that folder and re-run `ctest`. +#. Checksum the test dataset located under :code:`test/data/` by running :code:`sha256sum -c checksums.sha256`. + If the checksumming fails or the folder doesn't exist, download and extract the :code:`.tar.xz` file listed in file :code:`cmake/FetchTestDataset.cmake`. Make sure you run :code:`tar -xf` from the root of the repository (:code:`/tmp/hictk` if you are following the instructions). + +Example: + +.. code-block:: bash + + # Activate venv + . /tmp/venv/bin/activate + + cd /tmp/hictk + + # Make sure this is the URL listed in file cmake/FetchTestDataset.cmake + curl -L 'https://zenodo.org/record/8143316/files/hictk_test_data.tar.xz?download=1' | tar -xJf - + + # This should print "OK" if the check is successful + (cd test/data && sha256sum --quiet -c checksums.sha256 && 2>&1 echo OK) + + mkdir ~/hictk-test-dir # Remember to delete this folder + + TMPDIR="$HOME/hictk-test-dir" \ + ctest --test-dir=/tmp/hictk/build/ \ + --schedule-random \ + --output-on-failure \ + --no-tests=error \ + --timeout 600 \ + -j1 + + # rm -r ~/hictk-test-dir + +If after trying the above steps the tests are still failing, feel free to start `discussion `_ asking for help. + +.. raw:: html + +
+ + +Integration tests +----------------- + +The integration test scripts depend on the following tools: + +* cooler>=0.9 +* java +* `juicer_tools `_ or `hic_tools `_ +* xz +* common UNIX shell commands + +cooler can be installed using pip: + +.. code-block:: bash + + /tmp/venv/bin/pip3 install 'cooler>=0.9' + +juicer_tools and hic_tools do not need to be installed, downloading the JAR file is enough: + +.. code-block:: bash + + curl -L 'https://github.com/aidenlab/HiCTools/releases/download/v3.30.00/hic_tools.3.30.00.jar' -o /tmp/hictk/hic_tools.jar + +If not already installed, :code:`xz` can usually be installed with your system package manager (on some Linux distributions the relevant package is called :code:`xz-utils`). + +.. code-block:: bash + + # Activate venv + . /tmp/venv/bin/activate + + cd /tmp/hictk + + # hictk convert + test/scripts/hictk_convert_cool2hic.sh build/src/hictk/hictk juicer_tools.jar + test/scripts/hictk_convert_hic2cool.sh build/src/hictk/hictk + + # hictk dump + test/scripts/hictk_dump_balanced.sh build/src/hictk/hictk + test/scripts/hictk_dump_bins.sh build/src/hictk/hictk + test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk + test/scripts/hictk_dump_cis.sh build/src/hictk/hictk + test/scripts/hictk_dump_gw.sh build/src/hictk/hictk + test/scripts/hictk_dump_trans.sh build/src/hictk/hictk + + # hictk load (sorted) + test/scripts/hictk_load_4dn.sh build/src/hictk/hictk sorted + test/scripts/hictk_load_bg2.sh build/src/hictk/hictk sorted + test/scripts/hictk_load_coo.sh build/src/hictk/hictk sorted + + # hictk load (unsorted) + test/scripts/hictk_load_4dn.sh build/src/hictk/hictk unsorted + test/scripts/hictk_load_bg2.sh build/src/hictk/hictk unsorted + test/scripts/hictk_load_coo.sh build/src/hictk/hictk unsorted + + # hictk merge + test/scripts/hictk_merge.sh build/src/hictk/hictk + + # hictk validate + test/scripts/hictk_validate.sh build/src/hictk/hictk + + # hictk zoomify + test/scripts/hictk_zoomify.sh build/src/hictk/hictk + +Installation +============ + +Once all tests have passed, :code:`hictk` can be installed as follows: + +.. code-block:: console + + # Activate venv + user@dev:/tmp$ . /tmp/venv/bin/activate + + # Install system-wide (requires root/admin rights) + user@dev:/tmp$ cmake --install /tmp/hictk/build + -- Install configuration: "Release" + -- Installing: /usr/local/bin/hictk + -- Set runtime path of "/usr/local/bin/hictk" to "" + -- Up-to-date: /usr/local/share/licenses/hictk/LICENSE + ... + + # Alternatively, install to custom path + user@dev:/tmp$ cmake --install /tmp/hictk/build --prefix "$HOME/.local/" + -- Install configuration: "Release" + -- Installing: /home/user/.local/bin/hictk + -- Set runtime path of "/home/user/.local/bin/hictk" to "" + -- Up-to-date: /home/user/.local/share/licenses/hictk/LICENSE + ... + +Cleaning build artifacts +======================== + +After successfully compiling hictk the following folders safely be removed: + +* Python virtualenv: :code:`/tmp/venv` +* hictk source tree: :code:`/tmp/hictk` + +If you are not using Conan in any other project feel free to also delete Conan's folder :code:`~/.conan2/` diff --git a/docs/quickstart_api.rst b/docs/quickstart_api.rst new file mode 100644 index 00000000..44f45ca7 --- /dev/null +++ b/docs/quickstart_api.rst @@ -0,0 +1,198 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Quickstart (API) +################ + +The library component of hictk, libhictk, can be installed and configured in several ways. + +Installing libhictk +=================== + +Installing using Conan +---------------------- + +To install libhictk using Conan, first create a conanfile.txt like the following: + +.. code-block:: + + [requires] + hictk/0.0.2 + + [generators] + CMakeDeps + + [layout] + cmake_layout + +Next, install hictk as follows: + +.. code-block:: sh + + conan install conanfile.txt --build=missing --output-folder=conan_deps + +Folder ``conan_deps`` will contain all CMake module and config files required to include hictk in an application using CMake as build generator. + +Finally, add ``find_package(hictk REQUIRED)`` to your ``CMakeLists.txt`` and pass the full path to folder ``conan_deps`` to CMake through the ``CMAKE_PREFIX_PATH`` variable: + +.. code-block:: sh + + cmake -DCMAKE_PREFIX_PATH='/path/to/conan_deps' ... -B build/ -S . + + +For more options and details refer to hictk page on `ConanCenter `_. + +Installing using CMake FetchContent +----------------------------------- + +Before beginning, make sure all of hictk dependencies have been installed. +Refer to `conanfile.txt `_ for an up-to-date list of hictk dependencies. + +To install and configure hictk using `FetchContent `_, first write a ``CMakeLists.txt`` file like the following: + +.. code-block:: cmake + + cmake_minimum_required(VERSION 3.25) + cmake_policy(VERSION 3.25...3.27) + + project(myproject LANGUAGES C CXX) + + include(FetchContent) + FetchContent_Declare( + hictk + GIT_REPOSITORY "https://github.com/paulsengroup/hictk.git" + GIT_TAG v0.0.2 + SYSTEM) + + # Customize hictk build flags + set(HICTK_ENABLE_TESTING OFF) + set(HICTK_BUILD_EXAMPLES OFF) + set(HICTK_BUILD_BENCHMARKS OFF) + set(HICTK_BUILD_TOOLS OFF) + set(HICTK_INSTALL OFF) + + FetchContent_MakeAvailable(hictk) + + add_executable(main main.cpp) + target_link_libraries(main PRIVATE hictk::file) # Add other targets as necessary + +Include hictk source using CMake add_subdirectory +------------------------------------------------- + +Simply add a copy of hictk source code to your source tree (e.g. under folder ``myproject/external/hictk``), then add ``add_subdirectory("external/hictk")`` to your ``CMakeLists.txt``. + + +A quick tour of libhictk +------------------------ + +libhictk is a C++17 header-only library that provides the building blocks required to build complex applications operating on .hic and .cool files. + +libhictk public API is organized in 5 main sections: + +.. cpp:namespace:: hictk + +#. Classes :cpp:class:`cooler::File`, :cpp:class:`cooler::MultiResFile` and :cpp:class:`cooler::SingleCellFile`, which can be used to read and write .cool, .mcool and .scool files respectively. +#. Class :cpp:class:`hic::File` which can be used to read .hic files +#. Class :cpp:class:`File` which wraps :cpp:class:`cooler::File` and :cpp:class:`hic::File` and provides a uniform interface to read .cool and .hic files +#. Various other classes used e.g. to model tables of bins, reference genomes and much more +#. Classes and free-standing functions to perform common operations on files or pixel iterators, such as coarsening and balancing. + +The quick tour showcases basic functionality of the generic :cpp:class:`File` class. For more detailed examples refer to hictk `examples `_ and :doc:`cpp_api/index`. + +.. code-block:: cpp + + #include + #include + #include + #include + #include + + int main() { + // const std::string path = "interactions.cool"; + // const std::string path = "interactions.mcool::/resolutions/1000"; + const std::string path = "interactions.hic"; + const std::uint32_t resolution = 1000; + + const hictk::File f(path, resolution); + + const auto selector = f.fetch("chr1", "chr2"); + + std::for_each(selector.template begin(), selector.template end(), + [](const hictk::ThinPixel& p) { + std::cout << p.bin1_id << "\t"; + std::cout << p.bin2_id << "\t"; + std::cout << p.count << "\n"; + }); + } + + +It is often the case that we need access to more information than just bin IDs and counts. +Joining genomic coordinates to pixel counts can be done as follows: + +.. code-block:: cpp + + #include + #include + #include + #include + #include + + int main() { + const std::string path = "interactions.hic"; + const std::uint32_t resolution = 1000; + + const hictk::File f(path, resolution); + + const auto selector = f.fetch("chr1", "chr2"); + const hictk::transformers::JoinGenomicCoords jselector( + selector.template begin(), selector.template end(), f.bins_ptr()); + + for (const auto& p : jselector) { + std::cout << p.coords.bin1.chrom().name() << "\t"; + std::cout << p.coords.bin1.start() << "\t"; + std::cout << p.coords.bin1.end() << "\t"; + std::cout << p.coords.bin2.chrom().name() << "\t"; + std::cout << p.coords.bin2.start() << "\t"; + std::cout << p.coords.bin2.end() << "\t"; + std::cout << p.count << "\n"; + } + } + + +The above examples work just fine, however using iterators returned by generic :cpp:class:`PixelSelector` is suboptimal. These iterators are implemented using `std::variant `_ and require checking the type of the underlying ``PixelSelector`` every iteration. The overhead of this check is quite low but still noticeable. + +We can avoid paying this overhead by using the format-specific file handles instead of the generic one, or by using `std::visit `_ as follows: + +.. code-block:: cpp + + #include + #include + #include + #include + #include + #include + + int main() { + const std::string path = "interactions.hic"; + const std::uint32_t resolution = 1000; + + const hictk::File f(path, resolution); + + const auto selector = f.fetch("chr1", "chr2"); + + // std::visit applies the lambda function provided as first argument + // to the variant returned by selector.get(). + // In this way, the type held by the std::variant is checked once + // and the underlying PixelSelector and iterators are used for all operations + std::visit( + [&](const auto& sel) { + std::for_each(sel.template begin(), sel.template end(), + [](const hictk::ThinPixel& p) { + std::cout << p.bin1_id << "\t"; + std::cout << p.bin2_id << "\t"; + std::cout << p.count << "\n"; + }); + }, + selector.get()); + } diff --git a/docs/quickstart_cli.rst b/docs/quickstart_cli.rst new file mode 100644 index 00000000..daafe2de --- /dev/null +++ b/docs/quickstart_cli.rst @@ -0,0 +1,66 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Quickstart (CLI) +################ + +First, install hictk with one of the methods listed in the :doc:`Installation <./installation>` section. + +Next, verify that hictk was installed correctly with: + +.. code-block:: console + + user@dev:/tmp$ hictk --version + hictk-v0.0.2 + +Command line interface +====================== + +hictk CLI support performing common operations on .hic and .cool files directly from the shell. + +Verifying file integrity +------------------------ + +.. code-block:: sh + + hictk validate interactions.cool --validate-index + + hictk validate interactions.hic + +For more detailed examples refer to :doc:`File validation <./file_validation>` + +Reading interactions +-------------------- + +hictk supports reading interactions from .hic and .cool files through the ``hictk dump`` command: + +.. code-block:: console + + user@dev:/tmp$ hictk dump interactions.cool + 0 0 1745 + 0 1 2844 + 0 2 409 + ... + + user@dev:/tmp$ hictk dump interactions.cool --join + chr2L 0 10000 chr2L 0 10000 1745 + chr2L 0 10000 chr2L 10000 20000 2844 + chr2L 0 10000 chr2L 20000 30000 409 + ... + +For more detailed examples refer to :doc:`Reading interactions <./reading_interactions>` + +Other operations +---------------- + +* :doc:`Format conversion <./format_conversion>` +* :doc:`Creating .cool files <./creating_coolers>` +* :doc:`Converting .cool files to .mcool format <./creating_multires_coolers>` +* :doc:`Balancing Hi-C matrices <./balancing_matrices>` + + +API +=== + +Refer to :doc:`Quickstart (API) <./quickstart_api>`. diff --git a/docs/reading_interactions.rst b/docs/reading_interactions.rst new file mode 100644 index 00000000..7359c25f --- /dev/null +++ b/docs/reading_interactions.rst @@ -0,0 +1,108 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Reading interactions +#################### + +``hictk`` supports reading interactions from .hic and .cool files through the ``hictk dump`` command. + +By default, interactions are dumped to stdout in COO format (row, column, count): + +.. code-block:: console + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.hic9 --resolution 1000 + + 7 7 1745 + 7 12 1766 + 7 17 1078 + ... + +Use option ``--join`` to instead dump interactions in bedgraph2 format: + +.. code-block:: console + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.mcool --resolution 1000 --join | head -n 3 + + chr2L 7000 8000 chr2L 7000 8000 1745 + chr2L 7000 8000 chr2L 12000 13000 1766 + chr2L 7000 8000 chr2L 17000 18000 1078 + ... + +All operations work on .hic as well as .[m]cool files: + +.. code-block:: console + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.mcool --resolution 1000 + + 7 7 1745 + 7 12 1766 + 7 17 1078 + ... + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 + + 7 7 1745 + 7 12 1766 + 7 17 1078 + ... + +Dump balanced or expected interactions: + +.. code-block:: console + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.hic9 --resolution 1000 --join --balance SCALE + + chr2L 7000 8000 chr2L 7000 8000 1681.679565429688 + chr2L 7000 8000 chr2L 12000 13000 1386.554565429688 + chr2L 7000 8000 chr2L 17000 18000 878.9703979492188 + ... + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.hic9 --resolution 1000 --join --matrix-type expected + + chr2L 7000 8000 chr2L 7000 8000 88.33206176757812 + chr2L 7000 8000 chr2L 12000 13000 63.43805313110352 + chr2L 7000 8000 chr2L 17000 18000 31.78345680236816 + ... + +Dump interactions overlapping a region of interest: + +.. code-block:: console + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.hic9 --resolution 1000 --join --range chr3L:20,000,000-25,000,000 + + chr3L 20002000 20003000 chr3L 20002000 20003000 2390 + chr3L 20002000 20003000 chr3L 20007000 20008000 1285 + chr3L 20002000 20003000 chr3L 20012000 20013000 490 + ... + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.hic9 --resolution 1000 --join --range chr3L:20,000,000-25,000,000 --range2 chrX + + chr3L 20002000 20003000 chrX 52000 53000 1 + chr3L 20002000 20003000 chrX 157000 158000 1 + chr3L 20002000 20003000 chrX 352000 353000 1 + ... + +Dump tables other than pixels: + +.. code-block:: console + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.hic9 --table chroms + + chr2L 23513712 + chr2R 25286936 + chr3L 28110227 + ... + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.hic9 --table normalizations + + SCALE + VC + VC_SQRT + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.hic9 --table resolutions + + 1000 + 5000 + 10000 + ... diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..bdb18752 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,5 @@ +furo==2023.09.10 +sphinx==7.2.6 +sphinx-copybutton==0.5.2 +sphinxcontrib-moderncmakedomain==3.27.0 +sphinxcontrib-svg2pdfconverter==1.2.2 diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 00000000..56458991 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,15 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set(HICTK_EXAMPLES quicktour_example_001 quicktour_example_002 quicktour_example_003) + +foreach(EXAMPLE ${HICTK_EXAMPLES}) + add_executable("hictk_${EXAMPLE}" "${EXAMPLE}.cpp") + target_link_libraries( + "hictk_${EXAMPLE}" + PRIVATE hictk_project_warnings + hictk_project_options + hictk::file + hictk::transformers) +endforeach() diff --git a/examples/quicktour_example_001.cpp b/examples/quicktour_example_001.cpp new file mode 100644 index 00000000..f46ea431 --- /dev/null +++ b/examples/quicktour_example_001.cpp @@ -0,0 +1,28 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include +#include + +// NOLINTNEXTLINE +int main() { + // const std::string path = "interactions.cool"; + // const std::string path = "interactions.mcool::/resolutions/1000"; + const std::string path = "interactions.hic"; + const std::uint32_t resolution = 1000; + + const hictk::File f(path, resolution); + + const auto selector = f.fetch("chr1", "chr2"); + + std::for_each(selector.template begin(), selector.template end(), + [](const hictk::ThinPixel& p) { + std::cout << p.bin1_id << "\t"; + std::cout << p.bin2_id << "\t"; + std::cout << p.count << "\n"; + }); +} diff --git a/examples/quicktour_example_002.cpp b/examples/quicktour_example_002.cpp new file mode 100644 index 00000000..6eb2cefc --- /dev/null +++ b/examples/quicktour_example_002.cpp @@ -0,0 +1,31 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include +#include + +// NOLINTNEXTLINE +int main() { + const std::string path = "interactions.hic"; + const std::uint32_t resolution = 1000; + + const hictk::File f(path, resolution); + + const auto selector = f.fetch("chr1", "chr2"); + const hictk::transformers::JoinGenomicCoords jselector( + selector.template begin(), selector.template end(), f.bins_ptr()); + + for (const auto& p : jselector) { + std::cout << p.coords.bin1.chrom().name() << "\t"; + std::cout << p.coords.bin1.start() << "\t"; + std::cout << p.coords.bin1.end() << "\t"; + std::cout << p.coords.bin2.chrom().name() << "\t"; + std::cout << p.coords.bin2.start() << "\t"; + std::cout << p.coords.bin2.end() << "\t"; + std::cout << p.count << "\n"; + } +} diff --git a/examples/quicktour_example_003.cpp b/examples/quicktour_example_003.cpp new file mode 100644 index 00000000..8288c371 --- /dev/null +++ b/examples/quicktour_example_003.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include +#include +#include + +// NOLINTNEXTLINE +int main() { + const std::string path = "interactions.hic"; + const std::uint32_t resolution = 1000; + + const hictk::File f(path, resolution); + + const auto selector = f.fetch("chr1", "chr2"); + + // std::visit applies the lambda function provided as first argument + // to the variant returned by selector.get(). + // In this way, the type held by the std::variant is checked once + // and the underlying PixelSelector and iterators are used for all operations + std::visit( + [&](const auto& sel) { + std::for_each(sel.template begin(), sel.template end(), + [](const hictk::ThinPixel& p) { + std::cout << p.bin1_id << "\t"; + std::cout << p.bin2_id << "\t"; + std::cout << p.count << "\n"; + }); + }, + selector.get()); +} diff --git a/src/hictk/cli/cli_convert.cpp b/src/hictk/cli/cli_convert.cpp index 17141f7d..57cd5ed2 100644 --- a/src/hictk/cli/cli_convert.cpp +++ b/src/hictk/cli/cli_convert.cpp @@ -56,7 +56,7 @@ void Cli::make_convert_subcommand() { sc.add_option( "-r,--resolutions", c.resolutions, - "One or more resolution to be converted. By default all resolutions are converted.") + "One or more resolutions to be converted. By default all resolutions are converted.") ->check(CLI::PositiveNumber); sc.add_option( "--normalization-methods", diff --git a/src/hictk/cli/cli_fix_mcool.cpp b/src/hictk/cli/cli_fix_mcool.cpp index cd486a38..2f8f48a6 100644 --- a/src/hictk/cli/cli_fix_mcool.cpp +++ b/src/hictk/cli/cli_fix_mcool.cpp @@ -73,13 +73,13 @@ void Cli::make_fix_mcool_subcommand() { sc.add_option( "-t,--threads", c.threads, - "Maximum number of parallel threads to spawn (only applies to balancing stage).") + "Maximum number of parallel threads to spawn (only applies to the balancing stage).") ->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency())) ->capture_default_str(); sc.add_option( "-l,--compression-level", c.zstd_compression_lvl, - "Compression level used to compress temporary files using ZSTD (only applies to balancing stage).") + "Compression level used to compress temporary files using ZSTD (only applies to the balancing stage).") ->check(CLI::Range(0, 19)) ->capture_default_str(); sc.add_flag( diff --git a/src/hictk/cli/cli_zoomify.cpp b/src/hictk/cli/cli_zoomify.cpp index 457b3c0d..9c5fc534 100644 --- a/src/hictk/cli/cli_zoomify.cpp +++ b/src/hictk/cli/cli_zoomify.cpp @@ -45,7 +45,7 @@ void Cli::make_zoomify_subcommand() { sc.add_option( "--resolutions", c.resolutions, - "One or more resolution to be used for coarsening."); + "One or more resolutions to be used for coarsening."); sc.add_flag( "--copy-base-resolution,!--no-copy-base-resolution", diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index 9ff256d9..78d7f3d7 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -105,7 +105,7 @@ class File { template File(RootGroup entrypoint, Reference chroms, PixelT pixel, Attributes attributes, std::size_t cache_size_bytes, double w0); - // Ctor for SingleResCooler + // Ctor for SingleCellCooler template File(RootGroup entrypoint, PixelT pixel, Attributes attributes, std::size_t cache_size_bytes, double w0); @@ -248,8 +248,8 @@ class File { bool rescale = false) const; [[nodiscard]] bool has_normalization(std::string_view normalization) const; - [[nodiscard]] std::vector avail_normalizations() const; [[nodiscard]] bool has_normalization(const balancing::Method &normalization) const; + [[nodiscard]] std::vector avail_normalizations() const; std::shared_ptr read_weights(const balancing::Method &normalization, bool rescale = false) const; std::shared_ptr read_weights(const balancing::Method &normalization, diff --git a/src/libhictk/file/include/hictk/file.hpp b/src/libhictk/file/include/hictk/file.hpp index c162cff7..f82683e4 100644 --- a/src/libhictk/file/include/hictk/file.hpp +++ b/src/libhictk/file/include/hictk/file.hpp @@ -37,6 +37,13 @@ class PixelSelector { template [[nodiscard]] std::vector> read_all() const; +#ifdef HICTK_WITH_EIGEN + template + [[nodiscard]] Eigen::SparseMatrix read_sparse() const; + template + [[nodiscard]] Eigen::Matrix read_dense() const; +#endif + [[nodiscard]] const PixelCoordinates &coord1() const; [[nodiscard]] const PixelCoordinates &coord2() const; diff --git a/src/libhictk/file/include/hictk/impl/file_impl.hpp b/src/libhictk/file/include/hictk/impl/file_impl.hpp index cb617891..855bb1dc 100644 --- a/src/libhictk/file/include/hictk/impl/file_impl.hpp +++ b/src/libhictk/file/include/hictk/impl/file_impl.hpp @@ -50,6 +50,18 @@ inline std::vector> PixelSelector::read_all() const { return std::visit([&](const auto& sel) { return sel.template read_all(); }, _sel); } +#ifdef HICTK_WITH_EIGEN +template +inline Eigen::SparseMatrix PixelSelector::read_sparse() const { + return std::visit([&](const auto& sel) { return sel.template read_sparse(); }, _sel); +} + +template +inline Eigen::Matrix PixelSelector::read_dense() const { + return std::visit([&](const auto& sel) { return sel.template read_dense(); }, _sel); +} +#endif + inline const PixelCoordinates& PixelSelector::coord1() const { return std::visit( [&](const auto& sel) -> const PixelCoordinates& { diff --git a/test/units/file/file_test.cpp b/test/units/file/file_test.cpp index e9d8f3d7..61fb78e3 100644 --- a/test/units/file/file_test.cpp +++ b/test/units/file/file_test.cpp @@ -124,12 +124,20 @@ TEST_CASE("PixelSelector", "[file][short]") { CHECK(sel1.coord1().bin1.chrom().name() == "chr2L"); CHECK(sel1.coord2().bin1.chrom().name() == "chr2R"); CHECK(sel1.bins().bin_size() == resolution); + + CHECK(sel1.read_all().size() == 624); + CHECK(sel1.read_sparse().nonZeros() == 624); + CHECK(sel1.read_dense().sum() == 1'483'112); } SECTION("hic gw") { const auto hf = File(path_hic, resolution); auto sel1 = hf.fetch(); CHECK(sel1.coord1() == PixelCoordinates{}); CHECK(sel1.coord2() == PixelCoordinates{}); + + CHECK(sel1.read_all().size() == 10'148); + CHECK(sel1.read_sparse().nonZeros() == 10'148); + CHECK(sel1.read_dense().sum() == 155'486'075); } SECTION("cooler") { const auto clr = File(path_cooler, resolution); @@ -138,6 +146,10 @@ TEST_CASE("PixelSelector", "[file][short]") { CHECK(sel1.coord1().bin1.chrom().name() == "chr2L"); CHECK(sel1.coord2().bin1.chrom().name() == "chr2R"); CHECK(sel1.bins().bin_size() == resolution); + + CHECK(sel1.read_all().size() == 624); + CHECK(sel1.read_sparse().nonZeros() == 624); + CHECK(sel1.read_dense().sum() == 1'483'112); } } }