diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 8f6615d0..53e35fee 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -163,7 +163,7 @@ jobs: --schedule-random \ --output-on-failure \ --no-tests=error \ - --timeout 90 \ + --timeout 240 \ -j $(nproc) |& head -n 1000 @@ -174,26 +174,12 @@ jobs: python3 -m pip install 'cooler>=0.9.2' - - name: Cache hic_tools - id: cache-hictools - uses: actions/cache@v4 - with: - key: hic_tools - path: hic_tools.jar - - - name: Download hic_tools - if: steps.cache-hictools.outputs.cache-hit != 'true' - run: | - apt-get install -y --no-install-recommends curl - curl -L 'https://github.com/aidenlab/HiCTools/releases/download/v3.30.00/hic_tools.3.30.00.jar' \ - -o hic_tools.jar - - name: Run integration tests run: | - test/scripts/hictk_balance.sh build/src/hictk/hictk hic_tools.jar + test/scripts/hictk_balance.sh build/src/hictk/hictk test/scripts/hictk_convert_hic2cool.sh build/src/hictk/hictk - test/scripts/hictk_convert_cool2hic.sh build/src/hictk/hictk hic_tools.jar + test/scripts/hictk_convert_cool2hic.sh build/src/hictk/hictk test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk test/scripts/hictk_dump_bins.sh build/src/hictk/hictk diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index 4c88d595..694458e0 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -297,7 +297,7 @@ jobs: --schedule-random \ --output-on-failure \ --no-tests=error \ - --timeout 180 2>&1 | + --timeout 240 2>&1 | head -n 1000 run-integration-tests: @@ -327,19 +327,6 @@ jobs: - name: Install test dependencies run: pip install -r requirements.txt - - name: Cache hic_tools - id: cache-hictools - uses: actions/cache@v4 - with: - key: hic_tools - path: hic_tools.jar - - - name: Download hic_tools - if: steps.cache-hictools.outputs.cache-hit != 'true' - run: | - curl -L 'https://github.com/aidenlab/HiCTools/releases/download/v3.30.00/hic_tools.3.30.00.jar' \ - -o hic_tools.jar - - name: Restore test dataset uses: actions/cache/restore@v4 with: @@ -364,7 +351,7 @@ jobs: - name: Test hictk balance run: | - test/scripts/hictk_balance.sh bin/hictk hic_tools.jar + test/scripts/hictk_balance.sh bin/hictk - name: Test hictk convert (hic2cool) run: | @@ -372,7 +359,7 @@ jobs: - name: Test hictk convert (cool2hic) run: | - test/scripts/hictk_convert_cool2hic.sh bin/hictk hic_tools.jar + test/scripts/hictk_convert_cool2hic.sh bin/hictk - name: Test hictk dump chroms run: | diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml index c4c11ef8..0b80e815 100644 --- a/.github/workflows/ubuntu-ci.yml +++ b/.github/workflows/ubuntu-ci.yml @@ -351,7 +351,7 @@ jobs: --schedule-random \ --output-on-failure \ --no-tests=error \ - --timeout 200 \ + --timeout 360 \ -j $(nproc) |& head -n 1000 @@ -383,20 +383,6 @@ jobs: python3 -m pip install 'cooler>=0.9.2' - - name: Cache hic_tools - id: cache-hictools - uses: actions/cache@v4 - with: - key: hic_tools - path: hic_tools.jar - - - name: Download hic_tools - if: steps.cache-hictools.outputs.cache-hit != 'true' - run: | - apt-get install -y --no-install-recommends curl - curl -L 'https://github.com/aidenlab/HiCTools/releases/download/v3.30.00/hic_tools.3.30.00.jar' \ - -o hic_tools.jar - - name: Restore test dataset uses: actions/cache/restore@v4 with: @@ -421,7 +407,7 @@ jobs: - name: Test hictk balance run: | - test/scripts/hictk_balance.sh bin/hictk hic_tools.jar + test/scripts/hictk_balance.sh bin/hictk - name: Test hictk convert (hic2cool) run: | @@ -429,7 +415,7 @@ jobs: - name: Test hictk convert (cool2hic) run: | - test/scripts/hictk_convert_cool2hic.sh bin/hictk hic_tools.jar + test/scripts/hictk_convert_cool2hic.sh bin/hictk - name: Test hictk dump chroms run: | diff --git a/.github/workflows/windows-ci.yml b/.github/workflows/windows-ci.yml index 85032934..b2d5ca02 100644 --- a/.github/workflows/windows-ci.yml +++ b/.github/workflows/windows-ci.yml @@ -198,8 +198,8 @@ jobs: --schedule-random \ --output-on-failure \ --no-tests=error \ - --timeout 180 \ - --exclude-regex 'Cooler: dataset large read\/write.*' |& + --timeout 240 \ + --exclude-regex '.*(dataset large read\/write|HiCFileWriter).*' |& tail -n 1000 windows-ci-status-check: diff --git a/Dockerfile b/Dockerfile index 86ba3e3e..32baa9e2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -113,14 +113,6 @@ RUN if [ -z "$BUILD_BASE_IMAGE" ]; then echo "Missing BUILD_BASE_IMAGE --build-a && if [ -z "$GIT_SHORT_HASH" ]; then echo "Missing GIT_SHORT_HASH --build-arg" && exit 1; fi \ && if [ -z "$CREATION_DATE" ]; then echo "Missing CREATION_DATE --build-arg" && exit 1; fi -# Install runtime dependencies -RUN apt-get update \ -&& apt-get install -y ca-certificates-java \ -&& apt-get install -y \ - openjdk-19-jre-headless \ - pigz \ -&& rm -rf /var/lib/apt/lists/* - # Export project binaries to the final build stage COPY --from=builder "$staging_dir" "$install_dir" diff --git a/cmake/FetchTestDataset.cmake b/cmake/FetchTestDataset.cmake index 94900f53..4331e3c3 100644 --- a/cmake/FetchTestDataset.cmake +++ b/cmake/FetchTestDataset.cmake @@ -4,8 +4,8 @@ # cmake-format: off file( - DOWNLOAD https://zenodo.org/records/10289491/files/hictk_test_data.tar.xz?download=1 - EXPECTED_HASH SHA256=5e69dceb8789d923a38aed7add8fc18abfdfe531aea6effcdb7efe3c9bcf5246 + DOWNLOAD https://zenodo.org/records/10552012/files/hictk_test_data.tar.xz?download=1 + EXPECTED_HASH SHA256=f49c9fdd6e27f3d318cc0544510d89ed57ddd4d56b71ad32bb499895e94f396b "${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz") # cmake-format: on diff --git a/conanfile.txt b/conanfile.txt index 0ea61847..49fcb51d 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -5,6 +5,7 @@ [requires] boost/1.83.0#7825569cd0622461dec7bc87dfdf47ae bshoshany-thread-pool/3.5.0#5aaea7ccf37107d3c1ba9ca9ab246e42 +concurrentqueue/1.0.4#1e48e1c712bcfd892087c9c622a51502 cli11/2.3.2#1424b9b1d9e3682a7122f415b078b4d7 eigen/3.4.0#2e192482a8acff96fe34766adca2b24c fast_float/6.0.0#6c8a953c57879453982a6ce69c0ba87c diff --git a/docs/balancing_matrices.rst b/docs/balancing_matrices.rst index 0e484316..ce5c1434 100644 --- a/docs/balancing_matrices.rst +++ b/docs/balancing_matrices.rst @@ -47,4 +47,4 @@ It should be noted that when using a large number of threads (e.g. more than 16) When the ``--in-memory`` option is not used, ``hictk`` will create a temporary file under the default temporary folder. This file stores interactions using a layout and compression that are optimized for the access pattern used by ``hictk balance``. When balancing large matrices, this file can be quite large (sometimes tens of GBs). If this is the case, it may be appropriate to change the temporary folder using the ``--tmpdir`` option. -Finally, when balancing .hic files, ``hictk`` depends on `JuicerTools `_ or `HiCTools `_ to write balancing weights back to the file. Thus, when balancing .hic files, the JAR file to one of the tools should be specified through the ``--juicer-tools-jar`` option. You should use JuicerTools when balancing .hic files in .hic v8 format or older and HiCTools when balancing .hic v9 files. +Finally, when balancing .hic files, only .hic v9 files and newer are supported. diff --git a/docs/cli_reference.rst b/docs/cli_reference.rst index 651a307e..c8698f7c 100644 --- a/docs/cli_reference.rst +++ b/docs/cli_reference.rst @@ -22,8 +22,8 @@ Subcommands convert Convert HiC matrices to a different format. dump Dump data from .hic and Cooler files to stdout. fix-mcool Fix corrupted .mcool files. - load Build .cool files from interactions in various text formats. - merge Merge coolers. + load Build .cool and .hic files from interactions in various text formats. + merge Merge multiple Cooler or .hic files into a single file. rename-chromosomes, rename-chroms Rename chromosomes found in a Cooler file. validate Validate .hic and Cooler files. @@ -62,7 +62,10 @@ hictk balance Maximum number of iterations. --rescale-weights,--no-rescale-weights{false} Rescale weights such that rows sum approximately to 2. - --name TEXT [weight] Name to use when writing weights to file. + --name TEXT Name to use when writing weights to file. + Defaults to ICE, INTER_ICE and GW_ICE when --mode is cis, trans and gw, respectively. + --create-weight-link Create a symbolic link to the balancing weights at clr::/bins/weight. + Ignored when balancing .hic files --in-memory Store all interactions in memory (greatly improves performance). --stdout Write balancing weights to stdout instead of writing them to the input file. --chunk-size UINT:POSITIVE [10000000] @@ -71,12 +74,8 @@ hictk balance Set verbosity of output to the console. -t,--threads UINT:UINT in [1 - 16] [1] Maximum number of parallel threads to spawn. - -l,--compression-level UINT:INT in [0 - 19] [] + -l,--compression-lvl UINT:INT in [0 - 19] [] Compression level used to compress temporary files using ZSTD. - --juicer-tools-jar TEXT:FILE - Path to juicer_tools or hic_tools JAR. - --juicer-tools-memory UINT:SIZE [b, kb(=1000b), kib(=1024b), ...]:POSITIVE [256MB] - Max heap size used by juicer_tools. -f,--force Overwrite existing files and datasets (if any). hictk convert @@ -98,8 +97,6 @@ hictk convert - cool - mcool - hic - -j,--juicer-tools-jar TEXT:FILE - Path to juicer_tools or hic_tools JAR. -r,--resolutions UINT:POSITIVE ... One or more resolutions to be converted. By default all resolutions are converted. --normalization-methods TEXT [ALL] ... @@ -108,17 +105,17 @@ hictk convert Pass NONE to avoid copying normalization vectors. --fail-if-norm-not-found Fail if any of the requested normalization vectors are missing. -g,--genome TEXT Genome assembly name. By default this is copied from the .hic file metadata. - --juicer-tools-memory UINT:SIZE [b, kb(=1000b), kib(=1024b), ...]:POSITIVE [32GB] - Max heap size used by juicer_tools. Only used when converting from cool to hic --tmpdir TEXT Path where to store temporary files. + --chunk-size UINT:POSITIVE [10000000] + Batch size to use when converting .[m]cool to .hic. -v,--verbosity UINT:INT in [1 - 4] [] Set verbosity of output to the console. -t,--threads UINT:UINT in [2 - 16] [2] Maximum number of parallel threads to spawn. When converting from hic to cool, only two threads will be used. - -l,--compression-level UINT:INT in [0 - 9] [] - Compression level used to compress temporary files. - Pass 0 to disable compression. + -l,--compression-lvl UINT:INT in [1 - 12] [6] + Compression level used to compress interactions. + Defaults to 6 and 10 for .cool and .hic files, respectively. -f,--force Overwrite existing files (if any). hictk dump @@ -134,7 +131,7 @@ hictk dump Options: -h,--help Print this help message and exit --resolution UINT:NONNEGATIVE - HiC matrix resolution (ignored when file is not in .hic format). + HiC matrix resolution (ignored when file is in .cool format). --matrix-type ENUM:value in {expected->2,observed->0,oe->1} OR {2,0,1} [observed] Matrix type (ignored when file is not in .hic format). --matrix-unit ENUM:value in {BP->0,FRAG->1} OR {0,1} [BP] @@ -181,7 +178,7 @@ hictk fix-mcool Set verbosity of output to the console. -t,--threads UINT:UINT in [1 - 16] [1] Maximum number of parallel threads to spawn (only applies to the balancing stage). - -l,--compression-level UINT:INT in [0 - 19] [] + -l,--compression-lvl UINT:INT in [0 - 19] [] Compression level used to compress temporary files using ZSTD (only applies to the balancing stage). -f,--force Overwrite existing files (if any). @@ -190,18 +187,18 @@ hictk load .. code-block:: text - Build .cool files from interactions in various text formats. - Usage: hictk load [OPTIONS] chrom-sizes output-uri + Build .cool and .hic files from interactions in various text formats. + Usage: hictk load [OPTIONS] chrom-sizes output-path Positionals: chrom-sizes TEXT:FILE REQUIRED Path to .chrom.sizes file. - output-uri TEXT REQUIRED Path to output Cooler (URI syntax supported). + output-path TEXT REQUIRED Path to output file. Options: -h,--help Print this help message and exit -b,--bin-size UINT:POSITIVE Excludes: --bin-table Bin size (bp). Required when --bin-table is not used. - -t,--bin-table TEXT:FILE Excludes: --bin-size + --bin-table TEXT:FILE Excludes: --bin-size Path to a BED3+ file with the bin table. -f,--format TEXT:{4dn,validpairs,bg2,coo} REQUIRED Input format. @@ -214,29 +211,44 @@ hictk load --count-as-float Interactions are floats. --assume-sorted,--assume-unsorted{false} Assume input files are already sorted. + --chunk-size UINT [10000000] + Number of pixels to buffer in memory. + -l,--compression-lvl UINT:INT bounded to [1 - 12] + Compression level used to compress interactions. + Defaults to 6 and 10 for .cool and .hic files, respectively. + -t,--threads UINT:UINT in [1 - 16] [1] + Maximum number of parallel threads to spawn. + When loading interactions in a .cool file, only a single thread will be used. + --tmpdir TEXT [/tmp] Path to a folder where to store temporary data. -v,--verbosity UINT:INT in [1 - 4] [] Set verbosity of output to the console. - --batch-size UINT [20000000] - Number of pixels to buffer in memory. - Only used when processing unsorted interactions or pairs. hictk merge ----------- .. code-block:: text - Merge coolers. - Usage: hictk merge [OPTIONS] input-coolers... + Merge multiple Cooler or .hic files into a single file. + Usage: hictk merge [OPTIONS] input-files... Positionals: - input-coolers TEXT:Cooler x 2 REQUIRED - Path to two or more Cooler files to be merged (URI syntax supported). + input-files TEXT:(Cooler) OR (HiC) x 2 REQUIRED + Path to two or more Cooler or .hic files to be merged (Cooler URI syntax supported). Options: -h,--help Print this help message and exit - -o,--output-cooler TEXT Output Cooler (URI syntax supported). - When not specified, merged interactions will be printed to stdout. - -f,--force Force overwrite output cooler. - --chunk-size UINT [5000000] + -o,--output-file TEXT REQUIRED + Output Cooler or .hic file (Cooler URI syntax supported). + --resolution UINT:NONNEGATIVE + HiC matrix resolution (ignored when input files are in .cool format). + -f,--force Force overwrite output file. + --chunk-size UINT [10000000] Number of pixels to store in memory before writing to disk. + -l,--compression-lvl UINT:INT bounded to [1 - 12] + Compression level used to compress interactions. + Defaults to 6 and 10 for .cool and .hic files, respectively. + -t,--threads UINT:UINT in [1 - 16] [1] + Maximum number of parallel threads to spawn. + When merging interactions in Cooler format, only a single thread will be used. + --tmpdir TEXT [/tmp] Path to a folder where to store temporary data. -v,--verbosity UINT:INT in [1 - 4] [] Set verbosity of output to the console. @@ -282,10 +294,11 @@ hictk zoomify .. code-block:: text Convert single-resolution Cooler file to multi-resolution by coarsening. - Usage: hictk zoomify [OPTIONS] cooler [mcool] + Usage: hictk zoomify [OPTIONS] cooler/hic mcool/hic Positionals: - cooler TEXT:Cooler REQUIRED Path to a .cool file (Cooler URI syntax supported). - mcool TEXT Output path. + cooler/hic TEXT:(Cooler) OR (HiC) REQUIRED + Path to a .cool or .hic file (Cooler URI syntax supported). + mcool/hic TEXT REQUIRED Output path. Options: -h,--help Print this help message and exit --force Force overwrite existing output file(s). @@ -298,5 +311,15 @@ hictk zoomify Base resolution: 1000 Pow2: 1000, 2000, 4000, 8000... Nice: 1000, 2000, 5000, 10000... + -l,--compression-lvl UINT:INT bounded to [1 - 12] [6] + Compression level used to compress interactions. + Defaults to 6 and 12 for .mcool and .hic files, respectively. + -t,--threads UINT:UINT in [1 - 16] [1] + Maximum number of parallel threads to spawn. + When zoomifying interactions from a .cool file, only a single thread will be used. + --chunk-size UINT [10000000] + Number of pixels to buffer in memory. + Only used when zoomifying .hic files. + --tmpdir TEXT [/tmp] Path to a folder where to store temporary data. -v,--verbosity UINT:INT in [1 - 4] [] Set verbosity of output to the console. diff --git a/docs/cpp_api/cooler.rst b/docs/cpp_api/cooler.rst index fec7c503..45353d9d 100644 --- a/docs/cpp_api/cooler.rst +++ b/docs/cpp_api/cooler.rst @@ -24,22 +24,22 @@ Single-resolution Cooler (.cool) **Constructors** .. cpp:function:: File(const File &other) = delete; - .. cpp:function:: File(File &&other) noexcept(noexcept_move_ctor()) = default; + .. cpp:function:: File(File &&other) noexcept = default; - .. cpp:function:: [[nodiscard]] explicit File(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); - .. cpp:function:: [[nodiscard]] explicit File(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); + .. cpp:function:: [[nodiscard]] explicit File(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, bool validate = true); + .. cpp:function:: [[nodiscard]] explicit File(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, bool validate = true); **Factory functions** - .. cpp:function:: [[nodiscard]] static File open_random_access(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); - .. cpp:function:: [[nodiscard]] static File open_read_once(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); - .. cpp:function:: template [[nodiscard]] static File create(RootGroup entrypoint, const Reference &chroms, std::uint32_t bin_size, Attributes attributes = Attributes::init(0), std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + .. cpp:function:: [[nodiscard]] static File open_random_access(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, bool validate = true); + .. cpp:function:: [[nodiscard]] static File open_read_once(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, bool validate = true); + .. cpp:function:: template [[nodiscard]] static File create(RootGroup entrypoint, const Reference &chroms, std::uint32_t bin_size, Attributes attributes = Attributes::init(0), std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, std::uint32_t compression_lvl = DEFAULT_COMPRESSION_LEVEL); + .. cpp:function:: template [[nodiscard]] static File create(std::string_view uri, const Reference &chroms, std::uint32_t bin_size, bool overwrite_if_exists = false, Attributes attributes = Attributes::init(0), std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, std::uint32_t compression_lvl = DEFAULT_COMPRESSION_LEVEL); **Open/close methods** - .. cpp:function:: [[nodiscard]] static File open_random_access(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); - .. cpp:function:: [[nodiscard]] static File open_read_once(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); - .. cpp:function:: template [[nodiscard]] static File create(std::string_view uri, const Reference &chroms, std::uint32_t bin_size, bool overwrite_if_exists = false, Attributes attributes = Attributes::init(0), std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + .. cpp:function:: [[nodiscard]] static File open_random_access(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, bool validate = true); + .. cpp:function:: [[nodiscard]] static File open_read_once(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, bool validate = true); .. cpp:function:: void close(); @@ -48,7 +48,7 @@ Single-resolution Cooler (.cool) **Operators** .. cpp:function:: File &operator=(const File &other) = delete; - .. cpp:function:: File &operator=(File &&other) noexcept(noexcept_move_assigment_op()) = default; + .. cpp:function:: File &operator=(File &&other) noexcept = default; .. cpp:function:: [[nodiscard]] explicit operator bool() const noexcept; diff --git a/docs/cpp_api/hic.rst b/docs/cpp_api/hic.rst index a04107df..cab2917c 100644 --- a/docs/cpp_api/hic.rst +++ b/docs/cpp_api/hic.rst @@ -59,7 +59,7 @@ File handle .. cpp:function:: [[nodiscard]] bool has_resolution(std::uint32_t resolution) const; - .. cpp:function:: [[nodiscard]] const std::string &url() const noexcept; + .. cpp:function:: [[nodiscard]] const std::string &path() const noexcept; .. cpp:function:: [[nodiscard]] const std::string &name() const noexcept; .. cpp:function:: [[nodiscard]] std::int32_t version() const noexcept; diff --git a/docs/cpp_api/shared.rst b/docs/cpp_api/shared.rst index 00068e6e..0b035647 100644 --- a/docs/cpp_api/shared.rst +++ b/docs/cpp_api/shared.rst @@ -233,6 +233,10 @@ Reference genome .. cpp:function:: [[nodiscard]] const Chromosome& longest_chromosome() const; .. cpp:function:: [[nodiscard]] const Chromosome& chromosome_with_longest_name() const; + **Other** + .. cpp:function:: [[nodiscard]] Reference remove_ALL() const; + .. cpp:function:: [[nodiscard]] Reference add_ALL(std::uint32_t scaling_factor = 1) const; + Bin Table --------- diff --git a/docs/creating_cool_and_hic_files.rst b/docs/creating_cool_and_hic_files.rst new file mode 100644 index 00000000..711b08e5 --- /dev/null +++ b/docs/creating_cool_and_hic_files.rst @@ -0,0 +1,100 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Creating .cool and .hic files +############################# + +hictk supports creating .cool and .hic files from text files in the following formats: + +* `pairs (4DN-DCIC) `_ +* `validPairs (nf-core/hic) `_ +* `bedGraph2 `_ +* `COO `_ + +File requirements: + +* ``dm6.chrom.sizes`` - `download `__ +* ``4DNFIKNWM36K.pairs.gz`` - `download `__ + +.. code-block:: console + + # Create a 10kbp .cool file using dm6 as reference + user@dev:/tmp$ zcat 4DNFIKNWM36K.pairs.gz | hictk load --format 4dn --assembly dm6 --bin-size 10000 dm6.chrom.sizes 4DNFIKNWM36K.10000.cool + + [2024-01-23 15:15:00.520] [info]: Running hictk v0.0.6-45c36af-dirty + [2024-01-23 15:15:00.531] [info]: writing chunk #1 to intermediate file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp"... + [2024-01-23 15:15:23.762] [info]: done writing chunk #1 to tmp file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp". + [2024-01-23 15:15:23.762] [info]: writing chunk #2 to intermediate file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp"... + [2024-01-23 15:15:49.042] [info]: done writing chunk #2 to tmp file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp". + [2024-01-23 15:15:49.042] [info]: writing chunk #3 to intermediate file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp"... + [2024-01-23 15:15:49.834] [info]: done writing chunk #3 to tmp file "/tmp/4DNFIKNWM36K.10000.cool.tmp/4DNFIKNWM36K.10000.cool.tmp". + [2024-01-23 15:15:49.836] [info]: merging 3 chunks into "4DNFIKNWM36K.10000.cool"... + [2024-01-23 15:15:55.118] [info]: processing chr3L:15100000-15110000 chr3L:16230000-16240000 at 4789272 pixels/s... + [2024-01-23 15:15:59.718] [info]: ingested 119208613 interactions (18122865 nnz) in 59.197723453s! + + # Create a 10kbp .hic file using dm6 as reference + user@dev:/tmp$ zcat 4DNFIKNWM36K.pairs.gz | hictk load --format 4dn --assembly dm6 --bin-size 10000 dm6.chrom.sizes 4DNFIKNWM36K.10000.hic + + [2024-01-23 15:45:19.969] [info]: Running hictk v0.0.6-570037c-dirty + [2024-01-23 15:45:42.439] [info]: preprocessing chunk #1 at 452919 pixels/s... + [2024-01-23 15:46:09.182] [info]: preprocessing chunk #2 at 303750 pixels/s... + [2024-01-23 15:46:11.184] [info]: writing header at offset 0 + [2024-01-23 15:46:11.184] [info]: begin writing interaction blocks to file "4DNFIKNWM36K.10000.hic"... + [2024-01-23 15:46:11.184] [info]: [10000 bp] writing pixels for chr3R:chr3R matrix at offset 50632... + [2024-01-23 15:46:13.295] [info]: [10000 bp] written 2264963 pixels for chr3R:chr3R matrix + [2024-01-23 15:46:13.295] [info]: [10000 bp] writing pixels for chr3R:chr3L matrix at offset 4235718... + [2024-01-23 15:46:14.611] [info]: [10000 bp] written 1610264 pixels for chr3R:chr3L matrix + ... + [2024-01-23 15:46:44.065] [info]: [10000 bp] initializing expected value vector + [2024-01-23 15:46:50.531] [info]: [10000 bp] computing expected vector density + [2024-01-23 15:46:51.157] [info]: writing 1 expected value vectors at offset 32065110... + [2024-01-23 15:46:51.158] [info]: writing 0 normalized expected value vectors at offset 32078017... + [2024-01-23 15:46:51.194] [info]: ingested 119208613 interactions (18122865 nnz) in 91.225341628s! + +**Tips:** + +* When creating large .hic files, ``hictk`` needs to create potentially large temporary files. When this is the case, use option ``--tmpdir`` to set the temporary folder to a path with sufficient space. + + +Merging multiple files +---------------------- + +Multiple .cool and .hic files using the same reference genome and resolution can be merged using ``hictk merge``: + +.. code-block:: console + + # Merge multiple cooler files + + user@dev:/tmp$ hictk merge data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 -o 4DNFIZ1ZVXC8.merged.cool + + [2023-09-29 19:24:49.479] [info]: Running hictk v0.0.2 + [2023-09-29 19:24:49.479] [info]: begin merging 2 coolers... + [2023-09-29 19:24:52.032] [info]: processing chr2R:11267000-11268000 chr4:1052000-1053000 at 3976143 pixels/s... + [2023-09-29 19:24:55.157] [info]: processing chr3R:5812000-5813000 chr3R:23422000-23423000 at 3201024 pixels/s... + [2023-09-29 19:24:57.992] [info]: DONE! Merging 2 coolers took 8.51s! + [2023-09-29 19:24:57.992] [info]: 4DNFIZ1ZVXC8.merged.cool size: 36.23 MB + + # Merge multiple .hic files + + user@dev:/tmp$ hictk merge data/4DNFIZ1ZVXC8.hic9 data/4DNFIZ1ZVXC8.hic9 -o 4DNFIZ1ZVXC8.10000.merged.hic --resolution 10000 + + [2024-01-23 15:49:23.248] [info]: Running hictk v0.0.6-570037c-dirty + [2024-01-23 15:49:23.248] [info]: begin merging 2 .hic files... + [2024-01-23 15:49:31.101] [info]: ingesting pixels at 1352814 pixels/s... + [2024-01-23 15:49:37.777] [info]: writing header at offset 0 + [2024-01-23 15:49:37.777] [info]: begin writing interaction blocks to file "4DNFIZ1ZVXC8.10000.merged.hic"... + [2024-01-23 15:49:37.777] [info]: [10000 bp] writing pixels for chr2L:chr2L matrix at offset 212... + [2024-01-23 15:49:39.060] [info]: [10000 bp] written 1433133 pixels for chr2L:chr2L matrix + [2024-01-23 15:49:39.060] [info]: [10000 bp] writing pixels for chr2L:chr2R matrix at offset 2619165... + ... + [2024-01-23 15:49:58.624] [info]: [10000 bp] initializing expected value vector + [2024-01-23 15:50:05.276] [info]: [10000 bp] computing expected vector density + [2024-01-23 15:50:05.276] [info]: writing 1 expected value vectors at offset 31936601... + [2024-01-23 15:50:05.276] [info]: writing 0 normalized expected value vectors at offset 31949508... + [2024-01-23 15:50:05.299] [info]: DONE! Merging 2 files took 42.05s! + [2024-01-23 15:50:05.299] [info]: 4DNFIZ1ZVXC8.10000.merged.hic size: 31.95 MB + +**Tips:** + +* When merging many, large .hic files, ``hictk`` needs to create potentially large temporary files. When this is the case, use option ``--tmpdir`` to set the temporary folder to a path with sufficient space. diff --git a/docs/creating_coolers.rst b/docs/creating_coolers.rst deleted file mode 100644 index 96d99fdb..00000000 --- a/docs/creating_coolers.rst +++ /dev/null @@ -1,49 +0,0 @@ -.. - Copyright (C) 2023 Roberto Rossini - SPDX-License-Identifier: MIT - -Creating .cool files -#################### - -hictk supports creating .cool files from text files in the following formats: - -* `pairs (4DN-DCIC) `_ -* `validPairs (nf-core/hic) `_ -* `bedGraph2 `_ -* `COO `_ - -File requirements: - -* ``dm6.chrom.sizes`` - `download `__ -* ``4DNFIKNWM36K.pairs.gz`` - `download `__ - -.. code-block:: console - - # Create a 10kbp .cool file using dm6 as reference - user@dev:/tmp$ zcat 4DNFIKNWM36K.pairs.gz | hictk load --format 4dn --assembly dms6 dm6.chrom.sizes 10000 4DNFIKNWM36K.1000.cool - - [2023-09-29 19:16:51.962] [info]: Running hictk v0.0.2 - [2023-09-29 19:16:51.962] [info]: begin loading un-sorted pairs... - [2023-09-29 19:16:51.983] [info]: writing chunk #1 to intermediate file "4DNFIKNWM36K.1000.cool.tmp"... - [2023-09-29 19:17:39.715] [info]: done writing chunk #1 to tmp file "4DNFIKNWM36K.1000.cool.tmp". - [2023-09-29 19:17:39.715] [info]: writing chunk #2 to intermediate file "4DNFIKNWM36K.1000.cool.tmp"... - [2023-09-29 19:17:39.719] [info]: done writing chunk #2 to tmp file "4DNFIKNWM36K.1000.cool.tmp". - [2023-09-29 19:17:39.721] [info]: merging 2 chunks into "4DNFIKNWM36K.1000.cool"... - [2023-09-29 19:17:41.716] [info]: processing chr3L:15100000-15110000 chr3L:16220000-16230000 at 5073567 pixels/s... - - -Merging multiple Cooler files ------------------------------ - -Multiple .cool files using the same reference genome and resolution can be merged using ``hictk merge``: - -.. code-block:: console - - user@dev:/tmp$ hictk merge data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 -o 4DNFIZ1ZVXC8.merged.cool - - [2023-09-29 19:24:49.479] [info]: Running hictk v0.0.2 - [2023-09-29 19:24:49.479] [info]: begin merging 2 coolers... - [2023-09-29 19:24:52.032] [info]: processing chr2R:11267000-11268000 chr4:1052000-1053000 at 3976143 pixels/s... - [2023-09-29 19:24:55.157] [info]: processing chr3R:5812000-5813000 chr3R:23422000-23423000 at 3201024 pixels/s... - [2023-09-29 19:24:57.992] [info]: DONE! Merging 2 coolers took 8.51s! - [2023-09-29 19:24:57.992] [info]: 4DNFIZ1ZVXC8.merged.cool size: 36.23 MB diff --git a/docs/creating_multires_coolers.rst b/docs/creating_multires_coolers.rst deleted file mode 100644 index fb5d6f87..00000000 --- a/docs/creating_multires_coolers.rst +++ /dev/null @@ -1,35 +0,0 @@ -.. - Copyright (C) 2023 Roberto Rossini - SPDX-License-Identifier: MIT - -Converting .cool to .mcool -########################## - -Interactions from a single-resolution Cooler file (.cool) can be used to generate a multi-resolution Cooler (.mcool) by iterative coarsening using ``hictk zoomify`` - -.. code-block:: console - - user@dev:/tmp$ hictk zoomify data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 out.mcool - - [2023-09-29 19:28:39.926] [info]: Running hictk v0.0.2 - [2023-09-29 19:28:39.929] [info]: coarsening cooler at data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 13 times (1000 -> 1000 -> 2000 -> 5000 -> 10000 -> 20000 -> 50000 -> 100000 -> 200000 -> 500000 -> 1000000 -> 2000000 -> 5000000 -> 10000000) - [2023-09-29 19:28:39.929] [info]: copying 1000 resolution from data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 - [2023-09-29 19:28:40.119] [info]: generating 2000 resolution from 1000 (2x) - [2023-09-29 19:28:40.343] [info]: [1000 -> 2000] processing chr2L:1996000-1998000 at 4484305 pixels/s... - [2023-09-29 19:28:40.663] [info]: [1000 -> 2000] processing chr2L:4932000-4934000 at 3125000 pixels/s... - [2023-09-29 19:28:40.973] [info]: [1000 -> 2000] processing chr2L:7986000-7988000 at 3236246 pixels/s... - ... - [2023-09-29 19:29:12.513] [info]: generating 10000000 resolution from 5000000 (2x) - [2023-09-29 19:29:12.519] [info]: DONE! Processed 13 resolution(s) in 32.59s! - - # Coarsen a single resolution - user@dev:/tmp$ hictk zoomify data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 out.cool --resolutions 50000 - - [2023-09-29 19:30:52.476] [info]: Running hictk v0.0.2 - [2023-09-29 19:30:52.482] [info]: coarsening cooler at data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 2 times (1000 -> 1000 -> 50000) - [2023-09-29 19:30:52.482] [info]: copying 1000 resolution from data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 - [2023-09-29 19:30:52.668] [info]: generating 50000 resolution from 1000 (50x) - [2023-09-29 19:30:53.789] [info]: [1000 -> 50000] processing chr2L:23000000-23050000 at 896057 pixels/s... - [2023-09-29 19:30:55.005] [info]: [1000 -> 50000] processing chr3L:4600000-4650000 at 822368 pixels/s... - [2023-09-29 19:30:56.440] [info]: [1000 -> 50000] processing chr3R:32050000-32079331 at 696864 pixels/s... - [2023-09-29 19:30:56.863] [info]: DONE! Processed 2 resolution(s) in 4.39s! diff --git a/docs/creating_multires_files.rst b/docs/creating_multires_files.rst new file mode 100644 index 00000000..3122f24b --- /dev/null +++ b/docs/creating_multires_files.rst @@ -0,0 +1,79 @@ +.. + Copyright (C) 2023 Roberto Rossini + SPDX-License-Identifier: MIT + +Creating multi-resolution files (.hic and .mcool) +################################################# + +Converting .cool to .mcool +__________________________ + +Interactions from a single-resolution Cooler file (.cool) can be used to generate a multi-resolution Cooler (.mcool) by iterative coarsening using ``hictk zoomify`` + +.. code-block:: console + + user@dev:/tmp$ hictk zoomify data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 out.mcool + + [2023-09-29 19:28:39.926] [info]: Running hictk v0.0.2 + [2023-09-29 19:28:39.929] [info]: coarsening cooler at data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 13 times (1000 -> 1000 -> 2000 -> 5000 -> 10000 -> 20000 -> 50000 -> 100000 -> 200000 -> 500000 -> 1000000 -> 2000000 -> 5000000 -> 10000000) + [2023-09-29 19:28:39.929] [info]: copying 1000 resolution from data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 + [2023-09-29 19:28:40.119] [info]: generating 2000 resolution from 1000 (2x) + [2023-09-29 19:28:40.343] [info]: [1000 -> 2000] processing chr2L:1996000-1998000 at 4484305 pixels/s... + [2023-09-29 19:28:40.663] [info]: [1000 -> 2000] processing chr2L:4932000-4934000 at 3125000 pixels/s... + [2023-09-29 19:28:40.973] [info]: [1000 -> 2000] processing chr2L:7986000-7988000 at 3236246 pixels/s... + ... + [2023-09-29 19:29:12.513] [info]: generating 10000000 resolution from 5000000 (2x) + [2023-09-29 19:29:12.519] [info]: DONE! Processed 13 resolution(s) in 32.59s! + + # Coarsen a single resolution + user@dev:/tmp$ hictk zoomify data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 out.cool --resolutions 50000 + + [2023-09-29 19:30:52.476] [info]: Running hictk v0.0.2 + [2023-09-29 19:30:52.482] [info]: coarsening cooler at data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 2 times (1000 -> 1000 -> 50000) + [2023-09-29 19:30:52.482] [info]: copying 1000 resolution from data/4DNFIZ1ZVXC8.mcool::/resolutions/1000 + [2023-09-29 19:30:52.668] [info]: generating 50000 resolution from 1000 (50x) + [2023-09-29 19:30:53.789] [info]: [1000 -> 50000] processing chr2L:23000000-23050000 at 896057 pixels/s... + [2023-09-29 19:30:55.005] [info]: [1000 -> 50000] processing chr3L:4600000-4650000 at 822368 pixels/s... + [2023-09-29 19:30:56.440] [info]: [1000 -> 50000] processing chr3R:32050000-32079331 at 696864 pixels/s... + [2023-09-29 19:30:56.863] [info]: DONE! Processed 2 resolution(s) in 4.39s! + +Converting a single-resolution .hic to a multi-resolution .hic +______________________________________________________________ + +Interactions from a .hic file (like the one generated by ``hictk load``) can be used to generate a multi-resolution .hic file by iterative coarsening using ``hictk zoomify``. +hictk will copy interactions for resolutions that are available in the input file. +Interactions at resolutions missing from the input file will be generated by iterative coarsening. + +.. code-block:: console + + user@dev:/tmp$ hictk zoomify 4DNFIZ1ZVXC8.hic9 4DNFIZ1ZVXC8.zoomified.hic --threads 8 + + [2024-01-23 16:59:57.369] [info]: Running hictk v0.0.6-570037c-dirty + [2024-01-23 16:59:57.369] [info]: copying resolution 1000 from "4DNFIZ1ZVXC8.hic9" + [2024-01-23 16:59:57.369] [info]: generating 2000 resolution from 1000 (2x) + [2024-01-23 16:59:57.369] [info]: copying resolution 5000 from "4DNFIZ1ZVXC8.hic9" + [2024-01-23 16:59:57.369] [info]: copying resolution 10000 from "4DNFIZ1ZVXC8.hic9" + [2024-01-23 16:59:57.369] [info]: generating 20000 resolution from 10000 (2x) + [2024-01-23 16:59:57.369] [info]: copying resolution 50000 from "4DNFIZ1ZVXC8.hic9" + [2024-01-23 16:59:57.369] [info]: copying resolution 100000 from "4DNFIZ1ZVXC8.hic9" + [2024-01-23 16:59:57.369] [info]: generating 200000 resolution from 100000 (2x) + [2024-01-23 16:59:57.369] [info]: copying resolution 500000 from "4DNFIZ1ZVXC8.hic9" + [2024-01-23 16:59:57.369] [info]: copying resolution 1000000 from "4DNFIZ1ZVXC8.hic9" + [2024-01-23 16:59:57.369] [info]: generating 2000000 resolution from 1000000 (2x) + [2024-01-23 16:59:57.369] [info]: generating 5000000 resolution from 1000000 (5x) + [2024-01-23 16:59:57.369] [info]: generating 10000000 resolution from 5000000 (2x) + [2024-01-23 16:59:57.379] [info]: [1000 bp] ingesting interactions... + [2024-01-23 17:00:02.183] [info]: ingesting pixels at 2157032 pixels/s... + [2024-01-23 17:00:07.271] [info]: ingesting pixels at 1965795 pixels/s... + ... + [2024-01-23 17:02:04.842] [info]: [1000 bp] computing expected vector density + [2024-01-23 17:02:05.325] [info]: [2000 bp] computing expected vector density + [2024-01-23 17:02:06.291] [info]: [5000 bp] computing expected vector density + [2024-01-23 17:02:06.292] [info]: writing 13 expected value vectors at offset 193918320... + [2024-01-23 17:02:06.293] [info]: writing 0 normalized expected value vectors at offset 194161639... + [2024-01-23 17:02:06.318] [info]: DONE! Processed 13 resolution(s) in 128.95s! + + +**Tips:** + +* When zoomifying large .hic files, ``hictk`` may need to create large temporary files. When this is the case, use option ``--tmpdir`` to set the temporary folder to a path with sufficient space. diff --git a/docs/format_conversion.rst b/docs/format_conversion.rst index 643eb120..0c5b0309 100644 --- a/docs/format_conversion.rst +++ b/docs/format_conversion.rst @@ -67,27 +67,39 @@ Converting from .[m]cool to .hic ``hictk convert`` can also be used to convert .[m]cool files to .hic format. -As ``libhictk`` is not yet capable of writing .hic files, ``hictk convert`` requires `JuicerTools `_ or `HiCTools `_ JARs and java to be available. - -You should use HiCTools JAR unless you need to open the resulting .hic file with applications that do not support the latest .hic format specification. +The conversion steps are similar to those carried out to convert .hic to .[m]cool .. code-block:: console - user@dev:/tmp$ hictk convert data/4DNFIZ1ZVXC8.mcool 4DNFIZ1ZVXC8.hic --juicer-tools-jar hic_tools.3.30.00.jar - - [2023-09-29 17:44:10.001] [info]: Running hictk v0.0.2-f83f93e - [2023-09-29 17:44:10.001] [info]: Converting data/4DNFIZ1ZVXC8.mcool to 4DNFIZ1ZVXC8.hic (mcool -> hic)... - [2023-09-29 17:44:10.004] [info]: writing chromosomes to file /tmp/hictk-tmp-XXXXjjxVhi/reference.chrom.sizes... - [2023-09-29 17:44:10.004] [info]: DONE! Wrote 8 chromosomes to file /tmp/hictk-tmp-XXXXjjxVhi/reference.chrom.sizes - [2023-09-29 17:44:10.004] [info]: writing pixels to file /tmp/hictk-tmp-XXXXjjxVhi/pixels.tsv.gz... - [2023-09-29 17:44:19.933] [info]: processing chr2R:19727000-19728000 chr2R:21162000-21163000 at 1007252 pixels/s... - [2023-09-29 17:44:31.012] [info]: processing chr3R:6457000-6458000 chr3R:21482000-21483000 at 902609 pixels/s... - [2023-09-29 17:44:37.397] [info]: wrote 26591454 pixels across 8 chromosomes to /tmp/hictk-tmp-XXXXjjxVhi/pixels.tsv.gz in 27.39s - [2023-09-29 17:44:37.398] [info]: running juicer_tools pre... + user@dev:/tmp$ hictk convert data/4DNFIZ1ZVXC8.mcool 4DNFIZ1ZVXC8.hic + + [2024-01-23 17:19:34.045] [info]: Running hictk v0.0.6-570037c-dirty + [2024-01-23 17:19:34.045] [info]: Converting 4DNFIZ1ZVXC8.mcool to 4DNFIZ1ZVXC8.hic (mcool -> hic)... + [2024-01-23 17:19:37.808] [info]: ingesting pixels at 2700513 pixels/s... + [2024-01-23 17:19:41.916] [info]: ingesting pixels at 2434275 pixels/s... + [2024-01-23 17:19:48.685] [info]: ingesting pixels at 2500000 pixels/s... + [2024-01-23 17:19:52.753] [info]: ingesting pixels at 2458815 pixels/s... + [2024-01-23 17:19:59.034] [info]: ingesting pixels at 2805049 pixels/s... + [2024-01-23 17:20:07.190] [info]: writing header at offset 0 + [2024-01-23 17:20:07.190] [info]: begin writing interaction blocks to file "4DNFIZ1ZVXC8.hic"... + [2024-01-23 17:20:07.190] [info]: [1000 bp] writing pixels for chr2L:chr2L matrix at offset 248... + [2024-01-23 17:20:07.595] [info]: [1000 bp] written 2676654 pixels for chr2L:chr2L matrix + [2024-01-23 17:20:07.651] [info]: [5000 bp] writing pixels for chr2L:chr2L matrix at offset 4303035... + [2024-01-23 17:20:08.257] [info]: [5000 bp] written 2676654 pixels for chr2L:chr2L matrix + [2024-01-23 17:20:08.366] [info]: [10000 bp] writing pixels for chr2L:chr2L matrix at offset 9144982... + [2024-01-23 17:20:08.821] [info]: [10000 bp] written 1433133 pixels for chr2L:chr2L matrix ... + [2024-01-23 17:21:30.092] [info]: [5000 bp] computing expected vector density + [2024-01-23 17:21:30.240] [info]: [5000 bp] computing expected vector density + [2024-01-23 17:21:30.297] [info]: [1000 bp] computing expected vector density + [2024-01-23 17:21:30.784] [info]: [5000 bp] computing expected vector density + [2024-01-23 17:21:30.784] [info]: writing 50 normalized expected value vectors at offset 142822186... + [2024-01-23 17:21:30.785] [info]: writing 400 normalization vectors at offset 143709792... + [2024-01-23 17:21:30.839] [info]: DONE! Processed 10 resolution(s) in 116.79s! + [2024-01-23 17:21:30.839] [info]: 4DNFIZ1ZVXC8.mcool size: 139.38 MB + [2024-01-23 17:21:30.839] [info]: 4DNFIZ1ZVXC8.hic size: 147.52 MB **Tips:** -* Use JuicerTools instead of HiCTools if the output .hic file needs to be opened by applications that do not support the latest .hic format specification. -* When converting large .[m]cool files to .hic, ``hictk`` needs to create large temporary files. When this is the case, use option ``--tmpdir`` to set the temporary folder to a path with sufficient space -* When converting .[m]cool files to .hic, ``hictk`` tries to use ``pigz`` instead of plain ``gzip`` to compress temporary files. This can dramatically reduce conversion time. Please make sure ``pigz`` is installed and increase the number of processing threads with option ``--thread``. +* When converting large .[m]cool files to .hic, ``hictk`` may need to create large temporary files. When this is the case, use option ``--tmpdir`` to set the temporary folder to a path with sufficient space. +* When converting .[m]cool files to .hic certain conversion steps can be performed in parallel. To improve performance, please make sure to increase the number of processing threads with option ``--thread``. diff --git a/docs/index.rst b/docs/index.rst index d7176e01..dc574209 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -77,8 +77,8 @@ If you use ``hictk convert`` to convert .[m]cool files to .hic format you should file_validation format_conversion reading_interactions - creating_coolers - creating_multires_coolers + creating_cool_and_hic_files + creating_multires_files balancing_matrices diff --git a/docs/installation_src.rst b/docs/installation_src.rst index c4fefd27..0595c96c 100644 --- a/docs/installation_src.rst +++ b/docs/installation_src.rst @@ -178,7 +178,7 @@ Example: cd /tmp/hictk # Make sure this is the URL listed in file cmake/FetchTestDataset.cmake - curl -L 'https://zenodo.org/record/8143316/files/hictk_test_data.tar.xz?download=1' | tar -xJf - + curl -L 'https://zenodo.org/records/10522583/files/hictk_test_data.tar.xz?download=1' | tar -xJf - # This should print "OK" if the check is successful (cd test/data && sha256sum --quiet -c checksums.sha256 && 2>&1 echo OK) diff --git a/src/hictk/balance/balance.cpp b/src/hictk/balance/balance.cpp index 9ef3a7db..445f5a79 100644 --- a/src/hictk/balance/balance.cpp +++ b/src/hictk/balance/balance.cpp @@ -8,18 +8,14 @@ #include #include -#include -#include #include #include #include #include #include -#include #include #include #include -#include #include #include @@ -31,72 +27,23 @@ #include "hictk/cooler/uri.hpp" #include "hictk/file.hpp" #include "hictk/hic.hpp" +#include "hictk/hic/file_writer.hpp" #include "hictk/hic/utils.hpp" #include "hictk/hic/validation.hpp" -#include "hictk/tools/common.hpp" #include "hictk/tools/config.hpp" -#include "hictk/tools/juicer_tools.hpp" namespace hictk::tools { -static void write_weights_hic(const hic::File& hf, const BalanceConfig& c, - const std::vector& weights) { - auto tmpfile = c.tmp_dir / std::filesystem::path{hf.name()}.filename(); - for (std::size_t i = 0; i < 1024; ++i) { - if (!std::filesystem::exists(tmpfile)) { - break; - } - - tmpfile.replace_extension(".tmp" + std::to_string(i)); - } - - if (std::filesystem::exists(tmpfile)) { - throw std::runtime_error( - fmt::format(FMT_STRING("unable to create temporary file {}"), tmpfile)); - } - - try { - { - const std::unique_ptr f(std::fopen(tmpfile.string().c_str(), "ae")); - if (!bool(f)) { - throw fmt::system_error(errno, FMT_STRING("cannot open file {}"), tmpfile); - } - - std::ptrdiff_t i0 = 0; - - for (const auto& chrom : hf.chromosomes()) { - if (chrom.is_all()) { - continue; - } - fmt::print(f.get(), FMT_STRING("vector\t{}\t{}\t{}\tBP\n"), c.name, chrom.name(), - hf.bin_size()); - - const auto num_bins = (chrom.size() + hf.bin_size() - 1) / hf.bin_size(); - const auto i1 = i0 + static_cast(num_bins); - std::for_each(weights.begin() + i0, weights.begin() + i1, [&](const double w) { - std::isnan(w) ? fmt::print(f.get(), FMT_COMPILE(".\n")) - : fmt::print(f.get(), FMT_COMPILE("{}\n"), 1.0 / w); - if (!bool(f)) { // NOLINT - throw fmt::system_error( - errno, FMT_STRING("an error occurred while writing weights to file {}"), tmpfile); - } - }); - - i0 = i1; - } - } - - auto jt = run_juicer_tools_add_norm(c.juicer_tools_jar, tmpfile, hf.url(), c.juicer_tools_xmx); - jt->wait(); - if (jt->exit_code() != 0) { - throw std::runtime_error( - fmt::format(FMT_STRING("juicer_tools pre failed with exit code {}"), jt->exit_code())); - } - } catch (...) { - std::error_code ec{}; - std::filesystem::remove(tmpfile, ec); +static void write_weights_hic( + hic::internal::HiCFileWriter& hfw, const BalanceConfig& c, + const phmap::flat_hash_map>& weights, bool force_overwrite) { + for (const auto& [resolution, weights_] : weights) { + std::vector weights_f(weights_.size()); + std::transform(weights_.begin(), weights_.end(), weights_f.begin(), + [](const auto w) { return static_cast(1.0 / w); }); + hfw.add_norm_vector(c.name, "BP", resolution, weights_f, force_overwrite); } - std::filesystem::remove(tmpfile); + hfw.write_norm_vectors_and_norm_expected_values(); } static void write_weights_cooler(std::string_view uri, const BalanceConfig& c, @@ -105,10 +52,16 @@ static void write_weights_cooler(std::string_view uri, const BalanceConfig& c, const std::vector& scale) { const auto& [file, grp] = cooler::parse_cooler_uri(uri); const auto path = fmt::format(FMT_STRING("{}/bins/{}"), grp, c.name); - SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), file, path); + const auto link_path = fmt::format(FMT_STRING("{}/bins/weight"), grp); + SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), file, path); const HighFive::File clr(file, HighFive::File::ReadWrite); + if (c.symlink_to_weight && clr.exist(link_path) && !c.force) { + throw std::runtime_error(fmt::format( + FMT_STRING("unable to create link to {}::{}: object already exists"), file, link_path)); + } + if (clr.exist(path)) { assert(c.force); clr.unlink(path); @@ -138,12 +91,18 @@ static void write_weights_cooler(std::string_view uri, const BalanceConfig& c, dset.write_attribute("scale", scale); dset.write_attribute("var", variance); } + + if (c.symlink_to_weight) { + SPDLOG_INFO(FMT_STRING("Linking weights to {}::{}..."), file, link_path); + if (clr.exist(link_path)) { + clr.unlink(link_path); + } + clr.getGroup(grp).createSoftLink(link_path, dset()); + } } // NOLINTNEXTLINE(*-rvalue-reference-param-not-moved) -static int balance_singleres_file(File&& f, const BalanceConfig& c) { - std::filesystem::path tmpfile{}; - +static int balance_cooler(cooler::File&& f, const BalanceConfig& c) { if (!c.force && !c.stdout_ && f.has_normalization(c.name)) { throw std::runtime_error( fmt::format(FMT_STRING("Normalization weights for \"{}\" already exist in file {}. Pass " @@ -151,21 +110,7 @@ static int balance_singleres_file(File&& f, const BalanceConfig& c) { c.name, f.path())); } - if (!c.in_memory) { - tmpfile = c.tmp_dir / std::filesystem::path{f.path()}.filename(); - for (std::size_t i = 0; i < 1024; ++i) { - if (!std::filesystem::exists(tmpfile)) { - break; - } - - tmpfile.replace_extension(".tmp" + std::to_string(i)); - } - - if (std::filesystem::exists(tmpfile)) { - throw std::runtime_error( - fmt::format(FMT_STRING("unable to create temporary file {}"), tmpfile)); - } - } + const auto tmpfile = c.tmp_dir / std::filesystem::path{f.path()}.filename(); const balancing::ICE::Params params{c.tolerance, c.max_iters, c.masked_diags, c.min_nnz, c.min_count, c.mad_max, @@ -179,8 +124,7 @@ static int balance_singleres_file(File&& f, const BalanceConfig& c) { mode = balancing::ICE::Type::trans; } - const auto balancer = - std::visit([&](const auto& ff) { return balancing::ICE(ff, mode, params); }, f.get()); + const balancing::ICE balancer(f, mode, params); const auto weights = balancer.get_weights(c.rescale_marginals); if (c.stdout_) { @@ -189,44 +133,81 @@ static int balance_singleres_file(File&& f, const BalanceConfig& c) { return 0; } - if (f.is_cooler()) { - const auto uri = f.uri(); - f.get().close(); - write_weights_cooler(uri, c, weights, balancer.variance(), balancer.scale()); - return 0; + const auto uri = f.uri(); + f.close(); + write_weights_cooler(uri, c, weights, balancer.variance(), balancer.scale()); + return 0; +} + +// NOLINTNEXTLINE(*-rvalue-reference-param-not-moved) +static int balance_hic(const BalanceConfig& c) { + const auto resolutions = hic::utils::list_resolutions(c.path_to_input); + for (const auto& res : resolutions) { + const hic::File f(c.path_to_input.string(), res); + if (!c.force && !c.stdout_ && f.has_normalization(c.name)) { + throw std::runtime_error( + fmt::format(FMT_STRING("Normalization weights for \"{}\" already exist in file {}. Pass " + "--force to overwrite existing weights."), + c.name, f.path())); + } + } + + const auto tmpfile = c.tmp_dir / std::filesystem::path{c.path_to_input}.filename(); + + const balancing::ICE::Params params{c.tolerance, c.max_iters, c.masked_diags, + c.min_nnz, c.min_count, c.mad_max, + tmpfile, c.chunk_size, c.threads}; + balancing::ICE::Type mode{}; + if (c.mode == "gw") { + mode = balancing::ICE::Type::gw; + } else if (c.mode == "cis") { + mode = balancing::ICE::Type::cis; + } else { + mode = balancing::ICE::Type::trans; } - write_weights_hic(f.get(), c, weights); + phmap::flat_hash_map> weights{resolutions.size()}; + for (const auto& res : resolutions) { + SPDLOG_INFO(FMT_STRING("balancing resolution {}..."), res); + const hic::File f(c.path_to_input.string(), res); + const balancing::ICE balancer(f, mode, params); + + if (c.stdout_) { + std::for_each(weights.begin(), weights.end(), + [&](const auto w) { fmt::print(FMT_COMPILE("{}\n"), w); }); + } else { + weights.emplace(res, balancer.get_weights(c.rescale_marginals)); + } + } + hic::internal::HiCFileWriter hfw(c.path_to_input.string(), c.threads); + write_weights_hic(hfw, c, weights, c.force); return 0; } -static int balance_multires(const BalanceConfig& c) { - const auto resolutions = cooler::MultiResFile(c.path_to_input.string()).resolutions(); +static int balance_multires_cooler(const BalanceConfig& c) { + const cooler::MultiResFile mclr(c.path_to_input.string()); - for (const auto& res : resolutions) { - balance_singleres_file( - File(fmt::format(FMT_STRING("{}::/resolutions/{}"), c.path_to_input.string(), res)), c); + for (const auto& res : mclr.resolutions()) { + SPDLOG_INFO(FMT_STRING("balancing resolution {}..."), res); + balance_cooler(mclr.open(res), c); } return 0; } int balance_subcmd(const BalanceConfig& c) { - if (cooler::utils::is_multires_file(c.path_to_input.string())) { - return balance_multires(c); - } + [[maybe_unused]] const internal::TmpDir tmp_dir{c.tmp_dir}; - std::vector resolutions{}; - if (hic::utils::is_hic_file(c.path_to_input)) { - resolutions = hic::utils::list_resolutions(c.path_to_input); - } else { - resolutions.push_back(File(c.path_to_input.string()).bin_size()); + if (hic::utils::is_hic_file(c.path_to_input.string())) { + return balance_hic(c); } - for (const auto& res : resolutions) { - balance_singleres_file(File(c.path_to_input.string(), res), c); + if (cooler::utils::is_multires_file(c.path_to_input.string())) { + return balance_multires_cooler(c); } + balance_cooler(cooler::File(c.path_to_input.string()), c); + return 0; } } // namespace hictk::tools diff --git a/src/hictk/cli/cli_balance.cpp b/src/hictk/cli/cli_balance.cpp index ccedf4da..7efc1e75 100644 --- a/src/hictk/cli/cli_balance.cpp +++ b/src/hictk/cli/cli_balance.cpp @@ -16,6 +16,7 @@ #include #include +#include "hictk/file.hpp" #include "hictk/hic/validation.hpp" #include "hictk/tools/cli.hpp" #include "hictk/tools/config.hpp" @@ -99,7 +100,14 @@ void Cli::make_balance_subcommand() { sc.add_option( "--name", c.name, - "Name to use when writing weights to file.") + "Name to use when writing weights to file.\n" + "Defaults to ICE, INTER_ICE and GW_ICE when --mode is cis, trans and gw, respectively.") + ->capture_default_str(); + sc.add_flag( + "--create-weight-link" , + c.symlink_to_weight, + "Create a symbolic link to the balancing weights at clr::/bins/weight.\n" + "Ignored when balancing .hic files") ->capture_default_str(); sc.add_flag( "--in-memory", @@ -130,23 +138,11 @@ void Cli::make_balance_subcommand() { ->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency())) ->capture_default_str(); sc.add_option( - "-l,--compression-level", + "-l,--compression-lvl", c.zstd_compression_lvl, "Compression level used to compress temporary files using ZSTD.") ->check(CLI::Range(0, 19)) ->capture_default_str(); - sc.add_option( - "--juicer-tools-jar", - c.juicer_tools_jar, - "Path to juicer_tools or hic_tools JAR.") - ->check(CLI::ExistingFile); - sc.add_option( - "--juicer-tools-memory", - c.juicer_tools_xmx, - "Max heap size used by juicer_tools.") - ->default_str(fmt::format(FMT_STRING("{:.0f}MB"), double(c.juicer_tools_xmx) / 1.0e6)) - ->check(CLI::PositiveNumber) - ->transform(CLI::AsSizeValue(true)); sc.add_flag( "-f,--force", c.force, @@ -158,14 +154,16 @@ void Cli::make_balance_subcommand() { } void Cli::validate_balance_subcommand() const { - const auto& c = std::get(_config); + [[maybe_unused]] const auto& c = std::get(_config); std::vector errors; - const auto juicer_tools_jar_parsed = - !_cli.get_subcommand("balance")->get_option("--juicer-tools-jar")->empty(); - if (hic::utils::is_hic_file(c.path_to_input) && !c.stdout_ && !juicer_tools_jar_parsed) { - errors.emplace_back( - "option --juicer-tools-jar is required when balancing files in .hic format."); + const auto input_format = infer_input_format(c.path_to_input); + if (input_format == "hic") { + const auto avail_resolutions = hic::utils::list_resolutions(c.path_to_input); + const hic::File f(c.path_to_input.string(), avail_resolutions.back()); + if (f.version() < 9) { + errors.emplace_back("balancing .hic files v8 and older is not currently supported."); + } } if (!errors.empty()) { @@ -179,6 +177,25 @@ void Cli::validate_balance_subcommand() const { void Cli::transform_args_balance_subcommand() { auto& c = std::get(_config); + if (c.name.empty()) { + if (c.mode == "cis") { + c.name = "ICE"; + } else if (c.mode == "trans") { + c.name = "INTER_ICE"; + } else { + assert(c.mode == "gw"); + c.name = "GW_ICE"; + } + } + + const auto input_format = infer_input_format(c.path_to_input); + auto input_path = c.path_to_input; + if (input_format == "cool") { + input_path = cooler::File(c.path_to_input.string()).path(); + } + + c.tmp_dir /= input_path.filename().string() + ".tmp"; + // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity < 5); c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; diff --git a/src/hictk/cli/cli_convert.cpp b/src/hictk/cli/cli_convert.cpp index 18ed2fb3..8d3ad61a 100644 --- a/src/hictk/cli/cli_convert.cpp +++ b/src/hictk/cli/cli_convert.cpp @@ -21,6 +21,7 @@ #include "hictk/cooler/cooler.hpp" #include "hictk/cooler/validation.hpp" +#include "hictk/file.hpp" #include "hictk/hic.hpp" #include "hictk/hic/utils.hpp" #include "hictk/hic/validation.hpp" @@ -62,11 +63,6 @@ void Cli::make_convert_subcommand() { "- hic\n") ->check(CLI::IsMember({"cool", "mcool", "hic"})) ->default_str("auto"); - sc.add_option( - "-j,--juicer-tools-jar", - c.juicer_tools_jar, - "Path to juicer_tools or hic_tools JAR.") - ->check(CLI::ExistingFile); sc.add_option( "-r,--resolutions", c.resolutions, @@ -88,17 +84,16 @@ void Cli::make_convert_subcommand() { "-g,--genome", c.genome, "Genome assembly name. By default this is copied from the .hic file metadata."); - sc.add_option( - "--juicer-tools-memory", - c.juicer_tools_xmx, - "Max heap size used by juicer_tools. Only used when converting from cool to hic") - ->default_str(fmt::format(FMT_STRING("{:.0f}GB"), double(c.juicer_tools_xmx) / 1.0e9)) - ->check(CLI::PositiveNumber) - ->transform(CLI::AsSizeValue(true)); sc.add_option( "--tmpdir", c.tmp_dir, "Path where to store temporary files."); + sc.add_option( + "--chunk-size", + c.chunk_size, + "Batch size to use when converting .[m]cool to .hic.") + ->check(CLI::PositiveNumber) + ->capture_default_str(); sc.add_option( "-v,--verbosity", c.verbosity, @@ -113,11 +108,11 @@ void Cli::make_convert_subcommand() { ->check(CLI::Range(std::uint32_t(2), std::thread::hardware_concurrency())) ->capture_default_str(); sc.add_option( - "-l,--compression-level", - c.gzip_compression_lvl, - "Compression level used to compress temporary files.\n" - "Pass 0 to disable compression.") - ->check(CLI::Range(0, 9)) + "-l,--compression-lvl", + c.compression_lvl, + "Compression level used to compress interactions.\n" + "Defaults to 6 and 10 for .cool and .hic files, respectively.") + ->check(CLI::Range(1, 12)) ->capture_default_str(); sc.add_flag( "-f,--force", @@ -174,11 +169,6 @@ void Cli::validate_convert_subcommand() const { fmt::format(FMT_STRING("{} is not in .hic, .cool or .mcool format"), c.path_to_input)); } - if ((is_cool || is_mcool) && c.juicer_tools_jar.empty()) { - errors.emplace_back( - fmt::format(FMT_STRING("--juicer-tools-jar is required when converting to .hic."))); - } - if (!c.output_format.empty()) { if ((is_hic && c.output_format == "hic") || (is_cool && c.output_format == "cool") || (is_mcool && c.output_format == "mcool")) { @@ -220,6 +210,7 @@ void Cli::validate_convert_subcommand() const { void Cli::transform_args_convert_subcommand() { auto& c = std::get(_config); + const auto& sc = *_cli.get_subcommand("convert"); c.input_format = infer_input_format(c.path_to_input); if (c.output_format.empty()) { @@ -234,16 +225,26 @@ void Cli::transform_args_convert_subcommand() { c.genome = infer_assembly(c.path_to_input, c.resolutions.back(), c.input_format); } + if (c.normalization_methods.empty()) { + if (c.input_format == "mcool") { + c.normalization_methods = cooler::MultiResFile(c.path_to_input.string()) + .open(c.resolutions.back()) + .avail_normalizations(); + } else { + c.normalization_methods = + File(c.path_to_input.string(), c.resolutions.back()).avail_normalizations(); + } + } + // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity < 5); c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; - if (c.tmp_dir.empty()) { - c.tmp_dir = c.path_to_output.parent_path(); - } + c.tmp_dir /= c.path_to_output.filename().string() + ".tmp"; - c.tmp_dir /= c.path_to_output.filename(); - c.tmp_dir.replace_extension(".tmp"); + if (sc.get_option("--compression-lvl")->empty()) { + c.compression_lvl = c.output_format == "hic" ? 10 : 6; + } } } // namespace hictk::tools diff --git a/src/hictk/cli/cli_dump.cpp b/src/hictk/cli/cli_dump.cpp index e648b8d1..8c2b185b 100644 --- a/src/hictk/cli/cli_dump.cpp +++ b/src/hictk/cli/cli_dump.cpp @@ -47,7 +47,7 @@ void Cli::make_dump_subcommand() { sc.add_option( "--resolution", c.resolution, - "HiC matrix resolution (ignored when file is not in .hic format).") + "HiC matrix resolution (ignored when file is in .cool format).") ->check(CLI::NonNegativeNumber); sc.add_option( diff --git a/src/hictk/cli/cli_fix_mcool.cpp b/src/hictk/cli/cli_fix_mcool.cpp index 0b2c012c..21f9795d 100644 --- a/src/hictk/cli/cli_fix_mcool.cpp +++ b/src/hictk/cli/cli_fix_mcool.cpp @@ -83,7 +83,7 @@ void Cli::make_fix_mcool_subcommand() { ->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency())) ->capture_default_str(); sc.add_option( - "-l,--compression-level", + "-l,--compression-lvl", c.zstd_compression_lvl, "Compression level used to compress temporary files using ZSTD (only applies to the balancing stage).") ->check(CLI::Range(0, 19)) @@ -116,9 +116,9 @@ void Cli::validate_fix_mcool_subcommand() const { if (!sc->get_option("--in-memory")->empty()) { warnings.emplace_back("option --in-memory is ignored when --skip-balancing is provided."); } - if (!sc->get_option("--compression-level")->empty()) { + if (!sc->get_option("--compression-lvl")->empty()) { warnings.emplace_back( - "option --compression-level is ignored when --skip-balancing is provided."); + "option --compression-lvl is ignored when --skip-balancing is provided."); } if (!sc->get_option("--chunk-size")->empty()) { warnings.emplace_back("option --chunk-size is ignored when --skip-balancing is provided."); @@ -143,6 +143,8 @@ void Cli::validate_fix_mcool_subcommand() const { void Cli::transform_args_fix_mcool_subcommand() { auto& c = std::get(_config); + c.tmp_dir /= (c.path_to_input.filename().string() + ".tmp"); + // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity < 5); c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; diff --git a/src/hictk/cli/cli_load.cpp b/src/hictk/cli/cli_load.cpp index e274acd9..fd214dab 100644 --- a/src/hictk/cli/cli_load.cpp +++ b/src/hictk/cli/cli_load.cpp @@ -23,7 +23,8 @@ namespace hictk::tools { void Cli::make_load_subcommand() { auto& sc = - *_cli.add_subcommand("load", "Build .cool files from interactions in various text formats.") + *_cli.add_subcommand("load", + "Build .cool and .hic files from interactions in various text formats.") ->fallthrough() ->preparse_callback([this]([[maybe_unused]] std::size_t i) { assert(_config.index() == 0); @@ -42,9 +43,9 @@ void Cli::make_load_subcommand() { ->required(); sc.add_option( - "output-uri", - c.uri, - "Path to output Cooler (URI syntax supported).") + "output-path", + c.output_path, + "Path to output file.") ->required(); sc.add_option( @@ -55,7 +56,7 @@ void Cli::make_load_subcommand() { ->check(CLI::PositiveNumber); sc.add_option( - "-t,--bin-table", + "--bin-table", c.path_to_bin_table, "Path to a BED3+ file with the bin table.") ->check(CLI::ExistingFile); @@ -98,19 +99,39 @@ void Cli::make_load_subcommand() { "Assume input files are already sorted.") ->capture_default_str(); + sc.add_option( + "--chunk-size", + c.batch_size, + "Number of pixels to buffer in memory.") + ->capture_default_str(); + + sc.add_option( + "-l,--compression-lvl", + c.compression_lvl, + "Compression level used to compress interactions.\n" + "Defaults to 6 and 10 for .cool and .hic files, respectively.") + ->check(CLI::Bound(1, 12)); + + sc.add_option( + "-t,--threads", + c.threads, + "Maximum number of parallel threads to spawn.\n" + "When loading interactions in a .cool file, only a single thread will be used.") + ->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency())) + ->capture_default_str(); + + sc.add_option( + "--tmpdir", + c.tmp_dir, + "Path to a folder where to store temporary data.") + ->capture_default_str(); + sc.add_option( "-v,--verbosity", c.verbosity, "Set verbosity of output to the console.") ->check(CLI::Range(1, 4)) ->capture_default_str(); - - sc.add_option( - "--batch-size", - c.batch_size, - "Number of pixels to buffer in memory.\n" - "Only used when processing unsorted interactions or pairs.") - ->capture_default_str(); // clang-format on sc.get_option("--bin-size")->excludes(sc.get_option("--bin-table")); @@ -125,16 +146,21 @@ void Cli::validate_load_subcommand() const { const auto& c = std::get(_config); const auto& sc = *_cli.get_subcommand("load"); - if (!c.force && std::filesystem::exists(c.uri)) { + if (!c.force && std::filesystem::exists(c.output_path)) { errors.emplace_back(fmt::format( - FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.uri)); + FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.output_path)); } - if (c.path_to_bin_table.empty() && c.path_to_chrom_sizes.empty()) { + if (c.path_to_bin_table.empty() && c.bin_size == 0) { assert(c.bin_size == 0); errors.emplace_back("--bin-size is required when --bin-table is not specified."); } + const auto output_format = infer_output_format(c.output_path); + if (!c.path_to_bin_table.empty() && output_format == "hic") { + errors.emplace_back("--bin-table is not supported when generating .hic files."); + } + if ((c.format == "bg2" || c.format == "coo") && !sc.get_option("--bin-table")->empty()) { errors.emplace_back( "specifying bins through the --bin-table is not supported when ingesting pre-binned " @@ -162,6 +188,8 @@ void Cli::transform_args_load_subcommand() { auto& c = std::get(_config); const auto& sc = *_cli.get_subcommand("load"); + c.output_format = infer_output_format(c.output_path); + if (sc.get_option("--one-based")->empty()) { if (c.format == "4dn" || c.format == "validpairs") { c.offset = -1; @@ -170,6 +198,12 @@ void Cli::transform_args_load_subcommand() { c.offset = c.one_based ? -1 : 0; } + c.tmp_dir /= (std::filesystem::path(c.output_path).filename().string() + ".tmp"); + + if (sc.get_option("--compression-lvl")->empty()) { + c.compression_lvl = c.output_format == "hic" ? 10 : 6; + } + // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity < 5); c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; diff --git a/src/hictk/cli/cli_merge.cpp b/src/hictk/cli/cli_merge.cpp index 277a6b06..204e6cf5 100644 --- a/src/hictk/cli/cli_merge.cpp +++ b/src/hictk/cli/cli_merge.cpp @@ -23,35 +23,42 @@ namespace hictk::tools { void Cli::make_merge_subcommand() { - auto& sc = *_cli.add_subcommand("merge", "Merge coolers.") - ->fallthrough() - ->preparse_callback([this]([[maybe_unused]] std::size_t i) { - assert(_config.index() == 0); - _config = MergeConfig{}; - }); + auto& sc = + *_cli.add_subcommand("merge", "Merge multiple Cooler or .hic files into a single file.") + ->fallthrough() + ->preparse_callback([this]([[maybe_unused]] std::size_t i) { + assert(_config.index() == 0); + _config = MergeConfig{}; + }); _config = MergeConfig{}; auto& c = std::get(_config); // clang-format off sc.add_option( - "input-coolers", - c.input_uris, - "Path to two or more Cooler files to be merged (URI syntax supported).") - ->check(IsValidCoolerFile) + "input-files", + c.input_files, + "Path to two or more Cooler or .hic files to be merged (Cooler URI syntax supported).") + ->check(IsValidCoolerFile | IsValidHiCFile) ->expected(2, std::numeric_limits::max()) ->required(); sc.add_option( - "-o,--output-cooler", - c.output_uri, - "Output Cooler (URI syntax supported).\n" - "When not specified, merged interactions will be printed to stdout."); + "-o,--output-file", + c.output_file, + "Output Cooler or .hic file (Cooler URI syntax supported).") + ->required(); + + sc.add_option( + "--resolution", + c.resolution, + "HiC matrix resolution (ignored when input files are in .cool format).") + ->check(CLI::NonNegativeNumber); sc.add_flag( "-f,--force", c.force, - "Force overwrite output cooler.") + "Force overwrite output file.") ->capture_default_str(); sc.add_option( @@ -60,6 +67,27 @@ void Cli::make_merge_subcommand() { "Number of pixels to store in memory before writing to disk.") ->capture_default_str(); + sc.add_option( + "-l,--compression-lvl", + c.compression_lvl, + "Compression level used to compress interactions.\n" + "Defaults to 6 and 10 for .cool and .hic files, respectively.") + ->check(CLI::Bound(1, 12)); + + sc.add_option( + "-t,--threads", + c.threads, + "Maximum number of parallel threads to spawn.\n" + "When merging interactions in Cooler format, only a single thread will be used.") + ->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency())) + ->capture_default_str(); + + sc.add_option( + "--tmpdir", + c.tmp_dir, + "Path to a folder where to store temporary data.") + ->capture_default_str(); + sc.add_option( "-v,--verbosity", c.verbosity, @@ -72,15 +100,70 @@ void Cli::make_merge_subcommand() { _config = std::monostate{}; } +static bool check_all_files_are_hic(const std::vector& paths) { + return std::all_of(paths.begin(), paths.end(), + [](const auto& path) { return hic::utils::is_hic_file(path); }); +} + +static bool check_all_files_are_cooler(const std::vector& paths) { + return std::all_of(paths.begin(), paths.end(), + [](const auto& path) { return cooler::utils::is_cooler(path); }); +} + +static bool check_all_files_are_multires_cooler(const std::vector& paths) { + return std::all_of(paths.begin(), paths.end(), + [](const auto& path) { return cooler::utils::is_multires_file(path); }); +} + +static bool check_all_files_are_singlecell_cooler(const std::vector& paths) { + return std::all_of(paths.begin(), paths.end(), + [](const auto& path) { return cooler::utils::is_scool_file(path); }); +} + void Cli::validate_merge_subcommand() const { assert(_cli.get_subcommand("merge")->parsed()); std::vector errors; + std::vector warnings; const auto& c = std::get(_config); + const auto& sc = *_cli.get_subcommand("merge"); - if (!c.force && std::filesystem::exists(c.output_uri)) { + if (!c.force && std::filesystem::exists(c.output_file)) { errors.emplace_back(fmt::format( - FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.output_uri)); + FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.output_file)); + } + + const auto is_hic = check_all_files_are_hic(c.input_files); + const auto is_cooler = check_all_files_are_cooler(c.input_files); + const auto is_mcooler = check_all_files_are_multires_cooler(c.input_files); + const auto is_scool = check_all_files_are_singlecell_cooler(c.input_files); + + if (is_scool) { + errors.emplace_back("merging file in .scool format is not supported."); + } + + const auto output_format = infer_output_format(c.output_file); + + auto input_output_format_mismatch = is_hic && output_format != "hic"; + input_output_format_mismatch |= ((is_cooler || is_mcooler) && output_format != "cool"); + + if (input_output_format_mismatch) { + errors.emplace_back( + "detected mismatch in input-output formats: merging files of different formats is not " + "supported."); + } + + if (c.resolution == 0 && (is_hic || is_mcooler)) { + errors.emplace_back("--resolution is mandatory when input files are in .hic or .mcool format."); + } + + const auto resolution_parsed = !sc.get_option("--resolution")->empty(); + if (is_cooler && resolution_parsed) { + warnings.emplace_back("--resolution is ignored when file is in .[s]cool format."); + } + + for (const auto& w : warnings) { + SPDLOG_WARN(FMT_STRING("{}"), w); } if (!errors.empty()) { @@ -93,6 +176,15 @@ void Cli::validate_merge_subcommand() const { void Cli::transform_args_merge_subcommand() { auto& c = std::get(_config); + const auto& sc = *_cli.get_subcommand("merge"); + + c.output_format = c.output_file.empty() ? "text" : infer_output_format(c.output_file); + + if (sc.get_option("--compression-lvl")->empty()) { + c.compression_lvl = c.output_format == "hic" ? 10 : 6; + } + + c.tmp_dir /= c.output_file.filename().string() + ".tmp"; // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity < 5); diff --git a/src/hictk/cli/cli_zoomify.cpp b/src/hictk/cli/cli_zoomify.cpp index 4a72abcd..e285b0cc 100644 --- a/src/hictk/cli/cli_zoomify.cpp +++ b/src/hictk/cli/cli_zoomify.cpp @@ -37,16 +37,17 @@ void Cli::make_zoomify_subcommand() { // clang-format off sc.add_option( - "cooler", - c.input_uri, - "Path to a .cool file (Cooler URI syntax supported).") - ->check(IsValidCoolerFile) + "cooler/hic", + c.path_to_input, + "Path to a .cool or .hic file (Cooler URI syntax supported).") + ->check(IsValidCoolerFile | IsValidHiCFile) ->required(); sc.add_option( - "mcool", - c.output_path, - "Output path."); + "mcool/hic", + c.path_to_output, + "Output path.") + ->required(); sc.add_flag( "--force", @@ -74,6 +75,35 @@ void Cli::make_zoomify_subcommand() { "Nice: 1000, 2000, 5000, 10000...\n") ->default_str("--nice-steps"); + sc.add_option( + "-l,--compression-lvl", + c.compression_lvl, + "Compression level used to compress interactions.\n" + "Defaults to 6 and 12 for .mcool and .hic files, respectively.") + ->check(CLI::Bound(1, 12)) + ->capture_default_str(); + + sc.add_option( + "-t,--threads", + c.threads, + "Maximum number of parallel threads to spawn.\n" + "When zoomifying interactions from a .cool file, only a single thread will be used.") + ->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency())) + ->capture_default_str(); + + sc.add_option( + "--chunk-size", + c.batch_size, + "Number of pixels to buffer in memory.\n" + "Only used when zoomifying .hic files.") + ->capture_default_str(); + + sc.add_option( + "--tmpdir", + c.tmp_dir, + "Path to a folder where to store temporary data.") + ->capture_default_str(); + sc.add_option( "-v,--verbosity", c.verbosity, @@ -109,8 +139,7 @@ void Cli::make_zoomify_subcommand() { } static std::vector detect_invalid_resolutions( - const cooler::File& clr, const std::vector& resolutions) { - const auto base_resolution = clr.bin_size(); + std::uint32_t base_resolution, const std::vector& resolutions) { std::vector invalid_resolutions{}; for (const auto& res : resolutions) { if (res % base_resolution != 0 || res < base_resolution) { @@ -120,6 +149,16 @@ static std::vector detect_invalid_resolutions( return invalid_resolutions; } +[[nodiscard]] static std::uint32_t detect_base_resolution(std::string_view path, + std::string_view format) { + if (format == "cool") { + return cooler::File(path).bin_size(); + } + + assert(format == "hic"); + return hic::utils::list_resolutions(std::string{path}, true).front(); +} + void Cli::validate_zoomify_subcommand() const { assert(_cli.get_subcommand("zoomify")->parsed()); @@ -127,13 +166,26 @@ void Cli::validate_zoomify_subcommand() const { std::vector errors; const auto& c = std::get(_config); - const cooler::File clr(c.input_uri); - const auto output_path = c.output_path.empty() - ? std::filesystem::path(clr.path()).replace_extension(".mcool") - : std::filesystem::path(c.output_path); - if (!c.force && std::filesystem::exists(output_path)) { + if (!c.force && std::filesystem::exists(c.path_to_output)) { errors.emplace_back(fmt::format( - FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.output_path)); + FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.path_to_output)); + } + + const auto input_format = infer_input_format(c.path_to_input); + const auto output_format = infer_output_format(c.path_to_output); + if ((input_format == "hic" && output_format != "hic") || + (input_format != "hic" && output_format == "hic")) { + errors.emplace_back( + fmt::format(FMT_STRING("Zoomifying a .{} file to produce .{} file is not supported."), + input_format, output_format)); + } + + const auto base_resolution = detect_base_resolution(c.path_to_input.string(), input_format); + + if (base_resolution == 0) { // Variable bin size + errors.clear(); + warnings.clear(); + errors.emplace_back("zoomifying files with variable bin size is currently not supported."); } if (const auto dupl = detect_duplicate_resolutions(c.resolutions); !dupl.empty()) { @@ -141,11 +193,12 @@ void Cli::validate_zoomify_subcommand() const { fmt::format(FMT_STRING("Found duplicate resolution(s):\n - {}"), fmt::join(dupl, "\n - "))); } - if (const auto invalid = detect_invalid_resolutions(clr, c.resolutions); !invalid.empty()) { + if (const auto invalid = detect_invalid_resolutions(base_resolution, c.resolutions); + !invalid.empty()) { errors.emplace_back( fmt::format(FMT_STRING("Found the following invalid resolution(s):\n - {}\n" "Resolutions should be a multiple of the base resolution ({})."), - fmt::join(invalid, "\n - "), clr.bin_size())); + fmt::join(invalid, "\n - "), base_resolution)); } const auto* sc = _cli.get_subcommand("zoomify"); @@ -157,12 +210,6 @@ void Cli::validate_zoomify_subcommand() const { "--resolutions."); } - if (clr.bin_size() == 0) { // Variable bin size - errors.clear(); - warnings.clear(); - errors.emplace_back("zoomifying files with variable bin size is not currently supported."); - } - for (const auto& w : warnings) { SPDLOG_WARN(FMT_STRING("{}"), w); } @@ -216,26 +263,33 @@ static std::vector generate_resolutions_nice( void Cli::transform_args_zoomify_subcommand() { auto& c = std::get(_config); + const auto& sc = *_cli.get_subcommand("zoomify"); // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity < 5); c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; - const cooler::File clr(c.input_uri); - if (c.output_path.empty()) { - c.output_path = std::filesystem::path(clr.path()).replace_extension(".mcool").string(); - } + c.input_format = infer_input_format(c.path_to_input); + c.output_format = infer_output_format(c.path_to_output); + + const auto base_resolution = detect_base_resolution(c.path_to_input.string(), c.input_format); if (c.resolutions.empty()) { - c.resolutions = c.nice_resolution_steps ? generate_resolutions_nice(clr.bin_size()) - : generate_resolutions_pow2(clr.bin_size()); + c.resolutions = c.nice_resolution_steps ? generate_resolutions_nice(base_resolution) + : generate_resolutions_pow2(base_resolution); } else { std::sort(c.resolutions.begin(), c.resolutions.end()); } - if (c.resolutions.front() != clr.bin_size()) { - c.resolutions.insert(c.resolutions.begin(), clr.bin_size()); + if (c.output_format == "cool" && c.resolutions.front() != base_resolution) { + c.resolutions.insert(c.resolutions.begin(), base_resolution); + } + + if (sc.get_option("--compression-lvl")->empty()) { + c.compression_lvl = c.output_format == "hic" ? 10 : 6; } + + c.tmp_dir /= c.path_to_output.filename().string() + ".tmp"; } } // namespace hictk::tools diff --git a/src/hictk/convert/cool_to_hic.cpp b/src/hictk/convert/cool_to_hic.cpp index a8dccb3c..98d86bd1 100644 --- a/src/hictk/convert/cool_to_hic.cpp +++ b/src/hictk/convert/cool_to_hic.cpp @@ -2,394 +2,134 @@ // // SPDX-License-Identifier: MIT -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include #include #include -#include #include #include #include #include #include -#include -#include "hictk/balancing/methods.hpp" -#include "hictk/balancing/weights.hpp" #include "hictk/cooler/cooler.hpp" -#include "hictk/pixel.hpp" -#include "hictk/tmpdir.hpp" -#include "hictk/tools/common.hpp" +#include "hictk/cooler/multires_cooler.hpp" +#include "hictk/hic/file_writer.hpp" #include "hictk/tools/config.hpp" -#include "hictk/tools/juicer_tools.hpp" namespace hictk::tools { -[[maybe_unused]] [[nodiscard]] static std::filesystem::path find_pigz() { - return boost::process::search_path("pigz").string(); -} - -static void dump_chrom_sizes(const cooler::File& clr, const std::filesystem::path& dest) { - SPDLOG_INFO(FMT_STRING("writing chromosomes to file {}..."), dest); - const std::unique_ptr f(std::fopen(dest.string().c_str(), "we")); - - if (!bool(f)) { - throw fmt::system_error(errno, FMT_STRING("cannot open file {}"), dest); +static void copy_pixels(hic::internal::HiCFileWriter& w, const cooler::File& base_clr, + const ConvertConfig& c) { + if (c.input_format == "cool") { + w.add_pixels(base_clr.bin_size(), base_clr.begin(), base_clr.end()); + return; } - fmt::print(f.get(), FMT_STRING("{:tsv}\n"), fmt::join(clr.chromosomes(), "\n")); - SPDLOG_INFO(FMT_STRING("DONE! Wrote {} chromosomes to file {}"), clr.chromosomes().size(), dest); -} - -static std::size_t dump_pixels_plain(const cooler::File& clr, const std::filesystem::path& dest, - std::size_t update_frequency = 10'000'000) { - const std::unique_ptr f(std::fopen(dest.string().c_str(), "we")); - - if (!bool(f)) { - throw fmt::system_error(errno, FMT_STRING("cannot open file {}"), dest); - } - - std::size_t i = 0; - auto t0 = std::chrono::steady_clock::now(); - for (std::uint32_t chrom1_id = 0; chrom1_id < clr.chromosomes().size(); ++chrom1_id) { - for (std::uint32_t chrom2_id = chrom1_id; chrom2_id < clr.chromosomes().size(); ++chrom2_id) { - auto sel = - clr.fetch(clr.chromosomes().at(chrom1_id).name(), clr.chromosomes().at(chrom2_id).name()); - std::for_each( - sel.begin(), sel.end(), - [&](const ThinPixel& p) { - const auto bin1 = clr.bins().at(p.bin1_id); - const auto bin2 = clr.bins().at(p.bin2_id); - // https://github.com/aidenlab/juicer/wiki/Pre#short-with-score-format - // - fmt::print(f.get(), FMT_COMPILE("0\t{}\t{}\t0\t1\t{}\t{}\t1\t{}\n"), - bin1.chrom().name(), bin1.start(), bin2.chrom().name(), bin2.start(), - p.count); - if (!bool(f)) { // NOLINT - throw fmt::system_error( - errno, FMT_STRING("an error occurred while pixels to file {}"), dest); - } - - if (++i == update_frequency) { - const auto t1 = std::chrono::steady_clock::now(); - const auto delta = - static_cast( - std::chrono::duration_cast(t1 - t0).count()) / - 1000.0; - SPDLOG_INFO(FMT_STRING("processing {:ucsc} {:ucsc} at {:.0f} pixels/s..."), bin1, - bin2, double(update_frequency) / delta); - t0 = t1; - i = 0; - } - }); - } - } - - assert(clr.attributes().nnz); // NOLINTNEXTLINE(bugprone-unchecked-optional-access) - return static_cast(*clr.attributes().nnz); -} - -template -[[nodiscard]] static std::unique_ptr run_pigz( - Pipe& pipe, const std::filesystem::path& dest, std::uint8_t compression_lvl, - std::size_t threads) { - assert(compression_lvl != 0); - assert(threads != 0); - // clang-format off - return std::make_unique( - find_pigz().string(), - fmt::format(FMT_STRING("-{}"), compression_lvl), - "--processes", fmt::to_string(threads), - boost::process::std_in < pipe, - boost::process::std_out > dest.string() - ); - // clang-format on -} - -static std::size_t dump_pixels_pigz(const cooler::File& clr, const std::filesystem::path& dest, - std::uint8_t compression_lvl, std::size_t threads, - std::size_t update_frequency = 10'000'000) { - assert(compression_lvl != 0); - assert(threads > 1); - - boost::asio::io_context ioc; - boost::process::async_pipe pipe{ioc}; - const auto pigz = run_pigz(pipe, dest, compression_lvl, threads - 1); + assert(c.input_format == "mcool"); + const cooler::MultiResFile mclr(c.path_to_input.string()); - auto t0 = std::chrono::steady_clock::now(); - std::string buffer; - std::size_t i = 0; - for (std::uint32_t chrom1_id = 0; chrom1_id < clr.chromosomes().size(); ++chrom1_id) { - for (std::uint32_t chrom2_id = chrom1_id; chrom2_id < clr.chromosomes().size(); ++chrom2_id) { - auto sel = - clr.fetch(clr.chromosomes().at(chrom1_id).name(), clr.chromosomes().at(chrom2_id).name()); - std::for_each( - sel.begin(), sel.end(), - [&](const ThinPixel& p) { - const auto bin1 = clr.bins().at(p.bin1_id); - const auto bin2 = clr.bins().at(p.bin2_id); - // https://github.com/aidenlab/juicer/wiki/Pre#short-with-score-format - // - buffer += - fmt::format(FMT_COMPILE("0\t{}\t{}\t0\t1\t{}\t{}\t1\t{}\n"), bin1.chrom().name(), - bin1.start(), bin2.chrom().name(), bin2.start(), p.count); - - if (buffer.size() > 65'000) { - if (!pigz->running()) { - throw std::runtime_error(fmt::format( - FMT_STRING("pigz returned prematurely with code {} while writing pixels to {}"), - pigz->exit_code(), dest)); - } - boost::asio::write(pipe, boost::asio::buffer(buffer.data(), buffer.size())); - buffer.clear(); - } - - if (++i == update_frequency) { - const auto t1 = std::chrono::steady_clock::now(); - const auto delta = - static_cast( - std::chrono::duration_cast(t1 - t0).count()) / - 1000.0; - SPDLOG_INFO(FMT_STRING("processing {:ucsc} {:ucsc} at {:.0f} pixels/s..."), bin1, - bin2, double(update_frequency) / delta); - t0 = t1; - i = 0; - } - }); + for (const auto& res : c.resolutions) { + try { + const auto clr = mclr.open(res); + w.add_pixels(res, clr.begin(), clr.end()); + } catch (const std::exception& e) { + const std::string_view msg{e.what()}; + const auto pos = msg.find("does not have interactions for resolution"); + if (pos == std::string_view::npos) { + throw; + } } } - - if (!pigz->running()) { - throw std::runtime_error( - fmt::format(FMT_STRING("pigz returned prematurely with code {} while writing pixels to {}"), - pigz->exit_code(), dest)); - } - if (!buffer.empty()) { - boost::asio::write(pipe, boost::asio::buffer(buffer.data(), buffer.size())); - } - - pipe.close(); - ioc.run(); - pigz->wait(); - if (pigz->exit_code() != 0) { - throw std::runtime_error( - fmt::format(FMT_STRING("pigz failed with exit code {}"), pigz->exit_code())); - } - - assert(clr.attributes().nnz); // NOLINTNEXTLINE(bugprone-unchecked-optional-access) - return static_cast(*clr.attributes().nnz); } -static void dump_pixels(const cooler::File& clr, const std::filesystem::path& dest, - std::uint8_t compression_lvl, std::size_t threads) { - const auto t0 = std::chrono::steady_clock::now(); - - SPDLOG_INFO(FMT_STRING("writing pixels to file {}..."), dest); - - std::size_t pixels_processed{}; - if (dest.extension() == ".gz") { - assert(compression_lvl != 0); - pixels_processed = dump_pixels_pigz(clr, dest, compression_lvl, threads); - } else { - pixels_processed = dump_pixels_plain(clr, dest); +static void copy_normalization_vector(hic::internal::HiCFileWriter& w, const cooler::File& clr, + const balancing::Method& norm, bool throw_if_missing) { + if (norm == balancing::Method::NONE()) { + return; } - const auto t1 = std::chrono::steady_clock::now(); - const auto delta = - static_cast(std::chrono::duration_cast(t1 - t0).count()) / - 1.0e6; - SPDLOG_INFO(FMT_STRING("wrote {} pixels across {} chromosomes to {} in {:.2f}s"), - pixels_processed, clr.chromosomes().size(), dest, delta); -} -[[nodiscard]] static std::shared_ptr try_read_weights( - const cooler::File& clr, const balancing::Method& method) { try { - return clr.read_weights(method); - } catch (const std::exception& e) { - return clr.read_weights(method, balancing::Weights::Type::DIVISIVE); - } -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -static bool dump_weights(std::uint32_t resolution, std::string_view cooler_uri, - const std::filesystem::path& weight_file, - std::vector normalizations, bool fail_if_norm_missing) { - if (normalizations.size() == 1 && normalizations.front() == balancing::Method::NONE()) { - return false; - } - - if (normalizations.empty()) { - normalizations = cooler::File(cooler_uri).avail_normalizations(); - if (normalizations.empty()) { - return false; - } - } - - SPDLOG_INFO(FMT_STRING("[{}] writing balancing weights to file {}..."), resolution, weight_file); - const cooler::File clr(cooler_uri); - assert(clr.bin_size() == resolution); + const auto& weights = *clr.read_weights(norm); + std::vector weights_f(weights().size()); + std::transform(weights().begin(), weights().end(), weights_f.begin(), [&](const double n) { + if (weights.type() == balancing::Weights::Type::MULTIPLICATIVE) { + return static_cast(1.0 / n); + } + return static_cast(n); + }); - const std::unique_ptr f(std::fopen(weight_file.string().c_str(), "ae")); - if (!bool(f)) { - throw fmt::system_error(errno, FMT_STRING("cannot open file {}"), weight_file); - } + const auto norm_name = norm.to_string() == "weight" ? "ICE" : norm.to_string(); + SPDLOG_INFO(FMT_STRING("[{}] adding {} normalization vector"), clr.bin_size(), norm_name); + w.add_norm_vector(norm_name, "BP", clr.bin_size(), weights_f, true); - for (const auto& norm : normalizations) { - if (!clr.has_normalization(norm) && !fail_if_norm_missing) { - SPDLOG_WARN(FMT_STRING("[{}] unable to read weights from \"{}\"..."), resolution, cooler_uri); - continue; + } catch (const std::exception& e) { + const std::string_view msg{e.what()}; + const auto match = msg.find(fmt::format(FMT_STRING("unable to read \"{}\" weights"), norm)); + if (match == std::string_view::npos) { + throw; } - - const auto weights = try_read_weights(clr, norm); - const auto weight_name = norm == "weight" ? "ICE" : norm.to_string(); - const auto weight_is_divisive = weights->type() == balancing::Weights::Type::INFER || - weights->type() == balancing::Weights::Type::UNKNOWN || - weights->type() == balancing::Weights::Type::DIVISIVE; - auto weight_vector = (*weights)(); - if (weight_is_divisive) { - std::transform(weight_vector.begin(), weight_vector.end(), weight_vector.begin(), - [](const double w) { return 1.0 / w; }); + if (throw_if_missing) { + throw std::runtime_error( + fmt::format(FMT_STRING("Unable to find {} normalization vector for resolution {}"), norm, + norm, clr.bin_size())); } + SPDLOG_WARN(FMT_STRING("[{}] {} normalization vector is missing. SKIPPING!"), clr.bin_size(), + norm); + } +} - std::ptrdiff_t i0 = 0; - for (const auto& chrom : clr.chromosomes()) { - // TODO add GW/INTRA/INTER prefix as appropriate - fmt::print(f.get(), FMT_STRING("vector\t{}\t{}\t{}\tBP\n"), weight_name, chrom.name(), - resolution); - - const auto num_bins = (chrom.size() + resolution - 1) / resolution; - const auto i1 = i0 + static_cast(num_bins); - std::for_each(weight_vector.begin() + i0, weight_vector.begin() + i1, [&](double w) { - !std::isfinite(w) ? fmt::print(f.get(), FMT_COMPILE(".\n")) - : fmt::print(f.get(), FMT_COMPILE("{}\n"), w); - if (!bool(f)) { // NOLINT - throw fmt::system_error( - errno, FMT_STRING("an error occurred while writing weights to file {}"), weight_file); - } - }); +static void copy_normalization_vectors(hic::internal::HiCFileWriter& w, + const cooler::File& base_clr, const ConvertConfig& c) { + const auto avail_normalizations = base_clr.avail_normalizations(); - i0 = i1; + if (c.input_format == "cool") { + for (const auto& norm : c.normalization_methods) { + copy_normalization_vector(w, base_clr, norm, c.fail_if_normalization_method_is_not_avaliable); } - SPDLOG_INFO(FMT_STRING("[{}] wrote \"{}\" weights to file {}..."), resolution, weight_name, - weight_file); + w.write_norm_vectors_and_norm_expected_values(); + return; } - return std::ftell(f.get()) != 0; -} + assert(c.input_format == "mcool"); + const cooler::MultiResFile mclr(c.path_to_input.string()); -static bool dump_weights(const ConvertConfig& c, const std::filesystem::path& weight_file) { - bool cooler_has_weights = false; for (const auto& res : c.resolutions) { - cooler_has_weights |= dump_weights( - res, fmt::format(FMT_STRING("{}::/resolutions/{}"), c.path_to_input.string(), res), - weight_file, c.normalization_methods, c.fail_if_normalization_method_is_not_avaliable); + const auto clr = mclr.open(res); + for (const auto& norm : c.normalization_methods) { + copy_normalization_vector(w, clr, norm, c.fail_if_normalization_method_is_not_avaliable); + } } - - return cooler_has_weights; + w.write_norm_vectors_and_norm_expected_values(); } void cool_to_hic(const ConvertConfig& c) { - std::ignore = find_java(); - - const internal::TmpDir tmpdir{c.tmp_dir}; - - const auto chrom_sizes = tmpdir() / "reference.chrom.sizes"; - const auto pixels = [&]() { - if (c.gzip_compression_lvl == 0 || find_pigz().empty()) { - return tmpdir() / "pixels.tsv"; - } - return tmpdir() / "pixels.tsv.gz"; - }(); - const auto weights = tmpdir() / "weights.txt"; - if (c.force && std::filesystem::exists(c.path_to_output)) { [[maybe_unused]] std::error_code ec{}; std::filesystem::remove(c.path_to_output, ec); } - std::unique_ptr process{}; - - try { - { - const auto uri = c.input_format == "cool" - ? c.path_to_input.string() - : fmt::format(FMT_STRING("{}::/resolutions/{}"), - c.path_to_input.string(), c.resolutions.front()); + const auto base_uri = c.input_format == "cool" + ? c.path_to_input.string() + : fmt::format(FMT_STRING("{}::/resolutions/{}"), + c.path_to_input.string(), c.resolutions.front()); - const cooler::File clr(uri); - - if (clr.bin_size() == 0) { - throw std::runtime_error( - "converting cooler files with variable bin size is not supported."); - } - - dump_chrom_sizes(clr, chrom_sizes); - dump_pixels(clr, pixels, c.gzip_compression_lvl, c.threads); - } - - auto t1 = std::chrono::steady_clock::now(); - SPDLOG_INFO(FMT_STRING("running juicer_tools pre...")); - process = run_juicer_tools_pre(c, chrom_sizes, pixels, c.threads); - process->wait(); - if (process->exit_code() != 0) { - throw std::runtime_error(fmt::format(FMT_STRING("juicer_tools pre failed with exit code {}"), - process->exit_code())); - } - process = nullptr; - auto t2 = std::chrono::steady_clock::now(); - auto delta = static_cast( - std::chrono::duration_cast(t2 - t1).count()) / - 1.0e6; - SPDLOG_INFO(FMT_STRING("DONE! Running juicer_tools pre took {:.2f}s"), delta); + const cooler::File base_clr{base_uri}; + if (base_clr.bin_size() == 0) { + throw std::runtime_error("converting cooler files with variable bin size is not supported."); + } - std::filesystem::remove(chrom_sizes); - std::filesystem::remove(pixels); + const auto chromosomes = cooler::File(base_uri).chromosomes(); + const auto& resolutions = c.resolutions; - const auto weight_file_has_data = - c.input_format == "cool" - ? dump_weights(c.resolutions.front(), c.path_to_input.string(), weights, - c.normalization_methods, c.fail_if_normalization_method_is_not_avaliable) - : dump_weights(c, weights); + const internal::TmpDir tmpdir{c.tmp_dir}; + hictk::hic::internal::HiCFileWriter w(c.path_to_output.string(), chromosomes, resolutions, + c.genome, c.threads, c.chunk_size, c.tmp_dir, + c.compression_lvl); + copy_pixels(w, base_clr, c); + w.serialize(); - if (weight_file_has_data) { - t1 = std::chrono::steady_clock::now(); - SPDLOG_INFO(FMT_STRING("running juicer_tools addNorm...")); - process = run_juicer_tools_add_norm(c.juicer_tools_jar, weights, c.path_to_output, - c.juicer_tools_xmx); - process->wait(); - if (process->exit_code() != 0) { - throw std::runtime_error(fmt::format( - FMT_STRING("juicer_tools pre failed with exit code {}"), process->exit_code())); - } - t2 = std::chrono::steady_clock::now(); - delta = static_cast( - std::chrono::duration_cast(t2 - t1).count()) / - 1.0e6; - SPDLOG_INFO(FMT_STRING("DONE! Running juicer_tools addNorm took {:.2f}s"), delta); - } - } catch (const std::exception&) { - if (process) { - process->terminate(); - } - throw; - } + copy_normalization_vectors(w, base_clr, c); } } // namespace hictk::tools diff --git a/src/hictk/convert/hic_to_cool.cpp b/src/hictk/convert/hic_to_cool.cpp index 91802c99..19139850 100644 --- a/src/hictk/convert/hic_to_cool.cpp +++ b/src/hictk/convert/hic_to_cool.cpp @@ -55,7 +55,7 @@ static bool missing_norm_or_interactions(const std::exception& e, balancing::Met return missing_interactions || missing_norm_vect; } -bool check_if_norm_exists(hic::File& f, balancing::Method norm) { +static bool check_if_norm_exists(hic::File& f, balancing::Method norm) { return std::any_of(f.chromosomes().begin(), f.chromosomes().end(), [&](const Chromosome& chrom) { try { if (!chrom.is_all()) { @@ -71,53 +71,6 @@ bool check_if_norm_exists(hic::File& f, balancing::Method norm) { }); } -static std::vector read_weights_or_throw(hic::File& f, balancing::Method norm, - const Chromosome& chrom, - std::size_t expected_length) { - std::vector weights_{}; - try { - auto weights = f.fetch(chrom.name(), norm).weights1(); - if (!!weights && weights().size() != expected_length) { - throw std::runtime_error( - fmt::format(FMT_STRING("{} normalization vector for {} appears to be corrupted: " - "expected {} values, found {}"), - norm, chrom.name(), expected_length, weights().size())); - } - weights_ = weights(); - } catch (const std::exception& e) { - if (!missing_norm_or_interactions(e, norm)) { - throw; - } - } - return weights_; -} - -static std::vector read_weights(hic::File& f, const BinTable& bins, - balancing::Method norm) { - std::vector weights{}; - weights.reserve(bins.size()); - std::size_t missing_norms = 0; - for (const auto& chrom : bins.chromosomes()) { - if (chrom.is_all()) { - continue; - } - const auto expected_length = (chrom.size() + bins.bin_size() - 1) / bins.bin_size(); - auto chrom_weights = read_weights_or_throw(f, norm, chrom, expected_length); - if (chrom_weights.empty()) { - chrom_weights.resize(expected_length, std::numeric_limits::quiet_NaN()); - ++missing_norms; - } - weights.insert(weights.end(), chrom_weights.begin(), chrom_weights.end()); - } - if (missing_norms == f.chromosomes().size() - 1) { - SPDLOG_WARN(FMT_STRING("[{}] {} normalization vector is missing. SKIPPING!"), bins.bin_size(), - norm); - } - - assert(weights.size() == bins.size()); - return weights; -} - template static void copy_weights(hic::File& hf, CoolerFile& cf, balancing::Method norm, bool fail_if_missing) { @@ -142,30 +95,34 @@ static void copy_weights(hic::File& hf, CoolerFile& cf, balancing::Method norm, SPDLOG_INFO(FMT_STRING("[{}] processing {} normalization vector..."), hf.bins().bin_size(), norm); - const auto weights = read_weights(hf, hf.bins(), norm); + const auto weights = hf.normalization(norm); using T = std::remove_reference_t; if constexpr (std::is_same_v) { - cf.write_weights(dset_name, weights.begin(), weights.end(), false, true); + cf.write_weights(dset_name, weights().begin(), weights().end(), false, true); } else { - cooler::File::write_weights(cf, dset_name, weights.begin(), weights.end(), false, true); + cooler::File::write_weights(cf, dset_name, weights().begin(), weights().end(), false, true); } } [[nodiscard]] static cooler::File init_cooler(cooler::RootGroup entrypoint, std::uint32_t resolution, std::string_view genome, - const Reference& chroms) { + const Reference& chroms, + std::uint32_t compression_lvl) { auto attrs = cooler::Attributes::init(resolution); attrs.assembly = genome.empty() ? "unknown" : std::string{genome}; - return cooler::File::create(std::move(entrypoint), chroms, resolution, attrs); + return cooler::File::create(std::move(entrypoint), chroms, resolution, attrs, + cooler::DEFAULT_HDF5_CACHE_SIZE * 4, compression_lvl); } [[nodiscard]] static cooler::File init_cooler(std::string_view uri, std::uint32_t resolution, - std::string_view genome, const Reference& chroms) { + std::string_view genome, const Reference& chroms, + std::uint32_t compression_lvl) { auto attrs = cooler::Attributes::init(resolution); attrs.assembly = genome.empty() ? "unknown" : std::string{genome}; - return cooler::File::create(uri, chroms, resolution, true, attrs); + return cooler::File::create(uri, chroms, resolution, true, attrs, + cooler::DEFAULT_HDF5_CACHE_SIZE * 4, compression_lvl); } static Reference generate_reference(const std::filesystem::path& p, std::uint32_t res) { @@ -363,7 +320,9 @@ void hic_to_cool(const ConvertConfig& c) { if (c.resolutions.size() == 1) { convert_resolution_multi_threaded( - hf, init_cooler(c.path_to_output.string(), c.resolutions.front(), c.genome, chroms), + hf, + init_cooler(c.path_to_output.string(), c.resolutions.front(), c.genome, chroms, + c.compression_lvl), c.normalization_methods, c.fail_if_normalization_method_is_not_avaliable); return; } @@ -375,8 +334,8 @@ void hic_to_cool(const ConvertConfig& c) { auto attrs = cooler::Attributes::init(res); attrs.assembly = c.genome.empty() ? "unknown" : std::string{c.genome}; convert_resolution_multi_threaded( - hf, init_cooler(mclr.init_resolution(res), res, c.genome, chroms), c.normalization_methods, - c.fail_if_normalization_method_is_not_avaliable); + hf, init_cooler(mclr.init_resolution(res), res, c.genome, chroms, c.compression_lvl), + c.normalization_methods, c.fail_if_normalization_method_is_not_avaliable); hf.clear_cache(); }); } diff --git a/src/hictk/fix_mcool/fix_mcool.cpp b/src/hictk/fix_mcool/fix_mcool.cpp index b3d9f3bf..5b0353d8 100644 --- a/src/hictk/fix_mcool/fix_mcool.cpp +++ b/src/hictk/fix_mcool/fix_mcool.cpp @@ -39,8 +39,10 @@ static void run_hictk_zoomify(const FixMcoolConfig& c, const std::vector& resolutions, std::string_view base_uri) { ZoomifyConfig zc{}; - zc.input_uri = std::string{base_uri}; - zc.output_path = c.path_to_output.string(); + zc.path_to_input = std::string{base_uri}; + zc.path_to_output = c.path_to_output.string(); + zc.input_format = "cool"; + zc.output_format = "cool"; zc.resolutions = resolutions; zc.copy_base_resolution = true; zc.force = c.force; @@ -74,12 +76,17 @@ static std::optional detect_balancing_params(std::string_view fil if (cis_only) { c.mode = "cis"; + c.name = "ICE"; } else if (trans_only) { c.mode = "trans"; + c.name = "INTER_ICE"; } else { c.mode = "gw"; + c.name = "GW_ICE"; } + c.symlink_to_weight = true; + c.masked_diags = dset.read_attribute("ignore_diags"); c.mad_max = dset.read_attribute("mad_max"); c.min_count = dset.read_attribute("min_count"); diff --git a/src/hictk/include/hictk/tools/common.hpp b/src/hictk/include/hictk/tools/common.hpp deleted file mode 100644 index e6d7b83e..00000000 --- a/src/hictk/include/hictk/tools/common.hpp +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#include - -namespace std { -template <> -struct default_delete { - void operator()(FILE* file) const { std::fclose(file); } // NOLINT -}; -} // namespace std diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index e0cd3851..2d189a83 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -19,7 +19,6 @@ namespace hictk::tools { struct BalanceConfig { std::filesystem::path path_to_input{}; std::filesystem::path tmp_dir{std::filesystem::temp_directory_path()}; - std::filesystem::path juicer_tools_jar{}; std::string mode{"gw"}; std::size_t masked_diags{2}; @@ -29,13 +28,13 @@ struct BalanceConfig { double tolerance{1.0e-5}; std::size_t max_iters{500}; bool rescale_marginals{true}; - std::string name{"weight"}; + std::string name{}; bool in_memory{false}; + bool symlink_to_weight{true}; bool stdout_{false}; std::uint8_t zstd_compression_lvl{3}; std::size_t threads{1}; std::size_t chunk_size{10'000'000}; - std::size_t juicer_tools_xmx{256'000'000}; std::uint8_t verbosity{4}; bool force{false}; @@ -44,8 +43,7 @@ struct BalanceConfig { struct ConvertConfig { std::filesystem::path path_to_input{}; std::filesystem::path path_to_output{}; - std::filesystem::path tmp_dir{}; - std::filesystem::path juicer_tools_jar{}; + std::filesystem::path tmp_dir{std::filesystem::temp_directory_path()}; std::string input_format{}; std::string output_format{}; @@ -55,10 +53,10 @@ struct ConvertConfig { std::vector normalization_methods{}; bool fail_if_normalization_method_is_not_avaliable{false}; - std::uint8_t gzip_compression_lvl{6}; + std::uint32_t compression_lvl{6}; std::size_t threads{2}; + std::size_t chunk_size{10'000'000}; - std::size_t juicer_tools_xmx{32'000'000'000}; std::uint8_t verbosity{4}; bool force{false}; }; @@ -105,10 +103,11 @@ struct FixMcoolConfig { }; struct LoadConfig { - std::string uri{}; + std::string output_path{}; std::filesystem::path path_to_chrom_sizes{}; std::filesystem::path path_to_bin_table{}; + std::filesystem::path tmp_dir{std::filesystem::temp_directory_path()}; std::uint32_t bin_size{}; std::string format{}; @@ -120,16 +119,26 @@ struct LoadConfig { bool force{false}; bool validate_pixels{true}; + std::string output_format{}; + + std::size_t threads{1}; + std::uint32_t compression_lvl{9}; + std::uint8_t verbosity{4}; - std::size_t batch_size{20'000'000}; + std::size_t batch_size{10'000'000}; }; struct MergeConfig { - std::vector input_uris{}; - std::filesystem::path output_uri{}; - std::filesystem::path tmp_dir{}; + std::vector input_files{}; + std::filesystem::path output_file{}; + std::string output_format{}; + std::uint32_t resolution{}; - std::size_t chunk_size{5'000'000}; + std::filesystem::path tmp_dir{std::filesystem::temp_directory_path()}; + + std::size_t chunk_size{10'000'000}; + std::uint32_t compression_lvl{9}; + std::size_t threads{1}; bool force{false}; std::uint8_t verbosity{4}; @@ -151,13 +160,20 @@ struct ValidateConfig { }; struct ZoomifyConfig { - std::string input_uri{}; - std::string output_path{}; + std::filesystem::path path_to_input{}; + std::filesystem::path path_to_output{}; + std::string input_format{}; + std::string output_format{}; + std::filesystem::path tmp_dir{std::filesystem::temp_directory_path()}; std::vector resolutions{}; bool copy_base_resolution{true}; bool nice_resolution_steps{true}; + std::uint32_t compression_lvl{6}; + std::uint32_t threads{1}; + std::size_t batch_size{10'000'000}; + bool force{false}; std::uint8_t verbosity{4}; }; diff --git a/src/hictk/include/hictk/tools/juicer_tools.hpp b/src/hictk/include/hictk/tools/juicer_tools.hpp deleted file mode 100644 index 5c61d2a8..00000000 --- a/src/hictk/include/hictk/tools/juicer_tools.hpp +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hictk/tools/config.hpp" - -namespace hictk::tools { - -[[nodiscard]] inline std::filesystem::path find_java() { - auto java = boost::process::search_path("java"); - if (java.empty()) { - throw std::runtime_error("unable to find java in your PATH"); - } - return java.string(); -} - -[[nodiscard]] inline std::vector generate_juicer_tools_pre_args( - const ConvertConfig& c, const std::filesystem::path& path_to_pixels, - const std::filesystem::path& path_to_chrom_sizes, std::size_t processes) { - assert(processes != 0); - return {fmt::format(FMT_STRING("-Xmx{}M"), c.juicer_tools_xmx / 1'000'000), - "-jar", - c.juicer_tools_jar.string(), - "pre", - "-j", - fmt::to_string(processes), - "-t", - c.tmp_dir.string(), - "-n", - "-r", - fmt::format(FMT_STRING("{}"), fmt::join(c.resolutions, ",")), - path_to_pixels.string(), - c.path_to_output.string(), - path_to_chrom_sizes.string()}; -} - -[[nodiscard]] inline std::vector generate_juicer_tools_add_norm_args( - const std::filesystem::path& juicer_tools_jar, const std::filesystem::path& path_to_weights, - const std::filesystem::path& path_to_output, std::size_t juicer_tools_xmx) { - return {fmt::format(FMT_STRING("-Xmx{}M"), juicer_tools_xmx / 1'000'000), - "-jar", - juicer_tools_jar.string(), - "addNorm", - "-j", - "1", - path_to_output.string(), - path_to_weights.string()}; -} - -[[nodiscard]] inline std::unique_ptr run_juicer_tools_pre( - const ConvertConfig& c, const std::filesystem::path& chrom_sizes, - const std::filesystem::path& pixels, std::size_t processes) { - const auto cmd = generate_juicer_tools_pre_args(c, pixels, chrom_sizes, processes); - return std::make_unique(find_java().string(), cmd); -} - -[[nodiscard]] inline std::unique_ptr run_juicer_tools_add_norm( - const std::filesystem::path& juicer_tools_jar, const std::filesystem::path& path_to_weights, - const std::filesystem::path& path_to_output, std::size_t juicer_tools_xmx) { - const auto cmd = generate_juicer_tools_add_norm_args(juicer_tools_jar, path_to_weights, - path_to_output, juicer_tools_xmx); - return std::make_unique(find_java().string(), cmd); -} - -} // namespace hictk::tools diff --git a/src/hictk/load/common.hpp b/src/hictk/load/common.hpp index be973ef7..9a2b4985 100644 --- a/src/hictk/load/common.hpp +++ b/src/hictk/load/common.hpp @@ -5,13 +5,21 @@ #pragma once #include +#include #include +#include +#include #include "hictk/bin_table.hpp" #include "hictk/pixel.hpp" +#include "hictk/type_traits.hpp" namespace hictk::tools { +using IntBuff = std::vector>; +using FPBuff = std::vector>; +using PixelBuffer = std::variant; + enum class Format { COO, BG2, VP, _4DN }; [[nodiscard]] inline Format format_from_string(std::string_view s) { if (s == "coo") { @@ -55,4 +63,31 @@ template return !line.empty() && line.front() == '#'; } +struct Stats { + std::variant sum{0.0}; + std::uint64_t nnz{}; + + inline Stats& operator+=(const Stats& other) { + std::visit( + [&](auto& sum_) { + using T = remove_cvref_t; + + sum_ += std::get(other.sum); + }, + sum); + nnz += other.nnz; + + return *this; + } + + template + inline Stats(N sum_, std::uint64_t nnz_) : nnz(nnz_) { + if constexpr (std::is_floating_point_v) { + sum = static_cast(sum_); + } else { + sum = static_cast(sum_); + } + } +}; + } // namespace hictk::tools diff --git a/src/hictk/load/load.cpp b/src/hictk/load/load.cpp index f75f55a1..e2720d97 100644 --- a/src/hictk/load/load.cpp +++ b/src/hictk/load/load.cpp @@ -14,12 +14,14 @@ #include #include "./common.hpp" -#include "./load_pairs.hpp" -#include "./load_pixels.hpp" +#include "./load_cooler.hpp" +#include "./load_hic.hpp" #include "hictk/cooler/cooler.hpp" #include "hictk/cooler/singlecell_cooler.hpp" +#include "hictk/hic/file_writer.hpp" #include "hictk/pixel.hpp" #include "hictk/reference.hpp" +#include "hictk/tmpdir.hpp" #include "hictk/tools/config.hpp" #include "hictk/tools/tools.hpp" @@ -67,135 +69,88 @@ namespace hictk::tools { return {chroms, start_pos, end_pos}; } -static void ingest_pixels_sorted(const LoadConfig& c) { - assert(c.assume_sorted); - auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); +static Stats ingest_pixels_hic(const LoadConfig& c) { + const auto format = format_from_string(c.format); + const auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); + + [[maybe_unused]] const internal::TmpDir tmpdir{c.tmp_dir}; + return ingest_pixels_hic(c.output_path, c.tmp_dir, chroms, c.bin_size, c.assembly, c.offset, + format, c.threads, c.batch_size, c.compression_lvl, c.force); +} + +static Stats ingest_pixels_cooler(const LoadConfig& c) { + assert(c.output_format == "cool"); const auto format = format_from_string(c.format); + auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); - c.count_as_float ? ingest_pixels_sorted( - cooler::File::create(c.uri, chroms, c.bin_size, c.force), format, - c.offset, c.batch_size, c.validate_pixels) - : ingest_pixels_sorted( - cooler::File::create(c.uri, chroms, c.bin_size, c.force), - format, c.offset, c.batch_size, c.validate_pixels); + const internal::TmpDir tmpdir{c.tmp_dir}; + const auto tmp_cooler_path = + (c.tmp_dir / (std::filesystem::path{c.output_path}.filename().string() + ".tmp")).string(); + + return c.assume_sorted + ? ingest_pixels_sorted_cooler(c.output_path, chroms, c.bin_size, c.offset, format, + c.batch_size, c.compression_lvl, c.force, + c.count_as_float, c.validate_pixels) + : ingest_pixels_unsorted_cooler(c.output_path, tmp_cooler_path, chroms, c.bin_size, + c.offset, format, c.batch_size, c.compression_lvl, + c.force, c.count_as_float, c.validate_pixels); } -static void ingest_pixels_unsorted(const LoadConfig& c) { - assert(!c.assume_sorted); +static Stats ingest_pairs_cooler(const LoadConfig& c) { auto bins = c.path_to_bin_table.empty() ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); const auto format = format_from_string(c.format); - const auto tmp_cooler_path = c.uri + ".tmp"; + const internal::TmpDir tmpdir{c.tmp_dir}; + const auto tmp_cooler_path = + (tmpdir() / (std::filesystem::path{c.output_path}.filename().string() + ".tmp")).string(); - using IntBuff = std::vector>; - using FPBuff = std::vector>; - std::variant write_buffer{}; - if (c.count_as_float) { - write_buffer = FPBuff(c.batch_size); - } else { - write_buffer = IntBuff(c.batch_size); - } - - std::visit( - [&](auto& buffer) { - using N = decltype(buffer.front().count); - { - auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, c.force); - for (std::size_t i = 0; true; ++i) { - SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, - tmp_cooler_path); - const auto nnz = ingest_pixels_unsorted(tmp_clr.create_cell(fmt::to_string(i)), - buffer, format, c.offset, c.validate_pixels); - SPDLOG_INFO(FMT_STRING("done writing chunk #{} to tmp file \"{}\"."), i + 1, - tmp_cooler_path); - if (nnz == 0) { - break; - } - } - } - const cooler::SingleCellFile tmp_clr(tmp_cooler_path); - SPDLOG_INFO(FMT_STRING("merging {} chunks into \"{}\"..."), tmp_clr.cells().size(), c.uri); - tmp_clr.aggregate(c.uri, c.force); - }, - write_buffer); - std::filesystem::remove(tmp_cooler_path); + return ingest_pairs_cooler(c.output_path, tmp_cooler_path, bins, c.offset, format, c.batch_size, + c.compression_lvl, c.force, c.count_as_float, c.validate_pixels); } -static void ingest_pairs(const LoadConfig& c) { - auto bins = c.path_to_bin_table.empty() - ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) - : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); +static Stats ingest_pairs_hic(const LoadConfig& c) { + const auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); const auto format = format_from_string(c.format); - const auto tmp_cooler_path = c.uri + ".tmp"; + [[maybe_unused]] const internal::TmpDir tmpdir{c.tmp_dir}; + return ingest_pairs_hic(c.output_path, c.tmp_dir, chroms, c.bin_size, c.assembly, c.offset, + format, c.threads, c.batch_size, c.compression_lvl, c.force); +} - using IntBuff = std::vector>; - using FPBuff = std::vector>; - std::variant write_buffer{}; - if (c.count_as_float) { - write_buffer = FPBuff{}; - } else { - write_buffer = IntBuff{}; +static Stats ingest_pixels(const LoadConfig& c) { + if (c.output_format == "hic") { + return ingest_pixels_hic(c); } - std::visit( - [&](auto& buffer) { - using N = decltype(buffer.begin()->count); - { - auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, c.force); - - for (std::size_t i = 0; true; ++i) { - SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, - tmp_cooler_path); - const auto nnz = ingest_pairs(tmp_clr.create_cell(fmt::to_string(i)), buffer, - c.batch_size, format, c.offset, c.validate_pixels); - - SPDLOG_INFO(FMT_STRING("done writing chunk #{} to tmp file \"{}\"."), i + 1, - tmp_cooler_path); - if (nnz == 0) { - break; - } - } - } - - const cooler::SingleCellFile tmp_clr(tmp_cooler_path); - SPDLOG_INFO(FMT_STRING("merging {} chunks into \"{}\"..."), tmp_clr.cells().size(), c.uri); - tmp_clr.aggregate(c.uri, c.force); - }, - write_buffer); + return ingest_pixels_cooler(c); +} + +static Stats ingest_pairs(const LoadConfig& c) { + if (c.output_format == "hic") { + return ingest_pairs_hic(c); + } - std::filesystem::remove(tmp_cooler_path); + return ingest_pairs_cooler(c); } int load_subcmd(const LoadConfig& c) { const auto format = format_from_string(c.format); const auto pixel_has_count = format == Format::COO || format == Format::BG2; - const auto t0 = std::chrono::system_clock::now(); - - if (c.assume_sorted && pixel_has_count) { - SPDLOG_INFO(FMT_STRING("begin loading presorted pixels...")); - ingest_pixels_sorted(c); - } else if (!c.assume_sorted && pixel_has_count) { - SPDLOG_INFO(FMT_STRING("begin loading un-sorted pixels...")); - ingest_pixels_unsorted(c); - } else if (!pixel_has_count) { - SPDLOG_INFO(FMT_STRING("begin loading pairs...")); - ingest_pairs(c); - } - const cooler::File clr(c.uri); + const auto t0 = std::chrono::system_clock::now(); + const auto stats = pixel_has_count ? ingest_pixels(c) : ingest_pairs(c); const auto t1 = std::chrono::system_clock::now(); const auto delta = std::chrono::duration_cast(t1 - t0).count(); std::visit( [&](const auto& sum) { - SPDLOG_INFO(FMT_STRING("ingested {} interactions ({} nnz) in {}s!"), sum, clr.nnz(), + SPDLOG_INFO(FMT_STRING("ingested {} interactions ({} nnz) in {}s!"), sum, stats.nnz, static_cast(delta) / 1.0e9); - }, // NOLINTNEXTLINE(bugprone-unchecked-optional-access) - *clr.attributes().sum); + }, + stats.sum); return 0; } diff --git a/src/hictk/load/load_cooler.hpp b/src/hictk/load/load_cooler.hpp new file mode 100644 index 00000000..ce7fa26e --- /dev/null +++ b/src/hictk/load/load_cooler.hpp @@ -0,0 +1,143 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include "./common.hpp" +#include "./load_pairs.hpp" +#include "./load_pixels.hpp" +#include "hictk/bin_table.hpp" +#include "hictk/cooler/cooler.hpp" +#include "hictk/cooler/singlecell_cooler.hpp" +#include "hictk/pixel.hpp" + +namespace hictk::tools { + +inline Stats ingest_pixels_unsorted_cooler(std::string_view uri, std::string_view tmp_cooler_path, + const Reference& chromosomes, std::uint32_t bin_size, + std::int64_t offset, Format format, + std::size_t batch_size, std::uint32_t compression_lvl, + bool force, bool count_as_float, bool validate_pixels) { + SPDLOG_INFO(FMT_STRING("begin loading unsorted pixels into a .cool file...")); + const BinTable bins(chromosomes, bin_size); + PixelBuffer write_buffer{}; + if (count_as_float) { + write_buffer = FPBuff(batch_size); + } else { + write_buffer = IntBuff(batch_size); + } + + const auto stats = std::visit( + [&](auto& buffer) { + using N = decltype(buffer.front().count); + Stats local_stats{N{}, 0}; + { + auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, force); + for (std::size_t i = 0; true; ++i) { + SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, + tmp_cooler_path); + const auto partial_stats = ingest_pixels_unsorted( + tmp_clr.create_cell(fmt::to_string(i), cooler::Attributes::init(bins.bin_size()), + cooler::DEFAULT_HDF5_CACHE_SIZE * 4, compression_lvl), + buffer, format, offset, validate_pixels); + local_stats += partial_stats; + SPDLOG_INFO(FMT_STRING("done writing chunk #{} to tmp file \"{}\"."), i + 1, + tmp_cooler_path); + if (partial_stats.nnz == 0) { + break; + } + } + } + const cooler::SingleCellFile tmp_clr(tmp_cooler_path); + SPDLOG_INFO(FMT_STRING("merging {} chunks into \"{}\"..."), tmp_clr.cells().size(), uri); + tmp_clr.aggregate(uri, force, compression_lvl); + + return local_stats; + }, + write_buffer); + std::filesystem::remove(tmp_cooler_path); + + return stats; +} + +inline Stats ingest_pixels_sorted_cooler(std::string_view uri, const Reference& chromosomes, + std::uint32_t bin_size, std::int64_t offset, Format format, + std::size_t batch_size, std::uint32_t compression_lvl, + bool force, bool count_as_float, bool validate_pixels) { + SPDLOG_INFO(FMT_STRING("begin loading pre-sorted pixels into a .cool file...")); + if (count_as_float) { + return ingest_pixels_sorted( + cooler::File::create(uri, chromosomes, bin_size, force, + cooler::Attributes::init(bin_size), + cooler::DEFAULT_HDF5_CACHE_SIZE * 4, compression_lvl), + format, offset, batch_size, validate_pixels); + } + return ingest_pixels_sorted( + cooler::File::create(uri, chromosomes, bin_size, force, + cooler::Attributes::init(bin_size), + cooler::DEFAULT_HDF5_CACHE_SIZE * 4, compression_lvl), + format, offset, batch_size, validate_pixels); +} + +inline Stats ingest_pairs_cooler(std::string_view uri, std::string_view tmp_cooler_path, + const BinTable& bins, std::int64_t offset, Format format, + std::size_t batch_size, std::uint32_t compression_lvl, bool force, + bool count_as_float, bool validate_pixels) { + PixelBuffer write_buffer{}; + if (count_as_float) { + write_buffer = FPBuff{}; + } else { + write_buffer = IntBuff{}; + } + + std::visit( + [&](auto& buffer) { + using N = decltype(buffer.begin()->count); + { + auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, force); + + for (std::size_t i = 0; true; ++i) { + SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, + tmp_cooler_path); + const auto partial_stats = ingest_pairs( + tmp_clr.create_cell(fmt::to_string(i), cooler::Attributes::init(bins.bin_size()), + cooler::DEFAULT_HDF5_CACHE_SIZE * 4, compression_lvl), + buffer, batch_size, format, offset, validate_pixels); + + SPDLOG_INFO(FMT_STRING("done writing chunk #{} to tmp file \"{}\"."), i + 1, + tmp_cooler_path); + if (partial_stats.nnz == 0) { + break; + } + } + } + + const cooler::SingleCellFile tmp_clr(tmp_cooler_path); + SPDLOG_INFO(FMT_STRING("merging {} chunks into \"{}\"..."), tmp_clr.cells().size(), uri); + tmp_clr.aggregate(uri, force); + }, + write_buffer); + + std::filesystem::remove(tmp_cooler_path); + + const cooler::File clr(uri); + const auto nnz = clr.nnz(); + const auto sum = clr.attributes().sum.value(); + + if (clr.has_float_pixels()) { + return {std::get(sum), nnz}; + } + return {std::get(sum), nnz}; +} + +} // namespace hictk::tools diff --git a/src/hictk/load/load_hic.hpp b/src/hictk/load/load_hic.hpp new file mode 100644 index 00000000..bc64a6f2 --- /dev/null +++ b/src/hictk/load/load_hic.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include "./common.hpp" +#include "./load_pairs.hpp" +#include "./load_pixels.hpp" +#include "hictk/hic/file_writer.hpp" + +namespace hictk::tools { + +static Stats ingest_pixels_hic(std::string_view uri, const std::filesystem::path& tmp_dir, + const Reference& chromosomes, std::uint32_t bin_size, + const std::string& assembly, std::int64_t offset, Format format, + std::size_t threads, std::size_t batch_size, + std::uint32_t compression_lvl, bool force) { + SPDLOG_INFO(FMT_STRING("begin loading pixels into a .hic file...")); + + if (force) { + std::filesystem::remove(uri); + } + + hic::internal::HiCFileWriter hf(uri, chromosomes, {bin_size}, assembly, threads, batch_size, + tmp_dir, compression_lvl); + + std::vector> write_buffer(batch_size); + return ingest_pixels(std::move(hf), write_buffer, format, offset); +} + +inline Stats ingest_pairs_hic(std::string_view uri, const std::filesystem::path& tmp_dir, + const Reference& chromosomes, std::uint32_t bin_size, + const std::string& assembly, std::int64_t offset, Format format, + std::size_t threads, std::size_t batch_size, + std::uint32_t compression_lvl, bool force) { + if (force) { + std::filesystem::remove(uri); + } + + hic::internal::HiCFileWriter hf(uri, chromosomes, {bin_size}, assembly, threads, batch_size, + tmp_dir, compression_lvl); + + std::vector> buffer(batch_size); + return ingest_pairs(std::move(hf), buffer, format, offset); +} + +} // namespace hictk::tools diff --git a/src/hictk/load/load_pairs.hpp b/src/hictk/load/load_pairs.hpp index 22611d65..c2329fc1 100644 --- a/src/hictk/load/load_pairs.hpp +++ b/src/hictk/load/load_pairs.hpp @@ -21,6 +21,7 @@ #include "./common.hpp" #include "hictk/bin_table.hpp" #include "hictk/cooler/cooler.hpp" +#include "hictk/hic/file_writer.hpp" #include "hictk/pixel.hpp" namespace hictk::tools { @@ -120,23 +121,71 @@ class PairsAggregator { }; template -[[nodiscard]] inline std::uint64_t ingest_pairs(cooler::File&& clr, - std::vector>& buffer, - std::size_t batch_size, Format format, - std::int64_t offset, bool validate_pixels) { +[[nodiscard]] inline Stats ingest_pairs( + cooler::File&& clr, // NOLINT(*-rvalue-reference-param-not-moved) + std::vector>& buffer, std::size_t batch_size, Format format, std::int64_t offset, + bool validate_pixels) { buffer.reserve(batch_size); PairsAggregator{clr.bins(), format, offset}.read_next_chunk(buffer); if (buffer.empty()) { assert(std::cin.eof()); - return {}; + return {N{}, 0}; } clr.append_pixels(buffer.begin(), buffer.end(), validate_pixels); buffer.clear(); clr.flush(); - return clr.nnz(); + const auto nnz = clr.nnz(); + const auto sum = clr.attributes().sum.value(); + + if (clr.has_float_pixels()) { + return {std::get(sum), nnz}; + } + return {std::get(sum), nnz}; +} + +[[nodiscard]] inline Stats ingest_pairs( + hic::internal::HiCFileWriter&& hf, // NOLINT(*-rvalue-reference-param-not-moved) + std::vector>& buffer, Format format, std::int64_t offset) { + const auto resolution = hf.resolutions().front(); + assert(buffer.capacity() != 0); + buffer.reserve(buffer.capacity()); + std::size_t i = 0; + + try { + auto t0 = std::chrono::steady_clock::now(); + for (; !std::cin.eof(); ++i) { + PairsAggregator{hf.bins(resolution), format, offset}.read_next_chunk(buffer); + + if (buffer.empty()) { + assert(std::cin.eof()); + break; + } + const auto t1 = std::chrono::steady_clock::now(); + const auto delta = + static_cast( + std::chrono::duration_cast(t1 - t0).count()) / + 1000.0; + t0 = t1; + + SPDLOG_INFO(FMT_STRING("preprocessing chunk #{} at {:.0f} pixels/s..."), i + 1, + double(buffer.size()) / delta); + hf.add_pixels(resolution, buffer.begin(), buffer.end()); + buffer.clear(); + } + buffer.shrink_to_fit(); + + hf.serialize(); + const auto stats = hf.stats(resolution); + return {stats.sum, stats.nnz}; + } catch (const std::exception& e) { + const auto i0 = i * buffer.capacity(); + const auto i1 = i0 + buffer.size(); + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while processing chunk {}-{}: {}"), i0, i1, e.what())); + } } } // namespace hictk::tools diff --git a/src/hictk/load/load_pixels.hpp b/src/hictk/load/load_pixels.hpp index 69a4439e..12b2d05a 100644 --- a/src/hictk/load/load_pixels.hpp +++ b/src/hictk/load/load_pixels.hpp @@ -19,23 +19,31 @@ #include "./common.hpp" #include "hictk/bin_table.hpp" #include "hictk/cooler/cooler.hpp" +#include "hictk/hic/file_writer.hpp" #include "hictk/pixel.hpp" namespace hictk::tools { template -inline void read_batch(const BinTable& bins, std::vector>& buffer, Format format, - std::int64_t offset) { +inline Stats read_batch(const BinTable& bins, std::vector>& buffer, Format format, + std::int64_t offset) { buffer.clear(); + Stats stats{N{}, 0}; std::string line{}; try { while (std::getline(std::cin, line)) { if (line_is_header(line)) { continue; } - buffer.emplace_back(parse_pixel(bins, line, format, offset)); + const auto& p = buffer.emplace_back(parse_pixel(bins, line, format, offset)); + stats.nnz++; + if constexpr (std::is_floating_point_v) { + std::get(stats.sum) += conditional_static_cast(p.count); + } else { + std::get(stats.sum) += conditional_static_cast(p.count); + } if (buffer.size() == buffer.capacity()) { - return; + return stats; } } } catch (const std::exception& e) { @@ -45,44 +53,47 @@ inline void read_batch(const BinTable& bins, std::vector>& buffer, "Cause: {}"), line, e.what())); } + + return stats; } template -inline void ingest_pixels_sorted(cooler::File&& clr, // NOLINT(*-rvalue-reference-param-not-moved) - Format format, std::int64_t offset, std::size_t batch_size, - bool validate_pixels) { +[[nodiscard]] inline Stats ingest_pixels_sorted( + cooler::File&& clr, // NOLINT(*-rvalue-reference-param-not-moved) + Format format, std::int64_t offset, std::size_t batch_size, bool validate_pixels) { std::vector> buffer(batch_size); std::size_t i = 0; + Stats stats{N{}, 0}; try { for (; !std::cin.eof(); ++i) { SPDLOG_INFO(FMT_STRING("processing chunk #{}..."), i + 1); - read_batch(clr.bins(), buffer, format, offset); + stats += read_batch(clr.bins(), buffer, format, offset); clr.append_pixels(buffer.begin(), buffer.end(), validate_pixels); buffer.clear(); } - if (!buffer.empty()) { - clr.append_pixels(buffer.begin(), buffer.end(), validate_pixels); - } + assert(buffer.empty()); } catch (const std::exception& e) { const auto i0 = i * buffer.capacity(); const auto i1 = i0 + buffer.size(); throw std::runtime_error(fmt::format( FMT_STRING("an error occurred while processing chunk {}-{}: {}"), i0, i1, e.what())); } + + return stats; } template -[[nodiscard]] inline std::size_t ingest_pixels_unsorted( +[[nodiscard]] inline Stats ingest_pixels_unsorted( cooler::File&& clr, // NOLINT(*-rvalue-reference-param-not-moved) std::vector>& buffer, Format format, std::int64_t offset, bool validate_pixels) { assert(buffer.capacity() != 0); - read_batch(clr.bins(), buffer, format, offset); + auto stats = read_batch(clr.bins(), buffer, format, offset); if (buffer.empty()) { assert(std::cin.eof()); - return {}; + return {N{}, 0}; } std::sort(buffer.begin(), buffer.end()); @@ -90,6 +101,47 @@ template buffer.clear(); clr.flush(); - return clr.nnz(); + return stats; } + +[[nodiscard]] inline Stats ingest_pixels( + hic::internal::HiCFileWriter&& hf, // NOLINT(*-rvalue-reference-param-not-moved) + std::vector>& buffer, Format format, std::int64_t offset) { + assert(buffer.capacity() != 0); + + std::size_t i = 0; + Stats stats{0.0, 0}; + try { + auto t0 = std::chrono::steady_clock::now(); + const auto& bins = hf.bins(hf.resolutions().front()); + for (; !std::cin.eof(); ++i) { + stats += read_batch(bins, buffer, format, offset); + + if (buffer.empty()) { + assert(std::cin.eof()); + break; + } + + const auto t1 = std::chrono::steady_clock::now(); + const auto delta = + static_cast( + std::chrono::duration_cast(t1 - t0).count()) / + 1000.0; + t0 = t1; + SPDLOG_INFO(FMT_STRING("preprocessing chunk #{} at {:.0f} pixels/s..."), i + 1, + double(buffer.size()) / delta); + hf.add_pixels(bins.bin_size(), buffer.begin(), buffer.end()); + buffer.clear(); + } + hf.serialize(); + assert(buffer.empty()); + return stats; + } catch (const std::exception& e) { + const auto i0 = i * buffer.capacity(); + const auto i1 = i0 + buffer.size(); + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while processing chunk {}-{}: {}"), i0, i1, e.what())); + } +} + } // namespace hictk::tools diff --git a/src/hictk/merge/merge.cpp b/src/hictk/merge/merge.cpp index 116f06fa..ccff02e7 100644 --- a/src/hictk/merge/merge.cpp +++ b/src/hictk/merge/merge.cpp @@ -10,22 +10,42 @@ #include #include "hictk/cooler/utils.hpp" +#include "hictk/hic/utils.hpp" +#include "hictk/tmpdir.hpp" #include "hictk/tools/config.hpp" namespace hictk::tools { +static void merge_coolers(const MergeConfig& c) { + SPDLOG_INFO(FMT_STRING("begin merging {} coolers..."), c.input_files.size()); + cooler::utils::merge(c.input_files.begin(), c.input_files.end(), + c.output_file.string(), c.force, c.chunk_size, 10'000'000, + c.compression_lvl); +} + +static void merge_hics(const MergeConfig& c) { + SPDLOG_INFO(FMT_STRING("begin merging {} .hic files..."), c.input_files.size()); + const internal::TmpDir tmpdir{c.tmp_dir}; + hic::utils::merge(c.input_files.begin(), c.input_files.end(), c.output_file.string(), + c.resolution, tmpdir(), c.force, c.chunk_size, c.threads, c.compression_lvl); +} + int merge_subcmd(const MergeConfig& c) { - SPDLOG_INFO(FMT_STRING("begin merging {} coolers..."), c.input_uris.size()); const auto t0 = std::chrono::system_clock::now(); - cooler::utils::merge(c.input_uris.begin(), c.input_uris.end(), - c.output_uri.string(), c.force, c.chunk_size, 10'000'000); + if (c.output_format == "cool") { + merge_coolers(c); + } else { + merge_hics(c); + } + const auto t1 = std::chrono::system_clock::now(); const auto delta = static_cast(std::chrono::duration_cast(t1 - t0).count()) / 1000.0; - SPDLOG_INFO(FMT_STRING("DONE! Merging {} coolers took {:.2f}s!"), c.input_uris.size(), delta); - SPDLOG_INFO(FMT_STRING("{} size: {:.2f} MB"), c.output_uri, - static_cast(std::filesystem::file_size(c.output_uri)) / 1.0e6); + + SPDLOG_INFO(FMT_STRING("DONE! Merging {} files took {:.2f}s!"), c.input_files.size(), delta); + SPDLOG_INFO(FMT_STRING("{} size: {:.2f} MB"), c.output_file, + static_cast(std::filesystem::file_size(c.output_file)) / 1.0e6); return 0; } diff --git a/src/hictk/validate/validate.cpp b/src/hictk/validate/validate.cpp index 0ef82732..e18ccb4a 100644 --- a/src/hictk/validate/validate.cpp +++ b/src/hictk/validate/validate.cpp @@ -42,7 +42,7 @@ static void validate_hic(const hic::File& hf, const Chromosome& chrom1, const Ch fmt::format(FMT_STRING("### FAILURE: \"{}\" is not a valid .hic file:\n" "Validation failed for {}:{} map at {} resolution:\n" "{}"), - hf.url(), chrom1.name(), chrom2.name(), hf.resolution(), e.what())); + hf.path(), chrom1.name(), chrom2.name(), hf.resolution(), e.what())); } } diff --git a/src/hictk/zoomify/zoomify.cpp b/src/hictk/zoomify/zoomify.cpp index 5c58e521..c6fb7fa1 100644 --- a/src/hictk/zoomify/zoomify.cpp +++ b/src/hictk/zoomify/zoomify.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: MIT #include -#include #include #include @@ -19,26 +18,26 @@ #include "hictk/cooler/cooler.hpp" #include "hictk/cooler/group.hpp" #include "hictk/cooler/multires_cooler.hpp" -#include "hictk/cooler/utils.hpp" +#include "hictk/hic/file_zoomify.hpp" #include "hictk/pixel.hpp" #include "hictk/tools/config.hpp" -#include "hictk/transformers/coarsen.hpp" namespace hictk::tools { -void zoomify_once(const cooler::File& clr1, cooler::RootGroup entrypoint2, - std::uint32_t resolution) { +void zoomify_once_cooler(const cooler::File& clr1, cooler::RootGroup entrypoint2, + std::uint32_t resolution, std::uint32_t compression_lvl) { auto attrs = cooler::Attributes::init(clr1.bin_size()); attrs.assembly = clr1.attributes().assembly; - auto clr2 = cooler::File::create(std::move(entrypoint2), clr1.chromosomes(), resolution, attrs); + auto clr2 = cooler::File::create(std::move(entrypoint2), clr1.chromosomes(), resolution, attrs, + cooler::DEFAULT_HDF5_CACHE_SIZE * 4, compression_lvl); std::vector> buffer{500'000}; cooler::MultiResFile::coarsen(clr1, clr2, buffer); } -void zoomify_once(std::string_view uri1, std::string_view uri2, std::uint32_t resolution, - bool force) { +void zoomify_once_cooler(std::string_view uri1, std::string_view uri2, std::uint32_t resolution, + bool force, std::uint32_t compression_lvl) { const cooler::File clr1(uri1); SPDLOG_INFO(FMT_STRING("coarsening cooler at {} once ({} -> {})"), clr1.uri(), clr1.bin_size(), @@ -47,12 +46,12 @@ void zoomify_once(std::string_view uri1, std::string_view uri2, std::uint32_t re auto mode = force ? HighFive::File::Overwrite : HighFive::File::Create; cooler::RootGroup entrypoint2{HighFive::File(std::string{uri2}, mode).getGroup("/")}; - return zoomify_once(clr1, std::move(entrypoint2), resolution); -} + return zoomify_once_cooler(clr1, std::move(entrypoint2), resolution, compression_lvl); +} // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) -void zoomify_many(std::string_view in_uri, std::string_view out_path, - const std::vector& resolutions, bool copy_base_resolution, - bool force) { +void zoomify_many_cooler(std::string_view in_uri, std::string_view out_path, + const std::vector& resolutions, bool copy_base_resolution, + bool force, std::uint32_t compression_lvl) { const cooler::File clr(in_uri); auto mclr = cooler::MultiResFile::create(out_path, cooler::File(in_uri).chromosomes(), force); @@ -64,7 +63,8 @@ void zoomify_many(std::string_view in_uri, std::string_view out_path, mclr.copy_resolution(clr); } else { assert(resolutions.size() > 1); - zoomify_once(cooler::File(in_uri), mclr.init_resolution(resolutions[1]), resolutions[1]); + zoomify_once_cooler(cooler::File(in_uri), mclr.init_resolution(resolutions[1]), resolutions[1], + compression_lvl); } for (std::size_t i = 1; i < resolutions.size(); ++i) { @@ -72,14 +72,64 @@ void zoomify_many(std::string_view in_uri, std::string_view out_path, } } -int zoomify_subcmd(const ZoomifyConfig& c) { - const auto t0 = std::chrono::system_clock::now(); - const auto output_is_multires = c.copy_base_resolution || c.resolutions.size() > 2; +void print_zooming_plan_hic(std::string_view path_to_input, + const std::vector& resolutions) { + const auto avail_resolutions = hic::utils::list_resolutions(path_to_input); + for (const auto& res : resolutions) { + const auto match = std::find(avail_resolutions.begin(), avail_resolutions.end(), res); + if (match != avail_resolutions.end()) { + SPDLOG_INFO(FMT_STRING("copying resolution {} from \"{}\""), res, path_to_input); + } else { + auto base_resolution = resolutions.front(); + for (const auto& avail_res : resolutions) { + if (avail_res >= res) { + break; + } + if (res % avail_res == 0) { + base_resolution = avail_res; + } + } + SPDLOG_INFO(FMT_STRING("generating {} resolution from {} ({}x)"), res, base_resolution, + res / base_resolution); + } + } +} + +void zoomify_hic(const ZoomifyConfig& c) { + if (c.force) { + std::filesystem::remove(c.path_to_output); + } + print_zooming_plan_hic(c.path_to_input.string(), c.resolutions); + + const internal::TmpDir tmpdir{c.tmp_dir}; + hic::internal::HiCFileZoomify{c.path_to_input.string(), + c.path_to_output.string(), + c.resolutions, + c.threads, + c.batch_size, + tmpdir(), + c.compression_lvl} + .zoomify(); +} + +void zoomify_cooler(const ZoomifyConfig& c, bool output_is_multires) { if (output_is_multires) { - zoomify_many(c.input_uri, c.output_path, c.resolutions, c.copy_base_resolution, c.force); + zoomify_many_cooler(c.path_to_input.string(), c.path_to_output.string(), c.resolutions, + c.copy_base_resolution, c.force, c.compression_lvl); + return; + } + zoomify_once_cooler(c.path_to_input.string(), c.path_to_output.string(), c.resolutions.back(), + c.force, c.compression_lvl); +} + +int zoomify_subcmd(const ZoomifyConfig& c) { + const auto output_is_multires = c.copy_base_resolution || c.resolutions.size() > 2; + const auto t0 = std::chrono::system_clock::now(); + if (c.output_format == "hic") { + zoomify_hic(c); } else { - zoomify_once(c.input_uri, c.output_path, c.resolutions.back(), c.force); + zoomify_cooler(c, output_is_multires); } const auto t1 = std::chrono::system_clock::now(); const auto delta = diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp index e01f2a88..e5f21c84 100644 --- a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -20,18 +20,7 @@ #include #include "hictk/common.hpp" - -namespace std { -template <> -struct default_delete { - void operator()(ZSTD_CCtx_s* ctx) const { ZSTD_freeCCtx(ctx); } // NOLINT -}; - -template <> -struct default_delete { - void operator()(ZSTD_DCtx_s* ctx) const { ZSTD_freeDCtx(ctx); } // NOLINT -}; -} // namespace std +#include "hictk/default_delete.hpp" namespace hictk::balancing { @@ -136,8 +125,13 @@ class SparseMatrixChunked { ~SparseMatrixChunked() noexcept; SparseMatrixChunked& operator=(const SparseMatrixChunked& other) = delete; - SparseMatrixChunked& operator=(SparseMatrixChunked&& other) noexcept( - noexcept_move_assignment_op()) = default; +#if defined(__GNUC__) && defined(__clang__) && __clang_major__ > 8 + SparseMatrixChunked& operator=(SparseMatrixChunked&& other) noexcept = default; +#elif defined(__GNUC__) && __GNUC__ > 9 + SparseMatrixChunked& operator=(SparseMatrixChunked&& other) noexcept = default; +#else + SparseMatrixChunked& operator=(SparseMatrixChunked&& other) = default; +#endif [[nodiscard]] bool empty() const noexcept; [[nodiscard]] std::size_t size() const noexcept; diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp index 6d3c4c1f..4c0076c8 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp @@ -184,7 +184,7 @@ inline bool BinTable::operator==(const BinTable &other) const { _table); } -inline bool BinTable::operator!=(const BinTable &other) const { return !(*this == other); }; +inline bool BinTable::operator!=(const BinTable &other) const { return !(*this == other); } template constexpr const BinTableT &BinTable::get() const { diff --git a/src/libhictk/common/CMakeLists.txt b/src/libhictk/common/CMakeLists.txt index 24a29892..8d3393c2 100644 --- a/src/libhictk/common/CMakeLists.txt +++ b/src/libhictk/common/CMakeLists.txt @@ -3,6 +3,8 @@ # SPDX-License-Identifier: MIT find_package(FMT REQUIRED) +find_package(libdeflate REQUIRED) +find_package(zstd REQUIRED) add_library(common INTERFACE) add_library(hictk::common ALIAS common) @@ -17,4 +19,6 @@ target_sources( target_include_directories(common INTERFACE "$" "$") -target_link_libraries(common INTERFACE fmt::fmt-header-only) +target_link_libraries( + common INTERFACE fmt::fmt-header-only "libdeflate::libdeflate_$,shared,static>" + "zstd::libzstd_$,shared,static>") diff --git a/src/libhictk/common/include/hictk/common.hpp b/src/libhictk/common/include/hictk/common.hpp index 9c77554d..30f800e4 100644 --- a/src/libhictk/common/include/hictk/common.hpp +++ b/src/libhictk/common/include/hictk/common.hpp @@ -33,24 +33,6 @@ inline const std::string_view HICTK_VERSION_STRING_LONG{hictk::config::version:: #define HICTK_UNREACHABLE_CODE #endif -[[nodiscard]] constexpr bool noexcept_move_ctor() noexcept { -#if defined(__GNUC__) && !defined(__clang__) - return __GNUC__ > 7; -#else - return true; -#endif -} - -[[nodiscard]] constexpr bool noexcept_move_assignment_op() noexcept { -#if defined(__GNUC__) && defined(__clang__) - return __clang_major__ > 8; -#elif defined(__GNUC__) - return __GNUC__ > 9; -#else - return true; -#endif -} - [[noreturn]] inline void unreachable_code() { if constexpr (ndebug_not_defined()) { throw std::logic_error("Unreachable code"); diff --git a/src/libhictk/common/include/hictk/default_delete.hpp b/src/libhictk/common/include/hictk/default_delete.hpp new file mode 100644 index 00000000..a9b598b9 --- /dev/null +++ b/src/libhictk/common/include/hictk/default_delete.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include + +template <> +struct std::default_delete { + void operator()(ZSTD_CCtx_s* ctx) const { ZSTD_freeCCtx(ctx); } // NOLINT +}; + +template <> +struct std::default_delete { + void operator()(ZSTD_DCtx_s* ctx) const { ZSTD_freeDCtx(ctx); } // NOLINT +}; + +template <> +struct std::default_delete { + void operator()(libdeflate_compressor* compressor) const { + libdeflate_free_compressor(compressor); + } +}; + +template <> +struct std::default_delete { + void operator()(FILE* file) const { std::fclose(file); } // NOLINT +}; diff --git a/src/libhictk/cooler/include/hictk/cooler/common.hpp b/src/libhictk/cooler/include/hictk/cooler/common.hpp index 7a4412e1..bbe3ce5e 100644 --- a/src/libhictk/cooler/include/hictk/cooler/common.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/common.hpp @@ -38,7 +38,7 @@ inline constexpr std::array MANDATORY_DATASET_NAMES{ }; // clang-format on -inline constexpr std::uint_fast8_t DEFAULT_COMPRESSION_LEVEL = 6; +inline constexpr std::int32_t DEFAULT_COMPRESSION_LEVEL = 6; inline constexpr std::size_t DEFAULT_HDF5_CHUNK_SIZE = 64ULL << 10U; // 64KB inline constexpr double DEFAULT_HDF5_CACHE_W0 = 0.75; inline constexpr std::size_t DEFAULT_HDF5_DATASET_CACHE_SIZE = 1ULL << 20U; // 1MB diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index c10cb807..7b40dec8 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -106,7 +106,7 @@ class File { template File(RootGroup entrypoint, BinTable bins, PixelT pixel, Attributes attributes, - std::size_t cache_size_bytes, double w0); + std::size_t cache_size_bytes, std::uint32_t compression_lvl, double w0); // Ctor for SingleCellCooler template @@ -118,53 +118,70 @@ class File { File() = default; File(const File &other) = delete; - File(File &&other) noexcept(noexcept_move_ctor()) = default; // NOLINT + +#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ > 7 + File(File &&other) noexcept = default; +#else + File(File &&other) = default; +#endif // Simple constructor. Open file in read-only mode. Automatically detects pixel count type - explicit File(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, + explicit File(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, bool validate = true); - explicit File(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, + explicit File(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, bool validate = true); [[nodiscard]] static File open_random_access( - std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, + std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, bool validate = true); [[nodiscard]] static File open_read_once(std::string_view uri, - std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * + 4, bool validate = true); template [[nodiscard]] static File create(std::string_view uri, const Reference &chroms, std::uint32_t bin_size, bool overwrite_if_exists = false, Attributes attributes = Attributes::init(0), - std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, + std::uint32_t compression_lvl = DEFAULT_COMPRESSION_LEVEL); template [[nodiscard]] static File create(std::string_view uri, BinTable bins, bool overwrite_if_exists = false, Attributes attributes = Attributes::init(0), - std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, + std::uint32_t compression_lvl = DEFAULT_COMPRESSION_LEVEL); [[nodiscard]] static File open_random_access( - RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, + RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, bool validate = true); [[nodiscard]] static File open_read_once(RootGroup entrypoint, - std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * + 4, bool validate = true); template [[nodiscard]] static File create(RootGroup entrypoint, const Reference &chroms, std::uint32_t bin_size, Attributes attributes = Attributes::init(0), - std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, + std::uint32_t compression_lvl = DEFAULT_COMPRESSION_LEVEL); template [[nodiscard]] static File create(RootGroup entrypoint, BinTable bins, Attributes attributes = Attributes::init(0), - std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, + std::uint32_t compression_lvl = DEFAULT_COMPRESSION_LEVEL); ~File() noexcept; File &operator=(const File &other) = delete; - File &operator=(File &&other) noexcept(noexcept_move_assignment_op()) = default; // NOLINT +#if defined(__GNUC__) && defined(__clang__) && __clang_major__ > 8 + File &operator=(File &&other) noexcept = default; +#elif defined(__GNUC__) && __GNUC__ > 9 + File &operator=(File &&other) noexcept = default; +#else + File &operator=(File &&other) = default; +#endif [[nodiscard]] explicit operator bool() const noexcept; @@ -313,7 +330,8 @@ class File { -> GroupMap; template [[nodiscard]] static auto create_datasets(RootGroup &root_grp, const Reference &chroms, - std::size_t cache_size_bytes, double w0) -> DatasetMap; + std::size_t cache_size_bytes, + std::uint32_t compression_lvl, double w0) -> DatasetMap; static void write_standard_attributes(RootGroup &root_grp, const Attributes &attributes, bool skip_sentinel_attr = true); diff --git a/src/libhictk/cooler/include/hictk/cooler/dataset.hpp b/src/libhictk/cooler/include/hictk/cooler/dataset.hpp index 0eade655..5a6a616c 100644 --- a/src/libhictk/cooler/include/hictk/cooler/dataset.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/dataset.hpp @@ -73,8 +73,8 @@ class Dataset { template using const_iterator = iterator; - [[nodiscard]] static HighFive::DataSetCreateProps init_create_props( - std::uint_fast8_t compression_lvl, std::size_t chunk_size); + [[nodiscard]] static HighFive::DataSetCreateProps init_create_props(std::uint32_t compression_lvl, + std::size_t chunk_size); [[nodiscard]] static HighFive::DataSetAccessProps init_access_props(std::size_t chunk_size, std::size_t cache_size, double w0); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/dataset_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/dataset_impl.hpp index e8d7a951..4b36051e 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/dataset_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/dataset_impl.hpp @@ -54,12 +54,12 @@ template } // namespace internal -inline HighFive::DataSetCreateProps Dataset::init_create_props(std::uint_fast8_t compression_lvl, +inline HighFive::DataSetCreateProps Dataset::init_create_props(std::uint32_t compression_lvl, std::size_t chunk_size) { assert(chunk_size != 0); HighFive::DataSetCreateProps props{}; props.add(HighFive::Shuffle()); - props.add(HighFive::Deflate(conditional_static_cast(compression_lvl))); + props.add(HighFive::Deflate(compression_lvl)); props.add(HighFive::Chunking(chunk_size / sizeof(std::int32_t))); return props; } diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp index 4871135b..1e48481d 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp @@ -61,7 +61,7 @@ inline auto File::attributes() const noexcept -> const Attributes & { return _at inline HighFive::File File::file_handle() { return _root_group().getFile(); } -inline const HighFive::File& File::file_handle() const { return _root_group().getFile(); } +inline const HighFive::File &File::file_handle() const { return _root_group().getFile(); } inline auto File::group(std::string_view group_name) -> Group & { try { diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp index 1c922feb..7830f501 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp @@ -57,11 +57,13 @@ inline File::File(RootGroup entrypoint, unsigned int mode, std::size_t cache_siz template inline File::File(RootGroup entrypoint, BinTable bins, [[maybe_unused]] PixelT pixel, - Attributes attributes, std::size_t cache_size_bytes, double w0) + Attributes attributes, std::size_t cache_size_bytes, + std::uint32_t compression_lvl, double w0) : _mode(HighFive::File::ReadWrite), _root_group(std::move(entrypoint)), _groups(create_groups(_root_group)), - _datasets(create_datasets(_root_group, bins.chromosomes(), cache_size_bytes, w0)), + _datasets(create_datasets(_root_group, bins.chromosomes(), cache_size_bytes, + compression_lvl, w0)), _attrs(std::move(attributes)), _pixel_variant(PixelT(0)), _bins(std::make_shared(std::move(bins))), @@ -74,6 +76,14 @@ inline File::File(RootGroup entrypoint, BinTable bins, [[maybe_unused]] PixelT p write_chromosomes(); write_bin_table(); + if constexpr (std::is_floating_point_v) { + _attrs.sum = 0.0; + _attrs.cis = 0.0; + } else { + _attrs.sum = std::int64_t(0); + _attrs.cis = std::int64_t(0); + } + write_sentinel_attr(); } @@ -120,14 +130,15 @@ inline File File::open_read_once(std::string_view uri, std::size_t cache_size_by template inline File File::create(std::string_view uri, const Reference &chroms, std::uint32_t bin_size, bool overwrite_if_exists, Attributes attributes, - std::size_t cache_size_bytes) { + std::size_t cache_size_bytes, std::uint32_t compression_lvl) { return File::create(uri, BinTable(chroms, bin_size), overwrite_if_exists, attributes, - cache_size_bytes); + cache_size_bytes, compression_lvl); } template inline File File::create(std::string_view uri, BinTable bins, bool overwrite_if_exists, - Attributes attributes, std::size_t cache_size_bytes) { + Attributes attributes, std::size_t cache_size_bytes, + std::uint32_t compression_lvl) { try { const auto [file_path, root_path] = parse_cooler_uri(uri); const auto uri_is_file_path = root_path.empty() || root_path == "/"; @@ -169,7 +180,7 @@ inline File File::create(std::string_view uri, BinTable bins, bool overwrite_if_ return create( open_or_create_root_group(open_file(uri, HighFive::File::ReadWrite, false), uri), bins, - attributes, cache_size_bytes); + attributes, cache_size_bytes, compression_lvl); } catch (const std::exception &e) { throw std::runtime_error( fmt::format(FMT_STRING("Cannot create cooler at the following URI: \"{}\". Reason: {}"), @@ -189,13 +200,15 @@ inline File File::open_read_once(RootGroup entrypoint, std::size_t cache_size_by template inline File File::create(RootGroup entrypoint, const Reference &chroms, std::uint32_t bin_size, - Attributes attributes, std::size_t cache_size_bytes) { - return File::create(entrypoint, BinTable(chroms, bin_size), attributes, cache_size_bytes); + Attributes attributes, std::size_t cache_size_bytes, + std::uint32_t compression_lvl) { + return File::create(entrypoint, BinTable(chroms, bin_size), attributes, cache_size_bytes, + compression_lvl); } template inline File File::create(RootGroup entrypoint, BinTable bins, Attributes attributes, - std::size_t cache_size_bytes) { + std::size_t cache_size_bytes, std::uint32_t compression_lvl) { static_assert(std::is_arithmetic_v); if (std::holds_alternative>(bins.get())) { attributes.bin_type = "variable"; @@ -209,7 +222,8 @@ inline File File::create(RootGroup entrypoint, BinTable bins, Attributes attribu if (utils::is_cooler(entrypoint())) { throw std::runtime_error("URI points to an already existing cooler."); } - return File(entrypoint, bins, PixelT(0), attributes, cache_size_bytes, true); + return File(entrypoint, bins, PixelT(0), attributes, cache_size_bytes, compression_lvl, + DEFAULT_HDF5_CACHE_W0); } catch (const std::exception &e) { throw std::runtime_error( diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp index 066ec6a8..618b4d97 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp @@ -136,7 +136,7 @@ template inline void File::write_weights(std::string_view uri, std::string_view name, It first_weight, It last_weight, bool overwrite_if_exists, bool divisive) { File(open_or_create_root_group(open_file(uri, HighFive::File::ReadWrite, true), uri), - HighFive::File::ReadWrite, DEFAULT_HDF5_CACHE_SIZE, DEFAULT_HDF5_CACHE_W0, true) + HighFive::File::ReadWrite, DEFAULT_HDF5_CACHE_SIZE * 4, DEFAULT_HDF5_CACHE_W0, true) .write_weights(name, first_weight, last_weight, overwrite_if_exists, divisive); } @@ -222,7 +222,8 @@ inline auto File::create_groups(RootGroup &root_grp, Group chroms_grp, Group bin template inline auto File::create_datasets(RootGroup &root_grp, const Reference &chroms, - std::size_t cache_size_bytes, double w0) -> DatasetMap { + std::size_t cache_size_bytes, std::uint32_t compression_lvl, + double w0) -> DatasetMap { DatasetMap datasets(MANDATORY_DATASET_NAMES.size() + 1); const std::size_t num_pixel_datasets = 3; @@ -237,30 +238,33 @@ inline auto File::create_datasets(RootGroup &root_grp, const Reference &chroms, const auto pixels_aprop = Dataset::init_access_props( DEFAULT_HDF5_CHUNK_SIZE, ((std::max)(read_once_cache_size, pixel_dataset_cache_size)), w0); - auto create_dataset = [&](const auto &path, const auto &type, auto aprop) { + const auto default_cprop = Dataset::init_create_props(compression_lvl, DEFAULT_HDF5_CHUNK_SIZE); + + auto create_dataset = [&](const auto &path, const auto &type, auto aprop, auto cprop) { using T = remove_cvref_t; if constexpr (is_string_v) { const auto &chrom_with_longest_name = chroms.chromosome_with_longest_name(); datasets.emplace(path, Dataset{root_grp, path, chrom_with_longest_name.name(), - HighFive::DataSpace::UNLIMITED, aprop}); + HighFive::DataSpace::UNLIMITED, aprop, cprop}); } else { - datasets.emplace(path, Dataset{root_grp, path, type, HighFive::DataSpace::UNLIMITED, aprop}); + datasets.emplace(path, + Dataset{root_grp, path, type, HighFive::DataSpace::UNLIMITED, aprop, cprop}); } }; - create_dataset("chroms/name", std::string{}, default_aprop); - create_dataset("chroms/length", std::int32_t{}, default_aprop); + create_dataset("chroms/name", std::string{}, default_aprop, default_cprop); + create_dataset("chroms/length", std::int32_t{}, default_aprop, default_cprop); - create_dataset("bins/chrom", std::int32_t{}, default_aprop); - create_dataset("bins/start", std::int32_t{}, default_aprop); - create_dataset("bins/end", std::int32_t{}, default_aprop); + create_dataset("bins/chrom", std::int32_t{}, default_aprop, default_cprop); + create_dataset("bins/start", std::int32_t{}, default_aprop, default_cprop); + create_dataset("bins/end", std::int32_t{}, default_aprop, default_cprop); - create_dataset("pixels/bin1_id", std::int64_t{}, pixels_aprop); - create_dataset("pixels/bin2_id", std::int64_t{}, pixels_aprop); - create_dataset("pixels/count", PixelT{}, pixels_aprop); + create_dataset("pixels/bin1_id", std::int64_t{}, pixels_aprop, default_cprop); + create_dataset("pixels/bin2_id", std::int64_t{}, pixels_aprop, default_cprop); + create_dataset("pixels/count", PixelT{}, pixels_aprop, default_cprop); - create_dataset("indexes/bin1_offset", std::int64_t{}, default_aprop); - create_dataset("indexes/chrom_offset", std::int64_t{}, default_aprop); + create_dataset("indexes/bin1_offset", std::int64_t{}, default_aprop, default_cprop); + create_dataset("indexes/chrom_offset", std::int64_t{}, default_aprop, default_cprop); assert(datasets.size() == MANDATORY_DATASET_NAMES.size()); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp index 1f2e7829..8ddd2656 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp @@ -130,6 +130,13 @@ constexpr const std::vector& MultiResFile::resolutions() const no constexpr const MultiResAttributes& MultiResFile::attributes() const noexcept { return _attrs; } inline File MultiResFile::open(std::uint32_t resolution) const { + const auto match = std::find(resolutions().begin(), resolutions().end(), resolution); + + if (match == resolutions().end()) { + throw std::runtime_error( + fmt::format(FMT_STRING("file \"{}\" does not contain interactions for resolution {}"), + path(), resolution)); + } return File( RootGroup{(*_root_grp)().getGroup(fmt::format(FMT_STRING("/resolutions/{}"), resolution))}); } // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp index 8406b243..0de57c59 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp @@ -148,7 +148,9 @@ inline File SingleCellFile::open(std::string_view cell) const { } // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) template -inline File SingleCellFile::create_cell(std::string_view cell, Attributes attrs) { +inline File SingleCellFile::create_cell(std::string_view cell, Attributes attrs, + std::size_t cache_size_bytes, + std::uint32_t compression_lvl) { if (_cells.contains(cell)) { throw std::runtime_error(fmt::format( FMT_STRING("failed to create cell \"{}\" in file {}: cell already exists"), cell, path())); @@ -179,9 +181,9 @@ inline File SingleCellFile::create_cell(std::string_view cell, Attributes attrs) File::create_groups(entrypoint, Group{*_root_grp, _root_grp->group.getGroup("/chroms")}, Group{*_root_grp, _root_grp->group.getGroup("/bins")}); - create_cell_datasets(entrypoint, DEFAULT_HDF5_CACHE_SIZE, DEFAULT_HDF5_CACHE_W0); + create_cell_datasets(entrypoint, cache_size_bytes, compression_lvl, DEFAULT_HDF5_CACHE_W0); - return {entrypoint, N{}, std::move(attrs), DEFAULT_HDF5_CACHE_SIZE, DEFAULT_HDF5_CACHE_W0}; + return {entrypoint, N{}, std::move(attrs), cache_size_bytes, DEFAULT_HDF5_CACHE_W0}; } inline SingleCellFile::operator bool() const noexcept { return !!_root_grp; } @@ -201,7 +203,8 @@ inline const HighFive::File& SingleCellFile::file_handle() const { template inline File SingleCellFile::aggregate(std::string_view uri, bool overwrite_if_exists, - std::size_t chunk_size, std::size_t update_frequency) const { + std::uint32_t compression_lvl, std::size_t chunk_size, + std::size_t update_frequency) const { if (_cells.size() == 1) { utils::copy(open(*_cells.begin()).uri(), uri); return File(uri); @@ -219,7 +222,8 @@ inline File SingleCellFile::aggregate(std::string_view uri, bool overwrite_if_ex tails.emplace_back(std::move(last)); } }); - utils::merge(heads, tails, bins(), uri, overwrite_if_exists, chunk_size, update_frequency); + utils::merge(heads, tails, bins(), uri, overwrite_if_exists, chunk_size, update_frequency, + compression_lvl); return File(uri); } @@ -341,7 +345,7 @@ inline void SingleCellFile::create_datasets(RootGroup& root_grp, const BinTable& template inline void SingleCellFile::create_cell_datasets(RootGroup& root_grp, std::size_t cache_size_bytes, - double w0) { + std::uint32_t compression_lvl, double w0) { const std::size_t num_pixel_datasets = 3; const std::size_t num_read_once_dataset = MANDATORY_DATASET_NAMES.size() - num_pixel_datasets; @@ -354,16 +358,18 @@ inline void SingleCellFile::create_cell_datasets(RootGroup& root_grp, std::size_ const auto pixels_aprop = Dataset::init_access_props( DEFAULT_HDF5_CHUNK_SIZE, ((std::max)(read_once_cache_size, pixel_dataset_cache_size)), w0); - auto create_dataset = [&](const auto& path, const auto& type, auto aprop) { - Dataset{root_grp, path, type, HighFive::DataSpace::UNLIMITED, aprop}; + const auto default_cprop = Dataset::init_create_props(compression_lvl, DEFAULT_HDF5_CHUNK_SIZE); + + auto create_dataset = [&](const auto& path, const auto& type, auto aprop, auto cprop) { + Dataset{root_grp, path, type, HighFive::DataSpace::UNLIMITED, aprop, cprop}; }; - create_dataset("pixels/bin1_id", std::int64_t{}, pixels_aprop); - create_dataset("pixels/bin2_id", std::int64_t{}, pixels_aprop); - create_dataset("pixels/count", PixelT{}, pixels_aprop); + create_dataset("pixels/bin1_id", std::int64_t{}, pixels_aprop, default_cprop); + create_dataset("pixels/bin2_id", std::int64_t{}, pixels_aprop, default_cprop); + create_dataset("pixels/count", PixelT{}, pixels_aprop, default_cprop); - create_dataset("indexes/bin1_offset", std::int64_t{}, default_aprop); - create_dataset("indexes/chrom_offset", std::int64_t{}, default_aprop); + create_dataset("indexes/bin1_offset", std::int64_t{}, default_aprop, default_cprop); + create_dataset("indexes/chrom_offset", std::int64_t{}, default_aprop, default_cprop); } } // namespace hictk::cooler diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp index da0ce393..f26fe48d 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp @@ -89,7 +89,8 @@ inline void validate_chromosomes(const std::vector>& coolers) { template inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool overwrite_if_exists, - std::size_t chunk_size, std::size_t update_frequency) { + std::size_t chunk_size, std::size_t update_frequency, + std::uint32_t compression_lvl) { static_assert(std::is_constructible_v); assert(chunk_size != 0); try { @@ -116,7 +117,7 @@ inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool o } merge(heads, tails, cooler::File(clrs.front().uri).bins(), dest_uri, overwrite_if_exists, - chunk_size, update_frequency); + chunk_size, update_frequency, compression_lvl); } catch (const std::exception& e) { throw std::runtime_error(fmt::format(FMT_STRING("failed to merge {} cooler files: {}"), std::distance(first_uri, last_uri), e.what())); @@ -126,14 +127,17 @@ inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool o template inline void merge(const std::vector& heads, const std::vector& tails, const BinTable& bins, std::string_view dest_uri, bool overwrite_if_exists, - std::size_t chunk_size, std::size_t update_frequency) { + std::size_t chunk_size, std::size_t update_frequency, + std::uint32_t compression_lvl) { using N = remove_cvref_tcount)>; hictk::transformers::PixelMerger merger{heads, tails}; std::vector> buffer(chunk_size); buffer.clear(); - auto dest = File::create(dest_uri, bins, overwrite_if_exists); + auto dest = + File::create(dest_uri, bins, overwrite_if_exists, Attributes::init(bins.bin_size()), + DEFAULT_HDF5_CACHE_SIZE * 4, compression_lvl); std::size_t pixels_processed{}; auto t0 = std::chrono::steady_clock::now(); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp index 512a3034..18659ea0 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp @@ -338,7 +338,7 @@ inline ValidationStatusScool is_scool_file(const HighFive::File &fp, bool valida return false; } - File clr(uri, DEFAULT_HDF5_CACHE_SIZE, false); + File clr(uri, DEFAULT_HDF5_CACHE_SIZE * 4, false); const auto bin1_dset = clr.dataset("indexes/bin1_offset"); const auto bin2_dset = clr.dataset("pixels/bin2_id"); diff --git a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp index 3518a739..436b1747 100644 --- a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp @@ -76,7 +76,9 @@ class SingleCellFile { [[nodiscard]] constexpr const SingleCellAttributes& attributes() const noexcept; [[nodiscard]] File open(std::string_view cell) const; template - File create_cell(std::string_view cell, Attributes attrs = Attributes::init(0)); + File create_cell(std::string_view cell, Attributes attrs = Attributes::init(0), + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4, + std::uint32_t compression_lvl = DEFAULT_COMPRESSION_LEVEL); [[nodiscard]] explicit operator bool() const noexcept; [[nodiscard]] std::string path() const; @@ -89,6 +91,7 @@ class SingleCellFile { template File aggregate(std::string_view uri, bool overwrite_if_exists = false, + std::uint32_t compression_lvl = DEFAULT_COMPRESSION_LEVEL, std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000) const; private: @@ -102,7 +105,8 @@ class SingleCellFile { static void write_standard_attributes(RootGroup& root_grp, const SingleCellAttributes& attrs); template - static void create_cell_datasets(RootGroup& root_grp, std::size_t cache_size_bytes, double w0); + static void create_cell_datasets(RootGroup& root_grp, std::size_t cache_size_bytes, + std::uint32_t compression_lvl, double w0); }; } // namespace hictk::cooler diff --git a/src/libhictk/cooler/include/hictk/cooler/utils.hpp b/src/libhictk/cooler/include/hictk/cooler/utils.hpp index 3d7d8db4..369fef71 100644 --- a/src/libhictk/cooler/include/hictk/cooler/utils.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/utils.hpp @@ -17,15 +17,17 @@ namespace hictk::cooler::utils { -/// Iterable of hictk::File or strings +/// Iterable of strings template void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool overwrite_if_exists = false, - std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000); + std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000, + std::uint32_t compression_lvl = DEFAULT_COMPRESSION_LEVEL); template void merge(const std::vector& heads, const std::vector& tails, const BinTable& bins, std::string_view dest_uri, bool overwrite_if_exists = false, - std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000); + std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000, + std::uint32_t compression_lvl = DEFAULT_COMPRESSION_LEVEL); [[nodiscard]] bool equal(std::string_view uri1, std::string_view uri2, bool ignore_attributes = true); diff --git a/src/libhictk/cooler/include/hictk/cooler/validation.hpp b/src/libhictk/cooler/include/hictk/cooler/validation.hpp index a88644b0..afa96d8f 100644 --- a/src/libhictk/cooler/include/hictk/cooler/validation.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/validation.hpp @@ -82,15 +82,15 @@ struct ValidationStatusScool : public internal::ValidationStatusBase { namespace fmt { template <> struct formatter { - static constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator; + static constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator; - static auto format(const hictk::cooler::utils::ValidationStatusCooler& s, - format_context& ctx) -> format_context::iterator; + static auto format(const hictk::cooler::utils::ValidationStatusCooler& s, format_context& ctx) + -> format_context::iterator; }; template <> struct formatter { - static constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator; + static constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator; static auto format(const hictk::cooler::utils::ValidationStatusMultiresCooler& s, format_context& ctx) -> format_context::iterator; @@ -98,10 +98,10 @@ struct formatter { template <> struct formatter { - static constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator; + static constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator; - static auto format(const hictk::cooler::utils::ValidationStatusScool& s, - format_context& ctx) -> format_context::iterator; + static auto format(const hictk::cooler::utils::ValidationStatusScool& s, format_context& ctx) + -> format_context::iterator; }; } // namespace fmt diff --git a/src/libhictk/file/include/hictk/impl/file_impl.hpp b/src/libhictk/file/include/hictk/impl/file_impl.hpp index b43880c0..54ae0499 100644 --- a/src/libhictk/file/include/hictk/impl/file_impl.hpp +++ b/src/libhictk/file/include/hictk/impl/file_impl.hpp @@ -221,7 +221,7 @@ inline std::string File::uri() const { [&](auto& fp) { using T = std::decay_t; if constexpr (std::is_same_v) { - return fp.url(); + return fp.path(); } else { return fp.uri(); } @@ -234,7 +234,7 @@ inline std::string File::path() const { [&](auto& fp) { using T = std::decay_t; if constexpr (std::is_same_v) { - return fp.url(); + return fp.path(); } else { return fp.path(); } diff --git a/src/libhictk/hic/CMakeLists.txt b/src/libhictk/hic/CMakeLists.txt index efae5a13..9a925512 100644 --- a/src/libhictk/hic/CMakeLists.txt +++ b/src/libhictk/hic/CMakeLists.txt @@ -2,9 +2,13 @@ # # SPDX-License-Identifier: MIT +find_package(bshoshany-thread-pool REQUIRED) +find_package(concurrentqueue REQUIRED) find_package(FMT REQUIRED) find_package(libdeflate REQUIRED) find_package(phmap REQUIRED) +find_package(readerwriterqueue REQUIRED) +find_package(zstd REQUIRED) if(HICTK_WITH_EIGEN) find_package(Eigen3 QUIET REQUIRED) @@ -27,12 +31,17 @@ target_link_libraries( INTERFACE hictk::balancing hictk::common hictk::chromosome - hictk::reference) + hictk::reference + hictk::transformers) target_link_system_libraries( hic INTERFACE + bshoshany-thread-pool::bshoshany-thread-pool + concurrentqueue::concurrentqueue "$<$:Eigen3::Eigen>" fmt::fmt-header-only "libdeflate::libdeflate_$,shared,static>" - phmap) + phmap + readerwriterqueue::readerwriterqueue + "zstd::libzstd_$,shared,static>") diff --git a/src/libhictk/hic/include/hictk/hic.hpp b/src/libhictk/hic/include/hictk/hic.hpp index d5b123ba..8e78e51c 100644 --- a/src/libhictk/hic/include/hictk/hic.hpp +++ b/src/libhictk/hic/include/hictk/hic.hpp @@ -49,7 +49,7 @@ class File { MatrixUnit unit_ = MatrixUnit::BP, std::uint64_t block_cache_capacity = 0); [[nodiscard]] bool has_resolution(std::uint32_t resolution) const; - [[nodiscard]] const std::string &url() const noexcept; + [[nodiscard]] const std::string &path() const noexcept; [[nodiscard]] const std::string &name() const noexcept; [[nodiscard]] std::int32_t version() const noexcept; [[nodiscard]] const Reference &chromosomes() const noexcept; @@ -85,6 +85,13 @@ class File { std::uint64_t first_bin2, std::uint64_t last_bin2, balancing::Method norm = balancing::Method::NONE()) const; + [[nodiscard]] balancing::Weights normalization(balancing::Method norm, + const Chromosome &chrom) const; + [[nodiscard]] balancing::Weights normalization(std::string_view norm, + const Chromosome &chrom) const; + [[nodiscard]] balancing::Weights normalization(balancing::Method norm) const; + [[nodiscard]] balancing::Weights normalization(std::string_view norm) const; + [[nodiscard]] std::size_t num_cached_footers() const noexcept; void purge_footer_cache(); diff --git a/src/libhictk/hic/include/hictk/hic/binary_buffer.hpp b/src/libhictk/hic/include/hictk/hic/binary_buffer.hpp new file mode 100644 index 00000000..00a992b7 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/binary_buffer.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +namespace hictk::hic::internal { + +class BinaryBuffer { + std::string _buffer{}; + std::size_t _i{}; + + public: + BinaryBuffer() = default; + + // NOLINTNEXTLINE + template ::value>::type* = nullptr> + T read(); + template ::value>::type* = nullptr> + void read(T& buff); + template ::value>::type* = nullptr> + void read(std::vector& data); + void read(std::string& buff, std::size_t n); + void read(char* buff, std::size_t n); + std::string getline(char delim = '\n'); + // NOLINTNEXTLINE + template ::value>::type* = nullptr> + void write(T data); + void write(const std::string& data, bool add_nullterm = true); + template ::value>::type* = nullptr> + void write(const std::vector& data); + + // Return the offset of the underlying buffer. Useful for error checking + [[nodiscard]] std::size_t operator()() const noexcept; + + // Reset and return ref to underlying buffer so that buff can be refilled + std::string& reset() noexcept; + + void clear() noexcept; + + [[nodiscard]] const std::string& get() const noexcept; +}; + +} // namespace hictk::hic::internal + +#include "./impl/binary_buffer_impl.hpp" // NOLINT diff --git a/src/libhictk/hic/include/hictk/hic/block_reader.hpp b/src/libhictk/hic/include/hictk/hic/block_reader.hpp index e46405c1..38a57202 100644 --- a/src/libhictk/hic/include/hictk/hic/block_reader.hpp +++ b/src/libhictk/hic/include/hictk/hic/block_reader.hpp @@ -9,12 +9,12 @@ #include #include #include -#include #include #include #include "hictk/bin_table.hpp" #include "hictk/chromosome.hpp" +#include "hictk/hic/binary_buffer.hpp" #include "hictk/hic/cache.hpp" #include "hictk/hic/file_reader.hpp" #include "hictk/hic/index.hpp" @@ -22,23 +22,6 @@ namespace hictk::hic::internal { -class BinaryBuffer { - std::string _buffer{}; - std::size_t _i{}; - - public: - BinaryBuffer() = default; - // NOLINTNEXTLINE - template ::value>::type* = nullptr> - T read(); - - // Return the offset of the underlying buffer. Useful for error checking - [[nodiscard]] std::size_t operator()() const noexcept; - - // Reset and return ref to underlying buffer so that buff can be refilled - std::string& reset() noexcept; -}; - class HiCBlockReader { std::shared_ptr _hfs{}; std::shared_ptr _blk_cache{}; diff --git a/src/libhictk/hic/include/hictk/hic/cache.hpp b/src/libhictk/hic/include/hictk/hic/cache.hpp index d3d19c4c..80c38934 100644 --- a/src/libhictk/hic/include/hictk/hic/cache.hpp +++ b/src/libhictk/hic/include/hictk/hic/cache.hpp @@ -14,13 +14,12 @@ #include #include #include -#include #include "hictk/balancing/methods.hpp" #include "hictk/balancing/weights.hpp" #include "hictk/chromosome.hpp" #include "hictk/hash.hpp" -#include "hictk/pixel.hpp" +#include "hictk/hic/interaction_block.hpp" namespace hictk::hic::internal { struct BlockID { @@ -40,48 +39,6 @@ struct std::hash { namespace hictk::hic::internal { -class InteractionBlock { - public: - using Row = std::vector>; - - private: - using BuffT = std::vector>; - std::size_t _id{}; - BuffT _interactions{}; - - public: - using iterator = BuffT::iterator; - using const_iterator = BuffT::const_iterator; - - InteractionBlock() = default; - InteractionBlock(std::size_t id_, std::size_t block_bin_count, - std::vector> pixels); - - friend constexpr bool operator<(const InteractionBlock& a, const InteractionBlock& b) noexcept; - friend constexpr bool operator==(const InteractionBlock& a, const InteractionBlock& b) noexcept; - friend constexpr bool operator!=(const InteractionBlock& a, const InteractionBlock& b) noexcept; - - friend constexpr bool operator<(const InteractionBlock& a, std::size_t b_id) noexcept; - friend constexpr bool operator==(const InteractionBlock& a, std::size_t b_id) noexcept; - friend constexpr bool operator!=(const InteractionBlock& a, std::size_t b_id) noexcept; - - friend constexpr bool operator<(std::size_t a_id, const InteractionBlock& b) noexcept; - friend constexpr bool operator==(std::size_t a_id, const InteractionBlock& b) noexcept; - friend constexpr bool operator!=(std::size_t a_id, const InteractionBlock& b) noexcept; - - [[nodiscard]] auto operator()() const noexcept -> const BuffT&; - - [[nodiscard]] auto begin() const noexcept -> const_iterator; - [[nodiscard]] auto end() const noexcept -> const_iterator; - - [[nodiscard]] auto cbegin() const noexcept -> const_iterator; - [[nodiscard]] auto cend() const noexcept -> const_iterator; - - [[nodiscard]] std::size_t id() const noexcept; - - [[nodiscard]] std::size_t size() const noexcept; -}; - class BlockCache { using Value = std::shared_ptr; std::queue _queue{}; diff --git a/src/libhictk/hic/include/hictk/hic/expected_values_aggregator.hpp b/src/libhictk/hic/include/hictk/hic/expected_values_aggregator.hpp new file mode 100644 index 00000000..8a9bbab0 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/expected_values_aggregator.hpp @@ -0,0 +1,68 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +// IWYU pragma: private, include "hictk/hic.hpp" + +#include +#include + +#include +#include +#include +#include + +#include "hictk/bin_table.hpp" +#include "hictk/chromosome.hpp" +#include "hictk/pixel.hpp" +#include "hictk/reference.hpp" + +namespace hictk::hic::internal { + +class ExpectedValuesAggregator { + std::shared_ptr _bins{}; + std::size_t _num_bins_gw{}; + + using CisKey = Chromosome; + using TransKey = std::pair; + phmap::flat_hash_map _cis_sum{}; + phmap::flat_hash_map _trans_sum{}; + + std::vector _possible_distances{}; + std::vector _actual_distances{}; + + std::vector _weights{}; + phmap::btree_map _scaling_factors{}; + + public: + ExpectedValuesAggregator() = default; + explicit ExpectedValuesAggregator(std::shared_ptr bins); + void add(const ThinPixel& p); + void add(const Pixel& p); + + void compute_density(); + + [[nodiscard]] const std::vector& weights() const noexcept; + + [[nodiscard]] double scaling_factor(const Chromosome& chrom) const; + [[nodiscard]] const phmap::btree_map& scaling_factors() const noexcept; + + private: + [[nodiscard]] const Reference& chromosomes() const noexcept; + + inline void init_possible_distances(); + void compute_density_cis(); + void compute_density_trans(); + + [[nodiscard]] double at(const Chromosome& chrom) const; + [[nodiscard]] double at(const Chromosome& chrom1, const Chromosome& chrom2) const; + + [[nodiscard]] double& at(const Chromosome& chrom); + [[nodiscard]] double& at(const Chromosome& chrom1, const Chromosome& chrom2); +}; + +} // namespace hictk::hic::internal + +#include "./impl/expected_values_aggregator_impl.hpp" // NOLINT diff --git a/src/libhictk/hic/include/hictk/hic/file_reader.hpp b/src/libhictk/hic/include/hictk/hic/file_reader.hpp index 6192d40c..14fb9126 100644 --- a/src/libhictk/hic/include/hictk/hic/file_reader.hpp +++ b/src/libhictk/hic/include/hictk/hic/file_reader.hpp @@ -36,7 +36,7 @@ class HiCFileReader { public: HiCFileReader() = default; explicit HiCFileReader(std::string url); - [[nodiscard]] inline const std::string &url() const noexcept; + [[nodiscard]] inline const std::string &path() const noexcept; [[nodiscard]] const HiCHeader &header() const noexcept; [[nodiscard]] std::int32_t version() const noexcept; @@ -64,6 +64,7 @@ class HiCFileReader { [[nodiscard]] std::vector list_avail_normalizations( MatrixType matrix_type, MatrixUnit wanted_unit, std::uint32_t wanted_resolution); + [[nodiscard]] std::vector list_avail_normalizations_v9(); [[nodiscard]] static MatrixType readMatrixType(filestream::FileStream &fs, std::string &buff); [[nodiscard]] static balancing::Method readNormalizationMethod(filestream::FileStream &fs, diff --git a/src/libhictk/hic/include/hictk/hic/file_writer.hpp b/src/libhictk/hic/include/hictk/hic/file_writer.hpp new file mode 100644 index 00000000..9f8a5c05 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/file_writer.hpp @@ -0,0 +1,276 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +// IWYU pragma: private, include "hictk/hic.hpp" + +#if __has_include() +#include +#else +#include +#endif +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/balancing/weights.hpp" +#include "hictk/bin_table.hpp" +#include "hictk/default_delete.hpp" +#include "hictk/hash.hpp" +#include "hictk/hic/binary_buffer.hpp" +#include "hictk/hic/expected_values_aggregator.hpp" +#include "hictk/hic/file_writer_data_structures.hpp" +#include "hictk/hic/filestream.hpp" +#include "hictk/hic/footer.hpp" +#include "hictk/hic/header.hpp" +#include "hictk/hic/interaction_block.hpp" +#include "hictk/hic/interaction_to_block_mapper.hpp" +#include "hictk/tmpdir.hpp" + +namespace hictk::hic::internal { + +class HiCSectionOffsets { + std::streamoff _position{}; + std::size_t _size{}; + + public: + HiCSectionOffsets() = default; + template + HiCSectionOffsets(I1 start_, I2 size_); + + [[nodiscard]] std::streamoff start() const noexcept; + [[nodiscard]] std::streamoff end() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] std::size_t& size() noexcept; +}; + +struct BlockIndexKey { + Chromosome chrom1; + Chromosome chrom2; + std::uint32_t resolution; + + [[nodiscard]] bool operator<(const BlockIndexKey& other) const noexcept; +}; + +class MatrixBodyMetadataTank { + public: + struct Key { + Chromosome chrom1{}; + Chromosome chrom2{}; + + bool operator==(const Key& other) const noexcept; + }; + + private: + phmap::flat_hash_map _tank{}; + phmap::flat_hash_map _offsets{}; + + public: + MatrixBodyMetadataTank() = default; + + [[nodiscard]] bool contains(const Chromosome& chrom1, const Chromosome& chrom2) const noexcept; + [[nodiscard]] auto at(const Chromosome& chrom1, const Chromosome& chrom2) const + -> const MatrixBodyMetadata&; + [[nodiscard]] HiCSectionOffsets offset(const Chromosome& chrom1, const Chromosome& chrom2) const; + + void insert(const Chromosome& chrom1, const Chromosome& chrom2, MatrixMetadata matrix_metadata, + MatrixResolutionMetadata matrix_resolution_metadata); + void update_offsets(const Chromosome& chrom1, const Chromosome& chrom2, std::streamoff position, + std::size_t size); + + void remove(const Chromosome& chrom1, const Chromosome& chrom2); + + auto operator()() const noexcept -> const phmap::flat_hash_map&; +}; + +class HiCFileWriter { + struct Stats { + double sum{}; + std::uint64_t nnz{}; + }; + + filestream::FileStream _fs{}; + std::filesystem::path _tmpdir{}; + + using BinTables = phmap::flat_hash_map>; + using BlockIndex = phmap::btree_map>; + using BlockMappers = phmap::flat_hash_map; + + HiCHeader _header{}; + BinTables _bin_tables{}; + BlockIndex _block_index{}; + BlockMappers _block_mappers{}; + + using StatsTank = phmap::flat_hash_map; + using FooterTank = phmap::btree_map, FooterMasterIndex>; + + MatrixBodyMetadataTank _matrix_metadata{}; + FooterTank _footers{}; + StatsTank _stats{}; + + std::uint32_t _compression_lvl{}; + BinaryBuffer _bbuffer{}; + std::unique_ptr _compressor{}; + std::string _compression_buffer{}; + + phmap::btree_set _normalized_expected_values{}; + phmap::btree_map> _normalization_vectors{}; + + HiCSectionOffsets _header_section{}; + HiCSectionOffsets _data_block_section{}; + HiCSectionOffsets _body_metadata_section{}; + HiCSectionOffsets _footer_section{}; + HiCSectionOffsets _expected_values_section{}; + HiCSectionOffsets _expected_values_norm_section{}; + HiCSectionOffsets _norm_vector_index_section{}; + HiCSectionOffsets _norm_vectors_section{}; + + BS::thread_pool _tpool{}; + + static constexpr std::uint32_t DEFAULT_CHROM_ALL_SCALE_FACTOR{1000}; + + public: + HiCFileWriter() = default; + explicit HiCFileWriter(std::string_view path_, std::size_t n_threads = 1); + HiCFileWriter(std::string_view path_, Reference chromosomes_, + std::vector resolutions_, std::string_view assembly_ = "unknown", + std::size_t n_threads = 1, std::size_t chunk_size = 10'000'000, + const std::filesystem::path& tmpdir = std::filesystem::temp_directory_path(), + std::uint32_t compression_lvl = 12, std::size_t buffer_size = 32'000'000); + + [[nodiscard]] std::string_view path() const noexcept; + [[nodiscard]] const Reference& chromosomes() const noexcept; + [[nodiscard]] const BinTable& bins(std::uint32_t resolution) const; + [[nodiscard]] const std::vector& resolutions() const noexcept; + [[nodiscard]] auto stats(std::uint32_t resolution) const noexcept -> Stats; + + template >> + void add_pixels(std::uint32_t resolution, PixelIt first_pixel, PixelIt last_pixel); + + // Write normalization vectors + void add_norm_vector(const NormalizationVectorIndexBlock& blk, const std::vector& weights, + bool force_overwrite = false); + void add_norm_vector(std::string_view type, const Chromosome& chrom, std::string_view unit, + std::uint32_t bin_size, const std::vector& weights, + bool force_overwrite = false, + std::size_t position = std::numeric_limits::max(), + std::size_t n_bytes = std::numeric_limits::max()); + void add_norm_vector(const NormalizationVectorIndexBlock& blk, const balancing::Weights& weights, + bool force_overwrite = false); + void add_norm_vector(std::string_view type, const Chromosome& chrom, std::string_view unit, + std::uint32_t bin_size, const balancing::Weights& weights, + bool force_overwrite = false, + std::size_t position = std::numeric_limits::max(), + std::size_t n_bytes = std::numeric_limits::max()); + void add_norm_vector(std::string_view type, std::string_view unit, std::uint32_t bin_size, + const std::vector& weights, bool force_overwrite = false); + + void write_norm_vectors_and_norm_expected_values(); + + void serialize(); + + private: + [[nodiscard]] static HiCHeader read_header(filestream::FileStream& fs); + [[nodiscard]] static HiCHeader init_header(std::string_view path, Reference chromosomes, + std::vector resolutions, + std::string_view assembly); + [[nodiscard]] static auto init_bin_tables(const Reference& chromosomes, + const std::vector& resolutions) + -> BinTables; + [[nodiscard]] static auto init_interaction_block_mappers(const std::filesystem::path& root_folder, + const BinTables& bin_tables, + std::size_t chunk_size, + int compression_lvl) -> BlockMappers; + [[nodiscard]] BS::thread_pool init_tpool(std::size_t n_threads); + + // Write header + void write_header(); + void write_footer_offset(); + void write_norm_vector_index(); + + // Write pixels + void write_pixels(); + auto write_pixels(const Chromosome& chrom1, const Chromosome& chrom2) -> HiCSectionOffsets; + auto write_pixels(const Chromosome& chrom1, const Chromosome& chrom2, std::uint32_t resolution) + -> HiCSectionOffsets; + void write_all_matrix(std::uint32_t target_num_bins = 500); + + auto write_interaction_block(std::uint64_t block_id, const Chromosome& chrom1, + const Chromosome& chrom2, std::uint32_t resolution, + const MatrixInteractionBlock& blk) -> HiCSectionOffsets; + auto write_interaction_blocks(const Chromosome& chrom1, const Chromosome& chrom2, + std::uint32_t resolution) -> Stats; + + // Write body + void write_body_metadata(); + void add_body_metadata(std::uint32_t resolution, const Chromosome& chrom1, + const Chromosome& chrom2, const std::string& unit = "BP"); + + // Write footer + void write_footers(); + void add_footer(const Chromosome& chrom1, const Chromosome& chrom2); + void write_footer_size(); + + void write_empty_expected_values(); + void write_empty_normalized_expected_values(); + void compute_and_write_expected_values(); + void compute_and_write_normalized_expected_values(); + void write_norm_vectors(); + + void finalize(bool compute_expected_values = false); + + [[nodiscard]] std::size_t compute_block_column_count(const Chromosome& chrom1, + const Chromosome& chrom2, + std::uint32_t resolution); + [[nodiscard]] std::size_t compute_num_bins(const Chromosome& chrom1, const Chromosome& chrom2, + std::uint32_t resolution); + + [[nodiscard]] ExpectedValuesBlock compute_expected_values(std::uint32_t resolution); + [[nodiscard]] NormalizedExpectedValuesBlock compute_normalized_expected_values( + std::uint32_t resolution, const balancing::Method& norm); + + void add_norm_expected_values(const NormalizedExpectedValuesBlock& blk, + bool force_overwrite = false); + void read_norm_expected_values(); + void read_norm_vectors(); + [[nodiscard]] std::vector read_norm_vector(const NormalizationVectorIndexBlock& blk); + + void read_offsets(); + + // Methods to be called from worker threads + auto merge_and_compress_blocks_thr( + HiCInteractionToBlockMapper& mapper, std::mutex& mapper_mtx, + std::queue& block_id_queue, std::mutex& block_id_queue_mtx, + moodycamel::BlockingConcurrentQueue& block_queue, + phmap::flat_hash_map& serialized_block_tank, + std::mutex& serialized_block_tank_mtx, std::atomic& early_return, + std::uint64_t stop_token) -> Stats; + void write_compressed_blocks_thr( + const Chromosome& chrom1, const Chromosome& chrom2, std::uint32_t resolution, + std::queue& block_id_queue, std::mutex& block_id_queue_mtx, + phmap::flat_hash_map& serialized_block_tank, + std::mutex& serialized_block_tank_mtx, std::atomic& early_return, + std::uint64_t stop_token); +}; +} // namespace hictk::hic::internal + +template <> +struct std::hash { + inline std::size_t operator()( + hictk::hic::internal::MatrixBodyMetadataTank::Key const& k) const noexcept { + return hictk::internal::hash_combine(0, k.chrom1, k.chrom2); + } +}; + +#include "./impl/file_writer_impl.hpp" // NOLINT diff --git a/src/libhictk/hic/include/hictk/hic/file_writer_data_structures.hpp b/src/libhictk/hic/include/hictk/hic/file_writer_data_structures.hpp new file mode 100644 index 00000000..a42016a0 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/file_writer_data_structures.hpp @@ -0,0 +1,239 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +// IWYU pragma: private, include "hictk/hic.hpp" +#include +#include + +#include +#include +#include +#include +#include + +#include "hictk/hic/binary_buffer.hpp" +#include "hictk/hic/filestream.hpp" +#include "hictk/pixel.hpp" + +namespace hictk::hic::internal { + +// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#matrix-metadata +struct MatrixMetadata { + std::int32_t chr1Idx{}; + std::int32_t chr2Idx{}; + std::int32_t nResolutions{}; + + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; +}; + +struct MatrixBlockMetadata { + std::int32_t blockNumber{}; + std::int64_t blockPosition{}; + std::int32_t blockSizeBytes{}; + + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] bool operator<(const MatrixBlockMetadata& other) const noexcept; +}; + +// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#resolution-zoom-level-metadata +struct MatrixResolutionMetadata { + std::string unit{}; + std::int32_t resIdx{}; + float sumCounts{}; + std::int32_t occupiedCellCount = 0; // Not used + float percent5 = 0; // Not used + float percent95 = 0; // Not used + std::int32_t binSize{}; + std::int32_t blockSize{}; + std::int32_t blockColumnCount{}; + std::int32_t blockCount{}; + + [[nodiscard]] bool operator<(const MatrixResolutionMetadata& other) const noexcept; + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + + template + void set_block_metadata(It first_block, It last_block); + + private: + std::vector _block_metadata{}; +}; + +struct MatrixBodyMetadata { + MatrixMetadata matrixMetadata; + phmap::btree_set resolutionMetadata; + + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; +}; + +// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#blocks +template +struct MatrixInteractionBlock { + private: + using RowID = std::int32_t; + + struct Pixel { + std::int32_t column; + N count; + [[nodiscard]] bool operator<(const Pixel& other) const noexcept; + }; + using Row = phmap::btree_set; + + public: + std::int32_t nRecords{}; + std::int32_t binColumnOffset{std::numeric_limits::max()}; + std::int32_t binRowOffset{std::numeric_limits::max()}; + std::uint8_t useFloatContact{}; + std::uint8_t useIntXPos{}; + std::uint8_t useIntYPos{}; + std::uint8_t matrixRepresentation{}; + + std::int16_t w{}; + + [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] double sum() const noexcept; + + void emplace_back(hictk::Pixel&& p, std::uint32_t bin_id_offset = 0); + void finalize(); + + [[nodiscard]] auto operator()() const noexcept -> const phmap::btree_map&; + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, libdeflate_compressor& compressor, + std::string& compression_buffer, bool clear = true) const; + + private: + double _sum{}; + phmap::btree_map _interactions; + + std::int32_t _min_col{std::numeric_limits::max()}; + std::int32_t _max_col{}; + + [[nodiscard]] std::size_t compute_size_lor_repr() const noexcept; + [[nodiscard]] std::size_t compute_size_dense_repr() const noexcept; + + [[nodiscard]] std::size_t compute_dense_width() const noexcept; + + [[nodiscard]] std::string serialize_lor(BinaryBuffer& buffer, libdeflate_compressor& compressor, + std::string& compression_buffer, bool clear = true) const; + [[nodiscard]] std::string serialize_dense(BinaryBuffer& buffer, libdeflate_compressor& compressor, + std::string& compression_buffer, + bool clear = true) const; + + static void compress(const std::string& buffer_in, std::string& buffer_out, + libdeflate_compressor& compressor); +}; + +// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#master-index +struct FooterMasterIndex { + std::string key; + std::int64_t position; + std::int32_t size; + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; +}; + +struct ExpectedValuesBlock { + std::string unit{}; + std::int32_t binSize{}; + [[nodiscard]] std::int64_t nValues() const noexcept; + std::vector value{}; + [[nodiscard]] std::int32_t nChrScaleFactors() const noexcept; + std::vector chrIndex{}; + std::vector chrScaleFactor{}; + + ExpectedValuesBlock() = default; + ExpectedValuesBlock(std::string_view unit_, std::uint32_t bin_size, + const std::vector& weights, + const std::vector& chrom_ids, + const std::vector& scale_factors); + + [[nodiscard]] bool operator<(const ExpectedValuesBlock& other) const noexcept; + + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static ExpectedValuesBlock deserialize(filestream::FileStream& fs); +}; + +// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#expected-value-vectors +class ExpectedValues { + phmap::btree_set _expected_values; + + public: + [[nodiscard]] std::int32_t nExpectedValueVectors() const noexcept; + [[nodiscard]] const phmap::btree_set& expectedValues() const noexcept; + void emplace(const ExpectedValuesBlock& evb, bool force_overwrite = false); + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static ExpectedValues deserialize(filestream::FileStream& fs); +}; + +struct NormalizedExpectedValuesBlock { + std::string type{}; + std::string unit{}; + std::int32_t binSize{}; + [[nodiscard]] std::int64_t nValues() const noexcept; + std::vector value{}; + [[nodiscard]] std::int32_t nChrScaleFactors() const noexcept; + std::vector chrIndex{}; + std::vector chrScaleFactor{}; + + NormalizedExpectedValuesBlock() = default; + NormalizedExpectedValuesBlock(std::string_view type_, std::string_view unit_, + std::uint32_t bin_size, const std::vector& weights, + const std::vector& chrom_ids, + const std::vector& scale_factors); + + [[nodiscard]] bool operator<(const NormalizedExpectedValuesBlock& other) const noexcept; + + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static NormalizedExpectedValuesBlock deserialize(filestream::FileStream& fs); +}; + +// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalized-expected-value-vectors +class NormalizedExpectedValues { + phmap::btree_set _normalized_expected_values; + + public: + [[nodiscard]] std::int32_t nNormExpectedValueVectors() const noexcept; + [[nodiscard]] const phmap::btree_set& normExpectedValues() + const noexcept; + void emplace(const NormalizedExpectedValuesBlock& evb, bool force_overwrite = false); + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static NormalizedExpectedValues deserialize(filestream::FileStream& fs); +}; + +struct NormalizationVectorIndexBlock { + std::string type{}; + std::int32_t chrIdx{}; + std::string unit{}; + std::int32_t binSize{}; + std::int64_t position{}; + std::int64_t nBytes{}; + + private: + public: + NormalizationVectorIndexBlock() = default; + NormalizationVectorIndexBlock(std::string type_, std::uint32_t chrom_idx, std::string unit_, + std::uint32_t bin_size, std::size_t position_, std::size_t n_bytes); + + [[nodiscard]] bool operator<(const NormalizationVectorIndexBlock& other) const noexcept; + + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static NormalizationVectorIndexBlock deserialize(filestream::FileStream& fs); +}; + +// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalization-vector-index +class NormalizationVectorIndex { + std::vector _norm_vect_idx{}; + + public: + [[nodiscard]] std::int32_t nNormVectors() const noexcept; + [[nodiscard]] const std::vector normalizationVectorIndex() + const noexcept; + void emplace_back(NormalizationVectorIndexBlock blk); + + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static NormalizationVectorIndex deserialize(filestream::FileStream& fs); +}; + +} // namespace hictk::hic::internal + +#include "./impl/file_writer_data_structures_impl.hpp" // NOLINT diff --git a/src/libhictk/hic/include/hictk/hic/file_zoomify.hpp b/src/libhictk/hic/include/hictk/hic/file_zoomify.hpp new file mode 100644 index 00000000..718dd303 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/file_zoomify.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include "hictk/hic/file_writer.hpp" +#include "hictk/hic/filestream.hpp" + +namespace hictk::hic::internal { + +class HiCFileZoomify { + std::string _path_to_input_hic{}; + + HiCFileWriter _hfw{}; + + public: + HiCFileZoomify(std::string_view input_hic, std::string_view output_hic, + const std::vector& resolutions, std::size_t n_threads = 1, + std::size_t chunk_size = 10'000'000, + const std::filesystem::path& tmpdir = std::filesystem::temp_directory_path(), + std::uint32_t compression_lvl = 9); + void zoomify(); + + private: + [[nodiscard]] static HiCFileWriter init_writer(std::string_view input_hic, + std::string_view output_hic, + const std::vector& resolution, + std::size_t n_threads, std::size_t chunk_size, + const std::filesystem::path& tmpdir, + std::uint32_t compression_lvl); + void init(); +}; + +} // namespace hictk::hic::internal +#include "./impl/file_zoomify_impl.hpp" // NOLINT diff --git a/src/libhictk/hic/include/hictk/hic/filestream.hpp b/src/libhictk/hic/include/hictk/hic/filestream.hpp index 93c3d27d..32eee07a 100644 --- a/src/libhictk/hic/include/hictk/hic/filestream.hpp +++ b/src/libhictk/hic/include/hictk/hic/filestream.hpp @@ -11,30 +11,45 @@ #include #include #include +#include #include #include namespace hictk::hic::internal::filestream { class FileStream { - std::string path_{}; - mutable std::ifstream handle_{}; - std::size_t file_size_{}; + std::string _path{}; + mutable std::ifstream _ifs{}; + mutable std::ofstream _ofs{}; + std::size_t _file_size{}; public: FileStream() = default; - explicit FileStream(std::string path); + explicit FileStream(std::string path, std::ios::openmode mode = std::ios::in); + static FileStream create(std::string path); [[nodiscard]] const std::string &path() const noexcept; - [[nodiscard]] const std::string &url() const noexcept; [[nodiscard]] std::size_t size() const; void seekg(std::streamoff offset, std::ios::seekdir way = std::ios::beg); [[nodiscard]] std::size_t tellg() const noexcept; + + void seekp(std::streamoff offset, std::ios::seekdir way = std::ios::beg); + [[nodiscard]] std::size_t tellp() const noexcept; + [[nodiscard]] bool eof() const noexcept; + void flush(); + void read(std::string &buffer, std::size_t count); void read(char *buffer, std::size_t count); + void read_append(std::string &buffer, std::size_t count); + + bool getline(std::string &buffer, char delim = '\n'); + [[nodiscard]] std::string getline(char delim = '\n'); + + void write(std::string_view buffer); + void write(const char *buffer, std::size_t count); // NOLINTNEXTLINE(modernize-type-traits) template ::value>::type * = nullptr> @@ -43,6 +58,10 @@ class FileStream { template ::value>::type * = nullptr> void read(T &buffer); + // NOLINTNEXTLINE(modernize-type-traits) + template ::value>::type * = nullptr> + void write(T buffer); + template , // NOLINTNEXTLINE(modernize-type-traits) typename std::enable_if::value>::type * = nullptr> @@ -61,17 +80,20 @@ class FileStream { // NOLINTNEXTLINE(modernize-type-traits) template ::value>::type * = nullptr> - [[nodiscard]] std::vector read(std::size_t size); + void write(const std::vector &buffer); - void append(std::string &buffer, std::size_t count); - - bool getline(std::string &buffer, char delim = '\n'); - [[nodiscard]] std::string getline(char delim = '\n'); + // NOLINTNEXTLINE(modernize-type-traits) + template ::value>::type * = nullptr> + [[nodiscard]] std::vector read(std::size_t size); private: - [[nodiscard]] std::streampos new_pos(std::streamoff offset, std::ios::seekdir way); - [[nodiscard]] static std::ifstream open_file(const std::string &path, - std::ifstream::openmode mode); + [[nodiscard]] std::streampos new_posg(std::streamoff offset, std::ios::seekdir way); + [[nodiscard]] std::streampos new_posp(std::streamoff offset, std::ios::seekdir way); + void update_file_size(); + [[nodiscard]] static std::ifstream open_file_read(const std::string &path, + std::ifstream::openmode mode); + [[nodiscard]] static std::ofstream open_file_write(const std::string &path, + std::ofstream::openmode mode); }; } // namespace hictk::hic::internal::filestream diff --git a/src/libhictk/hic/include/hictk/hic/footer.hpp b/src/libhictk/hic/include/hictk/hic/footer.hpp index a9d13084..650c0d55 100644 --- a/src/libhictk/hic/include/hictk/hic/footer.hpp +++ b/src/libhictk/hic/include/hictk/hic/footer.hpp @@ -29,7 +29,7 @@ struct HiCFooterMetadata { std::uint32_t resolution{std::numeric_limits::max()}; Chromosome chrom1{}; Chromosome chrom2{}; - std::int64_t fileOffset{-1}; + std::int64_t matrixMetadataOffset{-1}; constexpr explicit operator bool() const noexcept; bool operator==(const HiCFooterMetadata &other) const noexcept; @@ -57,7 +57,7 @@ class HiCFooter { [[nodiscard]] constexpr HiCFooterMetadata &metadata() noexcept; [[nodiscard]] const Index &index() const noexcept; - [[nodiscard]] constexpr const std::string &url() const noexcept; + [[nodiscard]] constexpr const std::string &path() const noexcept; [[nodiscard]] constexpr MatrixType matrix_type() const noexcept; [[nodiscard]] balancing::Method normalization() const noexcept; [[nodiscard]] constexpr MatrixUnit unit() const noexcept; diff --git a/src/libhictk/hic/include/hictk/hic/header.hpp b/src/libhictk/hic/include/hictk/hic/header.hpp index 4493a980..33af3ae3 100644 --- a/src/libhictk/hic/include/hictk/hic/header.hpp +++ b/src/libhictk/hic/include/hictk/hic/header.hpp @@ -6,10 +6,14 @@ // IWYU pragma: private, include "hictk/hic.hpp" +#include + #include #include #include +#include "hictk/hic/binary_buffer.hpp" +#include "hictk/hic/filestream.hpp" #include "hictk/reference.hpp" namespace hictk::hic::internal { @@ -17,16 +21,20 @@ namespace hictk::hic::internal { struct HiCHeader { std::string url{}; std::int32_t version{-1}; - std::int64_t masterIndexOffset{-1}; + std::int64_t footerPosition{-1}; std::string genomeID{}; - std::int64_t nviPosition{-1}; - std::int64_t nviLength{-1}; + std::int64_t normVectorIndexPosition{-1}; + std::int64_t normVectorIndexLength{-1}; Reference chromosomes{}; std::vector resolutions{}; + phmap::flat_hash_map attributes{}; constexpr explicit operator bool() const noexcept; - bool operator==(const HiCHeader &other) const noexcept; - bool operator!=(const HiCHeader &other) const noexcept; + bool operator==(const HiCHeader& other) const noexcept; + bool operator!=(const HiCHeader& other) const noexcept; + + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const; + [[nodiscard]] static HiCHeader deserialize(filestream::FileStream& fs); }; } // namespace hictk::hic::internal diff --git a/src/libhictk/hic/include/hictk/hic/impl/binary_buffer_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/binary_buffer_impl.hpp new file mode 100644 index 00000000..02bc9421 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/impl/binary_buffer_impl.hpp @@ -0,0 +1,79 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +namespace hictk::hic::internal { + +template ::value>::type *> +inline T BinaryBuffer::read() { + static_assert(sizeof(char) == 1); + assert(_i < _buffer.size()); + T x{}; + + std::memcpy(static_cast(&x), _buffer.data() + _i, sizeof(T)); + _i += sizeof(T); + return x; +} + +template ::value>::type *> +inline void BinaryBuffer::read(T &buff) { + buff = read(); +} + +template ::value>::type *> +inline void BinaryBuffer::read(std::vector &buff) { + read(reinterpret_cast(buff.data()), sizeof(T) * buff.size()); +} + +inline void BinaryBuffer::read(std::string &buff, std::size_t n) { + buff.resize(n); + read(buff.data(), n); +} + +inline void BinaryBuffer::read(char *buff, std::size_t n) { + static_assert(sizeof(char) == 1); + assert(_i < _buffer.size()); + std::memcpy(static_cast(buff), _buffer.data() + _i, n * sizeof(char)); + _i += sizeof(char); +} + +inline std::string BinaryBuffer::getline(char delim) { + std::string_view view{_buffer}; + const auto pos = view.substr(_i).find(delim); + return std::string{view.substr(0, pos)}; +} + +template ::value>::type *> +inline void BinaryBuffer::write(T data) { + static_assert(sizeof(char) == 1); + _buffer.append(reinterpret_cast(&data), sizeof(T)); +} + +inline void BinaryBuffer::write(const std::string &data, bool add_nullterm) { + _buffer.append(data.c_str(), data.size() + add_nullterm); +} + +template ::value>::type *> +void BinaryBuffer::write(const std::vector &data) { + _buffer.append(reinterpret_cast(data.data()), data.size() * sizeof(T)); +} + +inline std::size_t BinaryBuffer::operator()() const noexcept { return _i; } + +inline std::string &BinaryBuffer::reset() noexcept { + _buffer.clear(); + _i = 0; + return _buffer; +} + +inline void BinaryBuffer::clear() noexcept { std::ignore = reset(); } + +inline const std::string &BinaryBuffer::get() const noexcept { return _buffer; } + +} // namespace hictk::hic::internal diff --git a/src/libhictk/hic/include/hictk/hic/impl/block_cache_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/block_cache_impl.hpp index 7ab77ecd..58218a9d 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/block_cache_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/block_cache_impl.hpp @@ -13,54 +13,6 @@ namespace hictk::hic::internal { -constexpr bool operator<(const InteractionBlock &a, const InteractionBlock &b) noexcept { - return a < b._id; -} -constexpr bool operator==(const InteractionBlock &a, const InteractionBlock &b) noexcept { - return a == b._id; -} -constexpr bool operator!=(const InteractionBlock &a, const InteractionBlock &b) noexcept { - return !(a == b); -} - -constexpr bool operator<(const InteractionBlock &a, std::size_t b_id) noexcept { - return a._id < b_id; -} -constexpr bool operator==(const InteractionBlock &a, std::size_t b_id) noexcept { - return a._id == b_id; -} -constexpr bool operator!=(const InteractionBlock &a, std::size_t b_id) noexcept { - return !(a == b_id); -} - -constexpr bool operator<(std::size_t a_id, const InteractionBlock &b) noexcept { - return a_id < b._id; -} -constexpr bool operator==(std::size_t a_id, const InteractionBlock &b) noexcept { - return a_id == b._id; -} -constexpr bool operator!=(std::size_t a_id, const InteractionBlock &b) noexcept { - return !(a_id == b); -} - -inline InteractionBlock::InteractionBlock(std::size_t id_, - [[maybe_unused]] std::size_t block_bin_count, - std::vector> pixels) - : _id(id_), _interactions(std::move(pixels)) {} - -inline auto InteractionBlock::operator()() const noexcept -> const BuffT & { return _interactions; } - -inline auto InteractionBlock::begin() const noexcept -> const_iterator { - return _interactions.begin(); -} -inline auto InteractionBlock::end() const noexcept -> const_iterator { return _interactions.end(); } -inline auto InteractionBlock::cbegin() const noexcept -> const_iterator { return begin(); } -inline auto InteractionBlock::cend() const noexcept -> const_iterator { return end(); } - -inline std::size_t InteractionBlock::id() const noexcept { return _id; } - -inline std::size_t InteractionBlock::size() const noexcept { return _interactions.size(); } - constexpr bool BlockID::operator==(const BlockID &other) const noexcept { return chrom1_id == other.chrom1_id && chrom2_id == other.chrom2_id && id == other.id; } diff --git a/src/libhictk/hic/include/hictk/hic/impl/block_reader_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/block_reader_impl.hpp index 8478134e..673c172d 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/block_reader_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/block_reader_impl.hpp @@ -24,25 +24,6 @@ namespace hictk::hic::internal { -template ::value>::type *> -inline T BinaryBuffer::read() { - static_assert(sizeof(char) == 1, ""); - assert(_i < _buffer.size()); - T x{}; - - std::memcpy(static_cast(&x), _buffer.data() + _i, sizeof(T)); - _i += sizeof(T); - return x; -} - -inline std::size_t BinaryBuffer::operator()() const noexcept { return _i; } - -inline std::string &BinaryBuffer::reset() noexcept { - _buffer.clear(); - _i = 0; - return _buffer; -} - inline HiCBlockReader::HiCBlockReader(std::shared_ptr hfs, const Index &master_index, std::shared_ptr bins_, std::shared_ptr block_cache_) diff --git a/src/libhictk/hic/include/hictk/hic/impl/expected_values_aggregator_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/expected_values_aggregator_impl.hpp new file mode 100644 index 00000000..13eee1c7 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/impl/expected_values_aggregator_impl.hpp @@ -0,0 +1,199 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "hictk/bin_table.hpp" +#include "hictk/chromosome.hpp" +#include "hictk/pixel.hpp" + +namespace hictk::hic::internal { + +inline ExpectedValuesAggregator::ExpectedValuesAggregator(std::shared_ptr bins) + : _bins(std::move(bins)) { + SPDLOG_INFO(FMT_STRING("[{} bp] initializing expected value vector"), _bins->bin_size()); + std::uint32_t max_length = 0; + for (const auto &chrom : chromosomes()) { + if (chrom.is_all()) { + continue; + } + + max_length = std::max(max_length, chrom.size()); + + _num_bins_gw += chrom.size(); + } + + const auto bin_size = _bins->bin_size(); + // round down to mimick HiCTools' behavior + const auto max_n_bins = max_length / bin_size; + _possible_distances.resize(max_n_bins, 0.0); + _actual_distances.resize(max_n_bins, 0.0); +} + +inline void ExpectedValuesAggregator::add(const ThinPixel &p) { + add(Pixel{*_bins, p}); +} + +inline void ExpectedValuesAggregator::add(const Pixel &p) { + if (std::isnan(p.count)) { + return; + } + + const auto &chrom1 = p.coords.bin1.chrom(); + const auto &chrom2 = p.coords.bin2.chrom(); + + if (p.coords.is_intra()) { + at(chrom1) += static_cast(p.count); + const auto i = p.coords.bin2.id() - p.coords.bin1.id(); + // skip last bin in chromosome if chromosome size is not a multiple of bin size + // this is done to mimick HiCTools' behavior + if (i < _actual_distances.size()) { + _actual_distances[i] += static_cast(p.count); + } + } else { + at(chrom1, chrom2) += static_cast(p.count); + } +} + +inline void ExpectedValuesAggregator::compute_density() { + SPDLOG_INFO(FMT_STRING("[{} bp] computing expected vector density"), _bins->bin_size()); + init_possible_distances(); + compute_density_cis(); + compute_density_trans(); +} + +inline const std::vector &ExpectedValuesAggregator::weights() const noexcept { + return _weights; +} + +inline double ExpectedValuesAggregator::scaling_factor(const Chromosome &chrom) const { + return _scaling_factors.at(chrom); +} + +inline const phmap::btree_map &ExpectedValuesAggregator::scaling_factors() + const noexcept { + return _scaling_factors; +} + +inline void ExpectedValuesAggregator::init_possible_distances() { + const auto bin_size = _bins->bin_size(); + + for (const auto &[chrom, _] : _cis_sum) { + if (chrom.is_all()) { + continue; + } + const auto n_bins = chrom.size() / bin_size; + for (std::uint32_t i = 0; i < n_bins; ++i) { + _possible_distances[i] += n_bins - i; + } + } +} + +inline void ExpectedValuesAggregator::compute_density_cis() { + // Re-implementation of the algorithm used by HiCTools: + // https://github.com/aidenlab/HiCTools/blob/6b2fab8e78685deae199c33bbb167dcab1dbfbb3/src/hic/tools/utils/original/ExpectedValueCalculation.java#L184 + + auto num_sum = _actual_distances.front(); + auto den_sum = _possible_distances.front(); + std::size_t bound1 = 0; + std::size_t bound2 = 0; + + const auto shot_noise_minimum = 400.0; + const auto max_num_bins = _actual_distances.size(); + + _weights.resize(max_num_bins); + std::fill(_weights.begin(), _weights.end(), 0.0); + + for (std::size_t ii = 0; ii < max_num_bins; ii++) { + if (num_sum < shot_noise_minimum) { + while (num_sum < shot_noise_minimum && ++bound2 < max_num_bins) { + num_sum += _actual_distances[bound2]; + den_sum += _possible_distances[bound2]; + } + } else if (num_sum >= shot_noise_minimum && bound2 - bound1 > 0) { + while (bound2 > bound1 && bound2 < _num_bins_gw && bound1 < _num_bins_gw && + num_sum - _actual_distances[bound1] - _actual_distances[bound2] >= + shot_noise_minimum) { + num_sum = num_sum - _actual_distances[bound1] - _actual_distances[bound2]; + den_sum = den_sum - _possible_distances[bound1] - _possible_distances[bound2]; + bound1++; + bound2--; + } + } + _weights[ii] = num_sum / den_sum; + if (bound2 + 2 < max_num_bins) { + num_sum += _actual_distances[bound2 + 1] + _actual_distances[bound2 + 2]; + den_sum += _possible_distances[bound2 + 1] + _possible_distances[bound2 + 2]; + bound2 += 2; + } else if (bound2 + 1 < max_num_bins) { + num_sum += _actual_distances[bound2 + 1]; + den_sum += _possible_distances[bound2 + 1]; + bound2++; + } + } + + for (const auto &[chrom, _] : _cis_sum) { + if (chrom.is_all()) { + continue; + } + auto num_chrom_bins = chrom.size() / _bins->bin_size(); + auto expected_count = 0.0; + for (std::size_t n = 0; n < num_chrom_bins; n++) { + if (n < max_num_bins) { + double v = _weights[n]; + expected_count += (double(num_chrom_bins) - double(n)) * v; + } + } + + double observed_count = _cis_sum.at(chrom); + double f = expected_count / observed_count; + _scaling_factors.emplace(chrom, f); + } +} + +inline void ExpectedValuesAggregator::compute_density_trans() { + for (auto &[k, v] : _trans_sum) { + // We round-down to match HiCTools behavior + const auto num_bins1 = k.first.size() / _bins->bin_size(); + const auto num_bins2 = k.second.size() / _bins->bin_size(); + const auto num_pixels = num_bins1 * num_bins2; + v = num_pixels != 0 ? v / static_cast(num_pixels) : 0.0; + } +} + +inline double ExpectedValuesAggregator::at(const Chromosome &chrom) const { + return _cis_sum.at(chrom); +} + +inline double ExpectedValuesAggregator::at(const Chromosome &chrom1, + const Chromosome &chrom2) const { + return _trans_sum.at(std::make_pair(chrom1, chrom2)); +} + +inline double &ExpectedValuesAggregator::at(const Chromosome &chrom) { + auto [it, _] = _cis_sum.try_emplace(chrom, 0.0); + return it->second; +} + +inline double &ExpectedValuesAggregator::at(const Chromosome &chrom1, const Chromosome &chrom2) { + auto [it, _] = _trans_sum.try_emplace(std::make_pair(chrom1, chrom2), 0.0); + return it->second; +} + +inline const Reference &ExpectedValuesAggregator::chromosomes() const noexcept { + assert(_bins); + return _bins->chromosomes(); +} + +} // namespace hictk::hic::internal diff --git a/src/libhictk/hic/include/hictk/hic/impl/file_reader_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/file_reader_impl.hpp index 076343fa..fb880596 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/file_reader_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/file_reader_impl.hpp @@ -44,7 +44,7 @@ inline filestream::FileStream HiCFileReader::openStream(std::string url) { } } -inline const std::string &HiCFileReader::url() const noexcept { return _fs->url(); } +inline const std::string &HiCFileReader::path() const noexcept { return _fs->path(); } inline const HiCHeader &HiCFileReader::header() const noexcept { return *_header; } inline std::int32_t HiCFileReader::version() const noexcept { @@ -174,9 +174,7 @@ inline bool HiCFileReader::checkMagicString(filestream::FileStream &fs) { return fs.getline('\0') == "HIC"; } -inline std::int64_t HiCFileReader::masterOffset() const noexcept { - return _header->masterIndexOffset; -} +inline std::int64_t HiCFileReader::masterOffset() const noexcept { return _header->footerPosition; } inline auto HiCFileReader::init_decompressor() -> Decompressor { Decompressor zs(libdeflate_alloc_decompressor(), @@ -205,7 +203,7 @@ inline Index HiCFileReader::read_index(std::int64_t fileOffset, const Chromosome std::ignore = _fs->read(); // oldIndex const auto sumCount = _fs->read(); std::ignore = _fs->read(); // occupiedCellCount - std::ignore = _fs->read(); // stdDev + std::ignore = _fs->read(); // percent5 std::ignore = _fs->read(); // percent95 const auto foundResolution = static_cast(_fs->read()); @@ -214,9 +212,8 @@ inline Index HiCFileReader::read_index(std::int64_t fileOffset, const Chromosome const auto nBlocks = static_cast(_fs->read()); - Index::BlkIdxBuffer buffer; - buffer.reserve(nBlocks); if (wantedUnit == foundUnit && wantedResolution == foundResolution) { + Index::BlkIdxBuffer buffer(nBlocks); for (std::size_t j = 0; j < nBlocks; ++j) { const auto block_id = static_cast(_fs->read()); const auto position = static_cast(_fs->read()); @@ -245,79 +242,8 @@ inline Index HiCFileReader::read_index(std::int64_t fileOffset, const Chromosome inline bool HiCFileReader::checkMagicString() { return checkMagicString(*_fs); } -// reads the header, storing the positions of the normalization vectors and returning the -// masterIndexPosition pointer inline HiCHeader HiCFileReader::readHeader(filestream::FileStream &fs) { - if (!checkMagicString(fs)) { - throw std::runtime_error(fmt::format( - FMT_STRING("Hi-C magic string is missing. {} does not appear to be a hic file"), fs.url())); - } - - HiCHeader header{fs.url()}; - - fs.read(header.version); - if (header.version < 6) { - throw std::runtime_error(fmt::format( - FMT_STRING(".hic version 5 and older are no longer supported. Found version {}"), - header.version)); - } - fs.read(header.masterIndexOffset); - if (header.masterIndexOffset < 0 || - header.masterIndexOffset >= static_cast(fs.size())) { - throw std::runtime_error( - fmt::format(FMT_STRING("file appears to be corrupted: expected master index offset to " - "be between 0 and {}, found {}"), - fs.size(), header.masterIndexOffset)); - } - - fs.getline(header.genomeID, '\0'); - if (header.genomeID.empty()) { - header.genomeID = "unknown"; - } - - if (header.version > 8) { - fs.read(header.nviPosition); - fs.read(header.nviLength); - } - - const auto nAttributes = fs.read(); - - // reading and ignoring attribute-value dictionary - for (std::int32_t i = 0; i < nAttributes; i++) { - std::ignore = fs.getline('\0'); // key - std::ignore = fs.getline('\0'); // value - } - - // Read chromosomes - auto numChromosomes = static_cast(fs.read()); - std::vector chrom_names(numChromosomes); - std::vector chrom_sizes(numChromosomes); - for (std::size_t i = 0; i < chrom_names.size(); ++i) { - fs.getline(chrom_names[i], '\0'); - chrom_sizes[i] = static_cast( - header.version > 8 ? fs.read() - : static_cast(fs.read())); - } - - if (chrom_names.empty()) { - throw std::runtime_error("unable to read chromosomes"); - } - - header.chromosomes = Reference(chrom_names.begin(), chrom_names.end(), chrom_sizes.begin()); - - // Read resolutions - const auto numResolutions = static_cast(fs.read()); - if (numResolutions == 0) { - throw std::runtime_error("unable to read the list of available resolutions"); - } - header.resolutions.resize(numResolutions); - std::generate(header.resolutions.begin(), header.resolutions.end(), [&]() { - const auto res = fs.read(); - assert(res > 0); - return static_cast(res); - }); - - return header; + return HiCHeader::deserialize(fs); } inline void HiCFileReader::readAndInflate(const BlockIndex &idx, std::string &plainTextBuffer) { @@ -375,6 +301,7 @@ inline std::int64_t HiCFileReader::read_footer_file_offset(std::string_view key) auto nEntries = _fs->read(); for (int i = 0; i < nEntries; i++) { const auto strbuff = _fs->getline('\0'); + assert(!strbuff.empty()); const auto fpos = _fs->read(); std::ignore = _fs->read(); // sizeInBytes if (strbuff == key) { @@ -492,6 +419,21 @@ inline void HiCFileReader::read_footer_norm(std::uint32_t chrom1_id, std::uint32 _fs->seekg(currentPos); } } + + if (!*weights1) { + const auto num_bins = + static_cast((chrom1.size() + wanted_resolution - 1) / wanted_resolution); + *weights1 = + balancing::Weights{std::vector(num_bins, std::numeric_limits::quiet_NaN()), + balancing::Weights::Type::DIVISIVE}; + } + if (!*weights2) { + const auto num_bins = + static_cast((chrom2.size() + wanted_resolution - 1) / wanted_resolution); + *weights2 = + balancing::Weights{std::vector(num_bins, std::numeric_limits::quiet_NaN()), + balancing::Weights::Type::DIVISIVE}; + } } inline HiCFooter HiCFileReader::read_footer(std::uint32_t chrom1_id, std::uint32_t chrom2_id, @@ -508,7 +450,7 @@ inline HiCFooter HiCFileReader::read_footer(std::uint32_t chrom1_id, std::uint32 // clang-format off HiCFooterMetadata metadata{ - _fs->url(), + _fs->path(), matrix_type, wanted_norm, wanted_unit, @@ -522,15 +464,15 @@ inline HiCFooter HiCFileReader::read_footer(std::uint32_t chrom1_id, std::uint32 _fs->seekg(masterOffset()); - metadata.fileOffset = read_footer_file_offset(key); - if (metadata.fileOffset == -1) { + metadata.matrixMetadataOffset = read_footer_file_offset(key); + if (metadata.matrixMetadataOffset == -1) { return {Index{}, std::move(metadata), {}, std::move(weights1), std::move(weights2)}; } const auto file_offset = _fs->tellg(); // NOTE: we read then move index to workaround assertion failures when compiling under MSVC - auto index = read_index(metadata.fileOffset, metadata.chrom1, metadata.chrom2, metadata.unit, - metadata.resolution); + auto index = read_index(metadata.matrixMetadataOffset, metadata.chrom1, metadata.chrom2, + metadata.unit, metadata.resolution); _fs->seekg(static_cast(file_offset)); if ((matrix_type == MT::observed && wanted_norm == NM::NONE()) || @@ -570,48 +512,67 @@ inline HiCFooter HiCFileReader::read_footer(std::uint32_t chrom1_id, std::uint32 read_footer_norm(chrom1_id, chrom2_id, wanted_norm, wanted_unit, wanted_resolution, metadata.chrom1, metadata.chrom2, weights1, weights2); - if (!*weights1 && !*weights2) { - throw std::runtime_error( - fmt::format(FMT_STRING("unable to find {} normalization vectors for {}:{} at {} ({})"), - wanted_norm, _header->chromosomes.at(chrom1_id).name(), - _header->chromosomes.at(chrom2_id).name(), wanted_resolution, wanted_unit)); - } - - if (!*weights1 || !*weights2) { - const auto chrom_id = !*weights1 ? chrom1_id : chrom2_id; - throw std::runtime_error(fmt::format( - FMT_STRING("unable to find {} normalization vector for {} at {} ({})"), wanted_norm, - _header->chromosomes.at(chrom_id).name(), wanted_resolution, wanted_unit)); - } - return {std::move(index), std::move(metadata), std::move(expectedValues), std::move(weights1), std::move(weights2)}; } inline std::vector HiCFileReader::list_avail_normalizations( MatrixType matrix_type, MatrixUnit wanted_unit, std::uint32_t wanted_resolution) { + if (version() >= 9) { + return list_avail_normalizations_v9(); + } + phmap::flat_hash_set methods{}; _fs->seekg(masterOffset()); [[maybe_unused]] const auto offset = read_footer_file_offset("1_1"); assert(offset != -1); - std::ignore = read_footer_expected_values(1, 1, matrix_type, balancing::Method::NONE(), - wanted_unit, wanted_resolution); + const auto chrom_id = _header->chromosomes.longest_chromosome().id(); + std::ignore = read_footer_expected_values( + chrom_id, chrom_id, matrix_type, balancing::Method::NONE(), wanted_unit, wanted_resolution); if (_fs->tellg() == _fs->size()) { return {}; } - const auto nExpectedValues = _fs->read(); - for (std::int32_t i = 0; i < nExpectedValues; i++) { + std::ignore = read_footer_expected_values_norm( + chrom_id, chrom_id, matrix_type, balancing::Method::NONE(), wanted_unit, wanted_resolution); + if (_fs->tellg() == _fs->size()) { + return {}; + } + + const auto nNormVectors = _fs->read(); + for (std::int32_t i = 0; i < nNormVectors; i++) { const auto foundNorm = readNormalizationMethod(); methods.emplace(foundNorm); + [[maybe_unused]] const auto chrIdx = _fs->read(); [[maybe_unused]] const auto foundUnit = readMatrixUnit(); [[maybe_unused]] const auto foundResolution = _fs->read_as_unsigned(); + [[maybe_unused]] const auto position = _fs->read(); + [[maybe_unused]] const auto nBytes = _fs->read(); + } - [[maybe_unused]] const auto nValues = readNValues(); + std::vector methods_{methods.size()}; + std::copy(methods.begin(), methods.end(), methods_.begin()); + std::sort(methods_.begin(), methods_.end(), + [&](const auto &m1, const auto &m2) { return m1.to_string() < m2.to_string(); }); + return methods_; +} - discardExpectedVector(nValues); - discardNormalizationFactors(1); +inline std::vector HiCFileReader::list_avail_normalizations_v9() { + if (_header->normVectorIndexPosition <= 0) { + return {}; + } + phmap::flat_hash_set methods{}; + _fs->seekg(_header->normVectorIndexPosition); + const auto nNormVectors = _fs->read(); + for (std::int32_t i = 0; i < nNormVectors; i++) { + const auto foundNorm = readNormalizationMethod(); + methods.emplace(foundNorm); + [[maybe_unused]] const auto chrIdx = _fs->read(); + [[maybe_unused]] const auto foundUnit = readMatrixUnit(); + [[maybe_unused]] const auto foundResolution = _fs->read_as_unsigned(); + [[maybe_unused]] const auto position = _fs->read(); + [[maybe_unused]] const auto nBytes = _fs->read(); } std::vector methods_{methods.size()}; diff --git a/src/libhictk/hic/include/hictk/hic/impl/file_writer_data_structures_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/file_writer_data_structures_impl.hpp new file mode 100644 index 00000000..3be6f5ce --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/impl/file_writer_data_structures_impl.hpp @@ -0,0 +1,821 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/fmt/pixel.hpp" +#include "hictk/hic/binary_buffer.hpp" +#include "hictk/pixel.hpp" + +namespace hictk::hic::internal { + +inline std::string MatrixMetadata::serialize(BinaryBuffer &buffer, bool clear) const { + if (clear) { + buffer.clear(); + } + + try { + buffer.write(chr1Idx); + buffer.write(chr2Idx); + buffer.write(nResolutions); + } catch (const std::exception &e) { + throw std::runtime_error("an error occurred while serializing a MatrixMetadata object: " + + std::string{e.what()}); + } + return buffer.get(); +} + +inline std::string MatrixBlockMetadata::serialize(BinaryBuffer &buffer, bool clear) const { + if (clear) { + buffer.clear(); + } + + try { + buffer.write(blockNumber); + buffer.write(blockPosition); + buffer.write(blockSizeBytes); + } catch (const std::exception &e) { + throw std::runtime_error("an error occurred while serializing a MatrixBlockMetadata object: " + + std::string{e.what()}); + } + + return buffer.get(); +} + +inline bool MatrixBlockMetadata::operator<(const MatrixBlockMetadata &other) const noexcept { + return blockNumber < other.blockNumber; +} + +inline bool MatrixResolutionMetadata::operator<( + const MatrixResolutionMetadata &other) const noexcept { + if (unit != other.unit) { + return unit < other.unit; + } + return binSize < other.binSize; +} + +inline std::string MatrixResolutionMetadata::serialize(BinaryBuffer &buffer, bool clear) const { + assert(!_block_metadata.empty()); + + if (clear) { + buffer.clear(); + } + + try { + buffer.write(unit); + buffer.write(resIdx); + buffer.write(sumCounts); + buffer.write(occupiedCellCount); + buffer.write(percent5); + buffer.write(percent95); + buffer.write(binSize); + buffer.write(blockSize); + buffer.write(blockColumnCount); + buffer.write(blockCount); + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while serializing a MatrixResolutionMetadata object: " + + std::string{e.what()}); + } + + for (const auto &blk : _block_metadata) { + std::ignore = blk.serialize(buffer, false); + } + + return buffer.get(); +} + +template +inline void MatrixResolutionMetadata::set_block_metadata(It first_block, It last_block) { + _block_metadata.clear(); + std::copy(first_block, last_block, std::back_inserter(_block_metadata)); + blockCount = static_cast(_block_metadata.size()); +} + +inline std::string MatrixBodyMetadata::serialize(BinaryBuffer &buffer, bool clear) const { + try { + std::ignore = matrixMetadata.serialize(buffer, clear); + for (const auto &metadata : resolutionMetadata) { + std::ignore = metadata.serialize(buffer, false); + } + } catch (const std::exception &e) { + throw std::runtime_error("an error occurred while serializing a MatrixBodyMetadata object: " + + std::string{e.what()}); + } + + return buffer.get(); +} + +template +inline bool MatrixInteractionBlock::Pixel::operator<(const Pixel &other) const noexcept { + return column < other.column; +} + +template +inline std::size_t MatrixInteractionBlock::size() const noexcept { + return static_cast(nRecords); +} + +template +inline double MatrixInteractionBlock::sum() const noexcept { + return _sum; +} + +template +inline void MatrixInteractionBlock::emplace_back(hictk::Pixel &&p, + std::uint32_t bin_id_offset) { + try { + _sum += conditional_static_cast(p.count); + + assert(p.coords.bin1.rel_id() >= bin_id_offset); + assert(p.coords.bin2.rel_id() >= bin_id_offset); + const auto row = static_cast(p.coords.bin2.rel_id() - bin_id_offset); + const auto col = static_cast(p.coords.bin1.rel_id() - bin_id_offset); + + _min_col = std::min(col, _min_col); + _max_col = std::max(col, _max_col); + + binRowOffset = std::min(binRowOffset, row); + binColumnOffset = std::min(binColumnOffset, col); + + auto match1 = _interactions.find(row); + if (match1 != _interactions.end()) { + auto &pixels = match1->second; + auto [it, inserted] = pixels.emplace(Pixel{col, p.count}); + nRecords += inserted; + if (!inserted) { + it->count += p.count; + } + } else { + nRecords++; + _interactions.emplace(row, Row{Pixel{col, p.count}}); + } + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "an error occurred while adding pixel {} to a MatrixInteractionBlock object: {}"), + p, e.what())); + } +} + +template +inline void MatrixInteractionBlock::finalize() { + try { + const auto size_lor = compute_size_lor_repr(); + const auto size_dense = compute_size_dense_repr(); + const auto width = compute_dense_width(); + + const auto use_lor = + (size_lor < size_dense) || (width > std::numeric_limits::max()); + + useFloatContact = 1; + useIntXPos = 1; + useIntYPos = 1; + matrixRepresentation = use_lor ? 1 : 2; + + // this can overflow, but it's ok because in this case use_lor=true + w = static_cast(width); + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while finalizing a MatrixInteractionBlock object: " + + std::string{e.what()}); + } +} + +template +inline auto MatrixInteractionBlock::operator()() const noexcept + -> const phmap::btree_map & { + return _interactions; +} + +template +inline std::string MatrixInteractionBlock::serialize(BinaryBuffer &buffer, + libdeflate_compressor &compressor, + std::string &compression_buffer, + bool clear) const { + if (matrixRepresentation == 1) { + return serialize_lor(buffer, compressor, compression_buffer, clear); + } + return serialize_dense(buffer, compressor, compression_buffer, clear); +} + +template +inline std::size_t MatrixInteractionBlock::compute_size_lor_repr() const noexcept { + std::size_t size_ = sizeof(nRecords) + sizeof(binColumnOffset) + sizeof(binRowOffset) + + sizeof(useFloatContact) + sizeof(useIntXPos) + sizeof(useIntYPos) + + sizeof(matrixRepresentation); + + // compute space taken up by rows + size_ += (_interactions.size() * sizeof(std::int32_t)) + sizeof(std::int32_t); + + // compute space taken up by columns + size_ += size() * (sizeof(std::int32_t) + sizeof(N)); + + return size_; +} + +template +inline std::size_t MatrixInteractionBlock::compute_size_dense_repr() const noexcept { + const auto width = compute_dense_width(); + const auto npixels = width * width; + + const std::size_t size_ = sizeof(nRecords) + sizeof(binColumnOffset) + sizeof(binRowOffset) + + sizeof(useFloatContact) + sizeof(useIntXPos) + sizeof(useIntYPos) + + sizeof(matrixRepresentation); + return size_ + (sizeof(std::int32_t) + sizeof(std::int16_t)) + (npixels * sizeof(N)); +} + +template +inline std::size_t MatrixInteractionBlock::compute_dense_width() const noexcept { + const auto min_row = _interactions.begin()->first; + const auto max_row = (--_interactions.end())->first; + const auto height = max_row - min_row; + + const auto width = _max_col - _min_col; + + return static_cast(std::max(height, width) + 1); +} + +template +inline std::string MatrixInteractionBlock::serialize_lor(BinaryBuffer &buffer, + libdeflate_compressor &compressor, + std::string &compression_buffer, + bool clear) const { + assert(matrixRepresentation == 1); + // TODO support representation using shorts + + if (clear) { + buffer.clear(); + } + + try { + buffer.write(nRecords); + buffer.write(binColumnOffset); + buffer.write(binRowOffset); + buffer.write(useFloatContact); + buffer.write(useIntXPos); + buffer.write(useIntYPos); + buffer.write(matrixRepresentation); + + const auto rowCount = static_cast(_interactions.size()); // TODO support short + buffer.write(rowCount); + + for (const auto &[row, pixels] : _interactions) { + assert(static_cast(row) >= binRowOffset); + const auto rowNumber = static_cast(row) - binRowOffset; // TODO support short + const auto recordCount = static_cast(pixels.size()); // TODO support short + buffer.write(rowNumber); + buffer.write(recordCount); + + assert(std::is_sorted(pixels.begin(), pixels.end())); + for (const auto &[col, count] : pixels) { + assert(col >= binColumnOffset); + const auto binColumn = col - binColumnOffset; + buffer.write(binColumn); + buffer.write(count); + } + } + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while serializing a MatrixInteractionBlock using the sparse " + "representation: " + + std::string{e.what()}); + } + + try { + compress(buffer.get(), compression_buffer, compressor); + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while compressing a serialized object of MatrixInteractionBlock type " + "(sparse representation): " + + std::string{e.what()}); + } + return compression_buffer; +} + +template +inline std::string MatrixInteractionBlock::serialize_dense(BinaryBuffer &buffer, + libdeflate_compressor &compressor, + std::string &compression_buffer, + bool clear) const { + assert(matrixRepresentation == 2); + // TODO support representation using shorts + + if (clear) { + buffer.clear(); + } + + try { + const N fill_value = -32768; + std::vector counts(static_cast(w) * static_cast(w), fill_value); + + for (const auto &[row, pixels] : _interactions) { + assert(row >= binRowOffset); + const auto i = static_cast(row - binRowOffset); + for (const auto &[col, value] : pixels) { + const auto j = static_cast(col - binColumnOffset); + const auto idx = (i * static_cast(w)) + j; + assert(idx < counts.size()); + counts[idx] = value; + } + } + + if constexpr (std::is_floating_point_v) { + std::transform(counts.begin(), counts.end(), counts.begin(), [&](const auto n) { + return n == fill_value ? std::numeric_limits::quiet_NaN() : n; + }); + } + + buffer.write(nRecords); + buffer.write(binColumnOffset); + buffer.write(binRowOffset); + buffer.write(useFloatContact); + buffer.write(useIntXPos); + buffer.write(useIntYPos); + buffer.write(matrixRepresentation); + + buffer.write(static_cast(counts.size())); + buffer.write(w); + buffer.write(counts); + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while serializing a MatrixInteractionBlock using the dense " + "representation: " + + std::string{e.what()}); + } + + try { + compress(buffer.get(), compression_buffer, compressor); + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while compressing a serialized object of MatrixInteractionBlock type " + "(dense representation): " + + std::string{e.what()}); + } + + return compression_buffer; +} + +template +inline void MatrixInteractionBlock::compress(const std::string &buffer_in, + std::string &buffer_out, + libdeflate_compressor &compressor) { + assert(buffer_out.capacity() != 0); + buffer_out.resize(buffer_out.capacity()); + while (true) { + const auto compressed_size = libdeflate_zlib_compress( + &compressor, buffer_in.data(), buffer_in.size(), buffer_out.data(), buffer_out.size()); + if (compressed_size != 0) { + buffer_out.resize(compressed_size); + break; + } + + buffer_out.resize(buffer_out.size() * 2); + } +} + +inline std::string FooterMasterIndex::serialize(BinaryBuffer &buffer, bool clear) const { + if (clear) { + buffer.clear(); + } + + try { + buffer.write(key); + buffer.write(position); + buffer.write(size); + } catch (const std::exception &e) { + throw std::runtime_error("an error occurred while serializing a FooterMasterIndex object: " + + std::string{e.what()}); + } + + return buffer.get(); +} + +inline std::int64_t ExpectedValuesBlock::nValues() const noexcept { + return static_cast(value.size()); +} + +inline std::int32_t ExpectedValuesBlock::nChrScaleFactors() const noexcept { + assert(chrIndex.size() == chrScaleFactor.size()); + return static_cast(chrIndex.size()); +} + +inline ExpectedValuesBlock::ExpectedValuesBlock(std::string_view unit_, std::uint32_t bin_size, + const std::vector &weights, + const std::vector &chrom_ids, + const std::vector &scale_factors) + : unit(std::string{unit_}), + binSize(static_cast(bin_size)), + value(weights.size()), + chrIndex(chrom_ids.size()), + chrScaleFactor(chrom_ids.size()) { + std::transform(weights.begin(), weights.end(), value.begin(), + [](const auto n) { return static_cast(n); }); + std::transform(chrom_ids.begin(), chrom_ids.end(), chrIndex.begin(), + [](const auto n) { return static_cast(n); }); + std::transform(scale_factors.begin(), scale_factors.end(), chrScaleFactor.begin(), + [](const auto n) { return static_cast(n); }); +} + +inline bool ExpectedValuesBlock::operator<(const ExpectedValuesBlock &other) const noexcept { + if (unit != other.unit) { + return unit < other.unit; + } + + return binSize < other.binSize; +} + +inline std::string ExpectedValuesBlock::serialize(BinaryBuffer &buffer, bool clear) const { + if (clear) { + buffer.clear(); + } + + try { + buffer.write(unit); + buffer.write(binSize); + buffer.write(nValues()); + buffer.write(value); + buffer.write(nChrScaleFactors()); + + assert(chrIndex.size() == chrScaleFactor.size()); + for (std::size_t i = 0; i < chrIndex.size(); ++i) { + buffer.write(chrIndex[i]); + buffer.write(chrScaleFactor[i]); + } + } catch (const std::exception &e) { + throw std::runtime_error("an error occurred while serializing an ExpectedValuesBlock object: " + + std::string{e.what()}); + } + + return buffer.get(); +} + +inline ExpectedValuesBlock ExpectedValuesBlock::deserialize(filestream::FileStream &fs) { + ExpectedValuesBlock evb{}; + + try { + evb.unit = fs.getline('\0'); + fs.read(evb.binSize); + const auto nValues = static_cast(fs.read()); + evb.value.resize(nValues); + fs.read(evb.value); + const auto nChrScaleFactors = static_cast(fs.read()); + evb.chrIndex.resize(nChrScaleFactors); + evb.chrScaleFactor.resize(nChrScaleFactors); + + for (std::size_t i = 0; i < nChrScaleFactors; ++i) { + evb.chrIndex.emplace_back(fs.read()); + evb.chrScaleFactor.emplace_back(fs.read()); + } + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while deserializing an ExpectedValuesBlock object: " + + std::string{e.what()}); + } + + return evb; +} + +inline std::int32_t ExpectedValues::nExpectedValueVectors() const noexcept { + return static_cast(expectedValues().size()); +} + +inline const phmap::btree_set &ExpectedValues::expectedValues() + const noexcept { + return _expected_values; +} + +inline void ExpectedValues::emplace(const ExpectedValuesBlock &evb, bool force_overwrite) { + auto [it, inserted] = _expected_values.emplace(evb); + if (!inserted) { + if (force_overwrite) { + *it = evb; + } else { + throw std::runtime_error( + fmt::format(FMT_STRING("ExpectedValues already contains vector for {} resolution ({})"), + it->binSize, it->unit)); + } + } +} + +inline std::string ExpectedValues::serialize(BinaryBuffer &buffer, bool clear) const { + if (clear) { + buffer.clear(); + } + + try { + buffer.write(nExpectedValueVectors()); + + if (nExpectedValueVectors() == 0) { + return buffer.get(); + } + + for (const auto &ev : expectedValues()) { + std::ignore = ev.serialize(buffer, false); + } + } catch (const std::exception &e) { + throw std::runtime_error("an error occurred while serializing an ExpectedValues object: " + + std::string{e.what()}); + } + + return buffer.get(); +} + +inline ExpectedValues ExpectedValues::deserialize(filestream::FileStream &fs) { + ExpectedValues evs{}; + + try { + const auto nExpectedValueVectors = static_cast(fs.read()); + for (std::size_t i = 0; i < nExpectedValueVectors; ++i) { + evs.emplace(ExpectedValuesBlock::deserialize(fs), true); + } + } catch (const std::exception &e) { + throw std::runtime_error("an error occurred while deserializing an ExpectedValues object: " + + std::string{e.what()}); + } + return evs; +} + +inline std::int64_t NormalizedExpectedValuesBlock::nValues() const noexcept { + return static_cast(value.size()); +} + +inline std::int32_t NormalizedExpectedValuesBlock::nChrScaleFactors() const noexcept { + assert(chrIndex.size() == chrScaleFactor.size()); + return static_cast(chrIndex.size()); +} + +inline NormalizedExpectedValuesBlock::NormalizedExpectedValuesBlock( + std::string_view type_, std::string_view unit_, std::uint32_t bin_size, + const std::vector &weights, const std::vector &chrom_ids, + const std::vector &scale_factors) + : type(std::string{type_}), + unit(std::string{unit_}), + binSize(static_cast(bin_size)), + value(weights.size()), + chrIndex(chrom_ids.size()), + chrScaleFactor(chrom_ids.size()) { + std::transform(weights.begin(), weights.end(), value.begin(), + [](const auto n) { return static_cast(n); }); + std::transform(chrom_ids.begin(), chrom_ids.end(), chrIndex.begin(), + [](const auto n) { return static_cast(n); }); + std::transform(scale_factors.begin(), scale_factors.end(), chrScaleFactor.begin(), + [](const auto n) { return static_cast(n); }); +} + +inline bool NormalizedExpectedValuesBlock::operator<( + const NormalizedExpectedValuesBlock &other) const noexcept { + if (type != other.type) { + return type < other.type; + } + if (unit != other.unit) { + return unit < other.unit; + } + return binSize < other.binSize; +} + +inline std::string NormalizedExpectedValuesBlock::serialize(BinaryBuffer &buffer, + bool clear) const { + if (clear) { + buffer.clear(); + } + + try { + buffer.write(type); + buffer.write(unit); + buffer.write(binSize); + buffer.write(nValues()); + buffer.write(value); + buffer.write(nChrScaleFactors()); + + assert(chrIndex.size() == chrScaleFactor.size()); + for (std::size_t i = 0; i < chrIndex.size(); ++i) { + buffer.write(chrIndex[i]); + buffer.write(chrScaleFactor[i]); + } + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while serializing a NormalizedExpectedValuesBlock object: " + + std::string{e.what()}); + } + + return buffer.get(); +} + +inline NormalizedExpectedValuesBlock NormalizedExpectedValuesBlock::deserialize( + filestream::FileStream &fs) { + NormalizedExpectedValuesBlock nevb{}; + + try { + nevb.type = fs.getline('\0'); + nevb.unit = fs.getline('\0'); + fs.read(nevb.binSize); + const auto nValues = static_cast(fs.read()); + nevb.value.resize(nValues); + fs.read(nevb.value); + const auto nChrScaleFactors = static_cast(fs.read()); + nevb.chrIndex.resize(nChrScaleFactors); + nevb.chrScaleFactor.resize(nChrScaleFactors); + + for (std::size_t i = 0; i < nChrScaleFactors; ++i) { + nevb.chrIndex.emplace_back(fs.read()); + nevb.chrScaleFactor.emplace_back(fs.read()); + } + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while deserializing a NormalizedExpectedValuesBlock object: " + + std::string{e.what()}); + } + + return nevb; +} + +inline std::int32_t NormalizedExpectedValues::nNormExpectedValueVectors() const noexcept { + return static_cast(_normalized_expected_values.size()); +} + +inline const phmap::btree_set & +NormalizedExpectedValues::normExpectedValues() const noexcept { + return _normalized_expected_values; +} + +inline void NormalizedExpectedValues::emplace(const NormalizedExpectedValuesBlock &evb, + bool force_overwrite) { + auto [it, inserted] = _normalized_expected_values.emplace(evb); + if (!inserted) { + if (force_overwrite) { + *it = evb; + } else { + throw std::runtime_error(fmt::format( + FMT_STRING("NormalizedExpectedValues already contains {} vector for {} resolution ({})"), + it->type, it->binSize, it->unit)); + } + } +} + +inline std::string NormalizedExpectedValues::serialize(BinaryBuffer &buffer, bool clear) const { + if (clear) { + buffer.clear(); + } + + try { + buffer.write(nNormExpectedValueVectors()); + for (const auto &nev : _normalized_expected_values) { + std::ignore = nev.serialize(buffer, false); + } + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while serializing a NormalizedExpectedValues object: " + + std::string{e.what()}); + } + + return buffer.get(); +} + +inline NormalizedExpectedValues NormalizedExpectedValues::deserialize(filestream::FileStream &fs) { + NormalizedExpectedValues nevs{}; + + try { + const auto nNormExpectedValueVectors = static_cast(fs.read()); + for (std::size_t i = 0; i < nNormExpectedValueVectors; ++i) { + nevs.emplace(NormalizedExpectedValuesBlock::deserialize(fs), true); + } + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while deserializing a NormalizedExpectedValues object: " + + std::string{e.what()}); + } + + return nevs; +} + +inline NormalizationVectorIndexBlock::NormalizationVectorIndexBlock( + std::string type_, std::uint32_t chrom_idx, std::string unit_, std::uint32_t bin_size, + std::size_t position_, std::size_t n_bytes) + : type(std::move(type_)), + chrIdx(static_cast(chrom_idx)), + unit(std::move(unit_)), + binSize(static_cast(bin_size)), + position(static_cast(position_)), + nBytes(static_cast(n_bytes)) {} + +inline bool NormalizationVectorIndexBlock::operator<( + const NormalizationVectorIndexBlock &other) const noexcept { + if (type != other.type) { + return type < other.type; + } + if (chrIdx != other.chrIdx) { + return chrIdx < other.chrIdx; + } + if (unit != other.unit) { + return unit < other.unit; + } + return binSize < other.binSize; +} + +inline std::string NormalizationVectorIndexBlock::serialize(BinaryBuffer &buffer, + bool clear) const { + if (clear) { + buffer.clear(); + } + + try { + buffer.write(type); + buffer.write(chrIdx); + buffer.write(unit); + buffer.write(binSize); + buffer.write(position); + buffer.write(nBytes); + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while serializing a NormalizationVectorIndexBlock object: " + + std::string{e.what()}); + } + + return buffer.get(); +} + +inline NormalizationVectorIndexBlock NormalizationVectorIndexBlock::deserialize( + filestream::FileStream &fs) { + NormalizationVectorIndexBlock nvib{}; + + try { + nvib.type = fs.getline('\0'); + nvib.chrIdx = fs.read(); + nvib.unit = fs.getline('\0'); + nvib.binSize = fs.read(); + nvib.position = fs.read(); + nvib.nBytes = fs.read(); + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while deserializing a NormalizationVectorIndexBlock object: " + + std::string{e.what()}); + } + + return nvib; +} + +inline std::int32_t NormalizationVectorIndex::nNormVectors() const noexcept { + return static_cast(_norm_vect_idx.size()); +} + +inline const std::vector +NormalizationVectorIndex::normalizationVectorIndex() const noexcept { + return _norm_vect_idx; +} + +inline void NormalizationVectorIndex::emplace_back(NormalizationVectorIndexBlock blk) { + _norm_vect_idx.emplace_back(std::move(blk)); +} + +inline std::string NormalizationVectorIndex::serialize(BinaryBuffer &buffer, bool clear) const { + if (clear) { + buffer.clear(); + } + + try { + buffer.write(nNormVectors()); + + for (const auto &nv : _norm_vect_idx) { + std::ignore = nv.serialize(buffer, false); + } + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while serializing a NormalizationVectorIndex object: " + + std::string{e.what()}); + } + + return buffer.get(); +} + +inline NormalizationVectorIndex NormalizationVectorIndex::deserialize(filestream::FileStream &fs) { + NormalizationVectorIndex nvi{}; + + try { + const auto nNormVectors = static_cast(fs.read()); + for (std::size_t i = 0; i < nNormVectors; ++i) { + nvi.emplace_back(NormalizationVectorIndexBlock::deserialize(fs)); + } + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while deserializing a NormalizationVectorIndex object: " + + std::string{e.what()}); + } + return nvi; +} + +} // namespace hictk::hic::internal diff --git a/src/libhictk/hic/include/hictk/hic/impl/file_writer_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/file_writer_impl.hpp new file mode 100644 index 00000000..e5050c45 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/impl/file_writer_impl.hpp @@ -0,0 +1,1535 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#if __has_include() +#include +#else +#include +#endif +#include +#include +#include +#include +#if __has_include() +#include +#else +#include +#endif +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/chromosome.hpp" +#include "hictk/hic.hpp" +#include "hictk/hic/common.hpp" +#include "hictk/hic/filestream.hpp" +#include "hictk/hic/index.hpp" +#include "hictk/reference.hpp" +#include "hictk/transformers/coarsen.hpp" +#include "hictk/version.hpp" + +namespace hictk::hic::internal { + +template +inline HiCSectionOffsets::HiCSectionOffsets(I1 start_, I2 size_) + : _position(conditional_static_cast(start_)), + _size(conditional_static_cast(size_)) { + static_assert(std::is_integral_v); + static_assert(std::is_integral_v); +} + +inline std::streamoff HiCSectionOffsets::start() const noexcept { return _position; } + +inline std::streamoff HiCSectionOffsets::end() const noexcept { + return _position + static_cast(size()); +} + +inline std::size_t HiCSectionOffsets::size() const noexcept { return _size; } + +inline std::size_t &HiCSectionOffsets::size() noexcept { return _size; } + +inline bool BlockIndexKey::operator<(const BlockIndexKey &other) const noexcept { + if (chrom1 != other.chrom1) { + return chrom1 < other.chrom1; + } + if (chrom2 != other.chrom2) { + return chrom2 < other.chrom2; + } + return resolution < other.resolution; +} + +inline bool MatrixBodyMetadataTank::Key::operator==( + const MatrixBodyMetadataTank::Key &other) const noexcept { + return chrom1 == other.chrom1 && chrom2 == other.chrom2; +} + +inline bool MatrixBodyMetadataTank::contains(const Chromosome &chrom1, + const Chromosome &chrom2) const noexcept { + return _tank.contains(Key{chrom1, chrom2}); +} + +inline auto MatrixBodyMetadataTank::at(const Chromosome &chrom1, const Chromosome &chrom2) const + -> const MatrixBodyMetadata & { + const auto match = _tank.find({chrom1, chrom2}); + if (match != _tank.end()) { + return match->second; + } + throw std::out_of_range( + fmt::format(FMT_STRING("MatrixBodyMetadataTank does not contain metadata for {}:{}"), + chrom1.name(), chrom2.name())); +} + +inline HiCSectionOffsets MatrixBodyMetadataTank::offset(const Chromosome &chrom1, + const Chromosome &chrom2) const { + const auto match = _offsets.find(Key{chrom1, chrom2}); + if (match != _offsets.end()) { + return match->second; + } + throw std::out_of_range( + fmt::format(FMT_STRING("MatrixBodyMetadataTank does not contain file offsets for {}:{}"), + chrom1.name(), chrom2.name())); +} + +inline void MatrixBodyMetadataTank::insert(const Chromosome &chrom1, const Chromosome &chrom2, + MatrixMetadata matrix_metadata, + MatrixResolutionMetadata matrix_resolution_metadata) { + try { + auto match = _tank.find(Key{chrom1, chrom2}); + if (match != _tank.end()) { + match->second.matrixMetadata = std::move(matrix_metadata); + match->second.resolutionMetadata.emplace(std::move(matrix_resolution_metadata)); + } else { + _tank.emplace(Key{chrom1, chrom2}, + MatrixBodyMetadata{std::move(matrix_metadata), + phmap::btree_set{ + std::move(matrix_resolution_metadata)}}); + } + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format(FMT_STRING("an error occurred while inserting metadata for {}:{} into a " + "MatrixBodyMetadataTank object: {}"), + chrom1.name(), chrom2.name(), e.what())); + } +} + +inline void MatrixBodyMetadataTank::update_offsets(const Chromosome &chrom1, + const Chromosome &chrom2, + std::streamoff position, std::size_t size) { + try { + auto [it, inserted] = + _offsets.try_emplace(Key{chrom1, chrom2}, HiCSectionOffsets{position, size}); + if (!inserted) { + it->second = HiCSectionOffsets{position, size}; + } + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "an error occurred while updating MatrixBodyMetadata file offsets for {}:{}: {}"), + chrom1.name(), chrom2.name(), e.what())); + } +} + +inline void MatrixBodyMetadataTank::remove(const Chromosome &chrom1, const Chromosome &chrom2) { + try { + const Key k{chrom1, chrom2}; + _tank.erase(k); + _offsets.erase(k); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while removing the MatrixBodyMetadata entry for {}:{}: {}"), + chrom1.name(), chrom2.name(), e.what())); + } +} + +inline auto MatrixBodyMetadataTank::operator()() const noexcept + -> const phmap::flat_hash_map & { + return _tank; +} + +inline HiCFileWriter::HiCFileWriter(std::string_view path_, std::size_t n_threads) + : _fs(std::string{path_}, std::ios::in | std::ios::out), + _header(read_header(_fs)), + _bin_tables(init_bin_tables(chromosomes(), resolutions())), + _tpool(init_tpool(n_threads)) { + read_offsets(); + read_norm_expected_values(); + read_norm_vectors(); +} + +inline HiCFileWriter::HiCFileWriter(std::string_view path_, Reference chromosomes_, + std::vector resolutions_, + std::string_view assembly_, std::size_t n_threads, + std::size_t chunk_size, const std::filesystem::path &tmpdir, + std::uint32_t compression_lvl, std::size_t buffer_size) + : _fs(filestream::FileStream::create(std::string{path_})), + _tmpdir(tmpdir), + _header(init_header(path_, std::move(chromosomes_), std::move(resolutions_), assembly_)), + _bin_tables(init_bin_tables(chromosomes(), resolutions())), + _block_mappers(init_interaction_block_mappers(_tmpdir, _bin_tables, chunk_size, 3)), + _compression_lvl(compression_lvl), + _compressor(libdeflate_alloc_compressor(static_cast(compression_lvl))), + _compression_buffer(buffer_size, '\0'), + _tpool(init_tpool(n_threads)) { + if (!std::filesystem::exists(_tmpdir)) { + throw std::runtime_error( + fmt::format(FMT_STRING("temporary directory {} does not exist"), _tmpdir)); + } +} + +inline std::string_view HiCFileWriter::path() const noexcept { return _header.url; } + +inline const Reference &HiCFileWriter::chromosomes() const noexcept { return _header.chromosomes; } + +inline const BinTable &HiCFileWriter::bins(std::uint32_t resolution) const { + return *_bin_tables.at(resolution); +} + +inline const std::vector &HiCFileWriter::resolutions() const noexcept { + return _header.resolutions; +} + +inline auto HiCFileWriter::stats(std::uint32_t resolution) const noexcept -> Stats { + auto match = _stats.find(resolution); + if (match != _stats.end()) { + return match->second; + } + return {}; +} + +inline void HiCFileWriter::serialize() { + try { + write_header(); + write_pixels(); + finalize(true); + for (auto &[_, mapper] : _block_mappers) { + mapper.clear(); + } + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing file \"{}\": {}"), path(), e.what())); + } +} + +inline void HiCFileWriter::write_header() { + assert(_fs.tellp() == 0); + + assert(_header.version == 9); + assert(!chromosomes().empty()); + + try { + const auto offset1 = _fs.tellp(); + SPDLOG_INFO(FMT_STRING("writing header at offset {}"), offset1); + _fs.write(_header.serialize(_bbuffer)); + const auto offset2 = _fs.tellp(); + + _header_section = {offset1, offset2 - offset1}; + _data_block_section = {offset2, 0}; + _body_metadata_section = {offset2, 0}; + _footer_section = {offset2, 0}; + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing the .hic header for file \"{}\" to disk: {}"), + path(), e.what())); + } +} + +inline void HiCFileWriter::write_footer_size() { + SPDLOG_DEBUG(FMT_STRING("updating footer size to {}"), _footer_section.size()); + // This is not documented for v9, but nBytesV5 is not included in the footer size + const auto nBytesV5 = static_cast(_footer_section.size()) - + static_cast(sizeof(std::int64_t)); + + try { + _fs.seekp(_footer_section.start()); + _fs.write(nBytesV5); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing the footer size for file \"{}\" to disk: {}"), + path(), e.what())); + } +} + +inline void HiCFileWriter::write_footer_offset() { + SPDLOG_DEBUG(FMT_STRING("updating footer offset to {}"), _footer_section.start()); + const auto offset = sizeof("HIC") + sizeof(_header.version); + + try { + _fs.seekp(offset); + _fs.write(conditional_static_cast(_footer_section.start())); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing the footer offset for file \"{}\" to disk: {}"), + path(), e.what())); + } +} + +inline void HiCFileWriter::write_norm_vector_index() { + const auto offset = + static_cast(sizeof("HIC") + sizeof(_header.version) + + sizeof(_header.footerPosition) + _header.genomeID.size() + 1); + const auto normVectorIndexPosition = + conditional_static_cast(_norm_vector_index_section.start()); + const auto normVectorIndexLength = static_cast(_norm_vector_index_section.size()); + + SPDLOG_DEBUG(FMT_STRING("writing normVectorIndex {}:{} at offset {}..."), normVectorIndexPosition, + normVectorIndexLength, offset); + + try { + _fs.seekp(offset); + _fs.write(normVectorIndexPosition); + _fs.write(normVectorIndexLength); + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format(FMT_STRING("an error occurred while writing the normVectorIndex position and " + "length for file \"{}\" to disk: {}"), + path(), e.what())); + } +} + +template +inline void HiCFileWriter::add_pixels(std::uint32_t resolution, PixelIt first_pixel, + PixelIt last_pixel) { + try { + _block_mappers.at(resolution).append_pixels(first_pixel, last_pixel, _tpool); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while adding pixels for resolution {} to file \"{}\": {}"), + resolution, path(), e.what())); + } +} + +inline void HiCFileWriter::write_pixels() { + SPDLOG_INFO(FMT_STRING("begin writing interaction blocks to file \"{}\"..."), path()); + const auto &chrom_idx = _block_mappers.at(resolutions().front()).chromosome_index(); + std::vector> chroms{chrom_idx.size()}; + std::transform(chrom_idx.begin(), chrom_idx.end(), chroms.begin(), + [](const auto &kv) { return kv.first; }); + std::sort(chroms.begin(), chroms.end()); + + for (const auto &[chrom1, chrom2] : chroms) { + if (chrom1.is_all() || chrom2.is_all()) { + continue; + } + write_pixels(chrom1, chrom2); + } + write_all_matrix(); +} + +inline void HiCFileWriter::write_all_matrix(std::uint32_t target_num_bins) { + try { + std::uint64_t genome_size = 0; + for (const auto &chrom : chromosomes()) { + if (chrom.is_all()) { + continue; + } + genome_size += chrom.size(); + } + + auto base_resolution = resolutions().front(); + auto target_resolution = + static_cast((genome_size + target_num_bins - 1) / target_num_bins); + auto factor = std::max(std::uint32_t(1), target_resolution / base_resolution); + target_resolution = factor * base_resolution; + const auto target_resolution_scaled = target_resolution / DEFAULT_CHROM_ALL_SCALE_FACTOR; + + for (const auto &res : resolutions()) { + if (res > target_resolution) { + break; + } + + if (target_resolution % res == 0) { + base_resolution = res; + } + } + + factor = target_resolution / base_resolution; + + SPDLOG_INFO(FMT_STRING("writing pixels for {}:{} matrix..."), chromosomes().at(0).name(), + chromosomes().at(0).name()); + + std::uint32_t genome_size_scaled = 0; + for (const auto &chrom : chromosomes()) { + if (chrom.is_all()) { + continue; + } + const auto num_bins = (chrom.size() + target_resolution - 1) / target_resolution; + genome_size_scaled += static_cast(num_bins) * target_resolution_scaled; + } + + const auto bin_table_ALL = std::make_shared( + Reference{Chromosome{0, "__ALL__", genome_size_scaled}}, target_resolution_scaled); + const auto chrom = bin_table_ALL->chromosomes().at(0); + + const auto num_bins = + HiCInteractionToBlockMapper::compute_num_bins(chrom, chrom, target_resolution_scaled); + const auto num_columns = HiCInteractionToBlockMapper::compute_block_column_count( + chrom, chrom, target_resolution_scaled, HiCInteractionToBlockMapper::DEFAULT_INTER_CUTOFF); + const auto num_rows = num_bins / num_columns + 1; + + HiCInteractionToBlockMapper::BlockMapperIntra mapper{num_rows, num_columns}; + + const File f(std::string{path()}, base_resolution); + auto sel = f.fetch(); + phmap::btree_map> blocks{}; + + std::for_each(sel.begin(), sel.end(), [&](const ThinPixel &p) { + const Pixel pixel(*_bin_tables.at(base_resolution), p); + // The result of this coarsening is not correct, as the last bin in a chromosome will + // have the same ID as the first bin in the next chromosome, but this is what JuiceBox + // expects. + // We subtract the chromosome ID as JuiceBox's chromosome grid expects pixels boundaries to be + // multiples of the bin size. This turns out to be correct as long as chromosome sizes are not + // multiples of the bin size (which should happen extremely rarely), in which case the result + // is off by one. + Pixel coarsened_pixel( + *bin_table_ALL, (p.bin1_id - (pixel.coords.bin1.chrom().id() - 1)) / factor, + (p.bin2_id - (pixel.coords.bin2.chrom().id() - 1)) / factor, p.count); + + const auto bid = + mapper(coarsened_pixel.coords.bin1.rel_id(), coarsened_pixel.coords.bin2.rel_id()); + auto [it, inserted] = blocks.try_emplace(bid, MatrixInteractionBlock{}); + it->second.emplace_back(std::move(coarsened_pixel)); + }); + + const auto offset = _data_block_section.end(); + _fs.seekp(offset); + + for (auto &[bid, blk] : blocks) { + blk.finalize(); + write_interaction_block(bid, chrom, chrom, target_resolution_scaled, blk); + } + _data_block_section.size() += _fs.tellp() - static_cast(offset); + + add_body_metadata(target_resolution_scaled, chrom, chrom); + write_body_metadata(); + add_footer(chrom, chrom); + write_footers(); + + finalize(); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing the All:All matrix to file \"{}\": {}"), path(), + e.what())); + } +} + +inline auto HiCFileWriter::write_pixels(const Chromosome &chrom1, const Chromosome &chrom2) + -> HiCSectionOffsets { + try { + write_pixels(chrom1, chrom2, resolutions().front()); + add_body_metadata(resolutions().front(), chrom1, chrom2); + write_body_metadata(); + add_footer(chrom1, chrom2); + write_footers(); + + finalize(); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "an error occurred while writing the {}:{} matrix at {} resolution to file \"{}\": {}"), + chrom1.name(), chrom2.name(), resolutions().front(), path(), e.what())); + } + + for (std::size_t i = 1; i < resolutions().size(); ++i) { + auto base_resolution = resolutions().front(); + const auto res = resolutions()[i]; + + auto &mapper = _block_mappers.at(res); + if (mapper.empty(chrom1, chrom2)) { + try { + for (std::size_t j = 0; j < i; ++j) { + if (res % resolutions()[j] == 0) { + base_resolution = resolutions()[j]; + } + } + const File f(std::string{path()}, base_resolution); + const auto sel = f.fetch(chrom1.name(), chrom2.name()); + if (!sel.empty()) { + SPDLOG_INFO( + FMT_STRING("[{} bp] no pixels provided for {}:{} matrix: generating pixels by " + "coarsening resolution {}..."), + res, chrom1.name(), chrom2.name(), base_resolution); + const auto factor = res / base_resolution; + const transformers::CoarsenPixels coarsener( + sel.begin(), sel.end(), + std::make_shared(bins(base_resolution)), factor); + + mapper.append_pixels(coarsener.begin(), coarsener.end(), _tpool); + } + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format(FMT_STRING("an error occurred while coarsening interactions for {}:{} from " + "resolution {} to resolution {}: {}"), + chrom1.name(), chrom2.name(), base_resolution, res, e.what())); + } + } + + if (mapper.empty(chrom1, chrom2)) { + SPDLOG_WARN(FMT_STRING("[{} bp] no pixels found for {}:{} matrix: SKIPPING!"), res, + chrom1.name(), chrom2.name()); + continue; + } + + try { + mapper.finalize(); + write_pixels(chrom1, chrom2, res); + for (std::size_t j = 0; j <= i; ++j) { + add_body_metadata(resolutions()[j], chrom1, chrom2); + } + write_body_metadata(); + add_footer(chrom1, chrom2); + write_footers(); + finalize(); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format(FMT_STRING("an error occurred while writing the {}:{} " + "matrix at {} resolution to file \"{}\": {}"), + chrom1.name(), chrom2.name(), res, path(), e.what())); + } + } + return {_data_block_section.start(), + _fs.tellp() - static_cast(_data_block_section.start())}; +} + +inline void HiCFileWriter::write_body_metadata() { + const auto pos = _data_block_section.end(); + _fs.seekp(pos); + for (const auto &[chroms, metadata] : _matrix_metadata()) { + const auto &chrom1 = chroms.chrom1; + const auto &chrom2 = chroms.chrom2; + [[maybe_unused]] const auto &num_resolutions = metadata.resolutionMetadata.size(); + + try { + const auto pos1 = _fs.tellp(); + SPDLOG_DEBUG(FMT_STRING("writing MatrixBodyMetadata for {}:{} ({} resolutions) at offset {}"), + chrom1.name(), chrom2.name(), num_resolutions, pos1); + _fs.write(metadata.serialize(_bbuffer)); + const auto pos2 = _fs.tellp(); + SPDLOG_DEBUG(FMT_STRING("updating MatrixBodyMetadata offset and size for {}:{} ({} " + "resolutions) to {} and {}"), + chrom1.name(), chrom2.name(), num_resolutions, pos1, pos2 - pos1); + _matrix_metadata.update_offsets(chrom1, chrom2, static_cast(pos1), + pos2 - pos1); + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format(FMT_STRING("an error occurred while writing the MatrixBodyMetadata for {}:{} " + "to file \"{}\": {}"), + chrom1.name(), chrom2.name(), path(), e.what())); + } + } + + const auto size = _fs.tellp() - static_cast(pos); + _body_metadata_section = {pos, size}; +} + +inline void HiCFileWriter::add_body_metadata(std::uint32_t resolution, const Chromosome &chrom1, + const Chromosome &chrom2, const std::string &unit) { + SPDLOG_DEBUG(FMT_STRING("adding MatrixBodyMetadata for {}:{} at {} {}"), chrom1.name(), + chrom2.name(), resolution, unit); + const auto sum_counts = + chrom1.name() == "__ALL__" ? 1.0F : _block_mappers.at(resolution).pixel_sum(chrom1, chrom2); + if (sum_counts == 0) { + return; + } + + try { + auto metadata = _matrix_metadata.contains(chrom1, chrom2) ? _matrix_metadata.at(chrom1, chrom2) + : MatrixBodyMetadata{}; + + auto &mm = metadata.matrixMetadata; + MatrixResolutionMetadata mrm{}; + + const auto num_bins = compute_num_bins(chrom1, chrom2, resolution); + const auto num_columns = compute_block_column_count(chrom1, chrom2, resolution); + const auto num_rows = num_bins / num_columns + 1; + + mrm.unit = unit; + mrm.resIdx = static_cast(std::distance( + resolutions().begin(), std::find(resolutions().begin(), resolutions().end(), resolution))); + mrm.sumCounts = sum_counts; + mrm.occupiedCellCount = 0; // not used + mrm.percent5 = 0; // not used + mrm.percent95 = 0; // not used + mrm.binSize = static_cast(resolution); + mrm.blockSize = static_cast(num_rows); + mrm.blockColumnCount = static_cast(num_columns); + + const auto &blks = _block_index.at(BlockIndexKey{chrom1, chrom2, resolution}); + mrm.set_block_metadata(blks.begin(), blks.end()); + + mm.chr1Idx = static_cast(chrom1.id()); + mm.chr2Idx = static_cast(chrom2.id()); + mm.nResolutions = static_cast(metadata.resolutionMetadata.size() + 1); + + _matrix_metadata.insert(chrom1, chrom2, mm, mrm); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "an error occurred while adding or updating the MatrixBodyMetadata for {}:{}: {}"), + chrom1.name(), chrom2.name(), e.what())); + } +} + +inline void HiCFileWriter::write_footers() { + const auto offset1 = _body_metadata_section.end(); + + try { + _fs.seekp(offset1); + SPDLOG_DEBUG(FMT_STRING("initializing footer section at offset {}"), offset1); + const std::int64_t nBytesV5 = -1; + const auto nEntries = static_cast(_footers.size()); + _fs.write(nBytesV5); + _fs.write(nEntries); + + for (auto &[chroms, footer] : _footers) { + try { + const auto offset = _matrix_metadata.offset(chroms.first, chroms.second); + footer.position = conditional_static_cast(offset.start()); + footer.size = static_cast(offset.size()); + SPDLOG_DEBUG(FMT_STRING("writing FooterMasterIndex for {}:{} at offset {}"), + chroms.first.name(), chroms.second.name(), _fs.tellp()); + _fs.write(footer.serialize(_bbuffer)); + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format(FMT_STRING("an error occurred while writing the footer for {}:{}: {}"), + chroms.first.name(), chroms.second.name(), e.what())); + } + } + + write_empty_expected_values(); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing the footer section to file \"{}\": {}"), path(), + e.what())); + } + + _footer_section = {offset1, _fs.tellp() - static_cast(offset1)}; +} + +inline void HiCFileWriter::add_footer(const Chromosome &chrom1, const Chromosome &chrom2) { + if (!_matrix_metadata.contains(chrom1, chrom2)) { + return; + } + + try { + FooterMasterIndex footer{}; + footer.key = fmt::format(FMT_STRING("{}_{}"), chrom1.id(), chrom2.id()); + footer.position = -1; + footer.size = -1; + + auto [it, inserted] = _footers.emplace(std::make_pair(chrom1, chrom2), footer); + if (!inserted) { + it->second = std::move(footer); + } + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format(FMT_STRING("an error occurred while adding the footer for {}:{}: {}"), + chrom1.name(), chrom2.name(), e.what())); + } +} + +inline void HiCFileWriter::write_norm_vectors_and_norm_expected_values() { + // we are writing the norm vectors twice because the function computing the norm expected values + // expects the normalization vectors to be available in the file that is being written + write_norm_vectors(); + compute_and_write_normalized_expected_values(); + write_norm_vectors(); +} + +inline void HiCFileWriter::write_empty_expected_values() { + ExpectedValues ev{}; + + try { + const auto offset = _fs.tellp(); + SPDLOG_DEBUG(FMT_STRING("writing empty expected values section at offset {}..."), offset); + _fs.write(ev.serialize(_bbuffer)); + + _expected_values_section = {offset, _fs.tellp() - offset}; + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "an error occurred while writing an empty expected values section to file \"{}\": {}"), + path(), e.what())); + } +} + +inline void HiCFileWriter::write_empty_normalized_expected_values() { + const auto offset = _expected_values_section.end(); + SPDLOG_DEBUG(FMT_STRING("writing empty expected values (normalized) section at offset {}..."), + offset); + try { + _fs.seekp(offset); + DISABLE_WARNING_PUSH + DISABLE_WARNING_USELESS_CAST + _fs.write(std::int32_t(0)); + DISABLE_WARNING_POP + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format(FMT_STRING("an error occurred while writing an empty normalized expected " + "values section to file \"{}\": {}"), + path(), e.what())); + } + _expected_values_norm_section = {offset, _fs.tellp() - static_cast(offset)}; +} + +inline ExpectedValuesBlock HiCFileWriter::compute_expected_values(std::uint32_t resolution) { + SPDLOG_DEBUG(FMT_STRING("computing expected values at resolution {}..."), resolution); + + try { + const File f(std::string{path()}, resolution); + const auto sel = f.fetch(); + + ExpectedValuesAggregator aggr(_bin_tables.at(resolution)); + std::for_each(sel.begin(), sel.end(), [&](const auto &p) { aggr.add(p); }); + aggr.compute_density(); + + std::vector weights(aggr.weights().size()); + std::transform(aggr.weights().begin(), aggr.weights().end(), weights.begin(), + [](const auto w) { return static_cast(w); }); + + std::vector chrom_ids{}; + std::vector scaling_factors{}; + std::for_each(aggr.scaling_factors().begin(), aggr.scaling_factors().end(), + [&](const auto &kv) { + chrom_ids.push_back(kv.first.id()); + scaling_factors.push_back(kv.second); + }); + + return {"BP", resolution, aggr.weights(), chrom_ids, scaling_factors}; + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format(FMT_STRING("an error occurred while computing the expected values for file " + "\"{}\" at {} resolution: {}"), + path(), resolution, e.what())); + } +} + +inline NormalizedExpectedValuesBlock HiCFileWriter::compute_normalized_expected_values( + std::uint32_t resolution, const balancing::Method &norm) { + assert(norm != balancing::Method::NONE()); + SPDLOG_INFO(FMT_STRING("computing normalized expected values ({}) at resolution {}..."), norm, + resolution); + + try { + const File f(std::string{path()}, resolution); + const auto sel = f.fetch(norm); + + ExpectedValuesAggregator aggr(_bin_tables.at(resolution)); + std::for_each(sel.begin(), sel.end(), [&](const auto &p) { aggr.add(p); }); + aggr.compute_density(); + + std::vector weights(aggr.weights().size()); + std::transform(aggr.weights().begin(), aggr.weights().end(), weights.begin(), + [](const auto w) { return static_cast(w); }); + + std::vector chrom_ids{}; + std::vector scaling_factors{}; + std::for_each(aggr.scaling_factors().begin(), aggr.scaling_factors().end(), + [&](const auto &kv) { + chrom_ids.push_back(kv.first.id()); + scaling_factors.push_back(kv.second); + }); + + return {norm.to_string(), "BP", resolution, aggr.weights(), chrom_ids, scaling_factors}; + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while computing the normalized expected values for file " + "\"{}\" at {} resolution: {}"), + path(), resolution, e.what())); + } +} + +inline void HiCFileWriter::compute_and_write_expected_values() { + assert(_tpool.get_thread_count() != 0); + ExpectedValues ev{}; + + std::vector> results{}; + for (const auto &resolution : resolutions()) { + results.emplace_back( + _tpool.submit([&, res = resolution]() { return compute_expected_values(res); })); + } + + for (auto &res : results) { + ev.emplace(res.get()); + } + + try { + const auto offset = _footer_section.end() - + conditional_static_cast(sizeof(ev.nExpectedValueVectors())); + SPDLOG_INFO(FMT_STRING("writing {} expected value vectors at offset {}..."), + ev.nExpectedValueVectors(), offset); + _fs.seekp(offset); + _fs.write(ev.serialize(_bbuffer)); + + _expected_values_section = {offset, _fs.tellp() - static_cast(offset)}; + _footer_section.size() += _expected_values_section.size() - sizeof(ev.nExpectedValueVectors()); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing expected values to file \"{}\": {}"), path(), + e.what())); + } +} + +inline void HiCFileWriter::compute_and_write_normalized_expected_values() { + assert(_tpool.get_thread_count() != 0); + NormalizedExpectedValues ev{}; + + phmap::btree_map> + results{}; + for (const auto &[blk, _] : _normalization_vectors) { + const NormalizedExpectedValuesBlock key{ + blk.type, blk.unit, static_cast(blk.binSize), {}, {}, {}}; + const auto nev_available = + _normalized_expected_values.find(key) != _normalized_expected_values.end(); + const auto nev_already_submitted_for_computation = results.find(key) != results.end(); + if (!nev_available && !nev_already_submitted_for_computation) { + results.emplace( + key, _tpool.submit([&, res = static_cast(blk.binSize), type = blk.type]() { + const balancing::Method norm{type}; + return compute_normalized_expected_values(res, norm); + })); + } + } + + for (auto &[_, res] : results) { + _normalized_expected_values.emplace(res.get()); + } + + for (const auto &nev : _normalized_expected_values) { + ev.emplace(nev); + } + + try { + const auto offset = _footer_section.end(); + SPDLOG_INFO(FMT_STRING("writing {} normalized expected value vectors at offset {}..."), + ev.nNormExpectedValueVectors(), offset); + _fs.seekp(offset); + _fs.write(ev.serialize(_bbuffer)); + + _expected_values_norm_section = {offset, _fs.tellp() - static_cast(offset)}; + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing normalized expected values to file \"{}\": {}"), + path(), e.what())); + } +} + +inline void HiCFileWriter::add_norm_vector(const NormalizationVectorIndexBlock &blk, + const std::vector &weights, + bool force_overwrite) { + const auto &chrom = chromosomes().at(static_cast(blk.chrIdx)); + SPDLOG_INFO(FMT_STRING("[{}] adding {} normalization vector for {} ({}): {} values"), blk.binSize, + blk.type, chrom.name(), blk.unit, weights.size()); + + try { + const auto bin_size = static_cast(blk.binSize); + const auto expected_shape = (chrom.size() + bin_size - 1) / bin_size; + + if (weights.size() != expected_shape) { + throw std::runtime_error( + fmt::format(FMT_STRING("weight shape mismatch: expected {} values, found {}"), + expected_shape, weights.size())); + } + + auto [it, inserted] = _normalization_vectors.emplace(blk, weights); + if (!inserted) { + if (force_overwrite) { + it->second = weights; + const NormalizedExpectedValuesBlock key{ + blk.type, blk.unit, static_cast(blk.binSize), {}, {}, {}}; + _normalized_expected_values.erase(key); + } else { + throw std::runtime_error("file already contains normalization vector"); + } + } + + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "an error occurred while adding {} normalization vector for {} at {} resolution: {}"), + blk.type, chrom.name(), blk.binSize, e.what())); + } +} + +inline void HiCFileWriter::add_norm_vector(std::string_view type, const Chromosome &chrom, + std::string_view unit, std::uint32_t bin_size, + const std::vector &weights, bool force_overwrite, + std::size_t position, std::size_t n_bytes) { + add_norm_vector(NormalizationVectorIndexBlock{std::string{type}, chrom.id(), std::string{unit}, + bin_size, position, n_bytes}, + weights, force_overwrite); +} + +inline void HiCFileWriter::add_norm_vector(const NormalizationVectorIndexBlock &blk, + const balancing::Weights &weights, + bool force_overwrite) { + std::vector weights_f(weights().size()); + if (weights.type() == balancing::Weights::Type::MULTIPLICATIVE) { + std::transform(weights().begin(), weights().end(), weights_f.begin(), + [](const double w) { return static_cast(1.0 / w); }); + } else { + std::transform(weights().begin(), weights().end(), weights_f.begin(), + [](const double w) { return static_cast(w); }); + } + add_norm_vector(blk, weights_f, force_overwrite); +} + +inline void HiCFileWriter::add_norm_vector(std::string_view type, const Chromosome &chrom, + std::string_view unit, std::uint32_t bin_size, + const balancing::Weights &weights, bool force_overwrite, + std::size_t position, std::size_t n_bytes) { + add_norm_vector(NormalizationVectorIndexBlock{std::string{type}, chrom.id(), std::string{unit}, + bin_size, position, n_bytes}, + weights, force_overwrite); +} + +inline void HiCFileWriter::add_norm_vector(std::string_view type, std::string_view unit, + std::uint32_t bin_size, + const std::vector &weights, + bool force_overwrite) { + try { + const auto expected_shape = bins(bin_size).size(); + if (weights.size() != expected_shape) { + throw std::runtime_error( + fmt::format(FMT_STRING("weight shape mismatch: expected {} values, found {}"), + expected_shape, weights.size())); + } + + std::ptrdiff_t i0 = 0; + std::ptrdiff_t i1 = 0; + for (const auto &chrom : chromosomes()) { + if (chrom.is_all()) { + continue; + } + i1 += static_cast((chrom.size() + bin_size - 1) / bin_size); + const std::vector chrom_weights(weights.begin() + i0, weights.begin() + i1); + add_norm_vector(type, chrom, unit, bin_size, chrom_weights, force_overwrite); + i0 = i1; + } + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format(FMT_STRING("an error occurred while adding {} genome-wide " + "normalization vector at {} resolution: {}"), + type, bin_size, e.what())); + } +} + +inline void HiCFileWriter::finalize(bool compute_expected_values) { + try { + if (compute_expected_values) { + compute_and_write_expected_values(); + write_empty_normalized_expected_values(); + write_norm_vectors(); + compute_and_write_normalized_expected_values(); + } else { + write_empty_expected_values(); + write_empty_normalized_expected_values(); + } + + write_footer_offset(); + write_footer_size(); + write_norm_vectors(); + _fs.flush(); + _fs.seekp(0, std::ios::end); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while finalizing file \"{}\": {}"), path(), e.what())); + } +} + +inline void HiCFileWriter::write_norm_vectors() { + try { + const auto offset1 = + std::max(_expected_values_norm_section.end(), _norm_vector_index_section.start()); + _fs.seekp(offset1); + + if (_normalization_vectors.empty()) { + SPDLOG_DEBUG(FMT_STRING("writing empty normalization vector section at offset {}..."), + offset1); + } else { + SPDLOG_INFO(FMT_STRING("writing {} normalization vectors at offset {}..."), + _normalization_vectors.size(), offset1); + } + + const auto nNormVectors = static_cast(_normalization_vectors.size()); + _fs.write(nNormVectors); + + phmap::btree_map index_offsets{}; + for (const auto &[blk, _] : _normalization_vectors) { + try { + const auto offset2 = _fs.tellp(); + _fs.write(blk.serialize(_bbuffer)); + index_offsets.emplace(blk, HiCSectionOffsets{offset2, _fs.tellp() - offset2}); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "an error occurred while writing the {} NormalizationVectorIndexBlock for {} " + "at {} resolution to file \"{}\": {}"), + blk.type, chromosomes().at(static_cast(blk.chrIdx)).name(), blk.binSize, + path(), e.what())); + } + } + const auto offset2 = _fs.tellp(); + + phmap::btree_map vector_offsets{}; + for (const auto &[blk, weights] : _normalization_vectors) { + try { + const auto offset3 = _fs.tellp(); + const auto nValues = static_cast(weights.size()); + _fs.write(nValues); + _fs.write(weights); + vector_offsets.emplace(blk, HiCSectionOffsets{offset3, _fs.tellp() - offset3}); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing the {} normalization vector for {} " + "at {} resolution to file \"{}\": {}"), + blk.type, chromosomes().at(static_cast(blk.chrIdx)).name(), blk.binSize, + path(), e.what())); + } + } + + const auto offset4 = _fs.tellp(); + + for (const auto &[blk, idx_offsets] : index_offsets) { + try { + const auto &vect_offsets = vector_offsets.at(blk); + auto new_blk = blk; + new_blk.position = vect_offsets.start(); + new_blk.nBytes = static_cast(vect_offsets.size()); + _fs.seekp(idx_offsets.start()); + _fs.write(new_blk.serialize(_bbuffer)); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while updating file offsets in the {} " + "NormalizationVectorIndexBlock for {} at {} resolution to file \"{}\": {}"), + blk.type, chromosomes().at(static_cast(blk.chrIdx)).name(), blk.binSize, + path(), e.what())); + } + } + + _norm_vector_index_section = {offset1, offset2 - static_cast(offset1)}; + _norm_vectors_section = {offset2, offset4 - static_cast(offset2)}; + + write_norm_vector_index(); + _fs.seekp(0, std::ios::end); + _fs.flush(); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing normalization vectors to file \"{}\": {}"), + path(), e.what())); + } +} + +inline HiCHeader HiCFileWriter::read_header(filestream::FileStream &fs) { + return HiCHeader::deserialize(fs); +} + +inline HiCHeader HiCFileWriter::init_header(std::string_view path, Reference chromosomes, + std::vector resolutions, + std::string_view assembly) { + chromosomes = chromosomes.add_ALL(DEFAULT_CHROM_ALL_SCALE_FACTOR); + return { + std::string{path}, // url + 9, // version + -1, // footerPosition + std::string{assembly}, // genomeId + -1, // normVectorIndexPosition + 0, // normVectorIndexLength + std::move(chromosomes), + std::move(resolutions), // resolutions + {{"software", std::string{config::version::str_long()}}} // attributes + }; +} + +inline auto HiCFileWriter::init_bin_tables(const Reference &chromosomes, + const std::vector &resolutions) + -> BinTables { + BinTables bin_tables(resolutions.size()); + for (const auto &res : resolutions) { + bin_tables.emplace(res, std::make_shared(chromosomes, res)); + } + + return bin_tables; +} + +inline auto HiCFileWriter::init_interaction_block_mappers(const std::filesystem::path &root_folder, + const BinTables &bin_tables, + std::size_t chunk_size, + int compression_lvl) -> BlockMappers { + BlockMappers mappers(bin_tables.size()); + for (const auto &[res, bin_table] : bin_tables) { + const auto path = fmt::format(FMT_STRING("{}/{}.bin"), root_folder.string(), res); + mappers.emplace(res, HiCInteractionToBlockMapper{path, bin_table, chunk_size, compression_lvl}); + } + + return mappers; +} + +inline BS::thread_pool HiCFileWriter::init_tpool(std::size_t n_threads) { + return {conditional_static_cast(n_threads < 2 ? std::size_t(1) : n_threads)}; +} + +inline auto HiCFileWriter::write_pixels(const Chromosome &chrom1, const Chromosome &chrom2, + std::uint32_t resolution) -> HiCSectionOffsets { + try { + const auto offset = _data_block_section.end(); + _fs.seekp(offset); + + SPDLOG_INFO(FMT_STRING("[{} bp] writing pixels for {}:{} matrix at offset {}..."), resolution, + chrom1.name(), chrom2.name(), offset); + + const auto stats = write_interaction_blocks(chrom1, chrom2, resolution); + + SPDLOG_INFO(FMT_STRING("[{} bp] written {} pixels for {}:{} matrix"), resolution, stats.nnz, + chrom1.name(), chrom2.name()); + + auto [it, inserted] = _stats.try_emplace(resolution, stats); + if (!inserted) { + it->second.sum += stats.sum; + it->second.nnz += stats.nnz; + } + + _data_block_section.size() += _fs.tellp() - static_cast(offset); + return {offset, _fs.tellp() - static_cast(offset)}; + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while writing pixels for {}:{} to file \"{}\": {}"), + chrom1.name(), chrom2.name(), path(), e.what())); + } +} + +inline auto HiCFileWriter::write_interaction_blocks(const Chromosome &chrom1, + const Chromosome &chrom2, + std::uint32_t resolution) -> Stats { + auto &mapper = _block_mappers.at(resolution); + mapper.finalize(); + + const auto block_ids = mapper.chromosome_index().find(std::make_pair(chrom1, chrom2)); + if (block_ids == mapper.chromosome_index().end()) { + SPDLOG_DEBUG(FMT_STRING("no pixels to write for {}:{} matrix at {} resolution"), chrom1.name(), + chrom2.name(), resolution); + return {}; + } + + if (_tpool.get_thread_count() < 3 || block_ids->second.size() == 1) { + try { + Stats stats{}; + for (const auto &bid : block_ids->second) { + auto blk = mapper.merge_blocks(bid); + stats.sum += blk.sum(); + stats.nnz += blk.size(); + write_interaction_block(bid.bid, chrom1, chrom2, resolution, std::move(blk)); + } + + return stats; + } catch (const std::exception &e) { + throw std::runtime_error( + "an error occurred while writing interaction blocks using a single thread: " + + std::string{e.what()}); + } + } + + try { + std::mutex block_id_queue_mtx{}; + std::queue block_id_queue{}; + moodycamel::BlockingConcurrentQueue block_queue( + block_ids->second.size()); + + std::mutex serialized_block_tank_mtx{}; + phmap::flat_hash_map serialized_block_tank{ + block_ids->second.size()}; + const auto stop_token = std::numeric_limits::max(); + std::atomic early_return = false; + + std::mutex mapper_mtx{}; + + std::vector> worker_threads{}; + for (BS::concurrency_t i = 2; i < _tpool.get_thread_count(); ++i) { + worker_threads.emplace_back(_tpool.submit([&]() { + return merge_and_compress_blocks_thr(mapper, mapper_mtx, block_id_queue, block_id_queue_mtx, + block_queue, serialized_block_tank, + serialized_block_tank_mtx, early_return, stop_token); + })); + } + + auto writer = _tpool.submit([&]() { + write_compressed_blocks_thr(chrom1, chrom2, resolution, block_id_queue, block_id_queue_mtx, + serialized_block_tank, serialized_block_tank_mtx, early_return, + stop_token); + }); + + auto producer = _tpool.submit([&]() { + try { + for (const auto &bid : block_ids->second) { + if (early_return) { + break; + } + + block_queue.enqueue(bid); + if (early_return) { + break; + } + } + for (std::size_t i = 0; i < worker_threads.size(); ++i) { + block_queue.enqueue(HiCInteractionToBlockMapper::BlockID{0, 0, stop_token}); + } + } catch (const std::exception &e) { + early_return = true; + throw std::runtime_error("an error occurred in the producer thread: " + + std::string{e.what()}); + } catch (...) { + early_return = true; + throw; + } + }); + + producer.get(); + + Stats stats{}; + for (auto &worker : worker_threads) { + const auto partial_stats = worker.get(); + stats.sum += partial_stats.sum; + stats.nnz += partial_stats.nnz; + } + // signal no more blocks will be enqueued + { + std::scoped_lock lck(block_id_queue_mtx); + block_id_queue.emplace(stop_token); + } + writer.get(); + + return stats; + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format(FMT_STRING("an error occurred while interaction blocks using {} threads: {}"), + _tpool.get_thread_count(), e.what())); + } +} + +inline auto HiCFileWriter::write_interaction_block(std::uint64_t block_id, const Chromosome &chrom1, + const Chromosome &chrom2, + std::uint32_t resolution, + const MatrixInteractionBlock &blk) + -> HiCSectionOffsets { + const auto offset = _fs.tellp(); + + std::ignore = blk.serialize(_bbuffer, *_compressor, _compression_buffer); + SPDLOG_DEBUG(FMT_STRING("writing block #{} for {}:{}:{} at {}:{}"), block_id, chrom1.name(), + chrom2.name(), resolution, offset, _compression_buffer.size()); + _fs.write(_compression_buffer); + + MatrixBlockMetadata mm{static_cast(block_id), static_cast(offset), + static_cast(_fs.tellp() - offset)}; + + const BlockIndexKey key{chrom1, chrom2, resolution}; + auto idx = _block_index.find(key); + if (idx != _block_index.end()) { + idx->second.emplace(std::move(mm)); + } else { + _block_index.emplace(key, phmap::btree_set{std::move(mm)}); + } + return {offset, _fs.tellp() - offset}; +} + +inline std::size_t HiCFileWriter::compute_num_bins(const Chromosome &chrom1, + const Chromosome &chrom2, + std::uint32_t resolution) { + return HiCInteractionToBlockMapper::compute_num_bins(chrom1, chrom2, resolution); +} + +inline void HiCFileWriter::add_norm_expected_values(const NormalizedExpectedValuesBlock &blk, + bool force_overwrite) { + try { + auto [it, inserted] = _normalized_expected_values.emplace(blk); + if (!inserted) { + if (force_overwrite) { + *it = blk; + } else { + throw std::runtime_error("file already contains normalized expected values"); + } + } + + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "an error occurred while adding {} normalized expected values at {} resolution: {}"), + blk.type, blk.binSize, e.what())); + } +} + +inline void HiCFileWriter::read_norm_expected_values() { + assert(_expected_values_norm_section.start() != 0); + try { + const auto offset = _expected_values_norm_section.start(); + _fs.seekg(offset); + const auto nev = NormalizedExpectedValues::deserialize(_fs); + + for (const auto &ev : nev.normExpectedValues()) { + add_norm_expected_values(ev); + } + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format(FMT_STRING("an error occurred while reading normalized " + "expected value vectors from file \"{}\": {}"), + path(), e.what())); + } +} + +inline void HiCFileWriter::read_norm_vectors() { + assert(_norm_vector_index_section.start() != 0); + try { + const auto offset = _norm_vector_index_section.start(); + _fs.seekg(offset); + const auto nvi = NormalizationVectorIndex::deserialize(_fs); + + for (const auto &blk : nvi.normalizationVectorIndex()) { + add_norm_vector(blk, read_norm_vector(blk), true); + } + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while reading normalization vectors from file \"{}\": {}"), + path(), e.what())); + } +} + +inline std::vector HiCFileWriter::read_norm_vector( + const NormalizationVectorIndexBlock &blk) { + try { + const auto offset = blk.position; + _fs.seekg(offset); + + const auto &chrom = chromosomes().at(static_cast(blk.chrIdx)); + const auto bin_size = static_cast(blk.binSize); + const auto nValuesExpected = (static_cast(chrom.size()) + bin_size - 1) / bin_size; + + // https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalization-vector-arrays-1-per-normalization-vector + const auto nValues = static_cast(_fs.read()); + // We cannot use numValues directly because sometimes hic files have few trailing zeros for some + // reason + if (nValues < nValuesExpected) { + throw std::runtime_error( + fmt::format(FMT_STRING("expected {} values, found {}"), nValuesExpected, nValues)); + } + + std::vector buffer(nValues); + _fs.read(buffer); + buffer.resize(nValuesExpected); + const auto bytes_read = _fs.tellg() - static_cast(offset); + if (bytes_read != static_cast(blk.nBytes)) { + throw std::runtime_error( + fmt::format(FMT_STRING("expected to read {} bytes but read {}"), blk.nBytes, bytes_read)); + } + return buffer; + } catch (const std::exception &e) { + throw std::runtime_error( + fmt::format(FMT_STRING("{} normalization vector for {} at {} resolution is corrupted: {}"), + blk.type, _header.chromosomes.at(static_cast(blk.chrIdx)).name(), + blk.binSize, e.what())); + } +} + +inline void HiCFileWriter::read_offsets() { + try { + _fs.seekg(0, std::ios::beg); + const auto header_start = _fs.tellg(); + const auto header = HiCHeader::deserialize(_fs); + const auto header_end = _fs.tellg(); + + // read footer offsets + _fs.seekg(header.footerPosition); + const auto footer_start = _fs.tellg(); + const auto nBytesV5 = _fs.read(); + _fs.seekg(nBytesV5, std::ios::cur); + const auto footer_end = _fs.tellg(); + + // read norm expected values offsets + const auto norm_expected_values_start = _fs.tellg(); + const auto nNormExpectedValueVectors = _fs.read(); + for (std::int32_t i = 0; i < nNormExpectedValueVectors; ++i) { + std::ignore = NormalizationVectorIndexBlock::deserialize(_fs); + } + const auto norm_expected_values_end = _fs.tellg(); + + // compute norm vector index offsets + const auto norm_vector_index_start = header.normVectorIndexPosition; + const auto norm_vector_index_end = + header.normVectorIndexPosition + header.normVectorIndexLength; + + // set the offsets + _header_section = {header_start, header_end - header_start}; + _footer_section = {footer_start, footer_end - footer_start}; + _expected_values_norm_section = {norm_expected_values_start, + norm_expected_values_end - norm_expected_values_start}; + _norm_vector_index_section = {norm_vector_index_start, + norm_vector_index_end - norm_vector_index_start}; + + _fs.seekg(0, std::ios::end); + } catch (const std::exception &e) { + throw std::runtime_error(fmt::format( + FMT_STRING("an error occurred while reading section offsets from file \"{}\": {}"), path(), + e.what())); + } +} + +inline std::size_t HiCFileWriter::compute_block_column_count(const Chromosome &chrom1, + const Chromosome &chrom2, + std::uint32_t resolution) { + return HiCInteractionToBlockMapper::compute_block_column_count( + chrom1, chrom2, resolution, + chrom1 == chrom2 ? HiCInteractionToBlockMapper::DEFAULT_INTRA_CUTOFF + : HiCInteractionToBlockMapper::DEFAULT_INTER_CUTOFF); +} + +inline auto HiCFileWriter::merge_and_compress_blocks_thr( + HiCInteractionToBlockMapper &mapper, std::mutex &mapper_mtx, + std::queue &block_id_queue, std::mutex &block_id_queue_mtx, + moodycamel::BlockingConcurrentQueue &block_queue, + phmap::flat_hash_map &serialized_block_tank, + std::mutex &serialized_block_tank_mtx, std::atomic &early_return, + std::uint64_t stop_token) -> Stats { + SPDLOG_DEBUG(FMT_STRING("merge_and_compress_blocks thread: start-up...")); + try { + HiCInteractionToBlockMapper::BlockID buffer{}; + BinaryBuffer bbuffer{}; + std::string compression_buffer(16'000'000, '\0'); + std::unique_ptr libdeflate_compressor( + libdeflate_alloc_compressor(static_cast(_compression_lvl))); + std::unique_ptr zstd_dctx{ZSTD_createDCtx()}; + + Stats stats{}; + while (!early_return) { + // dequeue block + if (!block_queue.wait_dequeue_timed(buffer, std::chrono::milliseconds(500))) { + continue; + } + if (buffer.bid == stop_token) { + SPDLOG_DEBUG( + FMT_STRING("merge_and_compress_blocks thread: processed all blocks. Returning!")); + return stats; + } + + SPDLOG_DEBUG( + FMT_STRING("merge_and_compress_blocks thread: merging partial blocks for block #{}"), + buffer.bid); + // read and merge partial blocks + auto blk = mapper.merge_blocks(buffer, bbuffer, *zstd_dctx, compression_buffer, mapper_mtx); + stats.nnz += blk.size(); + stats.sum += blk.sum(); + + // compress and serialize block + std::ignore = blk.serialize(bbuffer, *libdeflate_compressor, compression_buffer); + + // enqueue serialized block + std::scoped_lock lck(serialized_block_tank_mtx, block_id_queue_mtx); + SPDLOG_DEBUG(FMT_STRING("merge_and_compress_blocks thread: done processing block #{}"), + buffer.bid); + serialized_block_tank.emplace(std::make_pair(buffer.bid, compression_buffer)); + block_id_queue.emplace(buffer.bid); + } + + return stats; + } catch (const std::exception &e) { + early_return = true; + throw std::runtime_error("an error occurred in merge_and_compress_blocks thread: " + + std::string{e.what()}); + + } catch (...) { + early_return = true; + throw; + } +} + +inline void HiCFileWriter::write_compressed_blocks_thr( + const Chromosome &chrom1, const Chromosome &chrom2, std::uint32_t resolution, + std::queue &block_id_queue, std::mutex &block_id_queue_mtx, + phmap::flat_hash_map &serialized_block_tank, + std::mutex &serialized_block_tank_mtx, std::atomic &early_return, + std::uint64_t stop_token) { + SPDLOG_DEBUG(FMT_STRING("write_compressed_blocks thread: start-up...")); + try { + std::string buffer; + + while (!early_return) { + const auto do_sleep = [&]() { + std::scoped_lock lck(block_id_queue_mtx); + return block_id_queue.empty(); + }(); + + if (do_sleep) { + SPDLOG_DEBUG( + FMT_STRING("write_compressed_blocks thread: no blocks to consume. Sleeping...")); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + continue; + } + + const auto bid = [&]() { + std::scoped_lock lck(block_id_queue_mtx); + const auto n = block_id_queue.front(); + block_id_queue.pop(); + return n; + }(); + + if (bid == stop_token) { + SPDLOG_DEBUG(FMT_STRING( + "write_compressed_blocks thread: no more blocks to be processed. Returning!")); + return; + } + + SPDLOG_DEBUG(FMT_STRING("write_compressed_blocks thread: waiting for block #{}..."), bid); + while (!early_return) { + { + std::scoped_lock lck(serialized_block_tank_mtx); + auto match = serialized_block_tank.find(bid); + if (match != serialized_block_tank.end()) { + buffer = match->second; + serialized_block_tank.erase(match); + break; + } + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + if (early_return) { + return; + } + + const auto offset = _fs.tellp(); + SPDLOG_DEBUG(FMT_STRING("writing block #{} for {}:{}:{} at {}:{}"), bid, chrom1.name(), + chrom2.name(), resolution, offset, buffer.size()); + _fs.write(buffer); + + MatrixBlockMetadata mm{static_cast(bid), static_cast(offset), + static_cast(buffer.size())}; + const BlockIndexKey key{chrom1, chrom2, resolution}; + auto idx = _block_index.find(key); + if (idx != _block_index.end()) { + idx->second.emplace(std::move(mm)); + } else { + _block_index.emplace(key, phmap::btree_set{std::move(mm)}); + } + } + } catch (const std::exception &e) { + early_return = true; + throw std::runtime_error("an error occurred in write_compressed_blocks thread: " + + std::string{e.what()}); + + } catch (...) { + early_return = true; + throw; + } +} + +} // namespace hictk::hic::internal diff --git a/src/libhictk/hic/include/hictk/hic/impl/file_zoomify_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/file_zoomify_impl.hpp new file mode 100644 index 00000000..8fcb4383 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/impl/file_zoomify_impl.hpp @@ -0,0 +1,77 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "hictk/hic.hpp" +#include "hictk/hic/utils.hpp" +#include "hictk/version.hpp" + +namespace hictk::hic::internal { + +inline HiCFileZoomify::HiCFileZoomify(std::string_view input_hic, std::string_view output_hic, + const std::vector& resolutions, + std::size_t n_threads, std::size_t chunk_size, + const std::filesystem::path& tmpdir, + std::uint32_t compression_lvl) + : _path_to_input_hic(std::string{input_hic}), + _hfw(init_writer(input_hic, output_hic, resolutions, n_threads, chunk_size, tmpdir, + compression_lvl)) { + const auto avail_resolutions = hic::utils::list_resolutions(input_hic); + const auto base_resolution = avail_resolutions.front(); + for (const auto& res : resolutions) { + if (res % base_resolution != 0) { + throw std::runtime_error( + fmt::format(FMT_STRING("unable to generate resolution {} from base resolution {}"), res, + base_resolution)); + } + } + init(); +} + +inline void HiCFileZoomify::init() { + const auto avail_resolutions = hic::utils::list_resolutions(_path_to_input_hic); + + // TODO: check if .hic is version 9 + // if it is, copy blocks directly + // if it isn't copy pixels + for (const auto& res : _hfw.resolutions()) { + const auto res_avail = std::find(avail_resolutions.begin(), avail_resolutions.end(), res) != + avail_resolutions.end(); + if (res_avail) { + SPDLOG_INFO(FMT_STRING("[{} bp] ingesting interactions..."), res); + const File hf(_path_to_input_hic, res); + const auto sel = hf.fetch(); + _hfw.add_pixels(res, sel.begin(), sel.end()); + } + } +} + +inline void HiCFileZoomify::zoomify() { _hfw.serialize(); } + +inline HiCFileWriter HiCFileZoomify::init_writer(std::string_view input_hic, + std::string_view output_hic, + const std::vector& resolutions, + std::size_t n_threads, std::size_t chunk_size, + const std::filesystem::path& tmpdir, + std::uint32_t compression_lvl) { + auto resolutions_ = resolutions; + std::sort(resolutions_.begin(), resolutions_.end()); + + const auto avail_resolutions = hic::utils::list_resolutions(input_hic); + const File hf(std::string{input_hic}, avail_resolutions.back()); + + return HiCFileWriter{output_hic, hf.chromosomes(), resolutions_, hf.assembly(), + n_threads, chunk_size, tmpdir, compression_lvl}; +} + +} // namespace hictk::hic::internal diff --git a/src/libhictk/hic/include/hictk/hic/impl/filestream_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/filestream_impl.hpp index 8bb9fb80..c190eaa5 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/filestream_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/filestream_impl.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -22,31 +23,57 @@ namespace hictk::hic::internal::filestream { -inline FileStream::FileStream(std::string path) - : path_(std::move(path)), - handle_(open_file(path_, std::ios::binary | std::ios::ate)), - file_size_(static_cast(handle_.tellg())) { - handle_.seekg(std::ios::beg); +inline FileStream::FileStream(std::string path, std::ios::openmode mode) + : _path(std::move(path)), + _ifs(open_file_read(_path, std::ios::in | std::ios::binary | std::ios::ate)), + _ofs(mode & std::ios::out + ? open_file_write(_path, std::ios::in | std::ios::out | std::ios::binary) + : std::ofstream{}), + _file_size(static_cast(_ifs.tellg())) { + _ifs.seekg(0, std::ios::beg); } -inline const std::string &FileStream::path() const noexcept { return path_; } -inline const std::string &FileStream::url() const noexcept { return path(); } +inline FileStream FileStream::create(std::string path) { + if (std::filesystem::exists(path)) { + throw std::runtime_error("file\"" + path + "\" already exists"); + } + FileStream fs{}; + fs._path = std::move(path); + fs._ofs = open_file_write(fs._path, std::ios::trunc | std::ios::binary); + fs._ifs = open_file_read(fs._path, std::ios::binary); + + return fs; +} -inline std::size_t FileStream::size() const { return file_size_; } +inline const std::string &FileStream::path() const noexcept { return _path; } + +inline std::size_t FileStream::size() const { return _file_size; } inline void FileStream::seekg(std::streamoff offset, std::ios::seekdir way) { - const auto new_pos = this->new_pos(offset, way); + const auto new_pos = new_posg(offset, way); if (new_pos < 0 || new_pos >= std::int64_t(size() + 1)) { throw std::runtime_error("caught an attempt of out-of-bound read"); } - handle_.seekg(new_pos, std::ios::beg); + _ifs.seekg(new_pos, std::ios::beg); } inline std::size_t FileStream::tellg() const noexcept { - return static_cast(handle_.tellg()); + assert(_ifs.tellg() >= 0); + return static_cast(_ifs.tellg()); } -inline bool FileStream::eof() const noexcept { return handle_.eof(); } +inline void FileStream::seekp(std::streamoff offset, std::ios::seekdir way) { + _ofs.seekp(new_posp(offset, way), std::ios::beg); +} + +inline std::size_t FileStream::tellp() const noexcept { + assert(_ofs.tellp() >= 0); + return static_cast(_ofs.tellp()); +} + +inline bool FileStream::eof() const noexcept { return _ifs.eof(); } + +inline void FileStream::flush() { _ofs.flush(); } inline void FileStream::read(std::string &buffer, std::size_t count) { buffer.resize(count); @@ -56,37 +83,46 @@ inline void FileStream::read(std::string &buffer, std::size_t count) { } inline void FileStream::read(char *buffer, std::size_t count) { - handle_.read(buffer, std::int64_t(count)); + _ifs.read(buffer, std::int64_t(count)); } -inline void FileStream::append(std::string &buffer, std::size_t count) { +inline void FileStream::read_append(std::string &buffer, std::size_t count) { if (count == 0) { return; } const auto buff_size = buffer.size(); buffer.resize(buffer.size() + count); - handle_.read(&(*buffer.begin()) + buff_size, std::int64_t(count)); + _ifs.read(&(*buffer.begin()) + buff_size, std::int64_t(count)); } inline bool FileStream::getline(std::string &buffer, char delim) { buffer.clear(); if (eof()) { - handle_.setstate(std::ios::badbit); + _ifs.setstate(std::ios::badbit); } try { - return !!std::getline(handle_, buffer, delim); + return !!std::getline(_ifs, buffer, delim); } catch (const std::exception &) { - if (handle_.eof() && !handle_.bad()) { - return !!handle_; + if (_ifs.eof() && !_ifs.bad()) { + return !!_ifs; } throw; } } +inline void FileStream::write(std::string_view buffer) { + return write(buffer.data(), buffer.size()); +} + +inline void FileStream::write(const char *buffer, std::size_t count) { + _ofs.write(buffer, std::int64_t(count)); + _file_size = std::max(static_cast(_ofs.tellp()), _file_size); +} + template ::value>::type *> inline void FileStream::read(T &buffer) { - static_assert(sizeof(char) == 1, ""); + static_assert(sizeof(char) == 1); return read(reinterpret_cast(&buffer), sizeof(T)); } @@ -97,6 +133,12 @@ inline T FileStream::read() { return buffer; } +template ::value>::type *> +inline void FileStream::write(T buffer) { + static_assert(sizeof(char) == 1); + return write(reinterpret_cast(&buffer), sizeof(T)); +} + template ::value>::type *> inline Tout FileStream::read_as_signed() { @@ -117,10 +159,16 @@ inline double FileStream::read_as_double() { template ::value>::type *> inline void FileStream::read(std::vector &buffer) { - static_assert(sizeof(char) == 1, ""); + static_assert(sizeof(char) == 1); return read(reinterpret_cast(&(*buffer.begin())), buffer.size() * sizeof(T)); } +template ::value>::type *> +inline void FileStream::write(const std::vector &buffer) { + static_assert(sizeof(char) == 1); + return write(reinterpret_cast(buffer.data()), buffer.size() * sizeof(T)); +} + template ::value>::type *> inline std::vector FileStream::read(std::size_t size) { assert(size != 0); @@ -135,24 +183,53 @@ inline std::string FileStream::getline(char delim) { return buffer; } -inline std::streampos FileStream::new_pos(std::streamoff offset, std::ios::seekdir way) { +inline std::streampos FileStream::new_posg(std::streamoff offset, std::ios::seekdir way) { switch (way) { case std::ios::beg: return static_cast(offset); case std::ios::cur: return std::int64_t(tellg()) + offset; case std::ios::end: - return std::int64_t(file_size_) + offset; + return std::int64_t(_file_size) + offset; + default: + HICTK_UNREACHABLE_CODE; + } +} + +inline std::streampos FileStream::new_posp(std::streamoff offset, std::ios::seekdir way) { + switch (way) { + case std::ios::beg: + return static_cast(offset); + case std::ios::cur: + return std::int64_t(tellp()) + offset; + case std::ios::end: + return std::int64_t(_file_size) + offset; default: HICTK_UNREACHABLE_CODE; } } -inline std::ifstream FileStream::open_file(const std::string &path, std::ifstream::openmode mode) { - std::ifstream ifs; - ifs.exceptions(ifs.exceptions() | std::ios::failbit | std::ios::badbit); - ifs.open(path, mode); - return ifs; +inline void FileStream::update_file_size() { + const auto offset = _ifs.tellg(); + _ifs.seekg(0, std::ios::end); + _file_size = std::max(static_cast(_ifs.tellg()), _file_size); + _ifs.seekg(offset, std::ios::beg); +} + +inline std::ifstream FileStream::open_file_read(const std::string &path, + std::ifstream::openmode mode) { + std::ifstream fs; + fs.exceptions(fs.exceptions() | std::ios::failbit | std::ios::badbit); + fs.open(path, mode); + return fs; +} + +inline std::ofstream FileStream::open_file_write(const std::string &path, + std::ofstream::openmode mode) { + std::ofstream fs; + fs.exceptions(fs.exceptions() | std::ios::failbit | std::ios::badbit); + fs.open(path, mode); + return fs; } } // namespace hictk::hic::internal::filestream diff --git a/src/libhictk/hic/include/hictk/hic/impl/footer_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/footer_impl.hpp index 47bd98ad..2335a659 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/footer_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/footer_impl.hpp @@ -21,7 +21,7 @@ namespace hictk::hic::internal { -constexpr HiCFooterMetadata::operator bool() const noexcept { return fileOffset >= 0; } +constexpr HiCFooterMetadata::operator bool() const noexcept { return matrixMetadataOffset >= 0; } inline bool HiCFooterMetadata::operator==(const HiCFooterMetadata &other) const noexcept { return url == other.url && matrix_type == other.matrix_type && @@ -53,7 +53,7 @@ inline bool HiCFooter::operator!=(const HiCFooter &other) const noexcept { constexpr const HiCFooterMetadata &HiCFooter::metadata() const noexcept { return _metadata; } constexpr HiCFooterMetadata &HiCFooter::metadata() noexcept { return _metadata; } inline const Index &HiCFooter::index() const noexcept { return _index; } -constexpr const std::string &HiCFooter::url() const noexcept { return metadata().url; } +constexpr const std::string &HiCFooter::path() const noexcept { return metadata().url; } constexpr MatrixType HiCFooter::matrix_type() const noexcept { return metadata().matrix_type; } inline balancing::Method HiCFooter::normalization() const noexcept { return metadata().normalization; @@ -62,7 +62,9 @@ constexpr MatrixUnit HiCFooter::unit() const noexcept { return metadata().unit; constexpr std::uint32_t HiCFooter::resolution() const noexcept { return metadata().resolution; } constexpr const Chromosome &HiCFooter::chrom1() const noexcept { return metadata().chrom1; } constexpr const Chromosome &HiCFooter::chrom2() const noexcept { return metadata().chrom2; } -constexpr std::int64_t HiCFooter::fileOffset() const noexcept { return metadata().fileOffset; } +constexpr std::int64_t HiCFooter::fileOffset() const noexcept { + return metadata().matrixMetadataOffset; +} constexpr const std::vector &HiCFooter::expectedValues() const noexcept { return _expectedValues; diff --git a/src/libhictk/hic/include/hictk/hic/impl/header_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/header_impl.hpp index 21423c37..15d30bfc 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/header_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/header_impl.hpp @@ -4,28 +4,163 @@ #pragma once +#include +#include + +#include #include +#include #include +#include +#include #include "hictk/hash.hpp" +#include "hictk/hic/filestream.hpp" namespace hictk::hic::internal { -constexpr HiCHeader::operator bool() const noexcept { return masterIndexOffset >= 0; } +constexpr HiCHeader::operator bool() const noexcept { return footerPosition >= 0; } inline bool HiCHeader::operator==(const HiCHeader &other) const noexcept { - return url == other.url && masterIndexOffset == other.masterIndexOffset; + return url == other.url && footerPosition == other.footerPosition; } inline bool HiCHeader::operator!=(const HiCHeader &other) const noexcept { return !(*this == other); } +inline std::string HiCHeader::serialize(BinaryBuffer &buffer, bool clear) const { + if (version != 9) { + throw std::runtime_error("serializing header for file version other than v9 is not supported."); + } + if (chromosomes.empty()) { + throw std::runtime_error("serializing a header without chromosomes is not supported."); + } + + if (clear) { + buffer.clear(); + } + + buffer.write("HIC\0", 4); + buffer.write(version); + buffer.write(footerPosition); + buffer.write(genomeID.c_str(), genomeID.size() + 1); + buffer.write(normVectorIndexPosition); + buffer.write(normVectorIndexLength); + + // Write attributes + const auto nAttributes = static_cast(attributes.size()); + buffer.write(nAttributes); + for (const auto &[k, v] : attributes) { + buffer.write(k.c_str(), k.size() + 1); + buffer.write(v.c_str(), v.size() + 1); + } + + // Write chromosomes + auto numChromosomes = static_cast(chromosomes.size()); + buffer.write(numChromosomes); + + for (const Chromosome &c : chromosomes) { + const auto name = std::string{c.name()}; + buffer.write(name.c_str(), name.size() + 1); + buffer.write(c.size()); + } + + // write resolutions + buffer.write(static_cast(resolutions.size())); + const std::vector resolutions_(resolutions.begin(), resolutions.end()); + buffer.write(resolutions_); + + // write fragments: TODO + const std::int32_t nFragResolutions = 0; + buffer.write(nFragResolutions); + + return buffer.get(); +} + +inline HiCHeader HiCHeader::deserialize(filestream::FileStream &fs) { + fs.seekg(0, std::ios::beg); + const auto magic_string_found = fs.getline('\0') == "HIC"; + if (!magic_string_found) { + throw std::runtime_error( + fmt::format(FMT_STRING("Hi-C magic string is missing. {} does not appear to be a hic file"), + fs.path())); + } + + HiCHeader header{fs.path()}; + + fs.read(header.version); + if (header.version < 6) { + throw std::runtime_error(fmt::format( + FMT_STRING(".hic version 5 and older are no longer supported. Found version {}"), + header.version)); + } + fs.read(header.footerPosition); + if (header.footerPosition < 0 || header.footerPosition >= static_cast(fs.size())) { + throw std::runtime_error( + fmt::format(FMT_STRING("file appears to be corrupted: expected footerPosition to be " + "between 0 and {}, found {}"), + fs.size(), header.footerPosition)); + } + + fs.getline(header.genomeID, '\0'); + if (header.genomeID.empty()) { + header.genomeID = "unknown"; + } + + if (header.version > 8) { + fs.read(header.normVectorIndexPosition); + fs.read(header.normVectorIndexLength); + } + + const auto nAttributes = fs.read(); + + // reading attribute-value dictionary + for (std::int32_t i = 0; i < nAttributes; i++) { + auto key = fs.getline('\0'); // key + auto value = fs.getline('\0'); // value + header.attributes.emplace(std::move(key), std::move(value)); + } + + // Read chromosomes + auto numChromosomes = static_cast(fs.read()); + std::vector chrom_names(numChromosomes); + std::vector chrom_sizes(numChromosomes); + for (std::size_t i = 0; i < chrom_names.size(); ++i) { + fs.getline(chrom_names[i], '\0'); + chrom_sizes[i] = static_cast( + header.version > 8 ? fs.read() + : static_cast(fs.read())); + } + + if (chrom_names.empty()) { + throw std::runtime_error("unable to read chromosomes"); + } + + header.chromosomes = Reference(chrom_names.begin(), chrom_names.end(), chrom_sizes.begin()); + + // Read resolutions + const auto numResolutions = static_cast(fs.read()); + if (numResolutions == 0) { + throw std::runtime_error("unable to read the list of available resolutions"); + } + + // sometimes .hic files have duplicate resolutions for some obscure reason... + phmap::btree_set resolutions{}; + for (std::size_t i = 0; i < numResolutions; ++i) { + const auto res = fs.read_as_unsigned(); + resolutions.emplace(res); + } + std::copy(resolutions.begin(), resolutions.end(), std::back_inserter(header.resolutions)); + + return header; +} + } // namespace hictk::hic::internal template <> struct std::hash { inline std::size_t operator()(hictk::hic::internal::HiCHeader const &h) const noexcept { - return hictk::internal::hash_combine(0, h.url, h.masterIndexOffset); + return hictk::internal::hash_combine(0, h.url, h.footerPosition); } }; diff --git a/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp index 164ad6e1..c8feb143 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp @@ -39,7 +39,7 @@ inline File::File(std::string url_, std::uint32_t resolution_, MatrixType type_, _bins(std::make_shared(_fs->header().chromosomes, resolution_)) { if (!has_resolution(resolution())) { throw std::runtime_error(fmt::format( - FMT_STRING("file {} does not have interactions for resolution {}"), url(), resolution())); + FMT_STRING("file {} does not have interactions for resolution {}"), path(), resolution())); } if (block_cache_capacity == 0) { @@ -49,7 +49,7 @@ inline File::File(std::string url_, std::uint32_t resolution_, MatrixType type_, inline File& File::open(std::string url_, std::uint32_t resolution_, MatrixType type_, MatrixUnit unit_, std::uint64_t block_cache_capacity) { - if (_fs->url() == url_ && resolution() == resolution_ && _type == type_ && _unit == unit_) { + if (_fs->path() == url_ && resolution() == resolution_ && _type == type_ && _unit == unit_) { _block_cache->set_capacity(block_cache_capacity, false); return *this; } @@ -65,7 +65,7 @@ inline File& File::open(std::string url_, std::uint32_t resolution_, MatrixType inline File& File::open(std::uint32_t resolution_, MatrixType type_, MatrixUnit unit_, std::uint64_t block_cache_capacity) { - return open(url(), resolution_, type_, unit_, block_cache_capacity); + return open(path(), resolution_, type_, unit_, block_cache_capacity); } inline bool File::has_resolution(std::uint32_t resolution) const { @@ -73,9 +73,9 @@ inline bool File::has_resolution(std::uint32_t resolution) const { return match != avail_resolutions().end(); } -inline const std::string& File::url() const noexcept { return _fs->url(); } +inline const std::string& File::path() const noexcept { return _fs->path(); } -inline const std::string& File::name() const noexcept { return url(); } +inline const std::string& File::name() const noexcept { return path(); } inline std::int32_t File::version() const noexcept { return _fs->version(); } @@ -115,7 +115,7 @@ inline std::uint32_t File::resolution() const noexcept { return _bins->bin_size( inline std::shared_ptr File::get_footer( const Chromosome& chrom1, const Chromosome& chrom2, MatrixType matrix_type, balancing::Method norm, MatrixUnit unit, std::uint32_t resolution) const { - const internal::HiCFooterMetadata metadata{url(), matrix_type, norm, unit, + const internal::HiCFooterMetadata metadata{path(), matrix_type, norm, unit, resolution, chrom1, chrom2}; auto it = _footers.find(metadata); if (it != _footers.end()) { @@ -212,11 +212,6 @@ inline PixelSelector File::fetch(const Chromosome& chrom1, std::uint32_t start1, "Query overlaps the lower-triangle of the matrix. This is currently not supported."); } - if (_type == MatrixType::expected && norm != balancing::Method::NONE()) { - throw std::logic_error(fmt::format( - FMT_STRING("matrix type {} is incompatible with normalization method {}"), _type, norm)); - } - const PixelCoordinates coord1 = {_bins->at(chrom1, start1), _bins->at(chrom1, end1 - 1)}; const PixelCoordinates coord2 = {_bins->at(chrom2, start2), _bins->at(chrom2, end2 - 1)}; @@ -241,6 +236,66 @@ inline PixelSelector File::fetch(std::uint64_t first_bin1, std::uint64_t last_bi std::move(norm)); } +inline balancing::Weights File::normalization(balancing::Method norm, + const Chromosome& chrom) const { + std::vector weights_{}; + const auto expected_length = (chrom.size() + bins().bin_size() - 1) / bins().bin_size(); + try { + auto weights = fetch(chrom.name(), norm).weights1(); + if (!!weights && weights().size() != expected_length) { + throw std::runtime_error( + fmt::format(FMT_STRING("{} normalization vector for {} appears to be corrupted: " + "expected {} values, found {}"), + norm, chrom.name(), expected_length, weights().size())); + } + weights_ = weights(); + } catch (const std::exception& e) { + const std::string_view msg{e.what()}; + + const auto missing_interactions = + msg.find("unable to read file offset") != std::string_view::npos; + + const auto missing_norm_vect = + msg.find(fmt::format(FMT_STRING("unable to find {} normalization vector"), norm)) != + std::string_view::npos; + + if (!missing_interactions && !missing_norm_vect) { + throw; + } + } + + if (weights_.empty()) { + weights_.resize(expected_length, std::numeric_limits::quiet_NaN()); + } + + return {weights_, balancing::Weights::Type::DIVISIVE}; +} + +inline balancing::Weights File::normalization(std::string_view norm, + const Chromosome& chrom) const { + return normalization(balancing::Method{norm}, chrom); +} + +inline balancing::Weights File::normalization(balancing::Method norm) const { + std::vector weights{}; + weights.reserve(bins().size()); + for (const auto& chrom : chromosomes()) { + if (chrom.is_all()) { + continue; + } + + const auto chrom_weights = normalization(norm, chrom); + weights.insert(weights.end(), chrom_weights().begin(), chrom_weights().end()); + } + + assert(weights.size() == bins().size()); + return {weights, balancing::Weights::Type::DIVISIVE}; +} + +inline balancing::Weights File::normalization(std::string_view norm) const { + return normalization(balancing::Method{norm}); +} + inline std::size_t File::num_cached_footers() const noexcept { return _footers.size(); } inline void File::purge_footer_cache() { _footers.clear(); } diff --git a/src/libhictk/hic/include/hictk/hic/impl/interaction_block_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/interaction_block_impl.hpp new file mode 100644 index 00000000..26b837a3 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/impl/interaction_block_impl.hpp @@ -0,0 +1,61 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include "hictk/pixel.hpp" + +namespace hictk::hic::internal { + +constexpr bool operator<(const InteractionBlock &a, const InteractionBlock &b) noexcept { + return a < b._id; +} +constexpr bool operator==(const InteractionBlock &a, const InteractionBlock &b) noexcept { + return a == b._id; +} +constexpr bool operator!=(const InteractionBlock &a, const InteractionBlock &b) noexcept { + return !(a == b); +} + +constexpr bool operator<(const InteractionBlock &a, std::size_t b_id) noexcept { + return a._id < b_id; +} +constexpr bool operator==(const InteractionBlock &a, std::size_t b_id) noexcept { + return a._id == b_id; +} +constexpr bool operator!=(const InteractionBlock &a, std::size_t b_id) noexcept { + return !(a == b_id); +} + +constexpr bool operator<(std::size_t a_id, const InteractionBlock &b) noexcept { + return a_id < b._id; +} +constexpr bool operator==(std::size_t a_id, const InteractionBlock &b) noexcept { + return a_id == b._id; +} +constexpr bool operator!=(std::size_t a_id, const InteractionBlock &b) noexcept { + return !(a_id == b); +} + +inline InteractionBlock::InteractionBlock(std::size_t id_, + [[maybe_unused]] std::size_t block_bin_count, + std::vector> pixels) + : _id(id_), _interactions(std::move(pixels)) {} + +inline auto InteractionBlock::operator()() const noexcept -> const BuffT & { return _interactions; } + +inline auto InteractionBlock::begin() const noexcept -> const_iterator { + return _interactions.begin(); +} +inline auto InteractionBlock::end() const noexcept -> const_iterator { return _interactions.end(); } +inline auto InteractionBlock::cbegin() const noexcept -> const_iterator { return begin(); } +inline auto InteractionBlock::cend() const noexcept -> const_iterator { return end(); } + +inline std::size_t InteractionBlock::id() const noexcept { return _id; } + +inline std::size_t InteractionBlock::size() const noexcept { return _interactions.size(); } +} // namespace hictk::hic::internal diff --git a/src/libhictk/hic/include/hictk/hic/impl/interaction_to_block_mapper_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/interaction_to_block_mapper_impl.hpp new file mode 100644 index 00000000..203879fe --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/impl/interaction_to_block_mapper_impl.hpp @@ -0,0 +1,573 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/bin_table.hpp" +#include "hictk/hic/binary_buffer.hpp" +#include "hictk/hic/filestream.hpp" +#include "hictk/pixel.hpp" + +namespace hictk::hic::internal { + +template +inline void MatrixInteractionBlockFlat::emplace_back(Pixel &&p) { + emplace_back(p.to_thin()); +} + +template +inline void MatrixInteractionBlockFlat::emplace_back(ThinPixel &&p) { + bin1_ids.push_back(p.bin1_id); + bin2_ids.push_back(p.bin2_id); + counts.push_back(p.count); +} + +template +inline std::size_t MatrixInteractionBlockFlat::size() const noexcept { + return bin1_ids.size(); +} + +template +inline std::string MatrixInteractionBlockFlat::serialize(BinaryBuffer &buffer, + ZSTD_CCtx_s &compressor, + std::string &compression_buffer, + int compression_lvl, bool clear) const { + if (size() == 0) { + return ""; + } + + if (clear) { + buffer.clear(); + } + + buffer.write(bin1_ids); + buffer.write(bin2_ids); + buffer.write(counts); + + const auto buff_size = ZSTD_compressBound(buffer.get().size() * sizeof(char)); + compression_buffer.resize(buff_size); + + std::size_t compressed_size = ZSTD_compressCCtx( + &compressor, reinterpret_cast(compression_buffer.data()), + compression_buffer.size() * sizeof(char), reinterpret_cast(buffer.get().data()), + buffer.get().size() * sizeof(char), compression_lvl); + if (ZSTD_isError(compressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(compressed_size)); + } + + compression_buffer.resize(compressed_size); + + buffer.clear(); + buffer.write(size()); + buffer.write(compression_buffer, false); + + return buffer.get(); +} + +template +[[nodiscard]] std::vector> MatrixInteractionBlockFlat::deserialize( + BinaryBuffer &buffer, ZSTD_DCtx_s &decompressor, std::string &decompression_buffer) { + const auto size_ = buffer.read(); + std::vector> pixels(size_); + + const auto decompressed_size = + size_ * (sizeof(std::uint64_t) + sizeof(std::uint64_t) + sizeof(N)); + decompression_buffer.resize(decompressed_size); + + const auto compressed_buffer = std::string_view{buffer.get()}.substr(sizeof(size_)); + + const auto status = ZSTD_decompressDCtx( + &decompressor, decompression_buffer.data(), decompression_buffer.size() * sizeof(char), + compressed_buffer.data(), compressed_buffer.size() * sizeof(char)); + + if (ZSTD_isError(status)) { + throw std::runtime_error(ZSTD_getErrorName(status)); + } + buffer.clear(); + buffer.write(decompression_buffer); + + for (auto &p : pixels) { + p.bin1_id = buffer.read(); + } + for (auto &p : pixels) { + p.bin2_id = buffer.read(); + } + for (auto &p : pixels) { + p.count = buffer.read(); + } + + return pixels; +} + +inline bool HiCInteractionToBlockMapper::BlockID::operator<(const BlockID &other) const noexcept { + if (chrom1_id != other.chrom1_id) { + return chrom1_id < other.chrom1_id; + } + if (chrom2_id != other.chrom2_id) { + return chrom2_id < other.chrom2_id; + } + return bid < other.bid; +} + +inline bool HiCInteractionToBlockMapper::BlockID::operator==(const BlockID &other) const noexcept { + return chrom1_id == other.chrom1_id && chrom2_id == other.chrom2_id && bid == other.bid; +} + +inline HiCInteractionToBlockMapper::HiCInteractionToBlockMapper( + std::filesystem::path path, std::shared_ptr bins, std::size_t chunk_size, + int compression_lvl) + : _path(std::move(path)), + _bin_table(std::move(bins)), + _chunk_size(chunk_size), + _compression_lvl(compression_lvl), + _zstd_cctx(ZSTD_createCCtx()), + _zstd_dctx(ZSTD_createDCtx()) { + assert(_chunk_size != 0); + init_block_mappers(); +} + +inline HiCInteractionToBlockMapper::~HiCInteractionToBlockMapper() noexcept { + try { + _fs = filestream::FileStream(); + std::filesystem::remove(_path); + } catch (...) { + } +} + +inline const Reference &HiCInteractionToBlockMapper::chromosomes() const noexcept { + return _bin_table->chromosomes(); +} + +inline std::size_t HiCInteractionToBlockMapper::size() const noexcept { return _processed_pixels; } +inline bool HiCInteractionToBlockMapper::empty() const noexcept { return size() == 0; } +inline bool HiCInteractionToBlockMapper::empty(const Chromosome &chrom1, + const Chromosome &chrom2) const noexcept { + auto it = _chromosome_index.find(std::make_pair(chrom1, chrom2)); + return it == _chromosome_index.end(); +} + +template +inline void HiCInteractionToBlockMapper::append_pixels(PixelIt first_pixel, PixelIt last_pixel, + std::uint32_t update_frequency) { + using PixelT = remove_cvref_t; + static_assert(std::is_same_v> || std::is_same_v>); + + SPDLOG_DEBUG(FMT_STRING("mapping pixels to interaction blocks at resolution {}..."), + _bin_table->bin_size()); + + auto t0 = std::chrono::steady_clock::now(); + for (std::size_t i = 0; first_pixel != last_pixel; ++i) { + if (_pending_pixels >= _chunk_size) { + write_blocks(); + } + + add_pixel(*first_pixel); + std::ignore = ++first_pixel; + + if (i == update_frequency) { + const auto t1 = std::chrono::steady_clock::now(); + const auto delta = + static_cast( + std::chrono::duration_cast(t1 - t0).count()) / + 1000.0; + SPDLOG_INFO(FMT_STRING("ingesting pixels at {:.0f} pixels/s..."), + double(update_frequency) / delta); + t0 = t1; + i = 0; + } + } +} + +template +inline void HiCInteractionToBlockMapper::append_pixels(PixelIt first_pixel, PixelIt last_pixel, + BS::thread_pool &tpool, + std::uint32_t update_frequency) { + using PixelT = remove_cvref_t; + static_assert(std::is_same_v> || std::is_same_v>); + constexpr bool is_thin_pixel = std::is_same_v>; + + if (tpool.get_thread_count() < 2) { + return append_pixels(first_pixel, last_pixel); + } + + SPDLOG_DEBUG(FMT_STRING("mapping pixels to interaction blocks using 2 threads...")); + + std::atomic early_return = false; + moodycamel::BlockingReaderWriterQueue> queue(10'000); + + auto writer = tpool.submit([&]() { + try { + auto t0 = std::chrono::steady_clock::now(); + for (std::size_t i = 0; first_pixel != last_pixel && !early_return; ++i) { + const auto pixel = [&]() { + if constexpr (is_thin_pixel) { + return Pixel(*_bin_table, *first_pixel); + } else { + return *first_pixel; + } + }(); + std::ignore = ++first_pixel; + + while (!queue.try_enqueue(pixel)) { + if (early_return) { + return; + } + } + if (i == update_frequency) { + const auto t1 = std::chrono::steady_clock::now(); + const auto delta = + static_cast( + std::chrono::duration_cast(t1 - t0).count()) / + 1000.0; + SPDLOG_INFO(FMT_STRING("ingesting pixels at {:.0f} pixels/s..."), + double(update_frequency) / delta); + t0 = t1; + i = 0; + } + } + queue.enqueue(Pixel{}); + + } catch (...) { + early_return = true; + throw; + } + }); + + auto reader = tpool.submit([&]() { + try { + Pixel p{}; + while (!early_return) { + if (_pending_pixels >= _chunk_size) { + write_blocks(); + } + + queue.wait_dequeue(p); + if (p.count == 0) { + return; + } + + add_pixel(p); + ++_processed_pixels; + ++_pending_pixels; + } + } catch (...) { + early_return = true; + throw; + } + }); + + writer.get(); + reader.get(); +} + +inline auto HiCInteractionToBlockMapper::block_index() const noexcept -> const BlockIndexMap & { + return _block_index; +} + +inline auto HiCInteractionToBlockMapper::chromosome_index() const noexcept + -> const ChromosomeIndexMap & { + return _chromosome_index; +} + +inline auto HiCInteractionToBlockMapper::merge_blocks(const BlockID &bid) + -> MatrixInteractionBlock { + std::mutex dummy_mtx{}; + return merge_blocks(bid, _bbuffer, *_zstd_dctx, _compression_buffer, dummy_mtx); +} + +inline auto HiCInteractionToBlockMapper::merge_blocks(const BlockID &bid, BinaryBuffer &bbuffer, + ZSTD_DCtx_s &zstd_dctx, + std::string &compression_buffer, + std::mutex &mtx) + -> MatrixInteractionBlock { + MatrixInteractionBlock blk{}; + for (auto &&pixel : fetch_pixels(bid, bbuffer, zstd_dctx, compression_buffer, mtx)) { + blk.emplace_back(std::move(pixel)); + } + blk.finalize(); + return blk; +} + +inline float HiCInteractionToBlockMapper::pixel_sum(const Chromosome &chrom1, + const Chromosome &chrom2) const { + auto match = _pixel_sums.find(std::make_pair(chrom1, chrom2)); + if (match == _pixel_sums.end()) { + return 0; + } + return match->second; +} + +inline float HiCInteractionToBlockMapper::pixel_sum() const { + return std::accumulate( + _pixel_sums.begin(), _pixel_sums.end(), 0.0F, + [](const float accumulator, const auto &kv) { return accumulator + kv.second; }); +} + +inline void HiCInteractionToBlockMapper::finalize() { + if (_processed_pixels > _pending_pixels) { + write_blocks(); + } +} + +inline void HiCInteractionToBlockMapper::clear() { + _blocks.clear(); + _block_index.clear(); + _chromosome_index.clear(); + _pixel_sums.clear(); + _processed_pixels = 0; + _pending_pixels = 0; + _bbuffer.reset().shrink_to_fit(); + _compression_buffer.clear(); + _compression_buffer.shrink_to_fit(); + std::error_code ec{}; + std::filesystem::remove(_path, ec); +} + +inline void HiCInteractionToBlockMapper::init_block_mappers() { + for (std::uint32_t chrom1_id = 0; chrom1_id < chromosomes().size(); ++chrom1_id) { + const auto &chrom1 = chromosomes().at(chrom1_id); + for (std::uint32_t chrom2_id = chrom1_id; chrom2_id < chromosomes().size(); ++chrom2_id) { + const auto &chrom2 = chromosomes().at(chrom2_id); + + const auto num_bins = compute_num_bins(chrom1, chrom2, _bin_table->bin_size()); + const auto num_columns = compute_block_column_count( + chrom1, chrom2, _bin_table->bin_size(), + chrom1 == chrom2 ? DEFAULT_INTRA_CUTOFF : DEFAULT_INTER_CUTOFF); + const auto num_rows = num_bins / num_columns + 1; + + if (chrom1 == chrom2) { + _mappers_intra.emplace(chrom1, BlockMapperIntra{num_rows, num_columns}); + } else { + _mappers_inter.emplace(std::make_pair(chrom1, chrom2), + BlockMapperInter{num_rows, num_columns}); + } + } + } +} + +inline std::pair HiCInteractionToBlockMapper::write_block( + const MatrixInteractionBlockFlat &blk) { + const auto offset = _fs.tellp(); + _fs.write(blk.serialize(_bbuffer, *_zstd_cctx, _compression_buffer, _compression_lvl)); + const auto size = _fs.tellp() - offset; + return std::make_pair(offset, static_cast(size)); +} + +template +inline auto HiCInteractionToBlockMapper::map(const ThinPixel &p) const -> BlockID { + return map(Pixel(*_bin_table, p)); +} + +template +inline auto HiCInteractionToBlockMapper::map(const Pixel &p) const -> BlockID { + const auto &bin1 = p.coords.bin1; + const auto &bin2 = p.coords.bin2; + + const auto &chrom1 = bin1.chrom(); + const auto &chrom2 = bin2.chrom(); + + const auto bin1_id = bin1.rel_id(); + const auto bin2_id = bin2.rel_id(); + + const auto block_id = p.coords.is_intra() + ? _mappers_intra.at(chrom1)(bin1_id, bin2_id) + : _mappers_inter.at(std::make_pair(chrom1, chrom2))(bin1_id, bin2_id); + + return {chrom1.id(), chrom2.id(), block_id}; +} + +template +inline void HiCInteractionToBlockMapper::add_pixel(const ThinPixel &p) { + add_pixel(Pixel(*_bin_table, p)); +} + +template +inline void HiCInteractionToBlockMapper::add_pixel(const Pixel &p) { + auto bid = map(p); + + const auto &chrom1 = p.coords.bin1.chrom(); + const auto &chrom2 = p.coords.bin2.chrom(); + const auto chrom_pair = std::make_pair(chrom1, chrom2); + + auto match1 = _blocks.find(bid); + if (match1 != _blocks.end()) { + _pixel_sums.at(chrom_pair) += p.count; + match1->second.emplace_back(p.to_thin()); + } else { + _pixel_sums.emplace(chrom_pair, p.count); + auto [it, _] = _blocks.emplace(std::move(bid), MatrixInteractionBlockFlat{}); + it->second.emplace_back(p.to_thin()); + } + + auto match2 = _chromosome_index.find(chrom_pair); + if (match2 != _chromosome_index.end()) { + match2->second.emplace(bid); + } else { + _chromosome_index.emplace(chrom_pair, phmap::btree_set{bid}); + } + ++_processed_pixels; + ++_pending_pixels; +} + +inline std::vector> HiCInteractionToBlockMapper::fetch_pixels(const BlockID &bid) { + std::mutex dummy_mtx{}; + return fetch_pixels(bid, _bbuffer, *_zstd_dctx, _compression_buffer, dummy_mtx); +} + +inline std::vector> HiCInteractionToBlockMapper::fetch_pixels( + const BlockID &bid, BinaryBuffer &bbuffer, ZSTD_DCtx_s &zstd_dctx, + std::string &compression_buffer, std::mutex &mtx) { + std::vector> pixels{}; + auto match = _blocks.find(bid); + if (match != _blocks.end()) { + const auto &flat_pixels = match->second; + pixels.reserve(flat_pixels.size()); + for (std::size_t i = 0; i < flat_pixels.size(); ++i) { + pixels.emplace_back( + Pixel(*_bin_table, ThinPixel{flat_pixels.bin1_ids[i], flat_pixels.bin2_ids[i], + flat_pixels.counts[i]})); + } + return pixels; + } + + for (const auto &[pos, size] : _block_index.at(bid)) { + { + std::scoped_lock lck(mtx); + _fs.seekg(static_cast(pos)); + _fs.read(bbuffer.reset(), size); + } + const auto flat_pixels = + MatrixInteractionBlockFlat::deserialize(bbuffer, zstd_dctx, compression_buffer); + pixels.reserve(pixels.size() + flat_pixels.size()); + for (const auto &p : flat_pixels) { + pixels.emplace_back(Pixel(*_bin_table, p)); + } + } + return pixels; +} + +inline void HiCInteractionToBlockMapper::write_blocks() { + if (!std::filesystem::exists(_path)) { + _fs = filestream::FileStream::create(_path.string()); + } + SPDLOG_DEBUG(FMT_STRING("writing {} pixels to file {}..."), _pending_pixels, _path); + for (auto &[bid, blk] : _blocks) { + const auto [offset, size] = write_block(blk); + auto match = _block_index.find(bid); + if (match != _block_index.end()) { + match->second.emplace_back(BlockIndex{offset, size}); + } else { + _block_index.emplace(bid, std::vector{{offset, size}}); + } + } + _fs.flush(); + _blocks.clear(); + _pending_pixels = 0; +} + +inline std::size_t HiCInteractionToBlockMapper::compute_block_column_count( + const Chromosome &chrom1, const Chromosome &chrom2, std::uint32_t bin_size, + std::uint32_t cutoff, std::size_t block_capacity) { + const auto num_bins = compute_num_bins(chrom1, chrom2, bin_size); + auto num_columns = num_bins / block_capacity + 1; + if (bin_size < cutoff) { + const auto genome_size = num_bins * bin_size; + num_columns = genome_size / (block_capacity * cutoff); + } + + const auto max_sqrt = + static_cast(std::sqrt(std::numeric_limits::max())); + return std::clamp(num_columns, std::size_t(1), max_sqrt - 1); +} + +inline std::size_t HiCInteractionToBlockMapper::compute_num_bins(const Chromosome &chrom1, + const Chromosome &chrom2, + std::size_t bin_size) { + const auto max_size = std::max(chrom1.size(), chrom2.size()); + return (max_size + bin_size - 1) / bin_size; +} + +inline HiCInteractionToBlockMapper::BlockMapperInter::BlockMapperInter( + std::uint64_t block_bin_count, std::uint64_t block_column_count) + : _block_bin_count(block_bin_count), _block_column_count(block_column_count) { + assert(_block_bin_count != 0); + assert(_block_column_count != 0); +} + +inline std::uint64_t HiCInteractionToBlockMapper::BlockMapperInter::block_column_count() const { + return _block_column_count; +} + +inline std::uint64_t HiCInteractionToBlockMapper::BlockMapperInter::block_bin_count() const { + return _block_bin_count; +} + +inline std::uint64_t HiCInteractionToBlockMapper::BlockMapperInter::operator()( + std::uint64_t bin1_id, std::uint64_t bin2_id) const { + const auto i = bin1_id / block_bin_count(); + const auto j = bin2_id / block_bin_count(); + + return (block_column_count() * j) + i; +} + +inline HiCInteractionToBlockMapper::BlockMapperIntra::BlockMapperIntra( + std::uint64_t block_bin_count, std::uint64_t block_column_count, std::int64_t base_depth) + : _inter_mapper(block_bin_count, block_column_count), _base(init_base(base_depth)) {} + +inline std::uint64_t HiCInteractionToBlockMapper::BlockMapperIntra::block_column_count() const { + return _inter_mapper.block_column_count(); +} + +inline std::uint64_t HiCInteractionToBlockMapper::BlockMapperIntra::block_bin_count() const { + return _inter_mapper.block_bin_count(); +} + +inline bool HiCInteractionToBlockMapper::BlockMapperIntra::use_inter_mapper() const noexcept { + return _base == 0; +} + +inline std::uint64_t HiCInteractionToBlockMapper::BlockMapperIntra::operator()( + std::uint64_t bin1_id, std::uint64_t bin2_id) const { + if (use_inter_mapper()) { + return _inter_mapper(bin1_id, bin2_id); + } + const auto delta = bin1_id > bin2_id ? bin1_id - bin2_id : bin2_id - bin1_id; + const auto n = + static_cast(delta) / std::sqrt(2.0) / static_cast(block_bin_count()); + + const auto depth = static_cast(std::log(1.0 + n) / _base); + const auto position_along_diagonal = (bin1_id + bin2_id) / 2 / block_bin_count(); + + return depth * block_column_count() + position_along_diagonal; +} + +inline double HiCInteractionToBlockMapper::BlockMapperIntra::init_base( + std::int64_t base_depth) noexcept { + if (base_depth > 1) { + return std::log(static_cast(base_depth)); + } + if (base_depth < 0) { + return static_cast(-base_depth); + } + return std::log(2.0); +} + +} // namespace hictk::hic::internal diff --git a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp index b5f87728..7580828c 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp @@ -713,6 +713,8 @@ inline ThinPixel PixelSelector::iterator::transform_pixel(ThinPixel inline PixelSelectorAll::PixelSelectorAll(std::vector selectors_) noexcept : _selectors(std::move(selectors_)) {} +inline bool PixelSelectorAll::empty() const noexcept { return begin() == end(); } + template inline auto PixelSelectorAll::begin(bool sorted) const -> iterator { return cbegin(sorted); diff --git a/src/libhictk/hic/include/hictk/hic/impl/utils_merge_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/utils_merge_impl.hpp new file mode 100644 index 00000000..6afaf434 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/impl/utils_merge_impl.hpp @@ -0,0 +1,95 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include + +#include "hictk/hic/file_reader.hpp" +#include "hictk/hic/file_writer.hpp" + +namespace hictk::hic::utils { + +namespace internal { + +inline void validate_chromosomes(const std::vector& files) { + assert(files.size() > 1); + const auto chromosomes = files.front().chromosomes(); + + for (std::size_t i = 1; i < files.size(); ++i) { + if (chromosomes != files[i].chromosomes()) { + throw std::runtime_error( + fmt::format(FMT_STRING("files \"{}\" and \"{}\" use different reference genomes"), + files.front().path(), files[i].path())); + } + } +} +} // namespace internal + +template +inline void merge(Str first_file, Str last_file, std::string_view dest_file, + std::uint32_t resolution, const std::filesystem::path& tmp_dir, + bool overwrite_if_exists, std::size_t chunk_size, std::size_t n_threads, + std::uint32_t compression_lvl) { + static_assert(std::is_constructible_v); + assert(chunk_size != 0); + try { + std::vector files{}; + std::transform(first_file, last_file, std::back_inserter(files), + [&](const std::string& path) { return hic::File(path, resolution); }); + if (files.size() < 2) { + throw std::runtime_error("cannot merge less than 2 coolers"); + } + + internal::validate_chromosomes(files); + + std::vector selectors; + std::vector> heads; + std::vector> tails; + + for (auto& f : files) { + auto sel = f.fetch(); + auto first = sel.begin(); + auto last = sel.end(); + if (first != last) { + selectors.emplace_back(std::move(sel)); + heads.emplace_back(std::move(first)); + tails.emplace_back(std::move(last)); + } + } + + merge(heads, tails, files.front().bins(), dest_file, files.front().assembly(), tmp_dir, + overwrite_if_exists, chunk_size, n_threads, compression_lvl); + } catch (const std::exception& e) { + throw std::runtime_error(fmt::format(FMT_STRING("failed to merge {} cooler files: {}"), + std::distance(first_file, last_file), e.what())); + } +} + +template +inline void merge(const std::vector& heads, const std::vector& tails, + const BinTable& bins, std::string_view dest_uri, std::string_view assembly, + const std::filesystem::path& tmp_dir, bool overwrite_if_exists, + std::size_t chunk_size, std::size_t n_threads, std::uint32_t compression_lvl) { + using N = remove_cvref_tcount)>; + + hictk::transformers::PixelMerger merger{heads, tails}; + std::vector> buffer(chunk_size); + buffer.clear(); + + if (overwrite_if_exists) { + std::filesystem::remove(dest_uri); + } + + hic::internal::HiCFileWriter w(dest_uri, bins.chromosomes(), {bins.bin_size()}, assembly, + n_threads, chunk_size, tmp_dir, compression_lvl); + + w.add_pixels(bins.bin_size(), merger.begin(), merger.end()); + w.serialize(); +} + +} // namespace hictk::hic::utils diff --git a/src/libhictk/hic/include/hictk/hic/impl/validation_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/validation_impl.hpp index 7dc37b41..4e911251 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/validation_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/validation_impl.hpp @@ -13,7 +13,7 @@ namespace hictk::hic::utils { inline bool is_hic_file(const std::filesystem::path& path) { - return internal::HiCFileReader::checkMagicString(path.string()); + return hictk::hic::internal::HiCFileReader::checkMagicString(path.string()); } } // namespace hictk::hic::utils diff --git a/src/libhictk/hic/include/hictk/hic/interaction_block.hpp b/src/libhictk/hic/include/hictk/hic/interaction_block.hpp new file mode 100644 index 00000000..c28638e7 --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/interaction_block.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include "hictk/pixel.hpp" + +namespace hictk::hic::internal { + +class InteractionBlock { + public: + using Row = std::vector>; + + private: + using BuffT = std::vector>; + std::size_t _id{}; + BuffT _interactions{}; + + public: + using iterator = BuffT::iterator; + using const_iterator = BuffT::const_iterator; + + InteractionBlock() = default; + InteractionBlock(std::size_t id_, std::size_t block_bin_count, + std::vector> pixels); + + friend constexpr bool operator<(const InteractionBlock& a, const InteractionBlock& b) noexcept; + friend constexpr bool operator==(const InteractionBlock& a, const InteractionBlock& b) noexcept; + friend constexpr bool operator!=(const InteractionBlock& a, const InteractionBlock& b) noexcept; + + friend constexpr bool operator<(const InteractionBlock& a, std::size_t b_id) noexcept; + friend constexpr bool operator==(const InteractionBlock& a, std::size_t b_id) noexcept; + friend constexpr bool operator!=(const InteractionBlock& a, std::size_t b_id) noexcept; + + friend constexpr bool operator<(std::size_t a_id, const InteractionBlock& b) noexcept; + friend constexpr bool operator==(std::size_t a_id, const InteractionBlock& b) noexcept; + friend constexpr bool operator!=(std::size_t a_id, const InteractionBlock& b) noexcept; + + [[nodiscard]] auto operator()() const noexcept -> const BuffT&; + + [[nodiscard]] auto begin() const noexcept -> const_iterator; + [[nodiscard]] auto end() const noexcept -> const_iterator; + + [[nodiscard]] auto cbegin() const noexcept -> const_iterator; + [[nodiscard]] auto cend() const noexcept -> const_iterator; + + [[nodiscard]] std::size_t id() const noexcept; + + [[nodiscard]] std::size_t size() const noexcept; +}; + +} // namespace hictk::hic::internal + +#include "./impl/interaction_block_impl.hpp" // NOLINT diff --git a/src/libhictk/hic/include/hictk/hic/interaction_to_block_mapper.hpp b/src/libhictk/hic/include/hictk/hic/interaction_to_block_mapper.hpp new file mode 100644 index 00000000..b62016ba --- /dev/null +++ b/src/libhictk/hic/include/hictk/hic/interaction_to_block_mapper.hpp @@ -0,0 +1,224 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +// IWYU pragma: private, include "hictk/hic.hpp" + +#include +#include +#if __has_include() +#include +#else +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/chromosome.hpp" +#include "hictk/default_delete.hpp" +#include "hictk/hic/binary_buffer.hpp" +#include "hictk/hic/file_writer_data_structures.hpp" +#include "hictk/hic/filestream.hpp" +#include "hictk/pixel.hpp" +#include "hictk/reference.hpp" + +namespace hictk::hic::internal { + +template +struct MatrixInteractionBlockFlat { + std::vector bin1_ids{}; + std::vector bin2_ids{}; + std::vector counts{}; + + void emplace_back(ThinPixel&& p); + void emplace_back(Pixel&& p); + + [[nodiscard]] std::size_t size() const noexcept; + + [[nodiscard]] std::string serialize(BinaryBuffer& buffer, ZSTD_CCtx_s& compressor, + std::string& compression_buffer, int compression_lvl, + bool clear = true) const; + [[nodiscard]] static std::vector> deserialize(BinaryBuffer& buffer, + ZSTD_DCtx_s& decompressor, + std::string& decompression_buffer); +}; + +class HiCInteractionToBlockMapper { + public: + class BlockMapperIntra; + class BlockMapperInter; + + static constexpr std::uint32_t DEFAULT_INTRA_CUTOFF = 500; + static constexpr std::uint32_t DEFAULT_INTER_CUTOFF = 5'000; + static constexpr std::size_t DEFAULT_BLOCK_CAPACITY = 1'000; + + struct BlockID { + std::uint32_t chrom1_id; + std::uint32_t chrom2_id; + std::uint64_t bid; + + [[nodiscard]] bool operator<(const BlockID& other) const noexcept; + [[nodiscard]] bool operator==(const BlockID& other) const noexcept; + }; + + struct BlockIndex { + std::uint64_t offset; + std::uint32_t size; + }; + + private: + std::filesystem::path _path{}; + filestream::FileStream _fs{}; + std::shared_ptr _bin_table{}; + + using BlockIndexMap = phmap::btree_map>; + using ChromosomeIndexMap = + phmap::flat_hash_map, phmap::btree_set>; + BlockIndexMap _block_index{}; + ChromosomeIndexMap _chromosome_index{}; + + phmap::btree_map> _blocks{}; + phmap::flat_hash_map, float> _pixel_sums{}; + std::size_t _processed_pixels{}; + std::size_t _pending_pixels{}; + std::size_t _chunk_size{}; + + phmap::flat_hash_map _mappers_intra{}; + phmap::flat_hash_map, BlockMapperInter> _mappers_inter{}; + + BinaryBuffer _bbuffer{}; + int _compression_lvl{}; + std::unique_ptr _zstd_cctx{}; + std::unique_ptr _zstd_dctx{}; + std::string _compression_buffer{}; + + public: + HiCInteractionToBlockMapper() = default; + HiCInteractionToBlockMapper(std::filesystem::path path, std::shared_ptr bins, + std::size_t chunk_size, int compression_lvl); + + HiCInteractionToBlockMapper(const HiCInteractionToBlockMapper& other) = delete; +#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ > 9 + HiCInteractionToBlockMapper(HiCInteractionToBlockMapper&& other) noexcept = default; +#else + HiCInteractionToBlockMapper(HiCInteractionToBlockMapper&& other) = default; +#endif + + ~HiCInteractionToBlockMapper() noexcept; + + HiCInteractionToBlockMapper& operator=(const HiCInteractionToBlockMapper& other) = delete; + +#if defined(__GNUC__) && defined(__clang__) && __clang_major__ > 8 + HiCInteractionToBlockMapper& operator=(HiCInteractionToBlockMapper&& other) noexcept = default; +#elif defined(__GNUC__) && __GNUC__ > 8 + HiCInteractionToBlockMapper& operator=(HiCInteractionToBlockMapper&& other) noexcept = default; +#else + HiCInteractionToBlockMapper& operator=(HiCInteractionToBlockMapper&& other) = default; +#endif + + const Reference& chromosomes() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] bool empty(const Chromosome& chrom1, const Chromosome& chrom2) const noexcept; + template >> + void append_pixels(PixelIt first_pixel, PixelIt last_pixel, + std::uint32_t update_frequency = 10'000'000); + template >> + void append_pixels(PixelIt first_pixel, PixelIt last_pixel, BS::thread_pool& tpool, + std::uint32_t update_frequency = 10'000'000); + + [[nodiscard]] auto block_index() const noexcept -> const BlockIndexMap&; + [[nodiscard]] auto chromosome_index() const noexcept -> const ChromosomeIndexMap&; + [[nodiscard]] auto merge_blocks(const BlockID& bid) -> MatrixInteractionBlock; + [[nodiscard]] auto merge_blocks(const BlockID& bid, BinaryBuffer& bbuffer, ZSTD_DCtx_s& zstd_dctx, + std::string& compression_buffer, std::mutex& mtx) + -> MatrixInteractionBlock; + [[nodiscard]] float pixel_sum(const Chromosome& chrom1, const Chromosome& chrom2) const; + [[nodiscard]] float pixel_sum() const; + + void finalize(); + void clear(); + + [[nodiscard]] static std::size_t compute_block_column_count( + const Chromosome& chrom1, const Chromosome& chrom2, std::uint32_t bin_size, + std::uint32_t cutoff, std::size_t block_capacity = DEFAULT_BLOCK_CAPACITY); + [[nodiscard]] static std::size_t compute_num_bins(const Chromosome& chrom1, + const Chromosome& chrom2, std::size_t bin_size); + + private: + void init_block_mappers(); + + template + [[nodiscard]] auto map(const ThinPixel& p) const -> BlockID; + template + [[nodiscard]] auto map(const Pixel& p) const -> BlockID; + + template + void add_pixel(const ThinPixel& p); + template + void add_pixel(const Pixel& p); + + [[nodiscard]] std::vector> fetch_pixels(const BlockID& bid); + [[nodiscard]] std::vector> fetch_pixels(const BlockID& bid, BinaryBuffer& bbuffer, + ZSTD_DCtx_s& zstd_dctx, + std::string& compression_buffer, + std::mutex& mtx); + + void write_blocks(); + std::pair write_block(const MatrixInteractionBlockFlat& blk); + + public: + class BlockMapperInter { + std::uint64_t _block_bin_count{}; + std::uint64_t _block_column_count{}; + + public: + BlockMapperInter(std::uint64_t block_bin_count, std::uint64_t block_column_count); + [[nodiscard]] std::uint64_t operator()(std::uint64_t bin1_id, std::uint64_t bin2_id) const; + + [[nodiscard]] std::uint64_t block_bin_count() const; + [[nodiscard]] std::uint64_t block_column_count() const; + }; + + class BlockMapperIntra { + BlockMapperInter _inter_mapper; + double _base{}; + + static constexpr std::int64_t DEFAULT_BASE_DEPTH = 2; + + public: + BlockMapperIntra(std::uint64_t block_bin_count, std::uint64_t block_column_count, + std::int64_t base_depth = DEFAULT_BASE_DEPTH); + [[nodiscard]] std::uint64_t operator()(std::uint64_t bin1_id, std::uint64_t bin2_id) const; + + [[nodiscard]] std::uint64_t block_bin_count() const; + [[nodiscard]] std::uint64_t block_column_count() const; + + private: + [[nodiscard]] bool use_inter_mapper() const noexcept; + [[nodiscard]] static double init_base(std::int64_t base_depth) noexcept; + }; +}; + +} // namespace hictk::hic::internal + +template <> +struct std::hash { + inline std::size_t operator()( + hictk::hic::internal::HiCInteractionToBlockMapper::BlockID const& bid) const noexcept { + return hictk::internal::hash_combine(0, bid.chrom1_id, bid.chrom2_id, bid.bid); + } +}; + +#include "./impl/interaction_to_block_mapper_impl.hpp" // NOLINT diff --git a/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp b/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp index 8485b918..7eb992e5 100644 --- a/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp @@ -194,6 +194,8 @@ class PixelSelectorAll { PixelSelectorAll() = default; explicit PixelSelectorAll(std::vector selectors_) noexcept; + [[nodiscard]] bool empty() const noexcept; + template [[nodiscard]] auto begin(bool sorted = true) const -> iterator; template diff --git a/src/libhictk/hic/include/hictk/hic/utils.hpp b/src/libhictk/hic/include/hictk/hic/utils.hpp index bba25718..8f45e0a9 100644 --- a/src/libhictk/hic/include/hictk/hic/utils.hpp +++ b/src/libhictk/hic/include/hictk/hic/utils.hpp @@ -9,8 +9,24 @@ #include namespace hictk::hic::utils { -[[nodiscard]] std::vector list_resolutions(const std::filesystem::path &path, + +/// Iterable of hictk::File or strings +template +void merge(Str first_file, Str last_file, std::string_view dest_file, std::uint32_t resolution, + const std::filesystem::path& tmp_dir = std::filesystem::temp_directory_path(), + bool overwrite_if_exists = false, std::size_t chunk_size = 500'000, + std::size_t n_threads = 1, std::uint32_t compression_lvl = 12); + +template +void merge(const std::vector& heads, const std::vector& tails, + const BinTable& bins, std::string_view dest_file, std::string_view assembly = "unknown", + const std::filesystem::path& tmp_dir = std::filesystem::temp_directory_path(), + bool overwrite_if_exists = false, std::size_t chunk_size = 500'000, + std::size_t n_threads = 1, std::uint32_t compression_lvl = 12); + +[[nodiscard]] std::vector list_resolutions(const std::filesystem::path& path, bool sorted = true); } // namespace hictk::hic::utils -#include "./impl/utils_impl.hpp" // NOLINT +#include "./impl/utils_impl.hpp" // NOLINT +#include "./impl/utils_merge_impl.hpp" // NOLINT diff --git a/src/libhictk/reference/include/hictk/impl/reference_impl.hpp b/src/libhictk/reference/include/hictk/impl/reference_impl.hpp index 3d6bf30e..d1569ac0 100644 --- a/src/libhictk/reference/include/hictk/impl/reference_impl.hpp +++ b/src/libhictk/reference/include/hictk/impl/reference_impl.hpp @@ -181,6 +181,27 @@ inline const Chromosome& Reference::chromosome_with_longest_name() const { return _buff[_chrom_with_longest_name]; } +inline Reference Reference::remove_ALL() const { + std::vector chroms{}; + std::copy_if(begin(), end(), std::back_inserter(chroms), + [](const Chromosome& chrom) { return !chrom.is_all(); }); + + return {chroms.begin(), chroms.end()}; +} + +inline Reference Reference::add_ALL(std::uint32_t scaling_factor) const { + std::uint32_t all_size = 0; + for (const auto& chrom : *this) { + all_size += chrom.size() / scaling_factor; + } + + std::vector chroms{Chromosome{0, "All", all_size}}; + std::copy_if(begin(), end(), std::back_inserter(chroms), + [](const Chromosome& chrom) { return !chrom.is_all(); }); + + return {chroms.begin(), chroms.end()}; +} + inline void Reference::validate_chrom_id(std::uint32_t chrom_id) const { if (static_cast(chrom_id) >= size()) { throw std::out_of_range(fmt::format(FMT_STRING("chromosome with id {} not found"), chrom_id)); @@ -237,24 +258,40 @@ inline std::size_t Reference::find_longest_chromosome(const ChromBuff& chroms) n return Chromosome{}.id(); } - const auto match = std::max_element(chroms.begin(), chroms.end(), - [](const Chromosome& chrom1, const Chromosome& chrom2) { - return chrom1.size() < chrom2.size(); - }); + std::uint32_t max_length = 0; + std::size_t i = Chromosome{}.id(); + for (std::size_t j = 0; j < chroms.size(); ++j) { + const auto& chrom = chroms[j]; + if (chrom.is_all()) { + continue; + } + if (chrom.size() > max_length) { + max_length = chrom.size(); + i = j; + } + } - return static_cast(std::distance(chroms.begin(), match)); + return i; } inline std::size_t Reference::find_chromosome_with_longest_name(const ChromBuff& chroms) noexcept { if (chroms.empty()) { return Chromosome{}.id(); } - const auto match = std::max_element(chroms.begin(), chroms.end(), - [](const Chromosome& chrom1, const Chromosome& chrom2) { - return chrom1.name().size() < chrom2.name().size(); - }); + std::size_t max_length = 0; + std::size_t i = Chromosome{}.id(); + for (std::size_t j = 0; j < chroms.size(); ++j) { + const auto& chrom = chroms[j]; + if (chrom.is_all()) { + continue; + } + if (chrom.name().size() > max_length) { + max_length = chrom.name().size(); + i = j; + } + } - return static_cast(std::distance(chroms.begin(), match)); + return i; } inline std::vector Reference::compute_size_prefix_sum( diff --git a/src/libhictk/reference/include/hictk/reference.hpp b/src/libhictk/reference/include/hictk/reference.hpp index 7a4a62df..d157fcd1 100644 --- a/src/libhictk/reference/include/hictk/reference.hpp +++ b/src/libhictk/reference/include/hictk/reference.hpp @@ -92,6 +92,10 @@ class Reference { [[nodiscard]] const Chromosome& longest_chromosome() const; [[nodiscard]] const Chromosome& chromosome_with_longest_name() const; + // Add/remove ALL chromosome + [[nodiscard]] Reference remove_ALL() const; + [[nodiscard]] Reference add_ALL(std::uint32_t scaling_factor = 1) const; + private: void validate_chrom_id(std::uint32_t chrom_id) const; diff --git a/test/scripts/check_test_files_exist.sh b/test/scripts/check_test_files_exist.sh new file mode 100755 index 00000000..e051e2c6 --- /dev/null +++ b/test/scripts/check_test_files_exist.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +if [ $# -lt 1 ]; then + 2>&1 echo "Usage: $0 files..." + exit 1 +fi + + +status=0 +for f in "$@"; do + if [ ! -f "$f" ]; then + 2>&1 echo "Unable to find test file \"$f\"" + status=1 + fi +done + +exit "$status" diff --git a/test/scripts/compare_cooler_files.sh b/test/scripts/compare_cooler_files.sh new file mode 100755 index 00000000..5d388b99 --- /dev/null +++ b/test/scripts/compare_cooler_files.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +if [ $# -lt 3 ]; then + 2>&1 echo "Usage: $0 path_to_hictk file1 file2 [resolution]" + exit 1 +fi + +resolution="${4-0}" +f1="$2" +f2="$3" + + +function dump_table { + set -o pipefail + set -eu + + hictk="$1" + f="$2" + table="$3" + resolution="$4" + + args=() + + if [ "$table" = pixels ]; then + args+=(--join) + fi + + if [ ! "$resolution" = 0 ]; then + args+=(--resolution "$resolution") + fi + + "$hictk" dump "${args[@]}" --table "$table" "$f" +} + + +function compare_chromosomes { + set -o pipefail + set -eu + + hictk="$1" + f1="$2" + f2="$3" + + diff <(dump_table "$hictk" "$f1" chroms 0) \ + <(dump_table "$hictk" "$f2" chroms 0); +} + +function compare_bins { + set -o pipefail + set -eu + + hictk="$1" + f1="$2" + f2="$3" + resolution="$4" + + diff <(dump_table "$hictk" "$f1" bins "$resolution") \ + <(dump_table "$hictk" "$f2" bins "$resolution"); +} + +function compare_pixels { + set -o pipefail + set -eu + + hictk="$1" + f1="$2" + f2="$3" + resolution="$4" + + diff <(dump_table "$hictk" "$f1" pixels "$resolution") \ + <(dump_table "$hictk" "$f2" pixels "$resolution"); +} + + +status=0 +2>&1 echo "Comparing $f1 with $f2..." +if ! compare_chromosomes "$hictk" "$f1" "$f2" "$resolution"; then + status=1 +fi + +if ! compare_bins "$hictk" "$f1" "$f2" "$resolution"; then + status=1 +fi + +if ! compare_pixels "$hictk" "$f1" "$f2" "$resolution"; then + status=1 +fi + +if [ "$status" -eq 0 ]; then + 2>&1 echo "Files are identical" + exit 0 +else + 2>&1 echo "Files differ" + exit 1 +fi diff --git a/test/scripts/compare_matrix_files.sh b/test/scripts/compare_matrix_files.sh new file mode 100755 index 00000000..58d9a0c9 --- /dev/null +++ b/test/scripts/compare_matrix_files.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +if [ $# -lt 3 ]; then + 2>&1 echo "Usage: $0 path_to_hictk file1 file2 [resolution]" + exit 1 +fi + +hictk="$1" +resolution="${4-0}" +f1="$2" +f2="$3" + + +function dump_table { + set -o pipefail + set -eu + + hictk="$1" + f="$2" + table="$3" + resolution="$4" + + args=() + + if [ "$table" = pixels ]; then + args+=(--join) + fi + + if [ ! "$resolution" = 0 ]; then + args+=(--resolution "$resolution") + fi + + "$hictk" dump "${args[@]}" --table "$table" "$f" +} + + +function compare_chromosomes { + set -o pipefail + set -eu + + hictk="$1" + f1="$2" + f2="$3" + + diff <(dump_table "$hictk" "$f1" chroms 0) \ + <(dump_table "$hictk" "$f2" chroms 0); +} + +function compare_bins { + set -o pipefail + set -eu + + hictk="$1" + f1="$2" + f2="$3" + resolution="$4" + + diff <(dump_table "$hictk" "$f1" bins "$resolution") \ + <(dump_table "$hictk" "$f2" bins "$resolution"); +} + +function compare_pixels { + set -o pipefail + set -eu + + hictk="$1" + f1="$2" + f2="$3" + resolution="$4" + + diff <(dump_table "$hictk" "$f1" pixels "$resolution") \ + <(dump_table "$hictk" "$f2" pixels "$resolution"); +} + + +status=0 +2>&1 echo "Comparing $f1 with $f2..." +if ! compare_chromosomes "$hictk" "$f1" "$f2" "$resolution"; then + status=1 +fi + +if ! compare_bins "$hictk" "$f1" "$f2" "$resolution"; then + status=1 +fi + +if ! compare_pixels "$hictk" "$f1" "$f2" "$resolution"; then + status=1 +fi + +if [ "$status" -eq 0 ]; then + 2>&1 echo "Files are identical" + exit 0 +else + 2>&1 echo "Files differ" + exit 1 +fi diff --git a/test/scripts/compare_plain_files.sh b/test/scripts/compare_plain_files.sh new file mode 100755 index 00000000..746b7dba --- /dev/null +++ b/test/scripts/compare_plain_files.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +if [ $# -ne 2 ]; then + 2>&1 echo "Usage: $0 file1 file2" + exit 1 +fi + +set -o pipefail +set -e + +2>&1 echo "Comparing $1 with $2..." +if diff "$1" "$2"; then + 2>&1 echo "Files are identical" + exit 0 +else + 2>&1 echo "Files differ" + exit 1 +fi diff --git a/test/scripts/hictk_balance.sh b/test/scripts/hictk_balance.sh index cf253b90..0ec83169 100755 --- a/test/scripts/hictk_balance.sh +++ b/test/scripts/hictk_balance.sh @@ -17,24 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function nproc_py { - set -eu - python3 -c 'import multiprocessing as mp; print(mp.cpu_count())' -} - -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - function dump_interactions { set -o pipefail set -eu @@ -43,16 +25,10 @@ function dump_interactions { resolution="$3" f="$2" - if [[ "$f" == *.hic ]]; then - weight=WEIGHT - else - weight=weight - fi - - "$hictk" dump "$f" \ - --balance="$weight" \ - --resolution \ - "$resolution" | + "$hictk" dump "$f" \ + --balance=weight \ + --resolution \ + "$resolution" | cut -f 3 } @@ -99,27 +75,24 @@ function compare_matrices { fi } -export function readlink_py - status=0 -if [ $# -ne 2 ]; then - 2>&1 echo "Usage: $0 path_to_hictk juicer_tools.jar" +if [ $# -ne 1 ]; then + 2>&1 echo "Usage: $0 path_to_hictk" status=1 fi hictk_bin="$1" -juicer_tools_jar="$2" data_dir="$(readlink_py "$(dirname "$0")/../data/")" script_dir="$(readlink_py "$(dirname "$0")")" ref_cool="$data_dir/cooler/ENCFF993FGR.2500000.cool" -ref_hic="$data_dir/hic/ENCFF993FGR.hic" +ref_hic="$data_dir/hic/ENCFF993FGR.2500000.hic" export PATH="$PATH:$script_dir" -if ! check_files_exist "$ref_cool" "$ref_hic" "$juicer_tools_jar"; then +if ! check_test_files_exist.sh "$ref_cool" "$ref_hic"; then exit 1 fi @@ -128,17 +101,33 @@ trap 'rm -rf -- "$outdir"' EXIT cp "$ref_cool" "$ref_hic" "$outdir" -"$hictk_bin" balance "$outdir/"*.cool -t $(nproc_py) --chunk-size=100 --mode=cis --force +"$hictk_bin" balance "$outdir/"*.cool \ + -t $(nproc.sh) \ + --chunk-size=100 \ + --mode=cis \ + --tmpdir="$outdir" \ + --force if ! compare_matrices "$hictk_bin" "$outdir/"*.cool "$ref_cool" 2500000; then status=1 fi -"$hictk_bin" balance "$outdir/"*.hic -t $(nproc_py) --chunk-size=100 --mode=cis --force --juicer-tools-jar "$juicer_tools_jar" +"$hictk_bin" balance "$outdir/"*.hic \ + -t $(nproc.sh) \ + --chunk-size=100 \ + --mode=cis \ + --tmpdir="$outdir" \ + --name=weight \ + --force if ! compare_matrices "$hictk_bin" "$outdir/"*.hic "$ref_cool" 2500000; then status=1 fi -"$hictk_bin" balance "$outdir/"*.cool -t $(nproc_py) --in-memory --mode=cis --force +"$hictk_bin" balance "$outdir/"*.cool \ + -t $(nproc.sh) \ + --in-memory \ + --mode=cis \ + --tmpdir="$outdir" \ + --force if ! compare_matrices "$hictk_bin" "$outdir/"*.cool "$ref_cool" 2500000; then status=1 fi diff --git a/test/scripts/hictk_convert_cool2hic.sh b/test/scripts/hictk_convert_cool2hic.sh index 7f5574a7..e10e2a55 100755 --- a/test/scripts/hictk_convert_cool2hic.sh +++ b/test/scripts/hictk_convert_cool2hic.sh @@ -17,52 +17,14 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_coolers { - set -o pipefail - set -eu - - hictk="$1" - resolution="$4" - hic="$2" - clr="$3::/resolutions/$resolution" - - 2>&1 echo "Comparing $hic with $clr..." - if diff <("$hictk" dump --join "$hic" \ - --resolution \ - "$resolution") \ - <("$hictk" dump --join "$clr"); then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - -export function readlink_py - status=0 -if [ $# -ne 2 ]; then - 2>&1 echo "Usage: $0 path_to_hictk juicer_tools.jar" +if [ $# -ne 1 ]; then + 2>&1 echo "Usage: $0 path_to_hictk" status=1 fi hictk_bin="$1" -juicer_tools_jar="$2" data_dir="$(readlink_py "$(dirname "$0")/../data/")" script_dir="$(readlink_py "$(dirname "$0")")" @@ -71,7 +33,7 @@ ref_cool="$data_dir/integration_tests/4DNFIZ1ZVXC8.mcool" export PATH="$PATH:$script_dir" -if ! check_files_exist "$ref_cool" "$juicer_tools_jar"; then +if ! check_test_files_exist.sh "$ref_cool"; then exit 1 fi @@ -84,14 +46,22 @@ resolutions=(100000 2500000) "$ref_cool" \ "$outdir/out.hic" \ --resolutions ${resolutions[*]} \ - --juicer-tools-jar "$juicer_tools_jar" + --threads "$(nproc.sh)" \ + --chunk-size 100000 for resolution in "${resolutions[@]}"; do - if ! compare_coolers "$hictk_bin" "$outdir/out.hic" "$ref_cool" "$resolution"; then + if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.hic" "$ref_cool" "$resolution"; then status=1 fi done +"$hictk_bin" dump -t normalizations "$ref_cool" | sed 's/weight/ICE/' | sort > "$outdir/normalizations.mcool" +"$hictk_bin" dump -t normalizations "$outdir/out.hic" | sort > "$outdir/normalizations.hic" + +if ! compare_plain_files.sh "$outdir/normalizations.mcool" "$outdir/normalizations.hic"; then + status=1 +fi + if [ "$status" -eq 0 ]; then printf '\n### PASS ###\n' else diff --git a/test/scripts/hictk_convert_hic2cool.sh b/test/scripts/hictk_convert_hic2cool.sh index 6d3dcd43..74f91966 100755 --- a/test/scripts/hictk_convert_hic2cool.sh +++ b/test/scripts/hictk_convert_hic2cool.sh @@ -17,48 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_coolers { - set -o pipefail - set -eu - - hictk="$1" - resolution="$4" - clr="$2::/resolutions/$resolution" - hic="$3" - - 2>&1 echo "Comparing $clr with $hic..." - if diff <("$hictk" dump -t chroms "$clr") \ - <("$hictk" dump -t chroms "$hic" \ - --resolution \ - "$resolution") \ - && diff <("$hictk" dump --join "$clr") \ - <("$hictk" dump --join "$hic" \ - --resolution \ - "$resolution"); \ - then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - -export function readlink_py - status=0 if [ $# -ne 1 ]; then @@ -75,7 +33,7 @@ hic="$data_dir/hic/4DNFIZ1ZVXC8.hic9" export PATH="$PATH:$script_dir" -if ! check_files_exist "$hic"; then +if ! check_test_files_exist.sh "$hic"; then exit 1 fi @@ -90,11 +48,18 @@ resolutions=(50000 2500000) --resolutions ${resolutions[*]} for resolution in "${resolutions[@]}"; do - if ! compare_coolers "$hictk_bin" "$outdir/out.mcool" "$hic" "$resolution"; then + if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.mcool" "$hic" "$resolution"; then status=1 fi done +"$hictk_bin" dump -t normalizations "$hic" > "$outdir/normalizations.hic" +"$hictk_bin" dump -t normalizations "$outdir/out.mcool" > "$outdir/normalizations.mcool" + +if ! compare_plain_files.sh "$outdir/normalizations.hic" "$outdir/normalizations.mcool"; then + status=1 +fi + if [ "$status" -eq 0 ]; then printf '\n### PASS ###\n' else diff --git a/test/scripts/hictk_dump_balanced.sh b/test/scripts/hictk_dump_balanced.sh index de4d55b0..24ac738e 100755 --- a/test/scripts/hictk_dump_balanced.sh +++ b/test/scripts/hictk_dump_balanced.sh @@ -17,33 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_files { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff "$1" "$2"; then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - function truncate_counts { set -o pipefail set -e @@ -83,7 +56,7 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$ref_cooler"; then +if ! check_test_files_exist.sh "$ref_cooler"; then exit 1 fi @@ -93,7 +66,7 @@ trap 'rm -rf -- "$outdir"' EXIT cooler dump --balanced --na-rep nan --join "$ref_cooler::/resolutions/100000" -r chr2L | cut -f 1-6,8 | truncate_counts > "$outdir/expected.pixels" "$hictk_bin" dump --join --balance "weight" "$ref_cooler::/resolutions/100000" -r chr2L | truncate_counts > "$outdir/out.cooler.pixels" -if ! compare_files "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then status=1 fi diff --git a/test/scripts/hictk_dump_bins.sh b/test/scripts/hictk_dump_bins.sh index 6c4e4022..cb83fd7f 100755 --- a/test/scripts/hictk_dump_bins.sh +++ b/test/scripts/hictk_dump_bins.sh @@ -17,33 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_files { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff "$1" "$2"; then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - export function readlink_py status=0 @@ -76,7 +49,7 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$ref_cooler"; then +if ! check_test_files_exist.sh "$ref_cooler"; then exit 1 fi @@ -87,11 +60,11 @@ cooler dump -t bins "$ref_cooler::/resolutions/100000" | cut -f 1-3 > "$outdir/e "$hictk_bin" dump -t bins "$ref_cooler::/resolutions/100000" > "$outdir/out.cooler.chrom.sizes" "$hictk_bin" dump -t bins --resolution 100000 "$ref_hic" > "$outdir/out.hic.chrom.sizes" -if ! compare_files "$outdir/expected.chrom.sizes" "$outdir/out.cooler.chrom.sizes"; then +if ! compare_plain_files.sh "$outdir/expected.chrom.sizes" "$outdir/out.cooler.chrom.sizes"; then status=1 fi -if ! compare_files "$outdir/expected.chrom.sizes" "$outdir/out.hic.chrom.sizes"; then +if ! compare_plain_files.sh "$outdir/expected.chrom.sizes" "$outdir/out.hic.chrom.sizes"; then status=1 fi diff --git a/test/scripts/hictk_dump_cells.sh b/test/scripts/hictk_dump_cells.sh index d3992362..78de46f0 100755 --- a/test/scripts/hictk_dump_cells.sh +++ b/test/scripts/hictk_dump_cells.sh @@ -17,33 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_files { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff "$1" "$2"; then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - export function readlink_py status=0 @@ -72,7 +45,7 @@ expected_cells=( export PATH="$PATH:$script_dir" -if ! check_files_exist "$mclr" "$sclr" "$hic"; then +if ! check_test_files_exist.sh "$mclr" "$sclr" "$hic"; then exit 1 fi @@ -84,7 +57,7 @@ printf "%s\n" "${expected_cells[@]}" > "$outdir/expected.txt" "$hictk_bin" dump -t cells "$sclr" > "$outdir/scool.cells.txt" -if ! compare_files "$outdir/expected.txt" "$outdir/scool.cells.txt"; then +if ! compare_plain_files.sh "$outdir/expected.txt" "$outdir/scool.cells.txt"; then status=1 fi diff --git a/test/scripts/hictk_dump_chroms.sh b/test/scripts/hictk_dump_chroms.sh index cccd2f39..2c259d9f 100755 --- a/test/scripts/hictk_dump_chroms.sh +++ b/test/scripts/hictk_dump_chroms.sh @@ -17,33 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_files { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff "$1" "$2"; then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - export function readlink_py status=0 @@ -76,7 +49,7 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$ref_cooler"; then +if ! check_test_files_exist.sh "$ref_cooler"; then exit 1 fi @@ -87,11 +60,11 @@ cooler dump -t chroms "$ref_cooler::/resolutions/100000" > "$outdir/expected.chr "$hictk_bin" dump -t chroms "$ref_cooler::/resolutions/100000" > "$outdir/out.cooler.chrom.sizes" "$hictk_bin" dump -t chroms "$ref_hic" > "$outdir/out.hic.chrom.sizes" -if ! compare_files "$outdir/expected.chrom.sizes" "$outdir/out.cooler.chrom.sizes"; then +if ! compare_plain_files.sh "$outdir/expected.chrom.sizes" "$outdir/out.cooler.chrom.sizes"; then status=1 fi -if ! compare_files "$outdir/expected.chrom.sizes" "$outdir/out.hic.chrom.sizes"; then +if ! compare_plain_files.sh "$outdir/expected.chrom.sizes" "$outdir/out.hic.chrom.sizes"; then status=1 fi diff --git a/test/scripts/hictk_dump_cis.sh b/test/scripts/hictk_dump_cis.sh index 5644c2a4..2759596e 100755 --- a/test/scripts/hictk_dump_cis.sh +++ b/test/scripts/hictk_dump_cis.sh @@ -18,33 +18,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_files { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff "$1" "$2"; then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - export function readlink_py status=0 @@ -78,7 +51,7 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$ref_cooler" "$ref_hic8" "$ref_hic9"; then +if ! check_test_files_exist.sh "$ref_cooler" "$ref_hic8" "$ref_hic9"; then exit 1 fi @@ -91,15 +64,15 @@ cooler dump --join "$ref_cooler::/resolutions/100000" -r chr2L > "$outdir/expect "$hictk_bin" dump --join --resolution 100000 "$ref_hic8" -r chr2L > "$outdir/out.hic8.pixels" "$hictk_bin" dump --join --resolution 100000 "$ref_hic9" -r chr2L > "$outdir/out.hic9.pixels" -if ! compare_files "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then status=1 fi @@ -109,15 +82,15 @@ cooler dump --join "$ref_cooler::/resolutions/100000" | awk -F '\t' '$1==$4' | t "$hictk_bin" dump --join --resolution 100000 "$ref_hic8" --cis-only | tee "$outdir/out.hic8.pixels" > /dev/null "$hictk_bin" dump --join --resolution 100000 "$ref_hic9" --cis-only | tee "$outdir/out.hic9.pixels" > /dev/null -if ! compare_files "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then status=1 fi diff --git a/test/scripts/hictk_dump_gw.sh b/test/scripts/hictk_dump_gw.sh index 0ab3d5d7..dd451a96 100755 --- a/test/scripts/hictk_dump_gw.sh +++ b/test/scripts/hictk_dump_gw.sh @@ -17,33 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_files { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff "$1" "$2"; then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - export function readlink_py status=0 @@ -77,7 +50,7 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$ref_cooler" "$ref_hic8" "$ref_hic9"; then +if ! check_test_files_exist.sh "$ref_cooler" "$ref_hic8" "$ref_hic9"; then exit 1 fi @@ -89,15 +62,15 @@ cooler dump --join "$ref_cooler::/resolutions/1000000" > "$outdir/expected.pixel "$hictk_bin" dump --join --resolution 1000000 "$ref_hic8" > "$outdir/out.hic8.pixels" "$hictk_bin" dump --join --resolution 1000000 "$ref_hic9" > "$outdir/out.hic9.pixels" -if ! compare_files "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then status=1 fi diff --git a/test/scripts/hictk_dump_normalizations.sh b/test/scripts/hictk_dump_normalizations.sh index 09be2028..fc754238 100755 --- a/test/scripts/hictk_dump_normalizations.sh +++ b/test/scripts/hictk_dump_normalizations.sh @@ -17,33 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_files { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff "$1" "$2"; then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - export function readlink_py status=0 @@ -79,7 +52,7 @@ expected_norms_cooler=( export PATH="$PATH:$script_dir" -if ! check_files_exist "$mclr" "$sclr" "$hic"; then +if ! check_test_files_exist.sh "$mclr" "$sclr" "$hic"; then exit 1 fi @@ -97,13 +70,13 @@ printf "%s\n" "${expected_norms_cooler[@]}" > "$outdir/expected.cool.txt" "$hictk_bin" dump -t normalizations "$hic" --resolution 100000 > "$outdir/hic.norms.txt" for f in "$outdir/"*cool*.norms.txt; do - if ! compare_files "$outdir/expected.cool.txt" "$f"; then + if ! compare_plain_files.sh "$outdir/expected.cool.txt" "$f"; then status=1 fi done for f in "$outdir/"*hic*.norms.txt; do - if ! compare_files "$outdir/expected.hic.txt" "$f"; then + if ! compare_plain_files.sh "$outdir/expected.hic.txt" "$f"; then status=1 fi done diff --git a/test/scripts/hictk_dump_resolutions.sh b/test/scripts/hictk_dump_resolutions.sh index 559702b0..b8de19fb 100755 --- a/test/scripts/hictk_dump_resolutions.sh +++ b/test/scripts/hictk_dump_resolutions.sh @@ -17,33 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_files { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff "$1" "$2"; then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - export function readlink_py status=0 @@ -77,7 +50,7 @@ expected_res=( export PATH="$PATH:$script_dir" -if ! check_files_exist "$mclr" "$sclr" "$hic"; then +if ! check_test_files_exist.sh "$mclr" "$sclr" "$hic"; then exit 1 fi @@ -94,13 +67,13 @@ printf "%d\n" "100000" > "$outdir/expected.100000.txt" for f in "$outdir/"*.res.txt; do - if ! compare_files "$outdir/expected.txt" "$f"; then + if ! compare_plain_files.sh "$outdir/expected.txt" "$f"; then status=1 fi done for f in "$outdir/"*.res.100000.txt; do - if ! compare_files "$outdir/expected.100000.txt" "$f"; then + if ! compare_plain_files.sh "$outdir/expected.100000.txt" "$f"; then status=1 fi done diff --git a/test/scripts/hictk_dump_trans.sh b/test/scripts/hictk_dump_trans.sh index 40706f12..f1ba61f4 100755 --- a/test/scripts/hictk_dump_trans.sh +++ b/test/scripts/hictk_dump_trans.sh @@ -17,33 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_files { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff "$1" "$2"; then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - export function readlink_py status=0 @@ -77,7 +50,7 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$ref_cooler" "$ref_hic8" "$ref_hic9"; then +if ! check_test_files_exist.sh "$ref_cooler" "$ref_hic8" "$ref_hic9"; then exit 1 fi @@ -90,15 +63,15 @@ cooler dump --join "$ref_cooler::/resolutions/100000" --range chr2L --range2 chr "$hictk_bin" dump --join --resolution 100000 "$ref_hic8" --range chr2L --range2 chrX > "$outdir/out.hic8.pixels" "$hictk_bin" dump --join --resolution 100000 "$ref_hic9" --range chr2L --range2 chrX > "$outdir/out.hic9.pixels" -if ! compare_files "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then status=1 fi @@ -108,15 +81,15 @@ cooler dump --join "$ref_cooler::/resolutions/100000" | awk -F '\t' '$1!=$4' | t "$hictk_bin" dump --join --resolution 100000 "$ref_hic8" --trans-only | tee "$outdir/out.hic8.pixels" > /dev/null "$hictk_bin" dump --join --resolution 100000 "$ref_hic9" --trans-only | tee "$outdir/out.hic9.pixels" > /dev/null -if ! compare_files "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then status=1 fi @@ -125,15 +98,15 @@ fi "$hictk_bin" dump --join --resolution 100000 "$ref_hic8" --trans-only --unsorted | sort -V | tee "$outdir/out.hic8.pixels" > /dev/null "$hictk_bin" dump --join --resolution 100000 "$ref_hic9" --trans-only --unsorted | sort -V | tee "$outdir/out.hic9.pixels" > /dev/null -if ! compare_files "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then status=1 fi -if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then +if ! compare_plain_files.sh "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then status=1 fi diff --git a/test/scripts/hictk_fix_mcool.sh b/test/scripts/hictk_fix_mcool.sh index a27ce0c9..83993258 100755 --- a/test/scripts/hictk_fix_mcool.sh +++ b/test/scripts/hictk_fix_mcool.sh @@ -17,19 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - status=0 if [ $# -ne 1 ]; then @@ -50,7 +37,7 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$invalid_mcool"; then +if ! check_test_files_exist.sh "$invalid_mcool"; then exit 1 fi diff --git a/test/scripts/hictk_load_4dn.sh b/test/scripts/hictk_load_4dn.sh index d3b337a5..b04cb167 100755 --- a/test/scripts/hictk_load_4dn.sh +++ b/test/scripts/hictk_load_4dn.sh @@ -17,39 +17,7 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_coolers { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff <(cooler dump -t chroms "$1") \ - <(cooler dump -t chroms "$2") \ - && \ - diff <(cooler dump --join "$1") \ - <(cooler dump --join "$2"); - then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - -export function readlink_py shuffle +export function readlink_py status=0 @@ -69,15 +37,6 @@ ref_cooler_variable_bins="$data_dir/4DNFIKNWM36K.subset.variable-bins.cool" export PATH="$PATH:$script_dir" -if ! command -v cooler &> /dev/null; then - 2>&1 echo "Unable to find cooler in your PATH" - status=1 -fi - -# Try to detect the error outlined below as early as possible: -# https://github.com/open2c/cooler/pull/298 -cooler --help > /dev/null - if ! command -v xz &> /dev/null; then 2>&1 echo "Unable to find xz in your PATH" status=1 @@ -87,26 +46,29 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$pairs" "$ref_cooler_fixed_bins" "$ref_cooler_variable_bins"; then +if ! check_test_files_exist.sh "$pairs" "$ref_cooler_fixed_bins" "$ref_cooler_variable_bins"; then exit 1 fi outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" trap 'rm -rf -- "$outdir"' EXIT +resolution=10000 +batch_size=999999 + cooler dump -t chroms "$ref_cooler_fixed_bins" > "$outdir/chrom.sizes" # Test cooler with fixed bin size xzcat "$pairs" | "$hictk_bin" load \ -f 4dn \ - --assume-sorted \ - --batch-size 1000000 \ - --bin-size 10000 \ + --chunk-size "$batch_size" \ + --bin-size "$resolution" \ + --tmpdir "$outdir" \ "$outdir/chrom.sizes" \ "$outdir/out.cool" -if ! compare_coolers "$outdir/out.cool" "$ref_cooler_fixed_bins"; then +if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.cool" "$ref_cooler_fixed_bins" "$resolution"; then status=1 fi @@ -116,14 +78,29 @@ cooler dump -t bins "$ref_cooler_variable_bins" > "$outdir/bins.bed" xzcat "$pairs" | "$hictk_bin" load \ -f 4dn \ - --assume-sorted \ - --batch-size 1000000 \ + --chunk-size "$batch_size" \ --bin-table "$outdir/bins.bed" \ --force \ + --tmpdir "$outdir" \ "$outdir/chrom.sizes" \ "$outdir/out.cool" -if ! compare_coolers "$outdir/out.cool" "$ref_cooler_variable_bins"; then +if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.cool" "$ref_cooler_variable_bins"; then + status=1 +fi + + +# Test hic with fixed bin size +xzcat "$pairs" | + "$hictk_bin" load \ + -f 4dn \ + --chunk-size "$batch_size" \ + --bin-size "$resolution" \ + --tmpdir "$outdir" \ + "$outdir/chrom.sizes" \ + "$outdir/out.hic" + +if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.hic" "$ref_cooler_fixed_bins" "$resolution"; then status=1 fi diff --git a/test/scripts/hictk_load_bg2.sh b/test/scripts/hictk_load_bg2.sh index 5bb7ea75..32adb7cf 100755 --- a/test/scripts/hictk_load_bg2.sh +++ b/test/scripts/hictk_load_bg2.sh @@ -17,53 +17,13 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_coolers { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff <(cooler dump -t chroms "$1") \ - <(cooler dump -t chroms "$2") \ - && \ - diff <(cooler dump --join "$1") \ - <(cooler dump --join "$2"); - then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - -function shuffle { - if command -v shuf &> /dev/null; then - shuf - else - sort -R - fi -} - -export function readlink_py shuffle +export function readlink_py status=0 if [ $# -ne 2 ]; then 2>&1 echo "Usage: $0 path_to_hictk [un]sorted" - status=1 + exit 1 fi hictk_bin="$1" @@ -77,56 +37,70 @@ data_dir="$(readlink_py "$(dirname "$0")/../data/integration_tests")" script_dir="$(readlink_py "$(dirname "$0")")" ref_cooler="$data_dir/4DNFIKNWM36K.subset.fixed-bins.cool" +resolution=10000 +batch_size=999999 export PATH="$PATH:$script_dir" -if ! command -v cooler &> /dev/null; then - 2>&1 echo "Unable to find cooler in your PATH" - status=1 -fi - -# Try to detect the error outlined below as early as possible: -# https://github.com/open2c/cooler/pull/298 -cooler --help > /dev/null - if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$ref_cooler"; then +if ! check_test_files_exist.sh "$ref_cooler"; then exit 1 fi outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" trap 'rm -rf -- "$outdir"' EXIT -cooler dump -t chroms "$ref_cooler" > "$outdir/chrom.sizes" +"$hictk_bin" dump -t chroms "$ref_cooler" > "$outdir/chrom.sizes" if [[ "$sorted" == true ]]; then - cooler dump -t pixels --join "$ref_cooler" | + "$hictk_bin" dump -t pixels --join "$ref_cooler" | "$hictk_bin" load \ -f bg2 \ --assume-sorted \ - --batch-size 1000000 \ - --bin-size 10000 \ + --chunk-size "$batch_size" \ + --bin-size "$resolution" \ + --tmpdir "$outdir" \ "$outdir/chrom.sizes" \ "$outdir/out.cool" else - cooler dump -t pixels --join "$ref_cooler" | - shuffle | + "$hictk_bin" dump -t pixels --join "$ref_cooler" | + shuffle.sh | "$hictk_bin" load \ -f bg2 \ --assume-unsorted \ - --batch-size 1000000 \ - --bin-size 10000 \ + --chunk-size "$batch_size" \ + --bin-size "$resolution" \ + --tmpdir "$outdir" \ "$outdir/chrom.sizes" \ "$outdir/out.cool" fi -if ! compare_coolers "$outdir/out.cool" "$ref_cooler"; then +if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.cool" "$ref_cooler" "$resolution"; then status=1 fi +if [[ "$sorted" == false ]]; then + "$hictk_bin" dump -t pixels --join "$ref_cooler" | + shuffle.sh | + "$hictk_bin" load \ + -f bg2 \ + --assume-unsorted \ + --chunk-size "$batch_size" \ + --bin-size "$resolution" \ + --tmpdir "$outdir" \ + "$outdir/chrom.sizes" \ + "$outdir/out.hic" + + if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.hic" "$ref_cooler" "$resolution"; then + status=1 + fi +fi + + + if [ "$status" -eq 0 ]; then printf '\n### PASS ###\n' else diff --git a/test/scripts/hictk_load_coo.sh b/test/scripts/hictk_load_coo.sh index 0d46f9c7..3025d326 100755 --- a/test/scripts/hictk_load_coo.sh +++ b/test/scripts/hictk_load_coo.sh @@ -17,47 +17,7 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_coolers { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff <(cooler dump -t chroms "$1") \ - <(cooler dump -t chroms "$2") \ - && \ - diff <(cooler dump --join "$1") \ - <(cooler dump --join "$2"); - then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - -function shuffle { - if command -v shuf &> /dev/null; then - shuf - else - sort -R - fi -} - -export function readlink_py shuffle +export function readlink_py status=0 @@ -77,56 +37,69 @@ data_dir="$(readlink_py "$(dirname "$0")/../data/integration_tests")" script_dir="$(readlink_py "$(dirname "$0")")" ref_cooler="$data_dir/4DNFIKNWM36K.subset.fixed-bins.cool" +resolution=10000 +batch_size=999999 export PATH="$PATH:$script_dir" -if ! command -v cooler &> /dev/null; then - 2>&1 echo "Unable to find cooler in your PATH" - status=1 -fi - -# Try to detect the error outlined below as early as possible: -# https://github.com/open2c/cooler/pull/298 -cooler --help > /dev/null - if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$ref_cooler"; then +if ! check_test_files_exist.sh "$ref_cooler"; then exit 1 fi outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" trap 'rm -rf -- "$outdir"' EXIT -cooler dump -t chroms "$ref_cooler" > "$outdir/chrom.sizes" +"$hictk_bin" dump -t chroms "$ref_cooler" > "$outdir/chrom.sizes" if [[ "$sorted" == true ]]; then - cooler dump -t pixels "$ref_cooler" | + "$hictk_bin" dump -t pixels "$ref_cooler" | "$hictk_bin" load \ -f coo \ --assume-sorted \ - --batch-size 1000000 \ - --bin-size 10000 \ + --chunk-size "$batch_size" \ + --bin-size "$resolution" \ + --tmpdir "$outdir" \ "$outdir/chrom.sizes" \ "$outdir/out.cool" else - cooler dump -t pixels "$ref_cooler" | - shuffle | + "$hictk_bin" dump -t pixels "$ref_cooler" | + shuffle.sh | "$hictk_bin" load \ -f coo \ --assume-unsorted \ - --batch-size 1000000 \ - --bin-size 10000 \ + --chunk-size "$batch_size" \ + --bin-size "$resolution" \ + --tmpdir "$outdir" \ "$outdir/chrom.sizes" \ "$outdir/out.cool" fi -if ! compare_coolers "$outdir/out.cool" "$ref_cooler"; then +if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.cool" "$ref_cooler" "$resolution"; then status=1 fi + +if [[ "$sorted" == false ]]; then + "$hictk_bin" dump -t pixels "$ref_cooler" | + shuffle.sh | + "$hictk_bin" load \ + -f coo \ + --assume-unsorted \ + --chunk-size "$batch_size" \ + --bin-size "$resolution" \ + --tmpdir "$outdir" \ + "$outdir/chrom.sizes" \ + "$outdir/out.hic" + + if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.hic" "$ref_cooler" "$resolution"; then + status=1 + fi +fi + if [ "$status" -eq 0 ]; then printf '\n### PASS ###\n' else diff --git a/test/scripts/hictk_merge.sh b/test/scripts/hictk_merge.sh index d2265dc8..78845686 100755 --- a/test/scripts/hictk_merge.sh +++ b/test/scripts/hictk_merge.sh @@ -18,38 +18,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_coolers { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff <(cooler dump -t chroms "$1") \ - <(cooler dump -t chroms "$2") \ - && \ - diff <(cooler dump --join "$1") \ - <(cooler dump --join "$2"); - then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - export function readlink_py status=0 @@ -65,6 +33,8 @@ data_dir="$(readlink_py "$(dirname "$0")/../data/")" script_dir="$(readlink_py "$(dirname "$0")")" input_cooler="$data_dir/integration_tests/4DNFIZ1ZVXC8.mcool" +input_hic="$data_dir/hic/4DNFIZ1ZVXC8.hic9" +resolution=100000 export PATH="$PATH:$script_dir" @@ -81,17 +51,31 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$input_cooler"; then +if ! check_test_files_exist.sh "$input_cooler"; then exit 1 fi outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" trap 'rm -rf -- "$outdir"' EXIT -cooler merge "$outdir/expected.cool" "$input_cooler::/resolutions/10000" "$input_cooler::/resolutions/10000" -"$hictk_bin" merge -o "$outdir/out.cool" "$input_cooler::/resolutions/10000" "$input_cooler::/resolutions/10000" +cooler merge "$outdir/expected.cool" "$input_cooler::/resolutions/$resolution" "$input_cooler::/resolutions/$resolution" + +# Test merrging coolers +"$hictk_bin" merge "$input_cooler::/resolutions/$resolution" \ + "$input_cooler::/resolutions/$resolution" \ + -o "$outdir/out.cool" \ + --chunk-size=9999 +if ! compare_matrix_files.sh "$hictk_bin" "$outdir/expected.cool" "$outdir/out.cool" "$resolution"; then + status=1 +fi -if ! compare_coolers "$outdir/expected.cool" "$outdir/out.cool"; then +# Test merging .hic +"$hictk_bin" merge "$input_hic" \ + "$input_hic" \ + -o "$outdir/out.hic" \ + --resolution "$resolution" \ + --chunk-size=9999 +if ! compare_matrix_files.sh "$hictk_bin" "$outdir/expected.cool" "$outdir/out.hic" "$resolution"; then status=1 fi diff --git a/test/scripts/hictk_rename_chromosomes.sh b/test/scripts/hictk_rename_chromosomes.sh index a727f059..9b2270cf 100755 --- a/test/scripts/hictk_rename_chromosomes.sh +++ b/test/scripts/hictk_rename_chromosomes.sh @@ -17,19 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - export function readlink_py status=0 @@ -54,7 +41,7 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$input_cooler" "$input_mcool" "$input_scool"; then +if ! check_test_files_exist.sh "$input_cooler" "$input_mcool" "$input_scool"; then exit 1 fi diff --git a/test/scripts/hictk_validate.sh b/test/scripts/hictk_validate.sh index 36ad2073..7647ff02 100755 --- a/test/scripts/hictk_validate.sh +++ b/test/scripts/hictk_validate.sh @@ -17,19 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - status=0 if [ $# -ne 1 ]; then @@ -54,7 +41,7 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$valid_hic" "$valid_cooler" "$valid_mcool" "$valid_scool" "$invalid_mcool"; then +if ! check_test_files_exist.sh "$valid_hic" "$valid_cooler" "$valid_mcool" "$valid_scool" "$invalid_mcool"; then exit 1 fi diff --git a/test/scripts/hictk_zoomify.sh b/test/scripts/hictk_zoomify.sh index 6e64e686..a3bc7577 100755 --- a/test/scripts/hictk_zoomify.sh +++ b/test/scripts/hictk_zoomify.sh @@ -17,38 +17,6 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } -function check_files_exist { - set -eu - status=0 - for f in "$@"; do - if [ ! -f "$f" ]; then - 2>&1 echo "Unable to find test file \"$f\"" - status=1 - fi - done - - return "$status" -} - -function compare_coolers { - set -o pipefail - set -e - - 2>&1 echo "Comparing $1 with $2..." - if diff <(cooler dump -t chroms "$1") \ - <(cooler dump -t chroms "$2") \ - && \ - diff <(cooler dump --join "$1") \ - <(cooler dump --join "$2"); - then - 2>&1 echo "Files are identical" - return 0 - else - 2>&1 echo "Files differ" - return 1 - fi -} - export function readlink_py status=0 @@ -64,57 +32,54 @@ data_dir="$(readlink_py "$(dirname "$0")/../data/integration_tests")" script_dir="$(readlink_py "$(dirname "$0")")" ref_cooler="$data_dir/4DNFIZ1ZVXC8.mcool" +ref_hic="$data_dir/4DNFIZ1ZVXC8.500000.hic" resolutions=(50000 100000 250000 2500000) export PATH="$PATH:$script_dir" -if ! command -v cooler &> /dev/null; then - 2>&1 echo "Unable to find cooler in your PATH" - status=1 -fi - -# Try to detect the error outlined below as early as possible: -# https://github.com/open2c/cooler/pull/298 -cooler --help > /dev/null - -if [ $status -ne 0 ]; then - exit $status -fi - -if ! check_files_exist "$ref_cooler"; then +if ! check_test_files_exist.sh "$ref_cooler"; then exit 1 fi outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" trap 'rm -rf -- "$outdir"' EXIT - +# Test cooler (multiple resolutions) "$hictk_bin" zoomify \ + -t $(nproc.sh) \ "$ref_cooler::/resolutions/${resolutions[0]}" \ "$outdir/out.mcool" for res in "${resolutions[@]}"; do - if ! compare_coolers \ - "$outdir/out.mcool::/resolutions/$res" \ - "$ref_cooler::/resolutions/$res"; - then + if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.mcool" "$ref_cooler" "$res"; then status=1 fi done +# Test cooler (single resolution) "$hictk_bin" zoomify \ "$ref_cooler::/resolutions/${resolutions[0]}" \ "$outdir/out.cool" \ + -t $(nproc.sh) \ --no-copy-base-resolution \ --resolutions "${resolutions[1]}" -if ! compare_coolers \ - "$outdir/out.cool" \ - "$ref_cooler::/resolutions/${resolutions[1]}"; -then +if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.cool" "$ref_cooler" "${resolutions[1]}"; then status=1 fi +# Test hic (multiple resolutions) +"$hictk_bin" zoomify \ + -t $(nproc.sh) \ + "$ref_hic" \ + "$outdir/out.hic" + +for res in "${resolutions[@]}"; do + if ! compare_matrix_files.sh "$hictk_bin" "$outdir/out.hic" "$ref_cooler" "$res"; then + status=1 + fi +done + if [ "$status" -eq 0 ]; then printf '\n### PASS ###\n' else diff --git a/test/scripts/nproc.sh b/test/scripts/nproc.sh new file mode 100755 index 00000000..e87948a1 --- /dev/null +++ b/test/scripts/nproc.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +python3 -c 'import multiprocessing as mp; print(mp.cpu_count())' diff --git a/test/scripts/readlink.sh b/test/scripts/readlink.sh new file mode 100755 index 00000000..4275239e --- /dev/null +++ b/test/scripts/readlink.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +if [ $# -ne 1 ]; then + 2>&1 echo "Usage: $0 file" + exit 1 +fi + +python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" diff --git a/test/scripts/shuffle.sh b/test/scripts/shuffle.sh new file mode 100755 index 00000000..75758b3d --- /dev/null +++ b/test/scripts/shuffle.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +if command -v shuf &> /dev/null; then + shuf +else + sort -R +fi diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index 1f75d0e2..2c9ade19 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -172,7 +172,7 @@ TEST_CASE("Balancing: SparseMatrixChunked", "[balancing][short]") { TEST_CASE("Balancing: ICE (intra)", "[balancing][short]") { const std::array, 2> files{ std::make_pair("cooler", datadir / "cooler/ENCFF993FGR.2500000.cool"), - std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; + std::make_pair("hic", datadir / "hic/ENCFF993FGR.2500000.hic")}; const auto tmpfile = testdir() / "balancing_ice_intra.tmp"; const auto path_weights = datadir / "balancing/ENCFF993FGR.2500000.ICE.cis.txt"; @@ -208,7 +208,7 @@ TEST_CASE("Balancing: ICE (intra)", "[balancing][short]") { TEST_CASE("Balancing: ICE (inter)", "[balancing][medium]") { const std::array, 2> files{ std::make_pair("cooler", datadir / "cooler/ENCFF993FGR.2500000.cool"), - std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; + std::make_pair("hic", datadir / "hic/ENCFF993FGR.2500000.hic")}; const auto tmpfile = testdir() / "balancing_ice_inter.tmp"; const auto path_weights = datadir / "balancing/ENCFF993FGR.2500000.ICE.trans.txt"; @@ -244,7 +244,7 @@ TEST_CASE("Balancing: ICE (inter)", "[balancing][medium]") { TEST_CASE("Balancing: ICE (gw)", "[balancing][medium]") { const std::array, 2> files{ std::make_pair("cooler", datadir / "cooler/ENCFF993FGR.2500000.cool"), - std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; + std::make_pair("hic", datadir / "hic/ENCFF993FGR.2500000.hic")}; const auto tmpfile = testdir() / "balancing_ice_inter.tmp"; const auto path_weights = datadir / "balancing/ENCFF993FGR.2500000.ICE.gw.txt"; diff --git a/test/units/cooler/file_pixels_test.cpp b/test/units/cooler/file_pixels_test.cpp index 70f2ad31..f1c5d72a 100644 --- a/test/units/cooler/file_pixels_test.cpp +++ b/test/units/cooler/file_pixels_test.cpp @@ -104,7 +104,7 @@ TEST_CASE("Cooler: read/write pixels", "[cooler][long]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Cooler: validate pixels before append", "[cooler][long]") { +TEST_CASE("Cooler: validate pixels before read_append", "[cooler][long]") { auto path1 = datadir / "cooler_test_file.cool"; auto path2 = testdir() / "cooler_test_validate_before_append.cool"; diff --git a/test/units/hic/CMakeLists.txt b/test/units/hic/CMakeLists.txt index 26f2d4b3..3acc43aa 100644 --- a/test/units/hic/CMakeLists.txt +++ b/test/units/hic/CMakeLists.txt @@ -13,8 +13,11 @@ target_sources( hictk_hic_tests PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/filestream_test.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/file_reader_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/file_writer_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/file_zoomify_test.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/hic_file_test.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/pixel_selector_test.cpp") + "${CMAKE_CURRENT_SOURCE_DIR}/pixel_selector_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/utils_merge_test.cpp") target_link_libraries( hictk_hic_tests diff --git a/test/units/hic/file_reader_test.cpp b/test/units/hic/file_reader_test.cpp index c7ffbb27..8d585ca0 100644 --- a/test/units/hic/file_reader_test.cpp +++ b/test/units/hic/file_reader_test.cpp @@ -15,33 +15,32 @@ #include "hictk/balancing/methods.hpp" #include "hictk/hic/common.hpp" +#include "tmpdir.hpp" using namespace hictk::hic; -namespace hictk::test { -inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) -} // namespace hictk::test +namespace hictk::hic::test::file_reader { // NOLINTNEXTLINE(cert-err58-cpp) -const auto pathV8 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); +const auto pathV8 = (datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINTNEXTLINE(cert-err58-cpp) -const auto pathV9 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic9").string(); +const auto pathV9 = (datadir / "4DNFIZ1ZVXC8.hic9").string(); // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("HiC: read header (v8)", "[hic][v8][short]") { - constexpr std::array resolutions{2500000, 1000000, 500000, 250000, 100000, - 50000, 25000, 10000, 5000, 1000}; + constexpr std::array resolutions{ + 1'000, 5'000, 10'000, 25'000, 50'000, 100'000, 250'000, 500'000, 1'000'000, 2'500'000}; constexpr auto* genomeID = "dm6"; constexpr auto nChromosomes = 9; const auto header = internal::HiCFileReader(pathV8).header(); CHECK(header.url == pathV8); - CHECK(header.masterIndexOffset == 131515430); + CHECK(header.footerPosition == 131515430); CHECK(header.genomeID == genomeID); CHECK(header.chromosomes.size() == nChromosomes); CHECK(header.version == 8); - CHECK(header.nviPosition == -1); - CHECK(header.nviLength == -1); + CHECK(header.normVectorIndexPosition == -1); + CHECK(header.normVectorIndexLength == -1); REQUIRE(header.resolutions.size() == resolutions.size()); for (std::size_t i = 0; i < resolutions.size(); ++i) { @@ -51,20 +50,20 @@ TEST_CASE("HiC: read header (v8)", "[hic][v8][short]") { // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("HiC: read header (v9)", "[hic][v9][short]") { - constexpr std::array resolutions{2500000, 1000000, 500000, 250000, 100000, - 50000, 25000, 10000, 5000, 1000}; + constexpr std::array resolutions{ + 1'000, 5'000, 10'000, 25'000, 50'000, 100'000, 250'000, 500'000, 1'000'000, 2'500'000}; constexpr auto* genomeID = "dm6"; constexpr auto nChromosomes = 9; const auto header = internal::HiCFileReader(pathV9).header(); CHECK(header.url == pathV9); - CHECK(header.masterIndexOffset == 130706734); + CHECK(header.footerPosition == 130706734); CHECK(header.genomeID == genomeID); CHECK(header.chromosomes.size() == nChromosomes); CHECK(header.version == 9); - CHECK(header.nviPosition == 131417220); - CHECK(header.nviLength == 6600); + CHECK(header.normVectorIndexPosition == 131417220); + CHECK(header.normVectorIndexLength == 6600); REQUIRE(header.resolutions.size() == resolutions.size()); for (std::size_t i = 0; i < resolutions.size(); ++i) { @@ -305,3 +304,5 @@ TEST_CASE("HiC: read footer (v9)", "[hic][v9][short]") { } } } + +} // namespace hictk::hic::test::file_reader diff --git a/test/units/hic/file_writer_test.cpp b/test/units/hic/file_writer_test.cpp new file mode 100644 index 00000000..51b079af --- /dev/null +++ b/test/units/hic/file_writer_test.cpp @@ -0,0 +1,229 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include "hictk/hic/file_writer.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/chromosome.hpp" +#include "hictk/hic.hpp" +#include "hictk/reference.hpp" +#include "hictk/transformers/join_genomic_coords.hpp" +#include "tmpdir.hpp" + +using namespace hictk::hic; + +namespace hictk::hic::test::file_writer { + +using namespace hictk::hic::internal; + +TEST_CASE("HiC: HiCInteractionToBlockMapper::BlockMapper", "[hic][v9][short]") { + SECTION("intra") { + // Test case based on blocks fetched by running an instrumented version of + // hictk dump test/data/hic/4DNFIZ1ZVXC8.hic9 --resolution 10000 --range chr3R:0-50000 + { + const HiCInteractionToBlockMapper::BlockMapperIntra mapper(803, 4); + CHECK(mapper(0, 0) == 0); + CHECK(mapper(0, 100) == 0); + CHECK(mapper(802, 802) == 0); + CHECK(mapper(803, 803) == 1); + CHECK(mapper(1038, 2137) == 1); + CHECK(mapper(235, 1376) == 5); + CHECK(mapper(8, 3203) == 5); + } + } + + SECTION("inter") { + // Test case based on blocks fetched by running an instrumented version of + // hictk dump test/data/hic/4DNFIZ1ZVXC8.hic9 --resolution 10000 --range chr3L:0-50000 --range2 + // chr3R:0-10000000 + { + const HiCInteractionToBlockMapper::BlockMapperInter mapper(803, 4); + CHECK(mapper(0, 0) == 0); + CHECK(mapper(0, 100) == 0); + CHECK(mapper(802, 802) == 0); + CHECK(mapper(7, 803) == 4); + CHECK(mapper(795, 1605) == 4); + } + + { + const HiCInteractionToBlockMapper::BlockMapperInter mapper(101, 1); + CHECK(mapper(0, 0) == 0); + CHECK(mapper(0, 99) == 0); + CHECK(mapper(99, 99) == 0); + } + } +} + +TEST_CASE("HiC: HiCInteractionToBlockMapper", "[hic][v9][short]") { + const auto path1 = (datadir / "4DNFIZ1ZVXC8.hic9").string(); + const auto path2 = (testdir() / "hic_block_partitioner.bin").string(); + const std::uint32_t resolution = 25'000; + + const hic::File f1(path1, resolution); + const auto sel1 = f1.fetch("chr2L"); + const auto sel2 = f1.fetch("chr2L", "chr2R"); + + const std::vector> pixels1(sel1.begin(), sel1.end()); + const std::vector> pixels2(sel2.begin(), sel2.end()); + + HiCInteractionToBlockMapper partitioner(path2, f1.bins_ptr(), 50'000, 3); + + partitioner.append_pixels(pixels1.begin(), pixels1.end()); + partitioner.append_pixels(pixels2.begin(), pixels2.end()); + partitioner.finalize(); + + std::size_t num_interactions = 0; + for (const auto& [bid, _] : partitioner.block_index()) { + const auto blk = partitioner.merge_blocks(bid); + num_interactions += static_cast(blk.nRecords); + } + + CHECK(num_interactions == pixels1.size() + pixels2.size()); +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +static void hic_file_writer_compare_pixels(const std::vector>& expected, + const std::vector>& found) { + REQUIRE(expected.size() == found.size()); + + for (std::size_t i = 0; i < expected.size(); ++i) { + CHECK(expected[i].coords == found[i].coords); + if (std::isnan(expected[i].count)) { + CHECK(std::isnan(found[i].count)); + } else { + CHECK_THAT(expected[i].count, Catch::Matchers::WithinRel(found[i].count)); + } + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +static void hic_file_writer_create_file_test(const std::string& path1, const std::string& path2, + const std::vector& resolutions, + std::size_t num_threads) { + { + const auto chromosomes = hic::File(path1, resolutions.front()).chromosomes(); + std::filesystem::remove(path2); + HiCFileWriter w(path2, chromosomes, resolutions, "dm6", num_threads); + for (std::size_t i = 0; i < resolutions.size(); ++i) { + if (i % 2 == 0) { + const auto resolution = resolutions[i]; + const hic::File f((datadir / "4DNFIZ1ZVXC8.hic9").string(), resolution); + const auto sel1 = f.fetch("chr3R"); + const auto sel2 = f.fetch("chr3R", "chr4"); + w.add_pixels(resolution, sel1.begin(), sel1.end()); + w.add_pixels(resolution, sel2.begin(), sel2.end()); + } + } + w.serialize(); + } + + for (const auto& resolution : resolutions) { + fmt::print(FMT_STRING("Comparing {}...\n"), resolution); + const hic::File f1(path1, resolution); + const hic::File f2(path2, resolution); + + const auto correct_pixels1 = f1.fetch("chr3R").read_all(); + const auto correct_pixels2 = f1.fetch("chr3R", "chr4").read_all(); + const auto pixels1 = f2.fetch("chr3R").read_all(); + const auto pixels2 = f2.fetch("chr3R", "chr4").read_all(); + + hic_file_writer_compare_pixels(correct_pixels1, pixels1); + hic_file_writer_compare_pixels(correct_pixels2, pixels2); + + const hic::File f3(path1, resolution, MatrixType::expected); + const hic::File f4(path2, resolution, MatrixType::expected); + + const auto correct_expected_pixels1 = f3.fetch("chr3R").read_all(); + const auto correct_expected_pixels2 = f4.fetch("chr3R", "chr4").read_all(); + const auto expected_pixels1 = f3.fetch("chr3R").read_all(); + const auto expected_pixels2 = f4.fetch("chr3R", "chr4").read_all(); + + // NOLINTNEXTLINE(*-suspicious-call-argument) + hic_file_writer_compare_pixels(correct_expected_pixels1, expected_pixels1); + // NOLINTNEXTLINE(*-suspicious-call-argument) + hic_file_writer_compare_pixels(correct_expected_pixels2, expected_pixels2); + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("HiC: HiCFileWriter", "[hic][v9][long]") { + const auto path1 = (datadir / "4DNFIZ1ZVXC8.hic9").string(); + const auto path2 = (testdir() / "hic_writer_001.hic").string(); + const auto path3 = (testdir() / "hic_writer_002.hic").string(); + + SECTION("create file (st)") { + const std::vector resolutions{250'000, 500'000, 2'500'000}; + hic_file_writer_create_file_test(path1, path2, resolutions, 1); + } + SECTION("create file (mt)") { + const std::vector resolutions{25'000, 1'000'000, 2'500'000}; + hic_file_writer_create_file_test(path1, path2, resolutions, 3); + } + + SECTION("add weights") { + const std::uint32_t resolution = 500'000; + const hic::File hf1(path1, resolution); + + { + // init file + HiCFileWriter w(path3, hf1.chromosomes(), {hf1.resolution()}, "dm6"); + const auto sel = hf1.fetch(); + w.add_pixels(resolution, sel.begin(), sel.end()); + w.serialize(); + } + + // add normalization weights + { + HiCFileWriter w(path3); + for (const auto& chrom : w.chromosomes()) { + if (chrom.is_all()) { + continue; + } + w.add_norm_vector("SCALE", chrom, "BP", hf1.resolution(), + hf1.normalization("SCALE", chrom)); + } + + CHECK_THROWS_WITH( + w.add_norm_vector("SCALE", hf1.chromosomes().at("chr2L"), "BP", hf1.resolution(), + hf1.normalization("SCALE", hf1.chromosomes().at("chr2L"))), + Catch::Matchers::ContainsSubstring("file already contains")); + + CHECK_THROWS_WITH(w.add_norm_vector("VC", w.chromosomes().at("chr2L"), "BP", hf1.resolution(), + std::vector{1, 2, 3}), + Catch::Matchers::ContainsSubstring("weight shape mismatch")); + + w.write_norm_vectors_and_norm_expected_values(); + } + + // compare + const hic::File hf2(path3, resolution); + + const auto avail_norms = hf2.avail_normalizations(); + REQUIRE(avail_norms.size() == 1); + CHECK(avail_norms.front() == balancing::Method::SCALE()); + + const auto correct_pixels = hf1.fetch(balancing::Method::SCALE()).read_all(); + const auto pixels = hf2.fetch(balancing::Method::SCALE()).read_all(); + + hic_file_writer_compare_pixels(correct_pixels, pixels); + + const hic::File f3(path1, resolution, MatrixType::expected); + const hic::File f4(path3, resolution, MatrixType::expected); + + const auto correct_expected_pixels = f3.fetch(balancing::Method::SCALE()).read_all(); + const auto expected_pixels = f4.fetch(balancing::Method::SCALE()).read_all(); + + // NOLINTNEXTLINE(*-suspicious-call-argument) + hic_file_writer_compare_pixels(correct_expected_pixels, expected_pixels); + } +} + +} // namespace hictk::hic::test::file_writer diff --git a/test/units/hic/file_zoomify_test.cpp b/test/units/hic/file_zoomify_test.cpp new file mode 100644 index 00000000..dcc2acf4 --- /dev/null +++ b/test/units/hic/file_zoomify_test.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2024 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include "hictk/hic/file_zoomify.hpp" + +#include +#include +#include +#include +#include +#include + +#include "hictk/hic.hpp" +#include "hictk/hic/utils.hpp" +#include "tmpdir.hpp" + +using namespace hictk::hic; + +namespace hictk::hic::test::file_writer { + +using namespace hictk::hic::internal; + +TEST_CASE("HiC: HiCFileZoomify") { + const std::vector resolutions{100'000, 400'000, 1'000'000}; + + const auto path1 = (datadir / "4DNFIZ1ZVXC8.hic9").string(); + const auto path2 = (testdir() / "hic_file_zoomify.hic").string(); + + const auto avail_resolutions = hic::utils::list_resolutions(path1); + REQUIRE(std::find(avail_resolutions.begin(), avail_resolutions.end(), 400'000) == + avail_resolutions.end()); + + hic::internal::HiCFileZoomify{path1, path2, resolutions}.zoomify(); + + for (const auto& resolution : {100'000U, 1'000'000U}) { + const hic::File f1(path1, resolution); + const hic::File f2(path2, resolution); + + const auto expected_pixels = f1.fetch().read_all(); + const auto pixels = f2.fetch().read_all(); + + REQUIRE(expected_pixels.size() == pixels.size()); + for (std::size_t i = 0; i < pixels.size(); ++i) { + CHECK(expected_pixels[i] == pixels[i]); + } + } +} + +} // namespace hictk::hic::test::file_writer diff --git a/test/units/hic/filestream_test.cpp b/test/units/hic/filestream_test.cpp index da0788c3..28561c70 100644 --- a/test/units/hic/filestream_test.cpp +++ b/test/units/hic/filestream_test.cpp @@ -15,15 +15,15 @@ #include "catch2/catch_test_macros.hpp" #include "hictk/suppress_warnings.hpp" +#include "hictk/type_traits.hpp" +#include "tmpdir.hpp" -using namespace hictk::hic::internal::filestream; +namespace hictk::hic::test::filestream { -namespace hictk::test { -inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) -} // namespace hictk::test +using namespace hictk::hic::internal::filestream; -const auto path_plaintext = (hictk::test::datadir / "data.txt").string(); // NOLINT(cert-err58-cpp) -const auto path_binary = (hictk::test::datadir / "data.zip").string(); // NOLINT(cert-err58-cpp) +const auto path_plaintext = (datadir / "data.txt").string(); // NOLINT(cert-err58-cpp) +const auto path_binary = (datadir / "data.zip").string(); // NOLINT(cert-err58-cpp) const auto& path = path_plaintext; static std::string read_file(const std::string& path_) { @@ -53,69 +53,110 @@ static std::vector read_file_by_line(const std::string& path_, char } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("HiC: filestream ctor", "[hic][short]") { +TEST_CASE("HiC: filestream ctor", "[hic][filestream][short]") { SECTION("default") { const FileStream s{}; - CHECK(s.url().empty()); + CHECK(s.path().empty()); CHECK(s.size() == 0); } - SECTION("valid path") { + SECTION("valid path (read)") { const FileStream s(path_plaintext); - CHECK(s.url() == path_plaintext); + CHECK(s.path() == path_plaintext); CHECK(s.size() == 502941); CHECK(!s.eof()); } + SECTION("valid path (write)") { + const auto path1 = testdir() / "filestream_ctor_write.bin"; + const auto s = FileStream::create(path1.string()); + CHECK(s.path() == path1); + CHECK(s.size() == 0); + CHECK(!s.eof()); + } + SECTION("invalid path") { CHECK_THROWS(FileStream("not-a-path")); } } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("HiC: filestream seek", "[hic][short]") { - FileStream s(path_plaintext); - { - std::string buff; - s.read(buff, 1); - s.seekg(0); - } - SECTION("seek within chunk") { - s.seekg(5); - CHECK(s.tellg() == 5); +TEST_CASE("HiC: filestream seek", "[hic][filestream][short]") { + SECTION("read") { + FileStream s(path_plaintext); + SECTION("seek within chunk") { + s.seekg(5); + CHECK(s.tellg() == 5); + + s.seekg(10); + CHECK(s.tellg() == 10); + } - s.seekg(10); - CHECK(s.tellg() == 10); - } + SECTION("negative seek from beg") { CHECK_THROWS(s.seekg(-10)); } - SECTION("negative seek from beg") { CHECK_THROWS(s.seekg(-10)); } + SECTION("seek from current") { + s.seekg(10); + CHECK(s.tellg() == 10); - SECTION("seek from current") { - s.seekg(10); - CHECK(s.tellg() == 10); + s.seekg(10, std::ios::cur); + CHECK(s.tellg() == 20); - s.seekg(10, std::ios::cur); - CHECK(s.tellg() == 20); + s.seekg(-10, std::ios::cur); + CHECK(s.tellg() == 10); + } - s.seekg(-10, std::ios::cur); - CHECK(s.tellg() == 10); - } + SECTION("seek at end") { + s.seekg(0, std::ios::end); + CHECK(!s.eof()); + } - SECTION("seek at end") { - s.seekg(0, std::ios::end); - CHECK(!s.eof()); + SECTION("seek past end") { + s.seekg(0, std::ios::end); + CHECK_THROWS(s.seekg(1, std::ios::cur)); + + s.seekg(0); + CHECK_THROWS(s.seekg(1, std::ios::end)); + CHECK(s.tellg() == 0); + } } + SECTION("write") { + const auto path1 = testdir() / "filestream_seek.bin"; + std::filesystem::remove(path1); + auto s = FileStream::create(path1.string()); - SECTION("seek past end") { - s.seekg(0, std::ios::end); - CHECK_THROWS(s.seekg(1, std::ios::cur)); + SECTION("seek within chunk") { + s.seekp(5); + CHECK(s.tellp() == 5); - s.seekg(0); - CHECK_THROWS(s.seekg(1, std::ios::end)); - CHECK(s.tellg() == 0); + s.seekp(10); + CHECK(s.tellp() == 10); + } + + SECTION("negative seek from beg") { CHECK_THROWS(s.seekp(-10)); } + + SECTION("seek from current") { + s.seekp(10); + CHECK(s.tellp() == 10); + + s.seekp(10, std::ios::cur); + CHECK(s.tellp() == 20); + + s.seekp(-10, std::ios::cur); + CHECK(s.tellp() == 10); + } + + SECTION("seek at end") { + s.seekg(0, std::ios::end); + CHECK(!s.eof()); + } + + SECTION("seek past end") { + s.seekp(0, std::ios::end); + CHECK_NOTHROW(s.seekp(1, std::ios::cur)); + } } } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("HiC: filestream read", "[hic][short]") { +TEST_CASE("HiC: filestream read", "[hic][filestream][short]") { FileStream s(path_plaintext); std::string buffer{"garbage"}; @@ -160,41 +201,41 @@ TEST_CASE("HiC: filestream read", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("HiC: filestream append", "[hic][short]") { +TEST_CASE("HiC: filestream read_append", "[hic][filestream][short]") { FileStream s(path_plaintext); std::string buffer; const auto expected = read_file(path); SECTION("append to empty buffer") { - s.append(buffer, 10); + s.read_append(buffer, 10); CHECK(buffer == expected.substr(0, 10)); } SECTION("append to dirty buffer") { buffer = "garbage"; - s.append(buffer, 10); + s.read_append(buffer, 10); CHECK(buffer == "garbage" + expected.substr(0, 10)); } SECTION("large append") { - s.append(buffer, s.size()); + s.read_append(buffer, s.size()); CHECK(buffer == expected); } SECTION("no-op append") { - s.append(buffer, 0); + s.read_append(buffer, 0); CHECK(buffer.empty()); } SECTION("out-of-bound read") { s.seekg(-1, std::ios::end); - CHECK_THROWS(s.append(buffer, 10)); + CHECK_THROWS(s.read_append(buffer, 10)); } } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("HiC: filestream getline", "[hic][short]") { +TEST_CASE("HiC: filestream getline", "[hic][filestream][short]") { FileStream s(path_plaintext); std::string buffer; @@ -237,7 +278,7 @@ TEST_CASE("HiC: filestream getline", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("HiC: filestream read binary", "[hic][short]") { +TEST_CASE("HiC: filestream read binary", "[hic][filestream][short]") { FileStream s(path_binary); s.seekg(10); @@ -281,3 +322,91 @@ TEST_CASE("HiC: filestream read binary", "[hic][short]") { } } } + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("HiC: filestream write", "[hic][filestream][short]") { + const auto tmpfile = testdir() / "filestream_write.bin"; + std::filesystem::remove(tmpfile); + auto s = FileStream::create(tmpfile.string()); + + SECTION("small write") { + constexpr std::string_view buffer{"test"}; + s.write(buffer); + CHECK(s.size() == buffer.size()); + } + + SECTION("large write") { + const auto buffer = read_file(path); + s.write(buffer); + CHECK(s.size() == buffer.size()); + } + + SECTION("no-op read") { + constexpr std::string_view buffer{}; + s.write(buffer); + CHECK(s.size() == 0); + } + + SECTION("seek and write") { + const std::size_t offset = 10; + s.seekp(std::int64_t(offset)); + constexpr std::string_view buffer{"test"}; + s.write(buffer); + CHECK(s.size() == buffer.size() + offset); + } +} + +template +static void write_and_compare(FileStream& s, const T& data) { + s.write(data); + s.flush(); + REQUIRE(s.size() == sizeof(T)); + CHECK(s.read() == data); +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("HiC: filestream write binary", "[hic][filestream][short]") { + const auto tmpfile = testdir() / "filestream_write_binary.bin"; + std::filesystem::remove(tmpfile); + auto s = FileStream::create(tmpfile.string()); + + DISABLE_WARNING_PUSH + DISABLE_WARNING_USELESS_CAST + SECTION("uint8") { write_and_compare(s, std::uint8_t(162)); } + SECTION("uint16") { write_and_compare(s, std::uint16_t(42658)); } + SECTION("uint32") { write_and_compare(s, std::uint32_t(1433446050)); } + SECTION("uint64") { write_and_compare(s, std::uint64_t(18260117889181853346ULL)); } + + SECTION("int8") { write_and_compare(s, std::int8_t(-94)); } + SECTION("int16") { write_and_compare(s, std::int16_t(-22878)); } + SECTION("int32") { write_and_compare(s, std::int32_t(1433446050)); } + SECTION("int64") { write_and_compare(s, std::int64_t(-186626184527698270)); } + + SECTION("float") { write_and_compare(s, 16537405000000.0F); } + SECTION("double") { write_and_compare(s, -1.2758357206942371e+296); } + + SECTION("bool") { write_and_compare(s, false); } + SECTION("char") { write_and_compare(s, static_cast(162)); } + SECTION("unsigned char") { write_and_compare(s, static_cast(162)); } + DISABLE_WARNING_POP + + SECTION("vector") { + std::vector data{ + 67324752, 20, -1499332600, -126266000, 316472680, -71892991, 720898, + 926220316, 758592304, 2020879920, 156521844, 1067451136, 1101095797, 2020959093, + 67174411, 501, 5124, -1141015552, -1772542862, 787614245, 1386282978, + -1957338045, 1449544581, 1142046551, -518143477, -1249957234, 831590659, -732484307, + 1294996684, -1436898904, 1231094186, 1614771469}; + + s.write(data); + s.flush(); + REQUIRE(s.size() == sizeof(std::int32_t) * data.size()); + const auto buffer = s.read(data.size()); + REQUIRE(data.size() == buffer.size()); + for (std::size_t i = 0; i < data.size(); ++i) { + CHECK(data[i] == buffer[i]); + } + } +} + +} // namespace hictk::hic::test::filestream diff --git a/test/units/hic/hic_file_test.cpp b/test/units/hic/hic_file_test.cpp index c3c1c177..1f114d7b 100644 --- a/test/units/hic/hic_file_test.cpp +++ b/test/units/hic/hic_file_test.cpp @@ -34,15 +34,15 @@ TEST_CASE("HiC: utils is_hic_file", "[hic][short]") { TEST_CASE("HiC: file accessors", "[hic][short]") { File f(pathV8, 1'000); - CHECK(f.url() == pathV8); + CHECK(f.path() == pathV8); CHECK(f.name() == pathV8); CHECK(f.version() == 8); CHECK(f.chromosomes().size() == 9); CHECK(f.assembly() == "dm6"); CHECK(f.avail_resolutions().size() == 10); - CHECK(f.avail_resolutions().front() == 2'500'000); - CHECK(f.avail_resolutions().back() == 1000); + CHECK(f.avail_resolutions().front() == 1'000); + CHECK(f.avail_resolutions().back() == 2'500'000); CHECK(f.avail_normalizations().size() == 4); CHECK(f.avail_normalizations().front() == "KR"); @@ -122,8 +122,6 @@ TEST_CASE("HiC: fetch", "[hic][short]") { SECTION("malformed") { CHECK_THROWS(f.fetch(chrom2, chrom1, norm)); // NOLINT - CHECK_THROWS(File(pathV8, f.resolution(), MatrixType::expected, MatrixUnit::BP) - .fetch(chrom1, hictk::balancing::Method::VC())); // Matrix does not have contacts for fragments CHECK_THROWS( diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index e9356b9d..8d4b4d04 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -307,10 +307,6 @@ TEST_CASE("HiC: pixel selector fetch (observed NONE BP 10000)", "[hic][long]") { SECTION("invalid unit") { CHECK_THROWS(File(path, 10'000, MatrixType::observed, MatrixUnit::FRAG).fetch()); } - SECTION("expected + norm") { - const File hic(path, 10'000, MatrixType::expected, MatrixUnit::BP); - CHECK_THROWS(hic.fetch("chr2L", hictk::balancing::Method::VC())); - } } } } diff --git a/test/units/hic/utils_merge_test.cpp b/test/units/hic/utils_merge_test.cpp new file mode 100644 index 00000000..28773b35 --- /dev/null +++ b/test/units/hic/utils_merge_test.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "./tmpdir.hpp" +#include "hictk/hic.hpp" +#include "hictk/hic/utils.hpp" +#include "hictk/tmpdir.hpp" + +namespace hictk::hic::test::utils { + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("HiC: utils merge", "[merge][utils][long]") { + SECTION("merge gw") { + const auto src = datadir / "4DNFIZ1ZVXC8.hic9"; + const auto dest = testdir() / "hic_merge_test_001.hic"; + + const std::uint32_t resolution = 500'000; + const std::array sources{src.string(), src.string()}; + hic::utils::merge(sources.begin(), sources.end(), dest.string(), resolution, testdir(), true, + 1'000); + + const File f1(src.string(), resolution); + const File f2(dest.string(), resolution); + + const auto pixels1 = f1.fetch().read_all(); + const auto pixels2 = f2.fetch().read_all(); + + REQUIRE(pixels1.size() == pixels2.size()); + for (std::size_t i = 0; i < pixels1.size(); ++i) { + CHECK(pixels1[i].coords == pixels2[i].coords); + CHECK(pixels1[i].count * 2 == pixels2[i].count); + } + } // namespace hictk::hic::test::utils + + SECTION("merge chromosomes") { + const auto src = datadir / "4DNFIZ1ZVXC8.hic9"; + const auto dest = testdir() / "hic_merge_test_002.hic"; + std::vector sources{}; + const std::uint32_t resolution = 500'000; + { + spdlog::default_logger()->set_level(spdlog::level::warn); + const File f(src.string(), resolution); + + for (std::uint32_t chrom1_id = 0; chrom1_id < f.chromosomes().size(); ++chrom1_id) { + const auto& chrom1 = f.chromosomes().at(chrom1_id); + if (chrom1.is_all()) { + continue; + } + for (std::uint32_t chrom2_id = chrom1_id; chrom2_id < f.chromosomes().size(); ++chrom2_id) { + const auto& chrom2 = f.chromosomes().at(chrom2_id); + + const auto sel = f.fetch(chrom1.name(), chrom2.name()); + if (sel.empty()) { + continue; + } + + sources.emplace_back((testdir() / fmt::format(FMT_STRING("hic_merge_test_002.{}_{}.hic"), + chrom1.name(), chrom2.name())) + .string()); + + hic::internal::HiCFileWriter w(sources.back(), f.chromosomes(), {f.bin_size()}, "", 1, + 1'000, testdir()); + + w.add_pixels(resolution, sel.begin(), sel.end()); + w.serialize(); + } + } + } + + spdlog::default_logger()->set_level(spdlog::level::info); + hic::utils::merge(sources.begin(), sources.end(), dest.string(), resolution, testdir(), true, + 1'000); + + const File f1(src.string(), resolution); + const File f2(dest.string(), resolution); + + const auto pixels1 = f1.fetch().read_all(); + const auto pixels2 = f2.fetch().read_all(); + + REQUIRE(pixels1.size() == pixels2.size()); + for (std::size_t i = 0; i < pixels1.size(); ++i) { + CHECK(pixels1[i] == pixels2[i]); + } + } + + SECTION("merge - different reference") { + const auto src1 = datadir / "4DNFIZ1ZVXC8.hic9"; + const auto src2 = datadir / "ENCFF993FGR.2500000.hic"; + const auto dest = testdir() / "cooler_merge_test_003.cool"; + + const std::array sources{src1.string(), src2.string()}; + + CHECK_THROWS_WITH(hic::utils::merge(sources.begin(), sources.end(), dest.string(), 2'500'000), + Catch::Matchers::ContainsSubstring("use different reference genomes")); + } +} + +} // namespace hictk::hic::test::utils diff --git a/test/units/include/tmpdir.hpp b/test/units/include/tmpdir.hpp index 939951f2..506c6fde 100644 --- a/test/units/include/tmpdir.hpp +++ b/test/units/include/tmpdir.hpp @@ -51,4 +51,23 @@ inline const auto& testdir = hictk::test::testdir; inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } // namespace cooler::test::pixel_selector +namespace hic::test::filestream { +inline const auto& testdir = hictk::test::testdir; +inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) +} // namespace hic::test::filestream + +namespace hic::test::file_reader { +inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) +} // namespace hic::test::file_reader + +namespace hic::test::file_writer { +inline const auto& testdir = hictk::test::testdir; +inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) +} // namespace hic::test::file_writer + +namespace hic::test::utils { +inline const auto& testdir = hictk::test::testdir; +inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) +} // namespace hic::test::utils + } // namespace hictk diff --git a/utils/devel/test_docker_image.sh b/utils/devel/test_docker_image.sh index 524f821d..44e2c125 100755 --- a/utils/devel/test_docker_image.sh +++ b/utils/devel/test_docker_image.sh @@ -33,13 +33,10 @@ tar -xf test/data/hictk_test_data.tar.xz tmpdir="$(mktemp -d)" trap "rm -rf '$tmpdir'" EXIT -hic_tools_url='https://github.com/aidenlab/HiCTools/releases/download/v3.30.00/hic_tools.3.30.00.jar' -curl -L "$hic_tools_url" -o "$tmpdir/hic_tools.jar" - whereis -b hictk test/scripts/hictk_convert_hic2cool.sh "$(which hictk)" -test/scripts/hictk_convert_cool2hic.sh "$(which hictk)" "$tmpdir/hic_tools.jar" +test/scripts/hictk_convert_cool2hic.sh "$(which hictk)" EOM