Skip to content

Commit

Permalink
Merge pull request #102 from paulsengroup/feature/hic-write
Browse files Browse the repository at this point in the history
Add native support to write .hic files:
- Implement write operations on .hic files
- Implement zoomify operations on .hic files
- Implement balancing operations on .hic files
- Update hictk CLI tools to take advantage of the newly developed features
- Drop dependency on HiCTools/JuicerTools and the JVM
  • Loading branch information
robomics authored Jan 26, 2024
2 parents 6b087ea + 22494b4 commit a96d85e
Show file tree
Hide file tree
Showing 137 changed files with 7,577 additions and 2,304 deletions.
20 changes: 3 additions & 17 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ jobs:
--schedule-random \
--output-on-failure \
--no-tests=error \
--timeout 90 \
--timeout 240 \
-j $(nproc) |&
head -n 1000
Expand All @@ -174,26 +174,12 @@ jobs:
python3 -m pip install 'cooler>=0.9.2'
- name: Cache hic_tools
id: cache-hictools
uses: actions/cache@v4
with:
key: hic_tools
path: hic_tools.jar

- name: Download hic_tools
if: steps.cache-hictools.outputs.cache-hit != 'true'
run: |
apt-get install -y --no-install-recommends curl
curl -L 'https://github.com/aidenlab/HiCTools/releases/download/v3.30.00/hic_tools.3.30.00.jar' \
-o hic_tools.jar
- name: Run integration tests
run: |
test/scripts/hictk_balance.sh build/src/hictk/hictk hic_tools.jar
test/scripts/hictk_balance.sh build/src/hictk/hictk
test/scripts/hictk_convert_hic2cool.sh build/src/hictk/hictk
test/scripts/hictk_convert_cool2hic.sh build/src/hictk/hictk hic_tools.jar
test/scripts/hictk_convert_cool2hic.sh build/src/hictk/hictk
test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk
test/scripts/hictk_dump_bins.sh build/src/hictk/hictk
Expand Down
19 changes: 3 additions & 16 deletions .github/workflows/macos-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ jobs:
--schedule-random \
--output-on-failure \
--no-tests=error \
--timeout 180 2>&1 |
--timeout 240 2>&1 |
head -n 1000
run-integration-tests:
Expand Down Expand Up @@ -327,19 +327,6 @@ jobs:
- name: Install test dependencies
run: pip install -r requirements.txt

- name: Cache hic_tools
id: cache-hictools
uses: actions/cache@v4
with:
key: hic_tools
path: hic_tools.jar

- name: Download hic_tools
if: steps.cache-hictools.outputs.cache-hit != 'true'
run: |
curl -L 'https://github.com/aidenlab/HiCTools/releases/download/v3.30.00/hic_tools.3.30.00.jar' \
-o hic_tools.jar
- name: Restore test dataset
uses: actions/cache/restore@v4
with:
Expand All @@ -364,15 +351,15 @@ jobs:
- name: Test hictk balance
run: |
test/scripts/hictk_balance.sh bin/hictk hic_tools.jar
test/scripts/hictk_balance.sh bin/hictk
- name: Test hictk convert (hic2cool)
run: |
test/scripts/hictk_convert_hic2cool.sh bin/hictk
- name: Test hictk convert (cool2hic)
run: |
test/scripts/hictk_convert_cool2hic.sh bin/hictk hic_tools.jar
test/scripts/hictk_convert_cool2hic.sh bin/hictk
- name: Test hictk dump chroms
run: |
Expand Down
20 changes: 3 additions & 17 deletions .github/workflows/ubuntu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ jobs:
--schedule-random \
--output-on-failure \
--no-tests=error \
--timeout 200 \
--timeout 360 \
-j $(nproc) |&
head -n 1000
Expand Down Expand Up @@ -383,20 +383,6 @@ jobs:
python3 -m pip install 'cooler>=0.9.2'
- name: Cache hic_tools
id: cache-hictools
uses: actions/cache@v4
with:
key: hic_tools
path: hic_tools.jar

- name: Download hic_tools
if: steps.cache-hictools.outputs.cache-hit != 'true'
run: |
apt-get install -y --no-install-recommends curl
curl -L 'https://github.com/aidenlab/HiCTools/releases/download/v3.30.00/hic_tools.3.30.00.jar' \
-o hic_tools.jar
- name: Restore test dataset
uses: actions/cache/restore@v4
with:
Expand All @@ -421,15 +407,15 @@ jobs:
- name: Test hictk balance
run: |
test/scripts/hictk_balance.sh bin/hictk hic_tools.jar
test/scripts/hictk_balance.sh bin/hictk
- name: Test hictk convert (hic2cool)
run: |
test/scripts/hictk_convert_hic2cool.sh bin/hictk
- name: Test hictk convert (cool2hic)
run: |
test/scripts/hictk_convert_cool2hic.sh bin/hictk hic_tools.jar
test/scripts/hictk_convert_cool2hic.sh bin/hictk
- name: Test hictk dump chroms
run: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/windows-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ jobs:
--schedule-random \
--output-on-failure \
--no-tests=error \
--timeout 180 \
--exclude-regex 'Cooler: dataset large read\/write.*' |&
--timeout 240 \
--exclude-regex '.*(dataset large read\/write|HiCFileWriter).*' |&
tail -n 1000
windows-ci-status-check:
Expand Down
8 changes: 0 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,6 @@ RUN if [ -z "$BUILD_BASE_IMAGE" ]; then echo "Missing BUILD_BASE_IMAGE --build-a
&& if [ -z "$GIT_SHORT_HASH" ]; then echo "Missing GIT_SHORT_HASH --build-arg" && exit 1; fi \
&& if [ -z "$CREATION_DATE" ]; then echo "Missing CREATION_DATE --build-arg" && exit 1; fi

# Install runtime dependencies
RUN apt-get update \
&& apt-get install -y ca-certificates-java \
&& apt-get install -y \
openjdk-19-jre-headless \
pigz \
&& rm -rf /var/lib/apt/lists/*

# Export project binaries to the final build stage
COPY --from=builder "$staging_dir" "$install_dir"

Expand Down
4 changes: 2 additions & 2 deletions cmake/FetchTestDataset.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

# cmake-format: off
file(
DOWNLOAD https://zenodo.org/records/10289491/files/hictk_test_data.tar.xz?download=1
EXPECTED_HASH SHA256=5e69dceb8789d923a38aed7add8fc18abfdfe531aea6effcdb7efe3c9bcf5246
DOWNLOAD https://zenodo.org/records/10552012/files/hictk_test_data.tar.xz?download=1
EXPECTED_HASH SHA256=f49c9fdd6e27f3d318cc0544510d89ed57ddd4d56b71ad32bb499895e94f396b
"${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz")
# cmake-format: on

Expand Down
1 change: 1 addition & 0 deletions conanfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
[requires]
boost/1.83.0#7825569cd0622461dec7bc87dfdf47ae
bshoshany-thread-pool/3.5.0#5aaea7ccf37107d3c1ba9ca9ab246e42
concurrentqueue/1.0.4#1e48e1c712bcfd892087c9c622a51502
cli11/2.3.2#1424b9b1d9e3682a7122f415b078b4d7
eigen/3.4.0#2e192482a8acff96fe34766adca2b24c
fast_float/6.0.0#6c8a953c57879453982a6ce69c0ba87c
Expand Down
2 changes: 1 addition & 1 deletion docs/balancing_matrices.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@ It should be noted that when using a large number of threads (e.g. more than 16)

When the ``--in-memory`` option is not used, ``hictk`` will create a temporary file under the default temporary folder. This file stores interactions using a layout and compression that are optimized for the access pattern used by ``hictk balance``. When balancing large matrices, this file can be quite large (sometimes tens of GBs). If this is the case, it may be appropriate to change the temporary folder using the ``--tmpdir`` option.

Finally, when balancing .hic files, ``hictk`` depends on `JuicerTools <https://github.com/aidenlab/Juicebox/releases/latest>`_ or `HiCTools <https://github.com/aidenlab/HiCTools/releases/latest>`_ to write balancing weights back to the file. Thus, when balancing .hic files, the JAR file to one of the tools should be specified through the ``--juicer-tools-jar`` option. You should use JuicerTools when balancing .hic files in .hic v8 format or older and HiCTools when balancing .hic v9 files.
Finally, when balancing .hic files, only .hic v9 files and newer are supported.
93 changes: 58 additions & 35 deletions docs/cli_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ Subcommands
convert Convert HiC matrices to a different format.
dump Dump data from .hic and Cooler files to stdout.
fix-mcool Fix corrupted .mcool files.
load Build .cool files from interactions in various text formats.
merge Merge coolers.
load Build .cool and .hic files from interactions in various text formats.
merge Merge multiple Cooler or .hic files into a single file.
rename-chromosomes, rename-chroms
Rename chromosomes found in a Cooler file.
validate Validate .hic and Cooler files.
Expand Down Expand Up @@ -62,7 +62,10 @@ hictk balance
Maximum number of iterations.
--rescale-weights,--no-rescale-weights{false}
Rescale weights such that rows sum approximately to 2.
--name TEXT [weight] Name to use when writing weights to file.
--name TEXT Name to use when writing weights to file.
Defaults to ICE, INTER_ICE and GW_ICE when --mode is cis, trans and gw, respectively.
--create-weight-link Create a symbolic link to the balancing weights at clr::/bins/weight.
Ignored when balancing .hic files
--in-memory Store all interactions in memory (greatly improves performance).
--stdout Write balancing weights to stdout instead of writing them to the input file.
--chunk-size UINT:POSITIVE [10000000]
Expand All @@ -71,12 +74,8 @@ hictk balance
Set verbosity of output to the console.
-t,--threads UINT:UINT in [1 - 16] [1]
Maximum number of parallel threads to spawn.
-l,--compression-level UINT:INT in [0 - 19] []
-l,--compression-lvl UINT:INT in [0 - 19] []
Compression level used to compress temporary files using ZSTD.
--juicer-tools-jar TEXT:FILE
Path to juicer_tools or hic_tools JAR.
--juicer-tools-memory UINT:SIZE [b, kb(=1000b), kib(=1024b), ...]:POSITIVE [256MB]
Max heap size used by juicer_tools.
-f,--force Overwrite existing files and datasets (if any).
hictk convert
Expand All @@ -98,8 +97,6 @@ hictk convert
- cool
- mcool
- hic
-j,--juicer-tools-jar TEXT:FILE
Path to juicer_tools or hic_tools JAR.
-r,--resolutions UINT:POSITIVE ...
One or more resolutions to be converted. By default all resolutions are converted.
--normalization-methods TEXT [ALL] ...
Expand All @@ -108,17 +105,17 @@ hictk convert
Pass NONE to avoid copying normalization vectors.
--fail-if-norm-not-found Fail if any of the requested normalization vectors are missing.
-g,--genome TEXT Genome assembly name. By default this is copied from the .hic file metadata.
--juicer-tools-memory UINT:SIZE [b, kb(=1000b), kib(=1024b), ...]:POSITIVE [32GB]
Max heap size used by juicer_tools. Only used when converting from cool to hic
--tmpdir TEXT Path where to store temporary files.
--chunk-size UINT:POSITIVE [10000000]
Batch size to use when converting .[m]cool to .hic.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.
-t,--threads UINT:UINT in [2 - 16] [2]
Maximum number of parallel threads to spawn.
When converting from hic to cool, only two threads will be used.
-l,--compression-level UINT:INT in [0 - 9] []
Compression level used to compress temporary files.
Pass 0 to disable compression.
-l,--compression-lvl UINT:INT in [1 - 12] [6]
Compression level used to compress interactions.
Defaults to 6 and 10 for .cool and .hic files, respectively.
-f,--force Overwrite existing files (if any).
hictk dump
Expand All @@ -134,7 +131,7 @@ hictk dump
Options:
-h,--help Print this help message and exit
--resolution UINT:NONNEGATIVE
HiC matrix resolution (ignored when file is not in .hic format).
HiC matrix resolution (ignored when file is in .cool format).
--matrix-type ENUM:value in {expected->2,observed->0,oe->1} OR {2,0,1} [observed]
Matrix type (ignored when file is not in .hic format).
--matrix-unit ENUM:value in {BP->0,FRAG->1} OR {0,1} [BP]
Expand Down Expand Up @@ -181,7 +178,7 @@ hictk fix-mcool
Set verbosity of output to the console.
-t,--threads UINT:UINT in [1 - 16] [1]
Maximum number of parallel threads to spawn (only applies to the balancing stage).
-l,--compression-level UINT:INT in [0 - 19] []
-l,--compression-lvl UINT:INT in [0 - 19] []
Compression level used to compress temporary files using ZSTD (only applies to the balancing stage).
-f,--force Overwrite existing files (if any).
Expand All @@ -190,18 +187,18 @@ hictk load

.. code-block:: text
Build .cool files from interactions in various text formats.
Usage: hictk load [OPTIONS] chrom-sizes output-uri
Build .cool and .hic files from interactions in various text formats.
Usage: hictk load [OPTIONS] chrom-sizes output-path
Positionals:
chrom-sizes TEXT:FILE REQUIRED
Path to .chrom.sizes file.
output-uri TEXT REQUIRED Path to output Cooler (URI syntax supported).
output-path TEXT REQUIRED Path to output file.
Options:
-h,--help Print this help message and exit
-b,--bin-size UINT:POSITIVE Excludes: --bin-table
Bin size (bp).
Required when --bin-table is not used.
-t,--bin-table TEXT:FILE Excludes: --bin-size
--bin-table TEXT:FILE Excludes: --bin-size
Path to a BED3+ file with the bin table.
-f,--format TEXT:{4dn,validpairs,bg2,coo} REQUIRED
Input format.
Expand All @@ -214,29 +211,44 @@ hictk load
--count-as-float Interactions are floats.
--assume-sorted,--assume-unsorted{false}
Assume input files are already sorted.
--chunk-size UINT [10000000]
Number of pixels to buffer in memory.
-l,--compression-lvl UINT:INT bounded to [1 - 12]
Compression level used to compress interactions.
Defaults to 6 and 10 for .cool and .hic files, respectively.
-t,--threads UINT:UINT in [1 - 16] [1]
Maximum number of parallel threads to spawn.
When loading interactions in a .cool file, only a single thread will be used.
--tmpdir TEXT [/tmp] Path to a folder where to store temporary data.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.
--batch-size UINT [20000000]
Number of pixels to buffer in memory.
Only used when processing unsorted interactions or pairs.
hictk merge
-----------

.. code-block:: text
Merge coolers.
Usage: hictk merge [OPTIONS] input-coolers...
Merge multiple Cooler or .hic files into a single file.
Usage: hictk merge [OPTIONS] input-files...
Positionals:
input-coolers TEXT:Cooler x 2 REQUIRED
Path to two or more Cooler files to be merged (URI syntax supported).
input-files TEXT:(Cooler) OR (HiC) x 2 REQUIRED
Path to two or more Cooler or .hic files to be merged (Cooler URI syntax supported).
Options:
-h,--help Print this help message and exit
-o,--output-cooler TEXT Output Cooler (URI syntax supported).
When not specified, merged interactions will be printed to stdout.
-f,--force Force overwrite output cooler.
--chunk-size UINT [5000000]
-o,--output-file TEXT REQUIRED
Output Cooler or .hic file (Cooler URI syntax supported).
--resolution UINT:NONNEGATIVE
HiC matrix resolution (ignored when input files are in .cool format).
-f,--force Force overwrite output file.
--chunk-size UINT [10000000]
Number of pixels to store in memory before writing to disk.
-l,--compression-lvl UINT:INT bounded to [1 - 12]
Compression level used to compress interactions.
Defaults to 6 and 10 for .cool and .hic files, respectively.
-t,--threads UINT:UINT in [1 - 16] [1]
Maximum number of parallel threads to spawn.
When merging interactions in Cooler format, only a single thread will be used.
--tmpdir TEXT [/tmp] Path to a folder where to store temporary data.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.
Expand Down Expand Up @@ -282,10 +294,11 @@ hictk zoomify
.. code-block:: text
Convert single-resolution Cooler file to multi-resolution by coarsening.
Usage: hictk zoomify [OPTIONS] cooler [mcool]
Usage: hictk zoomify [OPTIONS] cooler/hic mcool/hic
Positionals:
cooler TEXT:Cooler REQUIRED Path to a .cool file (Cooler URI syntax supported).
mcool TEXT Output path.
cooler/hic TEXT:(Cooler) OR (HiC) REQUIRED
Path to a .cool or .hic file (Cooler URI syntax supported).
mcool/hic TEXT REQUIRED Output path.
Options:
-h,--help Print this help message and exit
--force Force overwrite existing output file(s).
Expand All @@ -298,5 +311,15 @@ hictk zoomify
Base resolution: 1000
Pow2: 1000, 2000, 4000, 8000...
Nice: 1000, 2000, 5000, 10000...
-l,--compression-lvl UINT:INT bounded to [1 - 12] [6]
Compression level used to compress interactions.
Defaults to 6 and 12 for .mcool and .hic files, respectively.
-t,--threads UINT:UINT in [1 - 16] [1]
Maximum number of parallel threads to spawn.
When zoomifying interactions from a .cool file, only a single thread will be used.
--chunk-size UINT [10000000]
Number of pixels to buffer in memory.
Only used when zoomifying .hic files.
--tmpdir TEXT [/tmp] Path to a folder where to store temporary data.
-v,--verbosity UINT:INT in [1 - 4] []
Set verbosity of output to the console.
Loading

0 comments on commit a96d85e

Please sign in to comment.