From 46af0f7ff8a4a64a99c03425340e7b78e245aaf3 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 17 Sep 2023 22:42:56 +0200 Subject: [PATCH 1/6] Update hictk dump to support dumping resolutions, cells and normalizations --- src/hictk/cli/cli_dump.cpp | 50 +++--- src/hictk/cli/cli_validate.cpp | 4 +- src/hictk/dump/dump.cpp | 166 +++++++++++++++--- src/hictk/include/hictk/tools/cli.hpp | 33 +++- src/libhictk/cooler/include/hictk/cooler.hpp | 1 + .../cooler/include/hictk/cooler/cooler.hpp | 3 +- .../hictk/cooler/impl/file_accessors_impl.hpp | 15 ++ .../hictk/cooler/impl/file_read_impl.hpp | 4 +- .../cooler/impl/singlecell_cooler_impl.hpp | 1 + .../hictk/cooler/singlecell_cooler.hpp | 1 + src/libhictk/file/include/hictk/file.hpp | 2 + .../file/include/hictk/impl/file_impl.hpp | 4 + 12 files changed, 234 insertions(+), 50 deletions(-) diff --git a/src/hictk/cli/cli_dump.cpp b/src/hictk/cli/cli_dump.cpp index 37385f24..923b0fb6 100644 --- a/src/hictk/cli/cli_dump.cpp +++ b/src/hictk/cli/cli_dump.cpp @@ -30,7 +30,10 @@ void Cli::make_dump_subcommand() { "uri", c.uri, "Path to a .hic, .cool or .mcool file (Cooler URI syntax supported).") - ->check(IsValidHiCFile | IsValidCoolerFile) + ->check(IsValidHiCFile | + IsValidCoolerFile | + IsValidMultiresCoolerFile | + IsValidSingleCellCoolerFile) ->required(); sc.add_option( @@ -57,7 +60,8 @@ void Cli::make_dump_subcommand() { "-t,--table", c.table, "Name of the table to dump.\n") - ->check(CLI::IsMember({"chroms", "bins", "pixels"})) + ->check(CLI::IsMember({"chroms", "bins", "pixels", "normalizations", + "resolutions", "cells"})) ->capture_default_str(); sc.add_option( @@ -106,6 +110,7 @@ void Cli::make_dump_subcommand() { // clang-format on + sc.get_option("--range2")->needs(sc.get_option("--range")); sc.get_option("--query-file")->excludes(sc.get_option("--range")); sc.get_option("--query-file")->excludes(sc.get_option("--range2")); @@ -119,38 +124,40 @@ void Cli::validate_dump_subcommand() const { std::vector errors; const auto& c = std::get(_config); - if (!errors.empty()) { - throw std::runtime_error( - fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI " - "arguments and input file(s):\n - {}"), - fmt::join(errors, "\n - "))); - } + const auto& subcmd = *_cli.get_subcommand("dump"); const auto is_hic = hic::utils::is_hic_file(c.uri); const auto is_cooler = cooler::utils::is_cooler(c.uri); const auto is_mcooler = cooler::utils::is_multires_file(c.uri); + const auto is_scool = cooler::utils::is_scool_file(c.uri); - if (is_hic && c.resolution == 0 && c.table != "chroms") { - errors.emplace_back("--resolution is mandatory when file is in .hic format."); + if ((is_hic || is_mcooler) && c.resolution == 0 && (c.table == "pixels" || c.table == "bins")) { + errors.emplace_back("--resolution is mandatory when file is in .hic or .mcool format."); } - const auto resolution_parsed = !_cli.get_subcommand("dump")->get_option("--resolution")->empty(); + const auto resolution_parsed = !subcmd.get_option("--resolution")->empty(); - if ((is_cooler || is_mcooler) && resolution_parsed) { - warnings.emplace_back("--resolution is ignored when file is in .cool or .mcool format."); + if ((is_cooler || is_scool) && resolution_parsed) { + warnings.emplace_back("--resolution is ignored when file is in .[s]cool format."); } - const auto weight_type_parsed = - !_cli.get_subcommand("dump")->get_option("--weight-type")->empty(); + const auto weight_type_parsed = !subcmd.get_option("--weight-type")->empty(); if (is_hic && weight_type_parsed) { warnings.emplace_back("--weight-type is ignored when file is in .hic format."); } - const auto matrix_type_parsed = - !_cli.get_subcommand("dump")->get_option("--matrix-type")->empty(); - const auto matrix_unit_parsed = - !_cli.get_subcommand("dump")->get_option("--matrix-unit")->empty(); + const auto range_parsed = !subcmd.get_option("--range")->empty(); + if (range_parsed && c.table != "bins" && c.table != "pixels") { + warnings.emplace_back("--range and --range2 are ignore when --table is not bins or pixels"); + } + const auto query_file_parsed = !subcmd.get_option("--query-file")->empty(); + if (query_file_parsed && c.table != "bins" && c.table != "pixels") { + warnings.emplace_back("--query-file is ignored when --table is not bins or pixels"); + } + + const auto matrix_type_parsed = !subcmd.get_option("--matrix-type")->empty(); + const auto matrix_unit_parsed = !subcmd.get_option("--matrix-unit")->empty(); if (!is_hic && (matrix_type_parsed || matrix_unit_parsed)) { warnings.emplace_back( @@ -181,9 +188,10 @@ void Cli::transform_args_dump_subcommand() { c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; c.format = infer_input_format(c.uri); - if (c.format == "hic" && c.resolution == 0) { - assert(c.table == "chroms"); + if (c.format == "hic" && c.resolution == 0 && c.table == "chroms") { c.resolution = hic::utils::list_resolutions(c.uri).back(); + } else if (c.format == "mcool" && c.resolution == 0 && c.table == "chroms") { + c.resolution = cooler::utils::list_resolutions(c.uri).back(); } if (_cli.get_subcommand("dump")->get_option("--range2")->empty()) { diff --git a/src/hictk/cli/cli_validate.cpp b/src/hictk/cli/cli_validate.cpp index 5beef57f..ecad1d9d 100644 --- a/src/hictk/cli/cli_validate.cpp +++ b/src/hictk/cli/cli_validate.cpp @@ -1,6 +1,6 @@ +// Copyright (C) 2023 Roberto Rossini // -// Created by roby on 7/13/23. -// +// SPDX-License-Identifier: MIT #include #include diff --git a/src/hictk/dump/dump.cpp b/src/hictk/dump/dump.cpp index fb74529b..014beb35 100644 --- a/src/hictk/dump/dump.cpp +++ b/src/hictk/dump/dump.cpp @@ -5,6 +5,7 @@ #include #include "hictk/balancing/methods.hpp" +#include "hictk/cooler.hpp" #include "hictk/file.hpp" #include "hictk/tools/config.hpp" #include "hictk/transformers.hpp" @@ -18,23 +19,6 @@ static void print(const ThinPixel& pixel) { fmt::print(FMT_COMPILE("{:d}\t{:d}\t{:.16g}\n"), pixel.bin1_id, pixel.bin2_id, pixel.count); } -static void dump_chroms(const File& f, std::string_view range) { - if (range == "all") { - for (const Chromosome& chrom : f.chromosomes()) { - if (!chrom.is_all()) { - fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), chrom.name(), chrom.size()); - } - } - return; - } - - const auto coords = GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range}); - auto it = f.chromosomes().find(coords.chrom()); - if (it != f.chromosomes().end()) { - fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), it->name(), it->size()); - } -} - template static void dump_bins(const File& f, std::string_view range) { if (range == "all") { @@ -127,9 +111,6 @@ static void dump_pixels(hic::File& f, std::string_view range1, std::string_view static void process_query(File& f, std::string_view table, std::string_view range1, std::string_view range2, std::string_view normalization, bool join, bool sorted) { - if (table == "chroms") { - return dump_chroms(f, range1); - } if (table == "bins") { return dump_bins(f, range1); } @@ -139,7 +120,128 @@ static void process_query(File& f, std::string_view table, std::string_view rang f.get()); } -int dump_subcmd(const DumpConfig& c) { +static int dump_chroms(std::string_view uri, std::string_view format, std::uint32_t resolution) { + Reference ref{}; + + if (format == "mcool") { + ref = cooler::MultiResFile{std::string{uri}}.chromosomes(); + } else if (format == "scool") { + ref = cooler::SingleCellFile{std::string{uri}}.chromosomes(); + } else { + ref = File{std::string{uri}, resolution}.chromosomes(); + } + + for (const Chromosome& chrom : ref) { + if (!chrom.is_all()) { + fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), chrom.name(), chrom.size()); + } + } + return 0; +} + +static phmap::btree_set get_normalizations(std::string_view uri, + std::string_view format, + std::uint32_t resolution) { + assert(format != "mcool"); + assert(format != "hic" || resolution != 0); + if (format == "scool") { + const auto cell_ids = cooler::SingleCellFile{uri}.cells(); + if (cell_ids.empty()) { + return {}; + } + + const auto scool_uri = fmt::format(FMT_STRING("{}::/cells/{}"), uri, *cell_ids.begin()); + return get_normalizations(scool_uri, "cool", 0); + } + + phmap::btree_set norms{}; + if (uri == "hic" && resolution == 0) { + const hic::File hf{std::string{uri}, resolution}; + + for (const auto& norm : hf.avail_normalizations()) { + norms.emplace(std::string{norm.to_string()}); + } + return norms; + } + + const auto norms_ = File{std::string{uri}, resolution}.avail_normalizations(); + std::transform(norms_.begin(), norms_.end(), std::inserter(norms, norms.begin()), + [](const auto& n) { return std::string{n.to_string()}; }); + + return norms; +} + +static int dump_normalizations(std::string_view uri, std::string_view format, + std::uint32_t resolution) { + phmap::btree_set norms{}; + std::vector resolutions{}; + if (format == "mcool") { + resolutions = cooler::MultiResFile{uri}.resolutions(); + if (resolutions.empty()) { + return 0; + } + } else if (format == "hic" && resolution == 0) { + resolutions = hic::utils::list_resolutions(std::string{uri}); + if (resolutions.empty()) { + return 0; + } + } + + if (resolutions.empty()) { + norms = get_normalizations(uri, format, resolution); + } else { + format = format == "hic" ? "hic" : "cool"; + std::for_each(resolutions.begin(), resolutions.end(), + [&](const auto res) { norms.merge(get_normalizations(uri, format, res)); }); + } + + if (!norms.empty()) { + fmt::print(FMT_STRING("{}\n"), fmt::join(norms, "\n")); + } + return 0; +} + +static int dump_resolutions(std::string_view uri, std::string_view format, + std::uint32_t resolution) { + std::vector resolutions{}; + + if (format == "hic") { + resolutions = hic::utils::list_resolutions(uri); + if (resolution != 0) { + const auto res_found = + std::find(resolutions.begin(), resolutions.end(), resolution) != resolutions.end(); + resolutions.clear(); + if (res_found) { + resolutions.push_back(resolution); + } + } + } else if (format == "mcool") { + resolutions = cooler::MultiResFile{uri}.resolutions(); + } else if (format == "scool") { + resolutions.push_back(cooler::SingleCellFile{uri}.bin_size()); + } else { + assert(format == "cool"); + resolutions.push_back(cooler::File{uri}.bin_size()); + } + + if (!resolutions.empty()) { + fmt::print(FMT_STRING("{}\n"), fmt::join(resolutions, "\n")); + } + return 0; +} + +static int dump_cells(std::string_view uri, std::string_view format) { + if (format != "scool") { + throw std::runtime_error(fmt::format(FMT_STRING("\"{}\" is not a .scool file"), uri)); + } + const auto cells = cooler::SingleCellFile{uri}.cells(); + if (!cells.empty()) { + fmt::print(FMT_STRING("{}\n"), fmt::join(cells, "\n")); + } + return 0; +} + +static int dump_tables(const DumpConfig& c) { hictk::File f{c.uri, c.resolution, c.matrix_type, c.matrix_unit}; if (c.query_file.empty()) { @@ -164,4 +266,26 @@ int dump_subcmd(const DumpConfig& c) { return 0; } + +int dump_subcmd(const DumpConfig& c) { + if (c.table == "bins" || c.table == "pixels") { + return dump_tables(c); + } + + if (c.table == "chroms") { + return dump_chroms(c.uri, c.format, c.resolution); + } + + if (c.table == "resolutions") { + return dump_resolutions(c.uri, c.format, c.resolution); + } + + if (c.table == "normalizations") { + return dump_normalizations(c.uri, c.format, c.resolution); + } + + assert(c.table == "cells"); + + return dump_cells(c.uri, c.format); +} } // namespace hictk::tools diff --git a/src/hictk/include/hictk/tools/cli.hpp b/src/hictk/include/hictk/tools/cli.hpp index 75eb97e3..6170ead7 100644 --- a/src/hictk/include/hictk/tools/cli.hpp +++ b/src/hictk/include/hictk/tools/cli.hpp @@ -10,6 +10,7 @@ #include #include "config.hpp" +#include "hictk/cooler.hpp" #include "hictk/cooler/utils.hpp" #include "hictk/hic/utils.hpp" @@ -23,6 +24,9 @@ class CoolerFileValidator : public CLI::Validator { if (hictk::cooler::utils::is_multires_file(uri)) { return "URI points to a .mcool file: " + uri; } + if (hictk::cooler::utils::is_scool_file(uri)) { + return "URI points to a .scool file: " + uri; + } const auto path = cooler::parse_cooler_uri(uri).file_path; if (!std::filesystem::exists(path)) { return "No such file: " + path; @@ -50,6 +54,22 @@ class MultiresCoolerFileValidator : public CLI::Validator { } }; +class SingleCellCoolerFileValidator : public CLI::Validator { + public: + inline SingleCellCoolerFileValidator() : Validator("Single-cell-cooler") { + func_ = [](std::string& uri) -> std::string { + const auto path = cooler::parse_cooler_uri(uri).file_path; + if (!std::filesystem::exists(path)) { + return "No such file: " + path; + } + if (!hictk::cooler::utils::is_scool_file(uri)) { + return "Not a valid single-cell cooler: " + uri; + } + return ""; + }; + } +}; + class HiCFileValidator : public CLI::Validator { public: inline HiCFileValidator() : Validator("HiC") { @@ -145,9 +165,10 @@ class Formatter : public CLI::Formatter { }; // clang-format off - inline const auto IsValidCoolerFile = CoolerFileValidator(); // NOLINT(cert-err58-cpp) - inline const auto IsValidMultiresCoolerFile = MultiresCoolerFileValidator(); // NOLINT(cert-err58-cpp) - inline const auto IsValidHiCFile = HiCFileValidator(); // NOLINT(cert-err58-cpp) + inline const auto IsValidCoolerFile = CoolerFileValidator(); // NOLINT(cert-err58-cpp) + inline const auto IsValidMultiresCoolerFile = MultiresCoolerFileValidator(); // NOLINT(cert-err58-cpp) + inline const auto IsValidSingleCellCoolerFile = SingleCellCoolerFileValidator(); // NOLINT(cert-err58-cpp) + inline const auto IsValidHiCFile = HiCFileValidator(); // NOLINT(cert-err58-cpp) // clang-format on // clang-format off @@ -225,6 +246,9 @@ class Cli { if (cooler::utils::is_multires_file(p.string())) { return "mcool"; } + if (cooler::utils::is_scool_file(p.string())) { + return "scool"; + } assert(hic::utils::is_hic_file(p)); return "hic"; } @@ -250,6 +274,9 @@ class Cli { if (format == "cool") { return {cooler::File(p.string()).bin_size()}; } + if (format == "scool") { + return {cooler::SingleCellFile{p.string()}.bin_size()}; + } if (format == "mcool") { return cooler::utils::list_resolutions(p, true); } diff --git a/src/libhictk/cooler/include/hictk/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler.hpp index c363e068..c29e0f2a 100644 --- a/src/libhictk/cooler/include/hictk/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler.hpp @@ -6,3 +6,4 @@ #include "hictk/cooler/cooler.hpp" #include "hictk/cooler/multires_cooler.hpp" +#include "hictk/cooler/singlecell_cooler.hpp" diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index 9737b07a..104b19f5 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -250,7 +250,8 @@ class File { balancing::Weights::Type type, bool rescale = false) const; - bool has_weights(const balancing::Method &normalization) const; + [[nodiscard]] std::vector avail_normalizations() const; + [[nodiscard]] bool has_normalization(const balancing::Method &normalization) const; std::shared_ptr read_weights(const balancing::Method &normalization, bool rescale = false) const; std::shared_ptr read_weights(const balancing::Method &normalization, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp index 990f77e6..d0ca7987 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp @@ -87,6 +87,21 @@ inline auto File::dataset(std::string_view dataset_name) const -> const Dataset } } +inline std::vector File::avail_normalizations() const { + const phmap::flat_hash_set bin_table_dsets{"chrom", "start", "end"}; + + std::vector norms{}; + for (const auto &dset : group("bins")().listObjectNames(HighFive::IndexType::NAME)) { + if (bin_table_dsets.contains(dset)) { + continue; + } + + norms.emplace_back(balancing::Method{dset}); + } + + return norms; +} + inline const hictk::internal::NumericVariant &File::pixel_variant() const noexcept { return _pixel_variant; } diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp index 795eec54..819bb657 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp @@ -227,7 +227,7 @@ inline PixelSelector File::fetch(PixelCoordinates coord1, PixelCoordinates coord } inline bool File::has_weights(std::string_view normalization) const { - return has_weights(balancing::Method{normalization}); + return has_normalization(balancing::Method{normalization}); } inline std::shared_ptr File::read_weights(std::string_view normalization, bool rescale) const { @@ -239,7 +239,7 @@ inline std::shared_ptr File::read_weights(std::string_ return read_weights(balancing::Method{normalization}, type, rescale); } -inline bool File::has_weights(const balancing::Method &normalization) const { +inline bool File::has_normalization(const balancing::Method &normalization) const { const auto dset_path = fmt::format(FMT_STRING("{}/{}"), _groups.at("bins").group.getPath(), normalization.to_string()); if (_weights.contains(dset_path)) { diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp index a4ae20f3..7d116960 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp @@ -163,6 +163,7 @@ inline SingleCellFile::operator bool() const noexcept { return !!_root_grp; } inline std::string SingleCellFile::path() const { return (*_root_grp)().getFile().getName(); } inline auto SingleCellFile::bins() const noexcept -> const BinTable& { return *_bins; } +inline std::uint32_t SingleCellFile::bin_size() const noexcept { return bins().bin_size(); } inline auto SingleCellFile::chromosomes() const noexcept -> const Reference& { return bins().chromosomes(); } diff --git a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp index 93e68863..86228e5b 100644 --- a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp @@ -74,6 +74,7 @@ class SingleCellFile { [[nodiscard]] std::string path() const; [[nodiscard]] auto chromosomes() const noexcept -> const Reference&; [[nodiscard]] auto bins() const noexcept -> const BinTable&; + [[nodiscard]] std::uint32_t bin_size() const noexcept; template File aggregate(std::string_view uri, bool overwrite_if_exists = false, diff --git a/src/libhictk/file/include/hictk/file.hpp b/src/libhictk/file/include/hictk/file.hpp index 36daf89a..c8e05983 100644 --- a/src/libhictk/file/include/hictk/file.hpp +++ b/src/libhictk/file/include/hictk/file.hpp @@ -132,6 +132,8 @@ class File { std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, const balancing::Method &normalization = balancing::Method::NONE()) const; + [[nodiscard]] std::vector avail_normalizations() const; + template [[nodiscard]] constexpr const FileT &get() const noexcept; template diff --git a/src/libhictk/file/include/hictk/impl/file_impl.hpp b/src/libhictk/file/include/hictk/impl/file_impl.hpp index 0c29474d..e8adf527 100644 --- a/src/libhictk/file/include/hictk/impl/file_impl.hpp +++ b/src/libhictk/file/include/hictk/impl/file_impl.hpp @@ -277,6 +277,10 @@ inline PixelSelector File::fetch(std::string_view chrom1_name, std::uint32_t sta _fp); } +inline std::vector File::avail_normalizations() const { + return std::visit([](const auto& fp) { return fp.avail_normalizations(); }, _fp); +} + template constexpr const FileT& File::get() const noexcept { return std::get(_fp); From 7243f42ffacbda4935d99896fdd71dc183af04b8 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:00:03 +0200 Subject: [PATCH 2/6] Add integration tests for new hictk dump tables --- .github/workflows/codecov.yml | 4 + .github/workflows/macos-ci.yml | 12 ++ .github/workflows/ubuntu-ci.yml | 12 ++ test/scripts/hictk_dump_cells.sh | 119 ++++++++++++++++++++ test/scripts/hictk_dump_normalizations.sh | 127 ++++++++++++++++++++++ test/scripts/hictk_dump_resolutions.sh | 114 +++++++++++++++++++ 6 files changed, 388 insertions(+) create mode 100755 test/scripts/hictk_dump_cells.sh create mode 100755 test/scripts/hictk_dump_normalizations.sh create mode 100755 test/scripts/hictk_dump_resolutions.sh diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 5490c6b0..72536ba7 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -192,6 +192,10 @@ jobs: run: | test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk test/scripts/hictk_dump_bins.sh build/src/hictk/hictk + test/scripts/hictk_resolutions.sh build/src/hictk/hictk + test/scripts/hictk_normalizations.sh build/src/hictk/hictk + test/scripts/hictk_cells.sh build/src/hictk/hictk + test/scripts/hictk_dump_gw.sh build/src/hictk/hictk test/scripts/hictk_dump_cis.sh build/src/hictk/hictk test/scripts/hictk_dump_trans.sh build/src/hictk/hictk diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index 82cd09fe..c7202bc3 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -370,6 +370,18 @@ jobs: run: | test/scripts/hictk_dump_bins.sh bin/hictk + - name: Test hictk dump resolutions + run: | + test/scripts/hictk_resolutions.sh bin/hictk + + - name: Test hictk dump normalizations + run: | + test/scripts/hictk_normalizations.sh bin/hictk + + - name: Test hictk dump cells + run: | + test/scripts/hictk_cells.sh bin/hictk + - name: Test hictk dump genome-wide run: | test/scripts/hictk_dump_gw.sh bin/hictk diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml index 35dcfc05..35169032 100644 --- a/.github/workflows/ubuntu-ci.yml +++ b/.github/workflows/ubuntu-ci.yml @@ -423,6 +423,18 @@ jobs: run: | test/scripts/hictk_dump_bins.sh bin/hictk + - name: Test hictk dump resolutions + run: | + test/scripts/hictk_resolutions.sh bin/hictk + + - name: Test hictk dump normalizations + run: | + test/scripts/hictk_normalizations.sh bin/hictk + + - name: Test hictk dump cells + run: | + test/scripts/hictk_cells.sh bin/hictk + - name: Test hictk dump genome-wide run: | test/scripts/hictk_dump_gw.sh bin/hictk diff --git a/test/scripts/hictk_dump_cells.sh b/test/scripts/hictk_dump_cells.sh new file mode 100755 index 00000000..d3992362 --- /dev/null +++ b/test/scripts/hictk_dump_cells.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash + +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +echo "###########################" +echo "#### hictk dump (cells) ####" + +# readlink -f is not available on macos... +function readlink_py { + set -eu + python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" +} + +function check_files_exist { + set -eu + status=0 + for f in "$@"; do + if [ ! -f "$f" ]; then + 2>&1 echo "Unable to find test file \"$f\"" + status=1 + fi + done + + return "$status" +} + +function compare_files { + set -o pipefail + set -e + + 2>&1 echo "Comparing $1 with $2..." + if diff "$1" "$2"; then + 2>&1 echo "Files are identical" + return 0 + else + 2>&1 echo "Files differ" + return 1 + fi +} + +export function readlink_py + +status=0 + +if [ $# -ne 1 ]; then + 2>&1 echo "Usage: $0 path_to_hictk" + status=1 +fi + +hictk_bin="$1" + +data_dir="$(readlink_py "$(dirname "$0")/../data/")" +script_dir="$(readlink_py "$(dirname "$0")")" + +mclr="$data_dir/integration_tests/4DNFIZ1ZVXC8.mcool" +sclr="$data_dir/cooler/single_cell_cooler_test_file.scool" +hic="$data_dir/hic/4DNFIZ1ZVXC8.hic8" + +expected_cells=( + GSM2687248_41669_ACAGTG-R1-DpnII.100000.cool + GSM2687249_41670_GGCTAC-R1-DpnII.100000.cool + GSM2687250_41671_TTAGGC-R1-DpnII.100000.cool + GSM2687251_41672_AGTTCC-R1-DpnII.100000.cool + GSM2687252_41673_CCGTCC-R1-DpnII.100000.cool +) + +export PATH="$PATH:$script_dir" + +if ! check_files_exist "$mclr" "$sclr" "$hic"; then + exit 1 +fi + +outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" +trap 'rm -rf -- "$outdir"' EXIT + +printf "%s\n" "${expected_cells[@]}" > "$outdir/expected.txt" + + +"$hictk_bin" dump -t cells "$sclr" > "$outdir/scool.cells.txt" + +if ! compare_files "$outdir/expected.txt" "$outdir/scool.cells.txt"; then + status=1 +fi + +if ! "$hictk_bin" dump -t cells "$mclr" &> /dev/null; then + 2>&1 echo "hictk dump -t resolution $mclr: OK" +else + 2>&1 echo "hictk dump -t resolution $mclr: FAIL" + status=1 +fi + +if ! "$hictk_bin" dump -t cells "$hic" &> /dev/null; then + 2>&1 echo "hictk dump -t resolution $hic: OK" +else + 2>&1 echo "hictk dump -t resolution $hic: FAIL" + status=1 +fi + +if ! "$hictk_bin" dump -t cells "$hic" --resolution 100000 &> /dev/null; then + 2>&1 echo "hictk dump -t resolution $hic --resolution 1000000: OK" +else + 2>&1 echo "hictk dump -t resolution $hic --resolution 1000000: FAIL" + status=1 +fi + + +if [ "$status" -eq 0 ]; then + printf '\n### PASS ###\n' +else + printf '\n### FAIL ###\n' +fi + +exit "$status" diff --git a/test/scripts/hictk_dump_normalizations.sh b/test/scripts/hictk_dump_normalizations.sh new file mode 100755 index 00000000..09be2028 --- /dev/null +++ b/test/scripts/hictk_dump_normalizations.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash + +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +echo "###########################" +echo "#### hictk dump (normalizations) ####" + +# readlink -f is not available on macos... +function readlink_py { + set -eu + python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" +} + +function check_files_exist { + set -eu + status=0 + for f in "$@"; do + if [ ! -f "$f" ]; then + 2>&1 echo "Unable to find test file \"$f\"" + status=1 + fi + done + + return "$status" +} + +function compare_files { + set -o pipefail + set -e + + 2>&1 echo "Comparing $1 with $2..." + if diff "$1" "$2"; then + 2>&1 echo "Files are identical" + return 0 + else + 2>&1 echo "Files differ" + return 1 + fi +} + +export function readlink_py + +status=0 + +if [ $# -ne 1 ]; then + 2>&1 echo "Usage: $0 path_to_hictk" + status=1 +fi + +hictk_bin="$1" + +data_dir="$(readlink_py "$(dirname "$0")/../data/")" +script_dir="$(readlink_py "$(dirname "$0")")" + +mclr="$data_dir/integration_tests/4DNFIZ1ZVXC8.mcool" +sclr="$data_dir/cooler/single_cell_cooler_test_file.scool" +hic="$data_dir/hic/4DNFIZ1ZVXC8.hic8" + +expected_norms_hic=( + KR + SCALE + VC + VC_SQRT +) + +expected_norms_cooler=( + KR + SCALE + VC + VC_SQRT + weight +) + +export PATH="$PATH:$script_dir" + +if ! check_files_exist "$mclr" "$sclr" "$hic"; then + exit 1 +fi + +outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" +trap 'rm -rf -- "$outdir"' EXIT + +printf "%s\n" "${expected_norms_hic[@]}" > "$outdir/expected.hic.txt" +printf "%s\n" "${expected_norms_cooler[@]}" > "$outdir/expected.cool.txt" + +"$hictk_bin" dump -t normalizations "$mclr" > "$outdir/mcool.norms.txt" +"$hictk_bin" dump -t normalizations "$sclr" > "$outdir/scool.norms.empty.txt" +"$hictk_bin" dump -t normalizations "$mclr::/resolutions/100000" > "$outdir/cool.norms.txt" + +"$hictk_bin" dump -t normalizations "$hic" > "$outdir/hic.norms.txt" +"$hictk_bin" dump -t normalizations "$hic" --resolution 100000 > "$outdir/hic.norms.txt" + +for f in "$outdir/"*cool*.norms.txt; do + if ! compare_files "$outdir/expected.cool.txt" "$f"; then + status=1 + fi +done + +for f in "$outdir/"*hic*.norms.txt; do + if ! compare_files "$outdir/expected.hic.txt" "$f"; then + status=1 + fi +done + +for f in "$outdir/"*.norms.empty.txt; do + 2>&1 echo "Checking that $f is empty..." + if [ -s "$f" ]; then + 2>&1 echo "File is NOT empty" + status=1 + else + 2>&1 echo "File is empty" + fi +done + +if [ "$status" -eq 0 ]; then + printf '\n### PASS ###\n' +else + printf '\n### FAIL ###\n' +fi + +exit "$status" diff --git a/test/scripts/hictk_dump_resolutions.sh b/test/scripts/hictk_dump_resolutions.sh new file mode 100755 index 00000000..559702b0 --- /dev/null +++ b/test/scripts/hictk_dump_resolutions.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash + +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +echo "###########################" +echo "#### hictk dump (resolutions) ####" + +# readlink -f is not available on macos... +function readlink_py { + set -eu + python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" +} + +function check_files_exist { + set -eu + status=0 + for f in "$@"; do + if [ ! -f "$f" ]; then + 2>&1 echo "Unable to find test file \"$f\"" + status=1 + fi + done + + return "$status" +} + +function compare_files { + set -o pipefail + set -e + + 2>&1 echo "Comparing $1 with $2..." + if diff "$1" "$2"; then + 2>&1 echo "Files are identical" + return 0 + else + 2>&1 echo "Files differ" + return 1 + fi +} + +export function readlink_py + +status=0 + +if [ $# -ne 1 ]; then + 2>&1 echo "Usage: $0 path_to_hictk" + status=1 +fi + +hictk_bin="$1" + +data_dir="$(readlink_py "$(dirname "$0")/../data/")" +script_dir="$(readlink_py "$(dirname "$0")")" + +mclr="$data_dir/integration_tests/4DNFIZ1ZVXC8.mcool" +sclr="$data_dir/cooler/single_cell_cooler_test_file.scool" +hic="$data_dir/hic/4DNFIZ1ZVXC8.hic8" + +expected_res=( + 1000 + 5000 + 10000 + 25000 + 50000 + 100000 + 250000 + 500000 + 1000000 + 2500000 +) + +export PATH="$PATH:$script_dir" + +if ! check_files_exist "$mclr" "$sclr" "$hic"; then + exit 1 +fi + +outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" +trap 'rm -rf -- "$outdir"' EXIT + +printf "%s\n" "${expected_res[@]}" > "$outdir/expected.txt" +printf "%d\n" "100000" > "$outdir/expected.100000.txt" +"$hictk_bin" dump -t resolutions "$mclr" > "$outdir/mcool.res.txt" +"$hictk_bin" dump -t resolutions "$sclr" > "$outdir/scool.res.100000.txt" +"$hictk_bin" dump -t resolutions "$mclr::/resolutions/100000" > "$outdir/cool.res.100000.txt" +"$hictk_bin" dump -t resolutions "$hic" > "$outdir/hic.res.txt" +"$hictk_bin" dump -t resolutions "$hic" --resolution 100000 > "$outdir/hic.res.100000.txt" + + +for f in "$outdir/"*.res.txt; do + if ! compare_files "$outdir/expected.txt" "$f"; then + status=1 + fi +done + +for f in "$outdir/"*.res.100000.txt; do + if ! compare_files "$outdir/expected.100000.txt" "$f"; then + status=1 + fi +done + +if [ "$status" -eq 0 ]; then + printf '\n### PASS ###\n' +else + printf '\n### FAIL ###\n' +fi + +exit "$status" From 193f6f7a56f625d2d24b22d8ee4b8efd7b6aa716 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:12:53 +0200 Subject: [PATCH 3/6] Add unit tests --- test/units/cooler/file_weights_test.cpp | 21 ++++++++++++++++++--- test/units/hic/hic_file_test.cpp | 4 ++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/test/units/cooler/file_weights_test.cpp b/test/units/cooler/file_weights_test.cpp index 0c7e684a..5218c0a8 100644 --- a/test/units/cooler/file_weights_test.cpp +++ b/test/units/cooler/file_weights_test.cpp @@ -2,8 +2,6 @@ // // SPDX-License-Identifier: MIT -#include - #include #include #include @@ -14,9 +12,26 @@ namespace hictk::cooler::test::cooler_file { +TEST_CASE("Cooler: read weights", "[cooler][short]") { + const auto path1 = datadir / "cooler_test_file.cool"; + const auto path2 = datadir / "ENCFF993FGR.2500000.cool"; + + const cooler::File clr1{path1.string()}; + const cooler::File clr2{path2.string()}; + + SECTION("wo/ weights") { CHECK(clr1.avail_normalizations().empty()); } + SECTION("w/ weights") { + CHECK(clr2.avail_normalizations().size() == 6); + CHECK(clr2.has_weights("SCALE")); + CHECK(!clr2.has_weights("FOOBAR")); + + CHECK(clr2.read_weights("SCALE")->type() == hictk::balancing::Weights::Type::DIVISIVE); + } +} + // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("Cooler: write weights", "[cooler][short]") { - auto path1 = datadir / "cooler_test_file.cool"; + const auto path1 = datadir / "cooler_test_file.cool"; auto path2 = testdir() / "cooler_test_write_weights1.cool"; auto path3 = testdir() / "cooler_test_write_weights2.cool"; diff --git a/test/units/hic/hic_file_test.cpp b/test/units/hic/hic_file_test.cpp index 79e992c9..099728be 100644 --- a/test/units/hic/hic_file_test.cpp +++ b/test/units/hic/hic_file_test.cpp @@ -42,6 +42,10 @@ TEST_CASE("HiC: file accessors", "[hic][short]") { CHECK(f.avail_resolutions().front() == 2'500'000); CHECK(f.avail_resolutions().back() == 1000); + CHECK(f.avail_normalizations().size() == 4); + CHECK(f.avail_normalizations().front() == "VC_SQRT"); + CHECK(f.avail_normalizations().back() == "VC"); + CHECK(f.open(2'500'000).resolution() == 2'500'000); SECTION("invalid") { From 491de189dc02309665b8a4750ae8ec6e4c6a5c9c Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 18 Sep 2023 08:16:18 +0200 Subject: [PATCH 4/6] Fix typo --- .github/workflows/codecov.yml | 6 +++--- .github/workflows/macos-ci.yml | 6 +++--- .github/workflows/ubuntu-ci.yml | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 72536ba7..e4ef6830 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -192,9 +192,9 @@ jobs: run: | test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk test/scripts/hictk_dump_bins.sh build/src/hictk/hictk - test/scripts/hictk_resolutions.sh build/src/hictk/hictk - test/scripts/hictk_normalizations.sh build/src/hictk/hictk - test/scripts/hictk_cells.sh build/src/hictk/hictk + test/scripts/hictk_dump_resolutions.sh build/src/hictk/hictk + test/scripts/hictk_dump_normalizations.sh build/src/hictk/hictk + test/scripts/hictk_dump_cells.sh build/src/hictk/hictk test/scripts/hictk_dump_gw.sh build/src/hictk/hictk test/scripts/hictk_dump_cis.sh build/src/hictk/hictk diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index c7202bc3..baf850eb 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -372,15 +372,15 @@ jobs: - name: Test hictk dump resolutions run: | - test/scripts/hictk_resolutions.sh bin/hictk + test/scripts/hictk_dump_resolutions.sh bin/hictk - name: Test hictk dump normalizations run: | - test/scripts/hictk_normalizations.sh bin/hictk + test/scripts/hictk_dump_normalizations.sh bin/hictk - name: Test hictk dump cells run: | - test/scripts/hictk_cells.sh bin/hictk + test/scripts/hictk_dump_cells.sh bin/hictk - name: Test hictk dump genome-wide run: | diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml index 35169032..19bf42eb 100644 --- a/.github/workflows/ubuntu-ci.yml +++ b/.github/workflows/ubuntu-ci.yml @@ -425,15 +425,15 @@ jobs: - name: Test hictk dump resolutions run: | - test/scripts/hictk_resolutions.sh bin/hictk + test/scripts/hictk_dump_resolutions.sh bin/hictk - name: Test hictk dump normalizations run: | - test/scripts/hictk_normalizations.sh bin/hictk + test/scripts/hictk_dump_normalizations.sh bin/hictk - name: Test hictk dump cells run: | - test/scripts/hictk_cells.sh bin/hictk + test/scripts/hictk_dump_cells.sh bin/hictk - name: Test hictk dump genome-wide run: | From 7c05341123b50a211ffaa73e771dbaa1dcfedfe7 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 18 Sep 2023 09:13:40 +0200 Subject: [PATCH 5/6] Make order of names returned by avail_normalizations() deterministic --- src/libhictk/hic/include/hictk/hic/impl/file_reader_impl.hpp | 2 ++ test/units/hic/hic_file_test.cpp | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/libhictk/hic/include/hictk/hic/impl/file_reader_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/file_reader_impl.hpp index 89a0d9ac..eb49bcec 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/file_reader_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/file_reader_impl.hpp @@ -607,6 +607,8 @@ inline std::vector HiCFileReader::list_avail_normalizations( std::vector methods_{methods.size()}; std::copy(methods.begin(), methods.end(), methods_.begin()); + std::sort(methods_.begin(), methods_.end(), + [&](const auto &m1, const auto &m2) { return m1.to_string() < m2.to_string(); }); return methods_; } diff --git a/test/units/hic/hic_file_test.cpp b/test/units/hic/hic_file_test.cpp index 099728be..10648624 100644 --- a/test/units/hic/hic_file_test.cpp +++ b/test/units/hic/hic_file_test.cpp @@ -43,8 +43,8 @@ TEST_CASE("HiC: file accessors", "[hic][short]") { CHECK(f.avail_resolutions().back() == 1000); CHECK(f.avail_normalizations().size() == 4); - CHECK(f.avail_normalizations().front() == "VC_SQRT"); - CHECK(f.avail_normalizations().back() == "VC"); + CHECK(f.avail_normalizations().front() == "KR"); + CHECK(f.avail_normalizations().back() == "VC_SQRT"); CHECK(f.open(2'500'000).resolution() == 2'500'000); From 9bc752562aa710b5a3a315a68d11675b30c6336e Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 24 Sep 2023 13:31:22 +0200 Subject: [PATCH 6/6] Make clang-tidy happy --- .clang-tidy | 1 + 1 file changed, 1 insertion(+) diff --git a/.clang-tidy b/.clang-tidy index 46677610..9ec34b36 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -16,6 +16,7 @@ Checks: > -cppcoreguidelines-pro-bounds-array-to-pointer-decay, -cppcoreguidelines-pro-bounds-constant-array-index, -hicpp-no-array-decay, + -misc-no-recursion, -modernize-use-trailing-return-type, -readability-identifier-length, -readability-magic-numbers