From 46af0f7ff8a4a64a99c03425340e7b78e245aaf3 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 17 Sep 2023 22:42:56 +0200 Subject: [PATCH] Update hictk dump to support dumping resolutions, cells and normalizations --- src/hictk/cli/cli_dump.cpp | 50 +++--- src/hictk/cli/cli_validate.cpp | 4 +- src/hictk/dump/dump.cpp | 166 +++++++++++++++--- src/hictk/include/hictk/tools/cli.hpp | 33 +++- src/libhictk/cooler/include/hictk/cooler.hpp | 1 + .../cooler/include/hictk/cooler/cooler.hpp | 3 +- .../hictk/cooler/impl/file_accessors_impl.hpp | 15 ++ .../hictk/cooler/impl/file_read_impl.hpp | 4 +- .../cooler/impl/singlecell_cooler_impl.hpp | 1 + .../hictk/cooler/singlecell_cooler.hpp | 1 + src/libhictk/file/include/hictk/file.hpp | 2 + .../file/include/hictk/impl/file_impl.hpp | 4 + 12 files changed, 234 insertions(+), 50 deletions(-) diff --git a/src/hictk/cli/cli_dump.cpp b/src/hictk/cli/cli_dump.cpp index 37385f24..923b0fb6 100644 --- a/src/hictk/cli/cli_dump.cpp +++ b/src/hictk/cli/cli_dump.cpp @@ -30,7 +30,10 @@ void Cli::make_dump_subcommand() { "uri", c.uri, "Path to a .hic, .cool or .mcool file (Cooler URI syntax supported).") - ->check(IsValidHiCFile | IsValidCoolerFile) + ->check(IsValidHiCFile | + IsValidCoolerFile | + IsValidMultiresCoolerFile | + IsValidSingleCellCoolerFile) ->required(); sc.add_option( @@ -57,7 +60,8 @@ void Cli::make_dump_subcommand() { "-t,--table", c.table, "Name of the table to dump.\n") - ->check(CLI::IsMember({"chroms", "bins", "pixels"})) + ->check(CLI::IsMember({"chroms", "bins", "pixels", "normalizations", + "resolutions", "cells"})) ->capture_default_str(); sc.add_option( @@ -106,6 +110,7 @@ void Cli::make_dump_subcommand() { // clang-format on + sc.get_option("--range2")->needs(sc.get_option("--range")); sc.get_option("--query-file")->excludes(sc.get_option("--range")); sc.get_option("--query-file")->excludes(sc.get_option("--range2")); @@ -119,38 +124,40 @@ void Cli::validate_dump_subcommand() const { std::vector errors; const auto& c = std::get(_config); - if (!errors.empty()) { - throw std::runtime_error( - fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI " - "arguments and input file(s):\n - {}"), - fmt::join(errors, "\n - "))); - } + const auto& subcmd = *_cli.get_subcommand("dump"); const auto is_hic = hic::utils::is_hic_file(c.uri); const auto is_cooler = cooler::utils::is_cooler(c.uri); const auto is_mcooler = cooler::utils::is_multires_file(c.uri); + const auto is_scool = cooler::utils::is_scool_file(c.uri); - if (is_hic && c.resolution == 0 && c.table != "chroms") { - errors.emplace_back("--resolution is mandatory when file is in .hic format."); + if ((is_hic || is_mcooler) && c.resolution == 0 && (c.table == "pixels" || c.table == "bins")) { + errors.emplace_back("--resolution is mandatory when file is in .hic or .mcool format."); } - const auto resolution_parsed = !_cli.get_subcommand("dump")->get_option("--resolution")->empty(); + const auto resolution_parsed = !subcmd.get_option("--resolution")->empty(); - if ((is_cooler || is_mcooler) && resolution_parsed) { - warnings.emplace_back("--resolution is ignored when file is in .cool or .mcool format."); + if ((is_cooler || is_scool) && resolution_parsed) { + warnings.emplace_back("--resolution is ignored when file is in .[s]cool format."); } - const auto weight_type_parsed = - !_cli.get_subcommand("dump")->get_option("--weight-type")->empty(); + const auto weight_type_parsed = !subcmd.get_option("--weight-type")->empty(); if (is_hic && weight_type_parsed) { warnings.emplace_back("--weight-type is ignored when file is in .hic format."); } - const auto matrix_type_parsed = - !_cli.get_subcommand("dump")->get_option("--matrix-type")->empty(); - const auto matrix_unit_parsed = - !_cli.get_subcommand("dump")->get_option("--matrix-unit")->empty(); + const auto range_parsed = !subcmd.get_option("--range")->empty(); + if (range_parsed && c.table != "bins" && c.table != "pixels") { + warnings.emplace_back("--range and --range2 are ignore when --table is not bins or pixels"); + } + const auto query_file_parsed = !subcmd.get_option("--query-file")->empty(); + if (query_file_parsed && c.table != "bins" && c.table != "pixels") { + warnings.emplace_back("--query-file is ignored when --table is not bins or pixels"); + } + + const auto matrix_type_parsed = !subcmd.get_option("--matrix-type")->empty(); + const auto matrix_unit_parsed = !subcmd.get_option("--matrix-unit")->empty(); if (!is_hic && (matrix_type_parsed || matrix_unit_parsed)) { warnings.emplace_back( @@ -181,9 +188,10 @@ void Cli::transform_args_dump_subcommand() { c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; c.format = infer_input_format(c.uri); - if (c.format == "hic" && c.resolution == 0) { - assert(c.table == "chroms"); + if (c.format == "hic" && c.resolution == 0 && c.table == "chroms") { c.resolution = hic::utils::list_resolutions(c.uri).back(); + } else if (c.format == "mcool" && c.resolution == 0 && c.table == "chroms") { + c.resolution = cooler::utils::list_resolutions(c.uri).back(); } if (_cli.get_subcommand("dump")->get_option("--range2")->empty()) { diff --git a/src/hictk/cli/cli_validate.cpp b/src/hictk/cli/cli_validate.cpp index 5beef57f..ecad1d9d 100644 --- a/src/hictk/cli/cli_validate.cpp +++ b/src/hictk/cli/cli_validate.cpp @@ -1,6 +1,6 @@ +// Copyright (C) 2023 Roberto Rossini // -// Created by roby on 7/13/23. -// +// SPDX-License-Identifier: MIT #include #include diff --git a/src/hictk/dump/dump.cpp b/src/hictk/dump/dump.cpp index fb74529b..014beb35 100644 --- a/src/hictk/dump/dump.cpp +++ b/src/hictk/dump/dump.cpp @@ -5,6 +5,7 @@ #include #include "hictk/balancing/methods.hpp" +#include "hictk/cooler.hpp" #include "hictk/file.hpp" #include "hictk/tools/config.hpp" #include "hictk/transformers.hpp" @@ -18,23 +19,6 @@ static void print(const ThinPixel& pixel) { fmt::print(FMT_COMPILE("{:d}\t{:d}\t{:.16g}\n"), pixel.bin1_id, pixel.bin2_id, pixel.count); } -static void dump_chroms(const File& f, std::string_view range) { - if (range == "all") { - for (const Chromosome& chrom : f.chromosomes()) { - if (!chrom.is_all()) { - fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), chrom.name(), chrom.size()); - } - } - return; - } - - const auto coords = GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range}); - auto it = f.chromosomes().find(coords.chrom()); - if (it != f.chromosomes().end()) { - fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), it->name(), it->size()); - } -} - template static void dump_bins(const File& f, std::string_view range) { if (range == "all") { @@ -127,9 +111,6 @@ static void dump_pixels(hic::File& f, std::string_view range1, std::string_view static void process_query(File& f, std::string_view table, std::string_view range1, std::string_view range2, std::string_view normalization, bool join, bool sorted) { - if (table == "chroms") { - return dump_chroms(f, range1); - } if (table == "bins") { return dump_bins(f, range1); } @@ -139,7 +120,128 @@ static void process_query(File& f, std::string_view table, std::string_view rang f.get()); } -int dump_subcmd(const DumpConfig& c) { +static int dump_chroms(std::string_view uri, std::string_view format, std::uint32_t resolution) { + Reference ref{}; + + if (format == "mcool") { + ref = cooler::MultiResFile{std::string{uri}}.chromosomes(); + } else if (format == "scool") { + ref = cooler::SingleCellFile{std::string{uri}}.chromosomes(); + } else { + ref = File{std::string{uri}, resolution}.chromosomes(); + } + + for (const Chromosome& chrom : ref) { + if (!chrom.is_all()) { + fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), chrom.name(), chrom.size()); + } + } + return 0; +} + +static phmap::btree_set get_normalizations(std::string_view uri, + std::string_view format, + std::uint32_t resolution) { + assert(format != "mcool"); + assert(format != "hic" || resolution != 0); + if (format == "scool") { + const auto cell_ids = cooler::SingleCellFile{uri}.cells(); + if (cell_ids.empty()) { + return {}; + } + + const auto scool_uri = fmt::format(FMT_STRING("{}::/cells/{}"), uri, *cell_ids.begin()); + return get_normalizations(scool_uri, "cool", 0); + } + + phmap::btree_set norms{}; + if (uri == "hic" && resolution == 0) { + const hic::File hf{std::string{uri}, resolution}; + + for (const auto& norm : hf.avail_normalizations()) { + norms.emplace(std::string{norm.to_string()}); + } + return norms; + } + + const auto norms_ = File{std::string{uri}, resolution}.avail_normalizations(); + std::transform(norms_.begin(), norms_.end(), std::inserter(norms, norms.begin()), + [](const auto& n) { return std::string{n.to_string()}; }); + + return norms; +} + +static int dump_normalizations(std::string_view uri, std::string_view format, + std::uint32_t resolution) { + phmap::btree_set norms{}; + std::vector resolutions{}; + if (format == "mcool") { + resolutions = cooler::MultiResFile{uri}.resolutions(); + if (resolutions.empty()) { + return 0; + } + } else if (format == "hic" && resolution == 0) { + resolutions = hic::utils::list_resolutions(std::string{uri}); + if (resolutions.empty()) { + return 0; + } + } + + if (resolutions.empty()) { + norms = get_normalizations(uri, format, resolution); + } else { + format = format == "hic" ? "hic" : "cool"; + std::for_each(resolutions.begin(), resolutions.end(), + [&](const auto res) { norms.merge(get_normalizations(uri, format, res)); }); + } + + if (!norms.empty()) { + fmt::print(FMT_STRING("{}\n"), fmt::join(norms, "\n")); + } + return 0; +} + +static int dump_resolutions(std::string_view uri, std::string_view format, + std::uint32_t resolution) { + std::vector resolutions{}; + + if (format == "hic") { + resolutions = hic::utils::list_resolutions(uri); + if (resolution != 0) { + const auto res_found = + std::find(resolutions.begin(), resolutions.end(), resolution) != resolutions.end(); + resolutions.clear(); + if (res_found) { + resolutions.push_back(resolution); + } + } + } else if (format == "mcool") { + resolutions = cooler::MultiResFile{uri}.resolutions(); + } else if (format == "scool") { + resolutions.push_back(cooler::SingleCellFile{uri}.bin_size()); + } else { + assert(format == "cool"); + resolutions.push_back(cooler::File{uri}.bin_size()); + } + + if (!resolutions.empty()) { + fmt::print(FMT_STRING("{}\n"), fmt::join(resolutions, "\n")); + } + return 0; +} + +static int dump_cells(std::string_view uri, std::string_view format) { + if (format != "scool") { + throw std::runtime_error(fmt::format(FMT_STRING("\"{}\" is not a .scool file"), uri)); + } + const auto cells = cooler::SingleCellFile{uri}.cells(); + if (!cells.empty()) { + fmt::print(FMT_STRING("{}\n"), fmt::join(cells, "\n")); + } + return 0; +} + +static int dump_tables(const DumpConfig& c) { hictk::File f{c.uri, c.resolution, c.matrix_type, c.matrix_unit}; if (c.query_file.empty()) { @@ -164,4 +266,26 @@ int dump_subcmd(const DumpConfig& c) { return 0; } + +int dump_subcmd(const DumpConfig& c) { + if (c.table == "bins" || c.table == "pixels") { + return dump_tables(c); + } + + if (c.table == "chroms") { + return dump_chroms(c.uri, c.format, c.resolution); + } + + if (c.table == "resolutions") { + return dump_resolutions(c.uri, c.format, c.resolution); + } + + if (c.table == "normalizations") { + return dump_normalizations(c.uri, c.format, c.resolution); + } + + assert(c.table == "cells"); + + return dump_cells(c.uri, c.format); +} } // namespace hictk::tools diff --git a/src/hictk/include/hictk/tools/cli.hpp b/src/hictk/include/hictk/tools/cli.hpp index 75eb97e3..6170ead7 100644 --- a/src/hictk/include/hictk/tools/cli.hpp +++ b/src/hictk/include/hictk/tools/cli.hpp @@ -10,6 +10,7 @@ #include #include "config.hpp" +#include "hictk/cooler.hpp" #include "hictk/cooler/utils.hpp" #include "hictk/hic/utils.hpp" @@ -23,6 +24,9 @@ class CoolerFileValidator : public CLI::Validator { if (hictk::cooler::utils::is_multires_file(uri)) { return "URI points to a .mcool file: " + uri; } + if (hictk::cooler::utils::is_scool_file(uri)) { + return "URI points to a .scool file: " + uri; + } const auto path = cooler::parse_cooler_uri(uri).file_path; if (!std::filesystem::exists(path)) { return "No such file: " + path; @@ -50,6 +54,22 @@ class MultiresCoolerFileValidator : public CLI::Validator { } }; +class SingleCellCoolerFileValidator : public CLI::Validator { + public: + inline SingleCellCoolerFileValidator() : Validator("Single-cell-cooler") { + func_ = [](std::string& uri) -> std::string { + const auto path = cooler::parse_cooler_uri(uri).file_path; + if (!std::filesystem::exists(path)) { + return "No such file: " + path; + } + if (!hictk::cooler::utils::is_scool_file(uri)) { + return "Not a valid single-cell cooler: " + uri; + } + return ""; + }; + } +}; + class HiCFileValidator : public CLI::Validator { public: inline HiCFileValidator() : Validator("HiC") { @@ -145,9 +165,10 @@ class Formatter : public CLI::Formatter { }; // clang-format off - inline const auto IsValidCoolerFile = CoolerFileValidator(); // NOLINT(cert-err58-cpp) - inline const auto IsValidMultiresCoolerFile = MultiresCoolerFileValidator(); // NOLINT(cert-err58-cpp) - inline const auto IsValidHiCFile = HiCFileValidator(); // NOLINT(cert-err58-cpp) + inline const auto IsValidCoolerFile = CoolerFileValidator(); // NOLINT(cert-err58-cpp) + inline const auto IsValidMultiresCoolerFile = MultiresCoolerFileValidator(); // NOLINT(cert-err58-cpp) + inline const auto IsValidSingleCellCoolerFile = SingleCellCoolerFileValidator(); // NOLINT(cert-err58-cpp) + inline const auto IsValidHiCFile = HiCFileValidator(); // NOLINT(cert-err58-cpp) // clang-format on // clang-format off @@ -225,6 +246,9 @@ class Cli { if (cooler::utils::is_multires_file(p.string())) { return "mcool"; } + if (cooler::utils::is_scool_file(p.string())) { + return "scool"; + } assert(hic::utils::is_hic_file(p)); return "hic"; } @@ -250,6 +274,9 @@ class Cli { if (format == "cool") { return {cooler::File(p.string()).bin_size()}; } + if (format == "scool") { + return {cooler::SingleCellFile{p.string()}.bin_size()}; + } if (format == "mcool") { return cooler::utils::list_resolutions(p, true); } diff --git a/src/libhictk/cooler/include/hictk/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler.hpp index c363e068..c29e0f2a 100644 --- a/src/libhictk/cooler/include/hictk/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler.hpp @@ -6,3 +6,4 @@ #include "hictk/cooler/cooler.hpp" #include "hictk/cooler/multires_cooler.hpp" +#include "hictk/cooler/singlecell_cooler.hpp" diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index 9737b07a..104b19f5 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -250,7 +250,8 @@ class File { balancing::Weights::Type type, bool rescale = false) const; - bool has_weights(const balancing::Method &normalization) const; + [[nodiscard]] std::vector avail_normalizations() const; + [[nodiscard]] bool has_normalization(const balancing::Method &normalization) const; std::shared_ptr read_weights(const balancing::Method &normalization, bool rescale = false) const; std::shared_ptr read_weights(const balancing::Method &normalization, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp index 990f77e6..d0ca7987 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp @@ -87,6 +87,21 @@ inline auto File::dataset(std::string_view dataset_name) const -> const Dataset } } +inline std::vector File::avail_normalizations() const { + const phmap::flat_hash_set bin_table_dsets{"chrom", "start", "end"}; + + std::vector norms{}; + for (const auto &dset : group("bins")().listObjectNames(HighFive::IndexType::NAME)) { + if (bin_table_dsets.contains(dset)) { + continue; + } + + norms.emplace_back(balancing::Method{dset}); + } + + return norms; +} + inline const hictk::internal::NumericVariant &File::pixel_variant() const noexcept { return _pixel_variant; } diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp index 795eec54..819bb657 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp @@ -227,7 +227,7 @@ inline PixelSelector File::fetch(PixelCoordinates coord1, PixelCoordinates coord } inline bool File::has_weights(std::string_view normalization) const { - return has_weights(balancing::Method{normalization}); + return has_normalization(balancing::Method{normalization}); } inline std::shared_ptr File::read_weights(std::string_view normalization, bool rescale) const { @@ -239,7 +239,7 @@ inline std::shared_ptr File::read_weights(std::string_ return read_weights(balancing::Method{normalization}, type, rescale); } -inline bool File::has_weights(const balancing::Method &normalization) const { +inline bool File::has_normalization(const balancing::Method &normalization) const { const auto dset_path = fmt::format(FMT_STRING("{}/{}"), _groups.at("bins").group.getPath(), normalization.to_string()); if (_weights.contains(dset_path)) { diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp index a4ae20f3..7d116960 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp @@ -163,6 +163,7 @@ inline SingleCellFile::operator bool() const noexcept { return !!_root_grp; } inline std::string SingleCellFile::path() const { return (*_root_grp)().getFile().getName(); } inline auto SingleCellFile::bins() const noexcept -> const BinTable& { return *_bins; } +inline std::uint32_t SingleCellFile::bin_size() const noexcept { return bins().bin_size(); } inline auto SingleCellFile::chromosomes() const noexcept -> const Reference& { return bins().chromosomes(); } diff --git a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp index 93e68863..86228e5b 100644 --- a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp @@ -74,6 +74,7 @@ class SingleCellFile { [[nodiscard]] std::string path() const; [[nodiscard]] auto chromosomes() const noexcept -> const Reference&; [[nodiscard]] auto bins() const noexcept -> const BinTable&; + [[nodiscard]] std::uint32_t bin_size() const noexcept; template File aggregate(std::string_view uri, bool overwrite_if_exists = false, diff --git a/src/libhictk/file/include/hictk/file.hpp b/src/libhictk/file/include/hictk/file.hpp index 36daf89a..c8e05983 100644 --- a/src/libhictk/file/include/hictk/file.hpp +++ b/src/libhictk/file/include/hictk/file.hpp @@ -132,6 +132,8 @@ class File { std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, const balancing::Method &normalization = balancing::Method::NONE()) const; + [[nodiscard]] std::vector avail_normalizations() const; + template [[nodiscard]] constexpr const FileT &get() const noexcept; template diff --git a/src/libhictk/file/include/hictk/impl/file_impl.hpp b/src/libhictk/file/include/hictk/impl/file_impl.hpp index 0c29474d..e8adf527 100644 --- a/src/libhictk/file/include/hictk/impl/file_impl.hpp +++ b/src/libhictk/file/include/hictk/impl/file_impl.hpp @@ -277,6 +277,10 @@ inline PixelSelector File::fetch(std::string_view chrom1_name, std::uint32_t sta _fp); } +inline std::vector File::avail_normalizations() const { + return std::visit([](const auto& fp) { return fp.avail_normalizations(); }, _fp); +} + template constexpr const FileT& File::get() const noexcept { return std::get(_fp);