From 7109d59a3c48e408de3b0a8a02cb390fefe2ca15 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 21 Nov 2023 18:41:19 +0100 Subject: [PATCH 01/13] Initial implementation of cooler::utils::rename_chromosomes() --- .../cooler/impl/multires_cooler_impl.hpp | 4 +- .../cooler/impl/utils_rename_chroms_impl.hpp | 87 +++++++++++++++++++ .../cooler/include/hictk/cooler/utils.hpp | 7 ++ test/units/cooler/CMakeLists.txt | 1 + .../cooler/utils_rename_chromosomes_test.cpp | 46 ++++++++++ 5 files changed, 142 insertions(+), 3 deletions(-) create mode 100644 src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp create mode 100644 test/units/cooler/utils_rename_chromosomes_test.cpp diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp index 544cba29..d4c6d458 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp @@ -162,9 +162,7 @@ inline MultiResFile::operator bool() const noexcept { return !!_root_grp; } inline std::string MultiResFile::path() const { return (*_root_grp)().getFile().getName(); } -inline auto MultiResFile::chromosomes() const noexcept -> const Reference& { - return _chroms; -} +inline auto MultiResFile::chromosomes() const noexcept -> const Reference& { return _chroms; } [[nodiscard]] inline std::uint32_t MultiResFile::compute_base_resolution( const std::vector& resolutions, std::uint32_t target_res) { diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp new file mode 100644 index 00000000..c11c9fb0 --- /dev/null +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "hictk/chromosome.hpp" +#include "hictk/cooler/cooler.hpp" + +namespace hictk::cooler::utils { + +namespace internal { +[[nodiscard]] inline std::vector get_chrom_names(const cooler::File& clr) { + std::vector names(clr.chromosomes().size()); + + std::transform(clr.chromosomes().begin(), clr.chromosomes().end(), names.begin(), + [](const hictk::Chromosome& chrom) { return std::string{chrom.name()}; }); + + return names; +} + +[[nodiscard]] inline std::vector& rename_chromosomes( + std::vector&& names, + const phmap::flat_hash_map& mappings) { + for (auto& name : names) { + auto it = mappings.find(name); + if (it != mappings.end()) { + name = it->second; + } + } + + return names; +} + +[[nodiscard]] inline std::string find_chrom_with_longest_name( + const std::vector& names) { + assert(!names.empty()); + return *std::max_element(names.begin(), names.end(), [&](const auto& name1, const auto& name2) { + return name1.size() < name2.size(); + }); +} + +} // namespace internal + +template +inline void rename_chromosomes(std::string_view uri, It first_mapping, It last_mapping) { + return rename_chromosomes(uri, {first_mapping, last_mapping}); +} + +inline void rename_chromosomes(std::string_view uri, + const phmap::flat_hash_map& mappings) { + cooler::File clr(uri); + auto names = internal::get_chrom_names(clr); + const auto file_path = clr.path(); + const auto chrom_dset = fmt::format(FMT_STRING("{}/chroms/name"), clr.hdf5_path()); + clr.close(); + + names = internal::rename_chromosomes(std::move(names), mappings); + + HighFive::File h5f(file_path, HighFive::File::ReadWrite); + const cooler::RootGroup root_grp{h5f.getGroup("/")}; + const auto aprop = h5f.getDataSet(chrom_dset).getAccessPropertyList(); + + h5f.unlink(chrom_dset); + cooler::Dataset dset{root_grp, chrom_dset, internal::find_chrom_with_longest_name(names), + HighFive::DataSpace::UNLIMITED, aprop}; + + try { + dset.write(names.begin(), names.end(), 0, true, [&](const auto& name) { return name; }); + } catch (const HighFive::Exception& e) { + throw std::runtime_error( + fmt::format(FMT_STRING("Failed to write {} chromosome name(s) to \"{}\": {}"), names.size(), + dset.uri(), e.what())); + } + assert(dset.size() == names.size()); +} +} // namespace hictk::cooler::utils diff --git a/src/libhictk/cooler/include/hictk/cooler/utils.hpp b/src/libhictk/cooler/include/hictk/cooler/utils.hpp index fe2d6075..5b52bcc2 100644 --- a/src/libhictk/cooler/include/hictk/cooler/utils.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/utils.hpp @@ -37,9 +37,16 @@ void merge(const std::vector& heads, const std::vector& tails, void copy(std::string_view uri1, std::string_view uri2, bool force_overwrite); void copy(std::string_view uri1, RootGroup dest); +template +void rename_chromosomes(std::string_view uri, It first_mapping, It last_mapping); + +void rename_chromosomes(std::string_view uri, + const phmap::flat_hash_map& mappings); + } // namespace hictk::cooler::utils #include "./impl/utils_copy_impl.hpp" #include "./impl/utils_equal_impl.hpp" #include "./impl/utils_impl.hpp" #include "./impl/utils_merge_impl.hpp" +#include "./impl/utils_rename_chroms_impl.hpp" diff --git a/test/units/cooler/CMakeLists.txt b/test/units/cooler/CMakeLists.txt index 0e7fb90e..6e10422a 100644 --- a/test/units/cooler/CMakeLists.txt +++ b/test/units/cooler/CMakeLists.txt @@ -37,6 +37,7 @@ target_sources( "${CMAKE_CURRENT_SOURCE_DIR}/utils_copy_test.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/utils_equal_test.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/utils_merge_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/utils_rename_chromosomes_test.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/utils_validation_test.cpp") target_link_libraries( diff --git a/test/units/cooler/utils_rename_chromosomes_test.cpp b/test/units/cooler/utils_rename_chromosomes_test.cpp new file mode 100644 index 00000000..2915ee45 --- /dev/null +++ b/test/units/cooler/utils_rename_chromosomes_test.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include +#include + +#include "hictk/cooler/cooler.hpp" +#include "hictk/cooler/utils.hpp" +#include "tmpdir.hpp" + +namespace hictk::cooler::test::cooler_file { + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Cooler: rename chromosomes", "[cooler][short]") { + const Reference ref2{{0, "1", 10}, {1, "2", 10}}; + const Reference ref3{{0, "chr1", 10}, {1, "2", 10}}; + + const auto path = testdir() / "rename_chromosomes.cool"; + + const Reference ref{{0, "chr1", 10}, {1, "chr2", 10}}; + std::ignore = cooler::File::create(path.string(), ref, 1, true); + cooler::utils::rename_chromosomes(path.string(), {{"chr1", "1"}}); + + { + const auto chroms = cooler::File(path.string()).chromosomes(); + CHECK(chroms.size() == 2); + CHECK(!chroms.contains("chr1")); + CHECK(chroms.contains("chr2")); + CHECK(chroms.contains("1")); + } + + const std::vector> mappings{{"1", "abc12345"}}; + cooler::utils::rename_chromosomes(path.string(), mappings.begin(), mappings.end()); + + { + const auto chroms = cooler::File(path.string()).chromosomes(); + CHECK(chroms.size() == 2); + CHECK(!chroms.contains("1")); + CHECK(chroms.contains("chr2")); + CHECK(chroms.contains("abc12345")); + } +} + +} // namespace hictk::cooler::test::cooler_file From e4da38b2c184e11066f08043c162aa1daf0849d1 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 21 Nov 2023 18:43:02 +0100 Subject: [PATCH 02/13] Initial implementation of hictk rename-chromosomes --- src/hictk/CMakeLists.txt | 2 + src/hictk/cli/cli.cpp | 10 ++ src/hictk/cli/cli_rename_chromosomes.cpp | 99 +++++++++++++++ src/hictk/include/hictk/tools/cli.hpp | 4 + src/hictk/include/hictk/tools/config.hpp | 9 ++ src/hictk/include/hictk/tools/tools.hpp | 1 + src/hictk/main.cpp | 2 + .../rename_chromosomes/rename_chromosomes.cpp | 114 ++++++++++++++++++ 8 files changed, 241 insertions(+) create mode 100644 src/hictk/cli/cli_rename_chromosomes.cpp create mode 100644 src/hictk/rename_chromosomes/rename_chromosomes.cpp diff --git a/src/hictk/CMakeLists.txt b/src/hictk/CMakeLists.txt index ea4e57f9..b6e3bec5 100644 --- a/src/hictk/CMakeLists.txt +++ b/src/hictk/CMakeLists.txt @@ -22,6 +22,7 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_fix_mcool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_load.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_merge.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_rename_chromosomes.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_validate.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_zoomify.cpp ${CMAKE_CURRENT_SOURCE_DIR}/balance/balance.cpp @@ -32,6 +33,7 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/fix_mcool/fix_mcool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/load/load.cpp ${CMAKE_CURRENT_SOURCE_DIR}/merge/merge.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/rename_chromosomes/rename_chromosomes.cpp ${CMAKE_CURRENT_SOURCE_DIR}/validate/validate.cpp ${CMAKE_CURRENT_SOURCE_DIR}/zoomify/zoomify.cpp) diff --git a/src/hictk/cli/cli.cpp b/src/hictk/cli/cli.cpp index 580da7a1..0c2fdcfd 100644 --- a/src/hictk/cli/cli.cpp +++ b/src/hictk/cli/cli.cpp @@ -37,6 +37,8 @@ auto Cli::parse_arguments() -> Config { _subcommand = subcommand::load; } else if (_cli.get_subcommand("merge")->parsed()) { _subcommand = subcommand::merge; + } else if (_cli.get_subcommand("rename-chromosomes")->parsed()) { + _subcommand = subcommand::rename_chromosomes; } else if (_cli.get_subcommand("validate")->parsed()) { _subcommand = subcommand::validate; } else if (_cli.get_subcommand("zoomify")->parsed()) { @@ -85,6 +87,8 @@ std::string_view Cli::subcommand_to_str(subcommand s) noexcept { return "load"; case merge: return "merge"; + case rename_chromosomes: + return "rename-chromosomes"; case validate: return "validate"; case zoomify: @@ -107,6 +111,7 @@ void Cli::make_cli() { make_fix_mcool_subcommand(); make_load_subcommand(); make_merge_subcommand(); + make_rename_chromosomes_subcommand(); make_validate_subcommand(); make_zoomify_subcommand(); } @@ -131,6 +136,9 @@ void Cli::validate_args() const { case merge: validate_merge_subcommand(); break; + case rename_chromosomes: + validate_rename_chromosomes_subcommand(); + break; case validate: break; case zoomify: @@ -161,6 +169,8 @@ void Cli::transform_args() { case merge: transform_args_merge_subcommand(); break; + case rename_chromosomes: + break; case validate: break; case zoomify: diff --git a/src/hictk/cli/cli_rename_chromosomes.cpp b/src/hictk/cli/cli_rename_chromosomes.cpp new file mode 100644 index 00000000..77a86d2c --- /dev/null +++ b/src/hictk/cli/cli_rename_chromosomes.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include + +#include +#include +#include + +#include "hictk/tools/cli.hpp" +#include "hictk/tools/config.hpp" + +namespace hictk::tools { +void Cli::make_rename_chromosomes_subcommand() { + auto& sc = + *_cli.add_subcommand("rename-chromosomes", "Rename chromosomes found in a Cooler file.") + ->fallthrough() + ->preparse_callback([this]([[maybe_unused]] std::size_t i) { + assert(_config.index() == 0); + _config = RenameChromosomesConfig{}; + }); + + _config = RenameChromosomesConfig{}; + auto& c = std::get(_config); + + // clang-format off + sc.add_option( + "uri", + c.uri, + "Path to a or .[ms]cool file (Cooler URI syntax supported).") + ->required(); + + sc.add_option( + "--name-mappings", + c.path_to_name_mappings, + "Path to a two column TSV with pairs of chromosomes to be renamed.\n" + "The first column should contain the original chromosome name,\n" + "while the second column should contain the destination name to use when renaming." + ); + + sc.add_flag( + "--add-chr-prefix", + c.add_chr_prefix, + "Prefix chromosome names with \"chr\".") + ->capture_default_str(); + + sc.add_flag( + "--remove-chr-prefix", + c.remove_chr_prefix, + "Remove prefix \"chr\" from chromosome names.") + ->capture_default_str(); + // clang-format on + + sc.get_option("--name-mappings")->excludes(sc.get_option("--add-chr-prefix")); + sc.get_option("--name-mappings")->excludes(sc.get_option("--remove-chr-prefix")); + sc.get_option("--add-chr-prefix")->excludes(sc.get_option("--remove-chr-prefix")); + + _config = std::monostate{}; +} + +void Cli::validate_rename_chromosomes_subcommand() const { + assert(_cli.get_subcommand("rename-chromosomes")->parsed()); + + const auto& c = std::get(_config); + + std::vector errors; + + if (!cooler::utils::is_cooler(c.uri) && !cooler::utils::is_multires_file(c.uri) && + !cooler::utils::is_scool_file(c.uri)) { + errors.emplace_back( + fmt::format(FMT_STRING("File \"{}\" does not appear to be a Cooler file."), c.uri)); + } + + const auto& sc = *_cli.get_subcommand("rename-chromosomes"); + if (sc.get_option("--name-mappings")->empty() && sc.get_option("--add-chr-prefix")->empty() && + sc.get_option("--remove-chr-prefix")->empty()) { + errors.emplace_back( + "please specify exactly one of --name-mappings, --add-chr-prefix, --remove-chr-prefix"); + } + + if (!errors.empty()) { + throw std::runtime_error( + fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI " + "arguments and input file(s):\n - {}\n"), + fmt::join(errors, "\n - "))); + } +} + +void Cli::transform_args_rename_chromosomes_subcommand() { + assert(_cli.get_subcommand("rename-chromosomes")->parsed()); + auto& c = std::get(_config); + + // in spdlog, high numbers correspond to low log levels + assert(c.verbosity > 0 && c.verbosity < 5); + c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; +} + +} // namespace hictk::tools diff --git a/src/hictk/include/hictk/tools/cli.hpp b/src/hictk/include/hictk/tools/cli.hpp index 4043c423..42e8615a 100644 --- a/src/hictk/include/hictk/tools/cli.hpp +++ b/src/hictk/include/hictk/tools/cli.hpp @@ -198,6 +198,7 @@ class Cli { fix_mcool, load, merge, + rename_chromosomes, validate, zoomify, }; @@ -224,6 +225,7 @@ class Cli { void make_fix_mcool_subcommand(); void make_load_subcommand(); void make_merge_subcommand(); + void make_rename_chromosomes_subcommand(); void make_validate_subcommand(); void make_zoomify_subcommand(); void make_cli(); @@ -234,6 +236,7 @@ class Cli { void validate_fix_mcool_subcommand() const; void validate_load_subcommand() const; void validate_merge_subcommand() const; + void validate_rename_chromosomes_subcommand() const; void validate_zoomify_subcommand() const; void validate_args() const; @@ -243,6 +246,7 @@ class Cli { void transform_args_fix_mcool_subcommand(); void transform_args_load_subcommand(); void transform_args_merge_subcommand(); + void transform_args_rename_chromosomes_subcommand(); void transform_args_zoomify_subcommand(); void transform_args(); }; diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index 66b07928..1f9fbdec 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -126,6 +126,14 @@ struct MergeConfig { std::uint8_t verbosity{4}; }; +struct RenameChromosomesConfig { + std::string uri{}; + std::filesystem::path path_to_name_mappings{}; + bool add_chr_prefix{false}; + bool remove_chr_prefix{false}; + std::uint8_t verbosity{4}; +}; + struct ValidateConfig { std::string uri{}; bool validate_index{false}; @@ -153,6 +161,7 @@ using Config = std::variant; // clang-format on diff --git a/src/hictk/include/hictk/tools/tools.hpp b/src/hictk/include/hictk/tools/tools.hpp index d29295ae..e9ecb672 100644 --- a/src/hictk/include/hictk/tools/tools.hpp +++ b/src/hictk/include/hictk/tools/tools.hpp @@ -14,6 +14,7 @@ namespace hictk::tools { [[nodiscard]] int fix_mcool_subcmd(const FixMcoolConfig& c); [[nodiscard]] int load_subcmd(const LoadConfig& c); [[nodiscard]] int merge_subcmd(const MergeConfig& c); +[[nodiscard]] int rename_chromosomes_subcmd(const RenameChromosomesConfig& c); [[nodiscard]] int validate_subcmd(const ValidateConfig& c); [[nodiscard]] int zoomify_subcmd(const ZoomifyConfig& c); diff --git a/src/hictk/main.cpp b/src/hictk/main.cpp index aefd6ed7..3ad68c77 100644 --- a/src/hictk/main.cpp +++ b/src/hictk/main.cpp @@ -111,6 +111,8 @@ int main(int argc, char** argv) noexcept { return load_subcmd(std::get(config)); case sc::merge: return merge_subcmd(std::get(config)); + case sc::rename_chromosomes: + return rename_chromosomes_subcmd(std::get(config)); case sc::validate: return validate_subcmd(std::get(config)); case sc::zoomify: diff --git a/src/hictk/rename_chromosomes/rename_chromosomes.cpp b/src/hictk/rename_chromosomes/rename_chromosomes.cpp new file mode 100644 index 00000000..1078ccb8 --- /dev/null +++ b/src/hictk/rename_chromosomes/rename_chromosomes.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include + +#include "hictk/cooler/multires_cooler.hpp" +#include "hictk/cooler/singlecell_cooler.hpp" +#include "hictk/cooler/utils.hpp" +#include "hictk/hic/utils.hpp" +#include "hictk/tools/config.hpp" + +namespace hictk::tools { + +[[nodiscard]] static phmap::flat_hash_map +generate_mappings_add_chr_prefix_prefix(std::string_view uri) { + const auto chroms = cooler::File{uri}.chromosomes(); + phmap::flat_hash_map mappings(chroms.size()); + for (const auto& chrom : chroms) { + mappings.emplace(std::string{chrom.name()}, "chr" + std::string{chrom.name()}); + } + return mappings; +} + +[[nodiscard]] static phmap::flat_hash_map +generate_mappings_remove_chr_prefix_prefix(std::string_view uri) { + const auto chroms = cooler::File{uri}.chromosomes(); + phmap::flat_hash_map mappings(chroms.size()); + for (const auto& chrom : chroms) { + const auto match = chrom.name().find("chr") == 0; + if (match) { + mappings.emplace(std::string{chrom.name()}, std::string{chrom.name().substr(3)}); + } + } + return mappings; +} + +[[nodiscard]] static phmap::flat_hash_map read_mappings_from_file( + const std::filesystem::path& path) { + if (path.empty()) { + return {}; + } + + std::ifstream ifs; + ifs.exceptions(std::ios::badbit); + ifs.open(path); + + phmap::flat_hash_map mappings{}; + std::string buff{}; + + for (std::size_t i = 0; std::getline(ifs, buff); ++i) { + if (buff.empty()) { + continue; + } + const auto sep_pos = buff.find('\t'); + if (sep_pos == std::string::npos) { + throw std::runtime_error(fmt::format( + FMT_STRING("Found invalid record \"{}\" in file {} at line {}"), buff, path, i)); + } + auto old_name = buff.substr(0, sep_pos); + auto new_name = buff.substr(sep_pos + 1); + + if (old_name.empty() || new_name.empty()) { + throw std::runtime_error(fmt::format( + FMT_STRING("Found invalid record \"{}\" in file {} at line {}"), buff, path, i)); + } + + mappings.emplace(std::move(old_name), std::move(new_name)); + } + + return mappings; +} + +[[nodiscard]] static phmap::flat_hash_map generate_name_mappings( + std::string_view uri, const std::filesystem::path& name_mappings_path, bool add_chr_prefix, + bool remove_chr_prefix) { + if (!name_mappings_path.empty()) { + return read_mappings_from_file(name_mappings_path); + } + if (add_chr_prefix) { + return generate_mappings_add_chr_prefix_prefix(uri); + } + + assert(remove_chr_prefix); + return generate_mappings_remove_chr_prefix_prefix(uri); +} + +int rename_chromosomes_subcmd(const RenameChromosomesConfig& c) { + if (cooler::utils::is_cooler(c.uri)) { + const auto mappings = generate_name_mappings(c.uri, c.path_to_name_mappings, c.add_chr_prefix, + c.remove_chr_prefix); + cooler::utils::rename_chromosomes(c.uri, mappings); + } else if (cooler::utils::is_multires_file(c.uri)) { + const auto resolutions = cooler::MultiResFile(c.uri).resolutions(); + const auto mappings = generate_name_mappings( + fmt::format(FMT_STRING("{}::/resolutions/{}"), c.uri, resolutions.front()), + c.path_to_name_mappings, c.add_chr_prefix, c.remove_chr_prefix); + for (const auto& res : resolutions) { + cooler::utils::rename_chromosomes(fmt::format(FMT_STRING("{}::/resolutions/{}"), c.uri, res), + mappings); + } + } else { + assert(cooler::utils::is_scool_file(c.uri)); + const auto cell_id = *cooler::SingleCellFile(c.uri).cells().begin(); + const auto uri = fmt::format(FMT_STRING("{}::/cells/{}"), c.uri, cell_id); + const auto mappings = + generate_name_mappings(uri, c.path_to_name_mappings, c.add_chr_prefix, c.remove_chr_prefix); + cooler::utils::rename_chromosomes(uri, mappings); + } + + return 1; +} + +} // namespace hictk::tools From 1f3255d7c5b88ceacf6b08dc2f793e979fbdc935 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 22 Nov 2023 09:00:36 +0100 Subject: [PATCH 03/13] Address compiler warnings --- src/hictk/cli/cli.cpp | 2 +- src/hictk/rename_chromosomes/rename_chromosomes.cpp | 2 +- .../include/hictk/cooler/impl/utils_rename_chroms_impl.hpp | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hictk/cli/cli.cpp b/src/hictk/cli/cli.cpp index 0c2fdcfd..a423376b 100644 --- a/src/hictk/cli/cli.cpp +++ b/src/hictk/cli/cli.cpp @@ -169,7 +169,7 @@ void Cli::transform_args() { case merge: transform_args_merge_subcommand(); break; - case rename_chromosomes: + case rename_chromosomes: // NOLINT break; case validate: break; diff --git a/src/hictk/rename_chromosomes/rename_chromosomes.cpp b/src/hictk/rename_chromosomes/rename_chromosomes.cpp index 1078ccb8..347d3a25 100644 --- a/src/hictk/rename_chromosomes/rename_chromosomes.cpp +++ b/src/hictk/rename_chromosomes/rename_chromosomes.cpp @@ -73,7 +73,7 @@ generate_mappings_remove_chr_prefix_prefix(std::string_view uri) { [[nodiscard]] static phmap::flat_hash_map generate_name_mappings( std::string_view uri, const std::filesystem::path& name_mappings_path, bool add_chr_prefix, - bool remove_chr_prefix) { + [[maybe_unused]] bool remove_chr_prefix) { if (!name_mappings_path.empty()) { return read_mappings_from_file(name_mappings_path); } diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp index c11c9fb0..842e2310 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp @@ -67,6 +67,7 @@ inline void rename_chromosomes(std::string_view uri, names = internal::rename_chromosomes(std::move(names), mappings); + // NOLINTNEXTLINE(misc-const-correctness) HighFive::File h5f(file_path, HighFive::File::ReadWrite); const cooler::RootGroup root_grp{h5f.getGroup("/")}; const auto aprop = h5f.getDataSet(chrom_dset).getAccessPropertyList(); From d338afa52c3836d257b8cce596a45ce6ead46ad6 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 26 Nov 2023 15:41:18 +0100 Subject: [PATCH 04/13] Fix typo --- src/hictk/cli/cli.cpp | 3 ++- src/hictk/rename_chromosomes/rename_chromosomes.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hictk/cli/cli.cpp b/src/hictk/cli/cli.cpp index a423376b..fd02096f 100644 --- a/src/hictk/cli/cli.cpp +++ b/src/hictk/cli/cli.cpp @@ -169,7 +169,8 @@ void Cli::transform_args() { case merge: transform_args_merge_subcommand(); break; - case rename_chromosomes: // NOLINT + case rename_chromosomes: + transform_args_rename_chromosomes_subcommand(); break; case validate: break; diff --git a/src/hictk/rename_chromosomes/rename_chromosomes.cpp b/src/hictk/rename_chromosomes/rename_chromosomes.cpp index 347d3a25..e2ae0b0f 100644 --- a/src/hictk/rename_chromosomes/rename_chromosomes.cpp +++ b/src/hictk/rename_chromosomes/rename_chromosomes.cpp @@ -108,7 +108,7 @@ int rename_chromosomes_subcmd(const RenameChromosomesConfig& c) { cooler::utils::rename_chromosomes(uri, mappings); } - return 1; + return 0; } } // namespace hictk::tools From edee37279df7c2112c4c9510f4563711ccaf7345 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 26 Nov 2023 16:41:23 +0100 Subject: [PATCH 05/13] Add type trait to detect map-like containers --- src/libhictk/common/include/hictk/type_traits.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/libhictk/common/include/hictk/type_traits.hpp b/src/libhictk/common/include/hictk/type_traits.hpp index 5b2e1209..3b902612 100644 --- a/src/libhictk/common/include/hictk/type_traits.hpp +++ b/src/libhictk/common/include/hictk/type_traits.hpp @@ -28,6 +28,15 @@ struct is_string template constexpr bool is_string_v = is_string::value; +template +struct is_map : std::false_type {}; + +template +struct is_map> : std::true_type {}; + +template +constexpr bool is_map_v = is_map::value; + template struct is_unary_operation : public std::is_invocable {}; From 93a8a9e988eef95b8b40256527a6485365d634b5 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 26 Nov 2023 16:42:14 +0100 Subject: [PATCH 06/13] Make cooler::utils::rename_chromosomes more generic --- .../hictk/cooler/impl/utils_rename_chroms_impl.hpp | 13 ++++++++----- src/libhictk/cooler/include/hictk/cooler/utils.hpp | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp index 842e2310..57dd7d0e 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp @@ -29,9 +29,9 @@ namespace internal { return names; } -[[nodiscard]] inline std::vector& rename_chromosomes( - std::vector&& names, - const phmap::flat_hash_map& mappings) { +template +[[nodiscard]] inline std::vector& rename_chromosomes(std::vector&& names, + const NameMap& mappings) { for (auto& name : names) { auto it = mappings.find(name); if (it != mappings.end()) { @@ -57,8 +57,11 @@ inline void rename_chromosomes(std::string_view uri, It first_mapping, It last_m return rename_chromosomes(uri, {first_mapping, last_mapping}); } -inline void rename_chromosomes(std::string_view uri, - const phmap::flat_hash_map& mappings) { +template +inline void rename_chromosomes(std::string_view uri, const NameMap& mappings) { + if (mappings.empty()) { + return; + } cooler::File clr(uri); auto names = internal::get_chrom_names(clr); const auto file_path = clr.path(); diff --git a/src/libhictk/cooler/include/hictk/cooler/utils.hpp b/src/libhictk/cooler/include/hictk/cooler/utils.hpp index 5b52bcc2..1965b6fc 100644 --- a/src/libhictk/cooler/include/hictk/cooler/utils.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/utils.hpp @@ -40,8 +40,8 @@ void copy(std::string_view uri1, RootGroup dest); template void rename_chromosomes(std::string_view uri, It first_mapping, It last_mapping); -void rename_chromosomes(std::string_view uri, - const phmap::flat_hash_map& mappings); +template >> +void rename_chromosomes(std::string_view uri, const NameMap& mappings); } // namespace hictk::cooler::utils From 01013ff24123ff19128183e728a54bbd74666c52 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 26 Nov 2023 16:42:38 +0100 Subject: [PATCH 07/13] Improve hictk rename-chroms --- src/hictk/cli/cli_rename_chromosomes.cpp | 8 ++++ .../rename_chromosomes/rename_chromosomes.cpp | 38 +++++++++++++------ 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/hictk/cli/cli_rename_chromosomes.cpp b/src/hictk/cli/cli_rename_chromosomes.cpp index 77a86d2c..128fa70e 100644 --- a/src/hictk/cli/cli_rename_chromosomes.cpp +++ b/src/hictk/cli/cli_rename_chromosomes.cpp @@ -50,11 +50,19 @@ void Cli::make_rename_chromosomes_subcommand() { c.remove_chr_prefix, "Remove prefix \"chr\" from chromosome names.") ->capture_default_str(); + + sc.add_option( + "-v,--verbosity", + c.verbosity, + "Set verbosity of output to the console.") + ->check(CLI::Range(1, 4)) + ->capture_default_str(); // clang-format on sc.get_option("--name-mappings")->excludes(sc.get_option("--add-chr-prefix")); sc.get_option("--name-mappings")->excludes(sc.get_option("--remove-chr-prefix")); sc.get_option("--add-chr-prefix")->excludes(sc.get_option("--remove-chr-prefix")); + sc.alias("rename-chroms"); _config = std::monostate{}; } diff --git a/src/hictk/rename_chromosomes/rename_chromosomes.cpp b/src/hictk/rename_chromosomes/rename_chromosomes.cpp index e2ae0b0f..5baa9f3e 100644 --- a/src/hictk/rename_chromosomes/rename_chromosomes.cpp +++ b/src/hictk/rename_chromosomes/rename_chromosomes.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: MIT #include +#include #include "hictk/cooler/multires_cooler.hpp" #include "hictk/cooler/singlecell_cooler.hpp" @@ -12,20 +13,20 @@ namespace hictk::tools { -[[nodiscard]] static phmap::flat_hash_map +[[nodiscard]] static phmap::btree_map generate_mappings_add_chr_prefix_prefix(std::string_view uri) { + phmap::btree_map mappings{}; const auto chroms = cooler::File{uri}.chromosomes(); - phmap::flat_hash_map mappings(chroms.size()); for (const auto& chrom : chroms) { mappings.emplace(std::string{chrom.name()}, "chr" + std::string{chrom.name()}); } return mappings; } -[[nodiscard]] static phmap::flat_hash_map +[[nodiscard]] static phmap::btree_map generate_mappings_remove_chr_prefix_prefix(std::string_view uri) { + phmap::btree_map mappings{}; const auto chroms = cooler::File{uri}.chromosomes(); - phmap::flat_hash_map mappings(chroms.size()); for (const auto& chrom : chroms) { const auto match = chrom.name().find("chr") == 0; if (match) { @@ -35,7 +36,7 @@ generate_mappings_remove_chr_prefix_prefix(std::string_view uri) { return mappings; } -[[nodiscard]] static phmap::flat_hash_map read_mappings_from_file( +[[nodiscard]] static phmap::btree_map read_mappings_from_file( const std::filesystem::path& path) { if (path.empty()) { return {}; @@ -45,7 +46,7 @@ generate_mappings_remove_chr_prefix_prefix(std::string_view uri) { ifs.exceptions(std::ios::badbit); ifs.open(path); - phmap::flat_hash_map mappings{}; + phmap::btree_map mappings{}; std::string buff{}; for (std::size_t i = 0; std::getline(ifs, buff); ++i) { @@ -71,18 +72,33 @@ generate_mappings_remove_chr_prefix_prefix(std::string_view uri) { return mappings; } -[[nodiscard]] static phmap::flat_hash_map generate_name_mappings( +[[nodiscard]] static phmap::btree_map generate_name_mappings( std::string_view uri, const std::filesystem::path& name_mappings_path, bool add_chr_prefix, [[maybe_unused]] bool remove_chr_prefix) { + phmap::btree_map mappings{}; if (!name_mappings_path.empty()) { - return read_mappings_from_file(name_mappings_path); + mappings = read_mappings_from_file(name_mappings_path); } if (add_chr_prefix) { - return generate_mappings_add_chr_prefix_prefix(uri); + mappings = generate_mappings_add_chr_prefix_prefix(uri); } - assert(remove_chr_prefix); - return generate_mappings_remove_chr_prefix_prefix(uri); + if (remove_chr_prefix) { + mappings = generate_mappings_remove_chr_prefix_prefix(uri); + } + + [[maybe_unused]] std::string mappings_str{}; + std::for_each(mappings.begin(), mappings.end(), [&](const auto& m) { + mappings_str += fmt::format(FMT_STRING("\n - {} -> {}"), m.first, m.second); + }); + + if (mappings.empty()) { + SPDLOG_WARN("Chromosome name map is empty: no chromosomes will be renamed!"); + } else { + SPDLOG_INFO(FMT_STRING("Renaming chromosomes as follows:{}"), mappings_str); + } + + return mappings; } int rename_chromosomes_subcmd(const RenameChromosomesConfig& c) { From e2141b70f4a7909c4b192d140062bfd9ddd9ba5b Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 26 Nov 2023 16:44:20 +0100 Subject: [PATCH 08/13] Add integration tests for hictk rename-chroms --- .github/workflows/codecov.yml | 2 + .github/workflows/macos-ci.yml | 4 + .github/workflows/ubuntu-ci.yml | 4 + test/scripts/hictk_rename_chromosomes.sh | 110 +++++++++++++++++++++++ 4 files changed, 120 insertions(+) create mode 100755 test/scripts/hictk_rename_chromosomes.sh diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 6b7c9cff..70d6f7eb 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -217,6 +217,8 @@ jobs: test/scripts/hictk_merge.sh build/src/hictk/hictk + test/scripts/hictk_rename_chroms.sh build/src/hictk/hictk + test/scripts/hictk_validate.sh build/src/hictk/hictk test/scripts/hictk_zoomify.sh build/src/hictk/hictk diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index 9dcbcc06..5fb1aad5 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -442,6 +442,10 @@ jobs: run: | test/scripts/hictk_merge.sh bin/hictk + - name: Test hictk rename-chroms + run: | + test/scripts/hictk_rename_chromosomes.sh bin/hictk + - name: Test hictk validate run: | test/scripts/hictk_validate.sh bin/hictk diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml index 329d7ecb..8822ea91 100644 --- a/.github/workflows/ubuntu-ci.yml +++ b/.github/workflows/ubuntu-ci.yml @@ -495,6 +495,10 @@ jobs: run: | test/scripts/hictk_merge.sh bin/hictk + - name: Test hictk rename-chroms + run: | + test/scripts/hictk_rename_chromosomes.sh bin/hictk + - name: Test hictk validate run: | test/scripts/hictk_validate.sh bin/hictk diff --git a/test/scripts/hictk_rename_chromosomes.sh b/test/scripts/hictk_rename_chromosomes.sh new file mode 100755 index 00000000..e64e0f60 --- /dev/null +++ b/test/scripts/hictk_rename_chromosomes.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash + +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +echo "##################################" +echo "#### hictk rename-chromosomes ####" + +# readlink -f is not available on macos... +function readlink_py { + set -eu + python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" +} + +function check_files_exist { + set -eu + status=0 + for f in "$@"; do + if [ ! -f "$f" ]; then + 2>&1 echo "Unable to find test file \"$f\"" + status=1 + fi + done + + return "$status" +} + +export function readlink_py + +status=0 + +if [ $# -ne 1 ]; then + 2>&1 echo "Usage: $0 path_to_hictk" + status=1 +fi + +hictk_bin="$1" + +data_dir="$(readlink_py "$(dirname "$0")/../data/")" +script_dir="$(readlink_py "$(dirname "$0")")" + +input_cooler="$data_dir/cooler/cooler_test_file.cool" +input_mcool="$data_dir/cooler/multires_cooler_test_file.mcool" +input_scool="$data_dir/cooler/single_cell_cooler_test_file.scool" + +export PATH="$PATH:$script_dir" + +if [ $status -ne 0 ]; then + exit $status +fi + +if ! check_files_exist "$input_cooler" "$input_mcool" "$input_scool"; then + exit 1 +fi + +outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" +trap 'rm -rf -- "$outdir"' EXIT + +# Test adding chr prefix +cp "$input_cooler" "$outdir/out1.cool" +"$hictk_bin" rename-chroms "$outdir/out1.cool" --add-chr-prefix +if ! "$hictk_bin" dump -t chroms "$outdir/out1.cool" | grep -q chr ; then + status=1 +fi + +# Test removing chr prefix +cp "$outdir/out1.cool" "$outdir/out2.cool" +"$hictk_bin" rename-chroms "$outdir/out2.cool" --remove-chr-prefix +if ! "$hictk_bin" dump -t chroms "$outdir/out2.cool" | grep -vq chr ; then + status=1 +fi + + +# Test renaming using custom name mappings +printf '1\tABC\n' > "$outdir/mappings.txt" +cp "$input_cooler" "$outdir/out3.cool" +"$hictk_bin" rename-chroms "$outdir/out3.cool" --name-mappings "$outdir/mappings.txt" +if ! "$hictk_bin" dump -t chroms "$outdir/out3.cool" | grep -q ABC ; then + status=1 +fi + + +# Test mcool +cp "$input_mcool" "$outdir/out4.mcool" +"$hictk_bin" rename-chroms "$outdir/out4.mcool" --name-mappings "$outdir/mappings.txt" +if ! "$hictk_bin" dump -t chroms "$outdir/out4.mcool" | grep -q ABC ; then + status=1 +fi + + +# Test scool +cp "$input_mcool" "$outdir/out5.scool" +"$hictk_bin" rename-chroms "$outdir/out5.scool" --name-mappings "$outdir/mappings.txt" +if ! "$hictk_bin" dump -t chroms "$outdir/out5.scool" | grep -q ABC ; then + status=1 +fi + + +if [ "$status" -eq 0 ]; then + printf '\n### PASS ###\n' +else + printf '\n### FAIL ###\n' +fi + +exit "$status" From 3474d252cf614bf6709f5621ccfb5b9a19704f53 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 26 Nov 2023 17:03:55 +0100 Subject: [PATCH 09/13] Bugfix --- .../include/hictk/cooler/impl/utils_rename_chroms_impl.hpp | 5 +++-- test/units/cooler/utils_rename_chromosomes_test.cpp | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp index 57dd7d0e..8e59f667 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp @@ -5,7 +5,7 @@ #pragma once #include -#include +#include #include #include @@ -54,7 +54,8 @@ template template inline void rename_chromosomes(std::string_view uri, It first_mapping, It last_mapping) { - return rename_chromosomes(uri, {first_mapping, last_mapping}); + return rename_chromosomes( + uri, phmap::btree_map{first_mapping, last_mapping}); } template diff --git a/test/units/cooler/utils_rename_chromosomes_test.cpp b/test/units/cooler/utils_rename_chromosomes_test.cpp index 2915ee45..12209d85 100644 --- a/test/units/cooler/utils_rename_chromosomes_test.cpp +++ b/test/units/cooler/utils_rename_chromosomes_test.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "hictk/cooler/cooler.hpp" #include "hictk/cooler/utils.hpp" @@ -13,7 +14,7 @@ namespace hictk::cooler::test::cooler_file { // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Cooler: rename chromosomes", "[cooler][short]") { +TEST_CASE("Cooler: utils rename chromosomes", "[cooler][short]") { const Reference ref2{{0, "1", 10}, {1, "2", 10}}; const Reference ref3{{0, "chr1", 10}, {1, "2", 10}}; @@ -21,7 +22,8 @@ TEST_CASE("Cooler: rename chromosomes", "[cooler][short]") { const Reference ref{{0, "chr1", 10}, {1, "chr2", 10}}; std::ignore = cooler::File::create(path.string(), ref, 1, true); - cooler::utils::rename_chromosomes(path.string(), {{"chr1", "1"}}); + cooler::utils::rename_chromosomes(path.string(), + std::map{{"chr1", "1"}}); { const auto chroms = cooler::File(path.string()).chromosomes(); From 21f5012d654338b8325cc16615354fbd279030bc Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 26 Nov 2023 17:27:39 +0100 Subject: [PATCH 10/13] Fix typo --- .github/workflows/codecov.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 70d6f7eb..631ba2a4 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -217,7 +217,7 @@ jobs: test/scripts/hictk_merge.sh build/src/hictk/hictk - test/scripts/hictk_rename_chroms.sh build/src/hictk/hictk + test/scripts/hictk_rename_chromosomes.sh build/src/hictk/hictk test/scripts/hictk_validate.sh build/src/hictk/hictk From cf24654bf15f51e264d866bc19cdc4695b007b52 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 26 Nov 2023 20:30:18 +0100 Subject: [PATCH 11/13] Properly rename chroms in .scool files --- conanfile.txt | 2 +- .../rename_chromosomes/rename_chromosomes.cpp | 82 ++++++++++++++----- .../cooler/include/hictk/cooler/cooler.hpp | 2 + .../hictk/cooler/impl/file_accessors_impl.hpp | 4 + .../cooler/impl/multires_cooler_impl.hpp | 5 ++ .../cooler/impl/singlecell_cooler_impl.hpp | 5 ++ .../cooler/impl/utils_rename_chroms_impl.hpp | 27 ++++-- .../include/hictk/cooler/multires_cooler.hpp | 3 + .../hictk/cooler/singlecell_cooler.hpp | 3 + .../cooler/include/hictk/cooler/utils.hpp | 4 + test/scripts/hictk_rename_chromosomes.sh | 2 +- 11 files changed, 112 insertions(+), 27 deletions(-) diff --git a/conanfile.txt b/conanfile.txt index 31d9e7f3..b6454b96 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -10,7 +10,7 @@ eigen/3.4.0#2e192482a8acff96fe34766adca2b24c fast_float/5.2.0#9bf1a3fac625789f2b571d43efb8013b fmt/10.1.0#1fae165cded07416f64960a5b6140317 hdf5/1.14.1#16047e4faf70ba5488d7525063c5cb2b -highfive/2.7.1#b1e846aa63f7b3ab0368faae2f004fbd +highfive/2.8.0#6116c5f03407679faf7aab5f3bbf503f libdeflate/1.19#3ea74a4549efc14d4b1202dc4bfbf602 parallel-hashmap/1.3.11#719aed501c271a34e2347a7731ab3bfb readerwriterqueue/1.0.6#aaa5ff6fac60c2aee591e9e51b063b83 diff --git a/src/hictk/rename_chromosomes/rename_chromosomes.cpp b/src/hictk/rename_chromosomes/rename_chromosomes.cpp index 5baa9f3e..67ee18dd 100644 --- a/src/hictk/rename_chromosomes/rename_chromosomes.cpp +++ b/src/hictk/rename_chromosomes/rename_chromosomes.cpp @@ -101,30 +101,72 @@ generate_mappings_remove_chr_prefix_prefix(std::string_view uri) { return mappings; } +static void remove_hardlinks_scool(HighFive::File& h5f, + const phmap::btree_set& cells) { + for (const auto& cell : cells) { + h5f.unlink(fmt::format(FMT_STRING("/cells/{}/chroms"), cell)); + } +} + +static void create_hardlinks_scool(HighFive::File& h5f, + const phmap::btree_set& cells) { + const auto chrom_grp = h5f.getGroup("/chroms"); + for (const auto& cell : cells) { + h5f.createHardLink(fmt::format(FMT_STRING("/cells/{}/chroms"), cell), chrom_grp); + } +} + +[[nodiscard]] static int rename_chromosomes_cooler(const RenameChromosomesConfig& c) { + const auto mappings = + generate_name_mappings(c.uri, c.path_to_name_mappings, c.add_chr_prefix, c.remove_chr_prefix); + cooler::utils::rename_chromosomes(c.uri, mappings); + return 0; +} + +[[nodiscard]] static int rename_chromosomes_multires_cooler(const RenameChromosomesConfig& c) { + const auto resolutions = cooler::MultiResFile(c.uri).resolutions(); + const auto mappings = generate_name_mappings( + fmt::format(FMT_STRING("{}::/resolutions/{}"), c.uri, resolutions.front()), + c.path_to_name_mappings, c.add_chr_prefix, c.remove_chr_prefix); + for (const auto& res : resolutions) { + cooler::utils::rename_chromosomes(fmt::format(FMT_STRING("{}::/resolutions/{}"), c.uri, res), + mappings); + } + return 0; +} + +[[nodiscard]] static int rename_chromosomes_single_cell_cooler(const RenameChromosomesConfig& c) { + assert(cooler::utils::is_scool_file(c.uri)); + const auto cells = cooler::SingleCellFile(c.uri).cells(); + const auto uri = fmt::format(FMT_STRING("{}::/cells/{}"), c.uri, *cells.begin()); + + const auto mappings = + generate_name_mappings(uri, c.path_to_name_mappings, c.add_chr_prefix, c.remove_chr_prefix); + + // NOLINTNEXTLINE(misc-const-correctness) + HighFive::File h5f(c.uri, HighFive::File::ReadWrite); + + remove_hardlinks_scool(h5f, cells); + + const cooler::RootGroup root_grp{h5f.getGroup("/")}; + cooler::Dataset dset{root_grp, "/chroms/name"}; + cooler::utils::rename_chromosomes(dset, mappings); + + create_hardlinks_scool(h5f, cells); + assert(cooler::utils::is_scool_file(c.uri)); + + return 0; +} + int rename_chromosomes_subcmd(const RenameChromosomesConfig& c) { if (cooler::utils::is_cooler(c.uri)) { - const auto mappings = generate_name_mappings(c.uri, c.path_to_name_mappings, c.add_chr_prefix, - c.remove_chr_prefix); - cooler::utils::rename_chromosomes(c.uri, mappings); - } else if (cooler::utils::is_multires_file(c.uri)) { - const auto resolutions = cooler::MultiResFile(c.uri).resolutions(); - const auto mappings = generate_name_mappings( - fmt::format(FMT_STRING("{}::/resolutions/{}"), c.uri, resolutions.front()), - c.path_to_name_mappings, c.add_chr_prefix, c.remove_chr_prefix); - for (const auto& res : resolutions) { - cooler::utils::rename_chromosomes(fmt::format(FMT_STRING("{}::/resolutions/{}"), c.uri, res), - mappings); - } - } else { - assert(cooler::utils::is_scool_file(c.uri)); - const auto cell_id = *cooler::SingleCellFile(c.uri).cells().begin(); - const auto uri = fmt::format(FMT_STRING("{}::/cells/{}"), c.uri, cell_id); - const auto mappings = - generate_name_mappings(uri, c.path_to_name_mappings, c.add_chr_prefix, c.remove_chr_prefix); - cooler::utils::rename_chromosomes(uri, mappings); + return rename_chromosomes_cooler(c); } - return 0; + if (cooler::utils::is_multires_file(c.uri)) { + return rename_chromosomes_multires_cooler(c); + } + return rename_chromosomes_single_cell_cooler(c); } } // namespace hictk::tools diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index 78d7f3d7..17f98870 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -170,8 +170,10 @@ class File { [[nodiscard]] std::uint64_t nnz() const; [[nodiscard]] auto attributes() const noexcept -> const Attributes &; + [[nodiscard]] HighFive::File file_handle(); [[nodiscard]] auto group(std::string_view group_name) -> Group &; [[nodiscard]] auto dataset(std::string_view dataset_name) -> Dataset &; + [[nodiscard]] const HighFive::File &file_handle() const; [[nodiscard]] auto group(std::string_view group_name) const -> const Group &; [[nodiscard]] auto dataset(std::string_view dataset_name) const -> const Dataset &; diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp index 00f1a175..b15db0aa 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp @@ -47,6 +47,10 @@ inline std::uint64_t File::nnz() const { return dataset("pixels/count").size(); inline auto File::attributes() const noexcept -> const Attributes & { return _attrs; } +inline HighFive::File File::file_handle() { return _root_group().getFile(); } + +inline const HighFive::File& File::file_handle() const { return _root_group().getFile(); } + inline auto File::group(std::string_view group_name) -> Group & { try { return _groups.at(std::string{group_name}); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp index d4c6d458..e4bbca6f 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp @@ -179,6 +179,11 @@ inline auto MultiResFile::chromosomes() const noexcept -> const Reference& { ret [&](const auto res) { return res <= target_res && target_res % res == 0; }); } +inline HighFive::File MultiResFile::file_handle() { return _root_grp->group.getFile(); } +inline const HighFive::File& MultiResFile::file_handle() const { + return _root_grp->group.getFile(); +} + inline void MultiResFile::coarsen(const File& clr1, File& clr2, std::vector>& buffer) { SPDLOG_INFO(FMT_STRING("generating {} resolution from {} ({}x)"), clr2.bin_size(), diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp index 7d116960..cefe1aa9 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp @@ -168,6 +168,11 @@ inline auto SingleCellFile::chromosomes() const noexcept -> const Reference& { return bins().chromosomes(); } +inline HighFive::File SingleCellFile::file_handle() { return _root_grp->group.getFile(); } +inline const HighFive::File& SingleCellFile::file_handle() const { + return _root_grp->group.getFile(); +} + template inline File SingleCellFile::aggregate(std::string_view uri, bool overwrite_if_exists, std::size_t chunk_size, std::size_t update_frequency) const { diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp index 8e59f667..6f60544c 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp @@ -69,15 +69,32 @@ inline void rename_chromosomes(std::string_view uri, const NameMap& mappings) { const auto chrom_dset = fmt::format(FMT_STRING("{}/chroms/name"), clr.hdf5_path()); clr.close(); - names = internal::rename_chromosomes(std::move(names), mappings); - // NOLINTNEXTLINE(misc-const-correctness) HighFive::File h5f(file_path, HighFive::File::ReadWrite); const cooler::RootGroup root_grp{h5f.getGroup("/")}; - const auto aprop = h5f.getDataSet(chrom_dset).getAccessPropertyList(); + cooler::Dataset dset{root_grp, chrom_dset}; + + return rename_chromosomes(dset, mappings); +} - h5f.unlink(chrom_dset); - cooler::Dataset dset{root_grp, chrom_dset, internal::find_chrom_with_longest_name(names), +template +inline void rename_chromosomes(cooler::Dataset& chrom_dset, const NameMap& mappings) { + if (mappings.empty()) { + return; + } + auto names = chrom_dset.read_all>(); + + names = internal::rename_chromosomes(std::move(names), mappings); + + // NOLINTNEXTLINE(misc-const-correctness) + auto h5f = chrom_dset().getFile(); + const auto chrom_path = chrom_dset().getPath(); + const auto aprop = chrom_dset().getAccessPropertyList(); + + h5f.unlink(chrom_dset().getPath()); + + const cooler::RootGroup root_grp{h5f.getGroup("/")}; + cooler::Dataset dset{root_grp, chrom_path, internal::find_chrom_with_longest_name(names), HighFive::DataSpace::UNLIMITED, aprop}; try { diff --git a/src/libhictk/cooler/include/hictk/cooler/multires_cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/multires_cooler.hpp index dc679930..4f7f0a38 100644 --- a/src/libhictk/cooler/include/hictk/cooler/multires_cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/multires_cooler.hpp @@ -59,6 +59,9 @@ class MultiResFile { [[nodiscard]] std::string path() const; [[nodiscard]] auto chromosomes() const noexcept -> const Reference&; + [[nodiscard]] HighFive::File file_handle(); + [[nodiscard]] const HighFive::File& file_handle() const; + [[nodiscard]] static std::uint32_t compute_base_resolution( const std::vector& resolutions, std::uint32_t target_res); diff --git a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp index 86228e5b..6eadf67d 100644 --- a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp @@ -76,6 +76,9 @@ class SingleCellFile { [[nodiscard]] auto bins() const noexcept -> const BinTable&; [[nodiscard]] std::uint32_t bin_size() const noexcept; + [[nodiscard]] HighFive::File file_handle(); + [[nodiscard]] const HighFive::File& file_handle() const; + template File aggregate(std::string_view uri, bool overwrite_if_exists = false, std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000) const; diff --git a/src/libhictk/cooler/include/hictk/cooler/utils.hpp b/src/libhictk/cooler/include/hictk/cooler/utils.hpp index 1965b6fc..0c18094a 100644 --- a/src/libhictk/cooler/include/hictk/cooler/utils.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/utils.hpp @@ -11,6 +11,7 @@ #include #include "hictk/cooler/cooler.hpp" +#include "hictk/cooler/dataset.hpp" #include "hictk/cooler/group.hpp" namespace hictk::cooler::utils { @@ -43,6 +44,9 @@ void rename_chromosomes(std::string_view uri, It first_mapping, It last_mapping) template >> void rename_chromosomes(std::string_view uri, const NameMap& mappings); +template >> +inline void rename_chromosomes(cooler::Dataset& chrom_dset, const NameMap& mappings); + } // namespace hictk::cooler::utils #include "./impl/utils_copy_impl.hpp" diff --git a/test/scripts/hictk_rename_chromosomes.sh b/test/scripts/hictk_rename_chromosomes.sh index e64e0f60..a727f059 100755 --- a/test/scripts/hictk_rename_chromosomes.sh +++ b/test/scripts/hictk_rename_chromosomes.sh @@ -94,7 +94,7 @@ fi # Test scool -cp "$input_mcool" "$outdir/out5.scool" +cp "$input_scool" "$outdir/out5.scool" "$hictk_bin" rename-chroms "$outdir/out5.scool" --name-mappings "$outdir/mappings.txt" if ! "$hictk_bin" dump -t chroms "$outdir/out5.scool" | grep -q ABC ; then status=1 From 260423ed67d825eff9c7511889469dc0155693c1 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 26 Nov 2023 20:30:39 +0100 Subject: [PATCH 12/13] Replace calls to createSoftLink with createHardLink --- .../cooler/include/hictk/cooler/impl/file_write_impl.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp index b43765ff..5d8c174e 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp @@ -199,9 +199,8 @@ inline auto File::create_groups(RootGroup &root_grp) -> GroupMap { inline auto File::create_groups(RootGroup &root_grp, Group chroms_grp, Group bins_grp) -> GroupMap { [[maybe_unused]] HighFive::SilenceHDF5 silencer{}; // NOLINT GroupMap groups(MANDATORY_GROUP_NAMES.size() + 1); - // TODO replace with createHardLink when implemented in HighFive - root_grp().createSoftLink("chroms", chroms_grp()); - root_grp().createSoftLink("bins", bins_grp()); + root_grp().createHardLink("chroms", chroms_grp()); + root_grp().createHardLink("bins", bins_grp()); groups.emplace(root_grp.hdf5_path(), Group{root_grp, root_grp()}); groups.emplace("chroms", Group{root_grp, root_grp().getGroup("chroms")}); From 365402b17d072914155269e360086fdb0f1323cd Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 13 Dec 2023 21:02:49 +0100 Subject: [PATCH 13/13] Address clang-tidy warnings --- .../include/hictk/cooler/impl/utils_rename_chroms_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp index 6f60544c..a0c66c7a 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_rename_chroms_impl.hpp @@ -105,5 +105,5 @@ inline void rename_chromosomes(cooler::Dataset& chrom_dset, const NameMap& mappi dset.uri(), e.what())); } assert(dset.size() == names.size()); -} +} // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) } // namespace hictk::cooler::utils