From e4da38b2c184e11066f08043c162aa1daf0849d1 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 21 Nov 2023 18:43:02 +0100 Subject: [PATCH] Initial implementation of hictk rename-chromosomes --- src/hictk/CMakeLists.txt | 2 + src/hictk/cli/cli.cpp | 10 ++ src/hictk/cli/cli_rename_chromosomes.cpp | 99 +++++++++++++++ src/hictk/include/hictk/tools/cli.hpp | 4 + src/hictk/include/hictk/tools/config.hpp | 9 ++ src/hictk/include/hictk/tools/tools.hpp | 1 + src/hictk/main.cpp | 2 + .../rename_chromosomes/rename_chromosomes.cpp | 114 ++++++++++++++++++ 8 files changed, 241 insertions(+) create mode 100644 src/hictk/cli/cli_rename_chromosomes.cpp create mode 100644 src/hictk/rename_chromosomes/rename_chromosomes.cpp diff --git a/src/hictk/CMakeLists.txt b/src/hictk/CMakeLists.txt index ea4e57f9..b6e3bec5 100644 --- a/src/hictk/CMakeLists.txt +++ b/src/hictk/CMakeLists.txt @@ -22,6 +22,7 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_fix_mcool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_load.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_merge.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_rename_chromosomes.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_validate.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_zoomify.cpp ${CMAKE_CURRENT_SOURCE_DIR}/balance/balance.cpp @@ -32,6 +33,7 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/fix_mcool/fix_mcool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/load/load.cpp ${CMAKE_CURRENT_SOURCE_DIR}/merge/merge.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/rename_chromosomes/rename_chromosomes.cpp ${CMAKE_CURRENT_SOURCE_DIR}/validate/validate.cpp ${CMAKE_CURRENT_SOURCE_DIR}/zoomify/zoomify.cpp) diff --git a/src/hictk/cli/cli.cpp b/src/hictk/cli/cli.cpp index 580da7a1..0c2fdcfd 100644 --- a/src/hictk/cli/cli.cpp +++ b/src/hictk/cli/cli.cpp @@ -37,6 +37,8 @@ auto Cli::parse_arguments() -> Config { _subcommand = subcommand::load; } else if (_cli.get_subcommand("merge")->parsed()) { _subcommand = subcommand::merge; + } else if (_cli.get_subcommand("rename-chromosomes")->parsed()) { + _subcommand = subcommand::rename_chromosomes; } else if (_cli.get_subcommand("validate")->parsed()) { _subcommand = subcommand::validate; } else if (_cli.get_subcommand("zoomify")->parsed()) { @@ -85,6 +87,8 @@ std::string_view Cli::subcommand_to_str(subcommand s) noexcept { return "load"; case merge: return "merge"; + case rename_chromosomes: + return "rename-chromosomes"; case validate: return "validate"; case zoomify: @@ -107,6 +111,7 @@ void Cli::make_cli() { make_fix_mcool_subcommand(); make_load_subcommand(); make_merge_subcommand(); + make_rename_chromosomes_subcommand(); make_validate_subcommand(); make_zoomify_subcommand(); } @@ -131,6 +136,9 @@ void Cli::validate_args() const { case merge: validate_merge_subcommand(); break; + case rename_chromosomes: + validate_rename_chromosomes_subcommand(); + break; case validate: break; case zoomify: @@ -161,6 +169,8 @@ void Cli::transform_args() { case merge: transform_args_merge_subcommand(); break; + case rename_chromosomes: + break; case validate: break; case zoomify: diff --git a/src/hictk/cli/cli_rename_chromosomes.cpp b/src/hictk/cli/cli_rename_chromosomes.cpp new file mode 100644 index 00000000..77a86d2c --- /dev/null +++ b/src/hictk/cli/cli_rename_chromosomes.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include + +#include +#include +#include + +#include "hictk/tools/cli.hpp" +#include "hictk/tools/config.hpp" + +namespace hictk::tools { +void Cli::make_rename_chromosomes_subcommand() { + auto& sc = + *_cli.add_subcommand("rename-chromosomes", "Rename chromosomes found in a Cooler file.") + ->fallthrough() + ->preparse_callback([this]([[maybe_unused]] std::size_t i) { + assert(_config.index() == 0); + _config = RenameChromosomesConfig{}; + }); + + _config = RenameChromosomesConfig{}; + auto& c = std::get(_config); + + // clang-format off + sc.add_option( + "uri", + c.uri, + "Path to a or .[ms]cool file (Cooler URI syntax supported).") + ->required(); + + sc.add_option( + "--name-mappings", + c.path_to_name_mappings, + "Path to a two column TSV with pairs of chromosomes to be renamed.\n" + "The first column should contain the original chromosome name,\n" + "while the second column should contain the destination name to use when renaming." + ); + + sc.add_flag( + "--add-chr-prefix", + c.add_chr_prefix, + "Prefix chromosome names with \"chr\".") + ->capture_default_str(); + + sc.add_flag( + "--remove-chr-prefix", + c.remove_chr_prefix, + "Remove prefix \"chr\" from chromosome names.") + ->capture_default_str(); + // clang-format on + + sc.get_option("--name-mappings")->excludes(sc.get_option("--add-chr-prefix")); + sc.get_option("--name-mappings")->excludes(sc.get_option("--remove-chr-prefix")); + sc.get_option("--add-chr-prefix")->excludes(sc.get_option("--remove-chr-prefix")); + + _config = std::monostate{}; +} + +void Cli::validate_rename_chromosomes_subcommand() const { + assert(_cli.get_subcommand("rename-chromosomes")->parsed()); + + const auto& c = std::get(_config); + + std::vector errors; + + if (!cooler::utils::is_cooler(c.uri) && !cooler::utils::is_multires_file(c.uri) && + !cooler::utils::is_scool_file(c.uri)) { + errors.emplace_back( + fmt::format(FMT_STRING("File \"{}\" does not appear to be a Cooler file."), c.uri)); + } + + const auto& sc = *_cli.get_subcommand("rename-chromosomes"); + if (sc.get_option("--name-mappings")->empty() && sc.get_option("--add-chr-prefix")->empty() && + sc.get_option("--remove-chr-prefix")->empty()) { + errors.emplace_back( + "please specify exactly one of --name-mappings, --add-chr-prefix, --remove-chr-prefix"); + } + + if (!errors.empty()) { + throw std::runtime_error( + fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI " + "arguments and input file(s):\n - {}\n"), + fmt::join(errors, "\n - "))); + } +} + +void Cli::transform_args_rename_chromosomes_subcommand() { + assert(_cli.get_subcommand("rename-chromosomes")->parsed()); + auto& c = std::get(_config); + + // in spdlog, high numbers correspond to low log levels + assert(c.verbosity > 0 && c.verbosity < 5); + c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; +} + +} // namespace hictk::tools diff --git a/src/hictk/include/hictk/tools/cli.hpp b/src/hictk/include/hictk/tools/cli.hpp index 4043c423..42e8615a 100644 --- a/src/hictk/include/hictk/tools/cli.hpp +++ b/src/hictk/include/hictk/tools/cli.hpp @@ -198,6 +198,7 @@ class Cli { fix_mcool, load, merge, + rename_chromosomes, validate, zoomify, }; @@ -224,6 +225,7 @@ class Cli { void make_fix_mcool_subcommand(); void make_load_subcommand(); void make_merge_subcommand(); + void make_rename_chromosomes_subcommand(); void make_validate_subcommand(); void make_zoomify_subcommand(); void make_cli(); @@ -234,6 +236,7 @@ class Cli { void validate_fix_mcool_subcommand() const; void validate_load_subcommand() const; void validate_merge_subcommand() const; + void validate_rename_chromosomes_subcommand() const; void validate_zoomify_subcommand() const; void validate_args() const; @@ -243,6 +246,7 @@ class Cli { void transform_args_fix_mcool_subcommand(); void transform_args_load_subcommand(); void transform_args_merge_subcommand(); + void transform_args_rename_chromosomes_subcommand(); void transform_args_zoomify_subcommand(); void transform_args(); }; diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index 66b07928..1f9fbdec 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -126,6 +126,14 @@ struct MergeConfig { std::uint8_t verbosity{4}; }; +struct RenameChromosomesConfig { + std::string uri{}; + std::filesystem::path path_to_name_mappings{}; + bool add_chr_prefix{false}; + bool remove_chr_prefix{false}; + std::uint8_t verbosity{4}; +}; + struct ValidateConfig { std::string uri{}; bool validate_index{false}; @@ -153,6 +161,7 @@ using Config = std::variant; // clang-format on diff --git a/src/hictk/include/hictk/tools/tools.hpp b/src/hictk/include/hictk/tools/tools.hpp index d29295ae..e9ecb672 100644 --- a/src/hictk/include/hictk/tools/tools.hpp +++ b/src/hictk/include/hictk/tools/tools.hpp @@ -14,6 +14,7 @@ namespace hictk::tools { [[nodiscard]] int fix_mcool_subcmd(const FixMcoolConfig& c); [[nodiscard]] int load_subcmd(const LoadConfig& c); [[nodiscard]] int merge_subcmd(const MergeConfig& c); +[[nodiscard]] int rename_chromosomes_subcmd(const RenameChromosomesConfig& c); [[nodiscard]] int validate_subcmd(const ValidateConfig& c); [[nodiscard]] int zoomify_subcmd(const ZoomifyConfig& c); diff --git a/src/hictk/main.cpp b/src/hictk/main.cpp index aefd6ed7..3ad68c77 100644 --- a/src/hictk/main.cpp +++ b/src/hictk/main.cpp @@ -111,6 +111,8 @@ int main(int argc, char** argv) noexcept { return load_subcmd(std::get(config)); case sc::merge: return merge_subcmd(std::get(config)); + case sc::rename_chromosomes: + return rename_chromosomes_subcmd(std::get(config)); case sc::validate: return validate_subcmd(std::get(config)); case sc::zoomify: diff --git a/src/hictk/rename_chromosomes/rename_chromosomes.cpp b/src/hictk/rename_chromosomes/rename_chromosomes.cpp new file mode 100644 index 00000000..1078ccb8 --- /dev/null +++ b/src/hictk/rename_chromosomes/rename_chromosomes.cpp @@ -0,0 +1,114 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include + +#include "hictk/cooler/multires_cooler.hpp" +#include "hictk/cooler/singlecell_cooler.hpp" +#include "hictk/cooler/utils.hpp" +#include "hictk/hic/utils.hpp" +#include "hictk/tools/config.hpp" + +namespace hictk::tools { + +[[nodiscard]] static phmap::flat_hash_map +generate_mappings_add_chr_prefix_prefix(std::string_view uri) { + const auto chroms = cooler::File{uri}.chromosomes(); + phmap::flat_hash_map mappings(chroms.size()); + for (const auto& chrom : chroms) { + mappings.emplace(std::string{chrom.name()}, "chr" + std::string{chrom.name()}); + } + return mappings; +} + +[[nodiscard]] static phmap::flat_hash_map +generate_mappings_remove_chr_prefix_prefix(std::string_view uri) { + const auto chroms = cooler::File{uri}.chromosomes(); + phmap::flat_hash_map mappings(chroms.size()); + for (const auto& chrom : chroms) { + const auto match = chrom.name().find("chr") == 0; + if (match) { + mappings.emplace(std::string{chrom.name()}, std::string{chrom.name().substr(3)}); + } + } + return mappings; +} + +[[nodiscard]] static phmap::flat_hash_map read_mappings_from_file( + const std::filesystem::path& path) { + if (path.empty()) { + return {}; + } + + std::ifstream ifs; + ifs.exceptions(std::ios::badbit); + ifs.open(path); + + phmap::flat_hash_map mappings{}; + std::string buff{}; + + for (std::size_t i = 0; std::getline(ifs, buff); ++i) { + if (buff.empty()) { + continue; + } + const auto sep_pos = buff.find('\t'); + if (sep_pos == std::string::npos) { + throw std::runtime_error(fmt::format( + FMT_STRING("Found invalid record \"{}\" in file {} at line {}"), buff, path, i)); + } + auto old_name = buff.substr(0, sep_pos); + auto new_name = buff.substr(sep_pos + 1); + + if (old_name.empty() || new_name.empty()) { + throw std::runtime_error(fmt::format( + FMT_STRING("Found invalid record \"{}\" in file {} at line {}"), buff, path, i)); + } + + mappings.emplace(std::move(old_name), std::move(new_name)); + } + + return mappings; +} + +[[nodiscard]] static phmap::flat_hash_map generate_name_mappings( + std::string_view uri, const std::filesystem::path& name_mappings_path, bool add_chr_prefix, + bool remove_chr_prefix) { + if (!name_mappings_path.empty()) { + return read_mappings_from_file(name_mappings_path); + } + if (add_chr_prefix) { + return generate_mappings_add_chr_prefix_prefix(uri); + } + + assert(remove_chr_prefix); + return generate_mappings_remove_chr_prefix_prefix(uri); +} + +int rename_chromosomes_subcmd(const RenameChromosomesConfig& c) { + if (cooler::utils::is_cooler(c.uri)) { + const auto mappings = generate_name_mappings(c.uri, c.path_to_name_mappings, c.add_chr_prefix, + c.remove_chr_prefix); + cooler::utils::rename_chromosomes(c.uri, mappings); + } else if (cooler::utils::is_multires_file(c.uri)) { + const auto resolutions = cooler::MultiResFile(c.uri).resolutions(); + const auto mappings = generate_name_mappings( + fmt::format(FMT_STRING("{}::/resolutions/{}"), c.uri, resolutions.front()), + c.path_to_name_mappings, c.add_chr_prefix, c.remove_chr_prefix); + for (const auto& res : resolutions) { + cooler::utils::rename_chromosomes(fmt::format(FMT_STRING("{}::/resolutions/{}"), c.uri, res), + mappings); + } + } else { + assert(cooler::utils::is_scool_file(c.uri)); + const auto cell_id = *cooler::SingleCellFile(c.uri).cells().begin(); + const auto uri = fmt::format(FMT_STRING("{}::/cells/{}"), c.uri, cell_id); + const auto mappings = + generate_name_mappings(uri, c.path_to_name_mappings, c.add_chr_prefix, c.remove_chr_prefix); + cooler::utils::rename_chromosomes(uri, mappings); + } + + return 1; +} + +} // namespace hictk::tools