diff --git a/src/hictk/CMakeLists.txt b/src/hictk/CMakeLists.txt index e2b8fb8d..ea4e57f9 100644 --- a/src/hictk/CMakeLists.txt +++ b/src/hictk/CMakeLists.txt @@ -19,6 +19,7 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_balance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_convert.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_dump.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_fix_mcool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_load.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_merge.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_validate.cpp @@ -28,6 +29,7 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/convert/cool_to_hic.cpp ${CMAKE_CURRENT_SOURCE_DIR}/convert/hic_to_cool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dump/dump.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fix_mcool/fix_mcool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/load/load.cpp ${CMAKE_CURRENT_SOURCE_DIR}/merge/merge.cpp ${CMAKE_CURRENT_SOURCE_DIR}/validate/validate.cpp diff --git a/src/hictk/balance/balance.cpp b/src/hictk/balance/balance.cpp index 4b3fab75..ddcc5bc1 100644 --- a/src/hictk/balance/balance.cpp +++ b/src/hictk/balance/balance.cpp @@ -83,7 +83,7 @@ static void write_weights_cooler(std::string_view uri, const BalanceConfig& c, const std::vector& scale) { const auto& [file, grp] = cooler::parse_cooler_uri(uri); const auto path = fmt::format(FMT_STRING("{}/bins/{}"), grp, c.name); - SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), uri, path); + SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), file, path); const HighFive::File clr(file, HighFive::File::ReadWrite); diff --git a/src/hictk/cli/cli.cpp b/src/hictk/cli/cli.cpp index fa044d93..580da7a1 100644 --- a/src/hictk/cli/cli.cpp +++ b/src/hictk/cli/cli.cpp @@ -31,6 +31,8 @@ auto Cli::parse_arguments() -> Config { _subcommand = subcommand::convert; } else if (_cli.get_subcommand("dump")->parsed()) { _subcommand = subcommand::dump; + } else if (_cli.get_subcommand("fix-mcool")->parsed()) { + _subcommand = subcommand::fix_mcool; } else if (_cli.get_subcommand("load")->parsed()) { _subcommand = subcommand::load; } else if (_cli.get_subcommand("merge")->parsed()) { @@ -77,6 +79,8 @@ std::string_view Cli::subcommand_to_str(subcommand s) noexcept { return "convert"; case dump: return "dump"; + case fix_mcool: + return "fix-mcool"; case load: return "load"; case merge: @@ -100,6 +104,7 @@ void Cli::make_cli() { make_balance_subcommand(); make_convert_subcommand(); make_dump_subcommand(); + make_fix_mcool_subcommand(); make_load_subcommand(); make_merge_subcommand(); make_validate_subcommand(); @@ -117,6 +122,9 @@ void Cli::validate_args() const { case dump: validate_dump_subcommand(); break; + case fix_mcool: + validate_fix_mcool_subcommand(); + break; case load: validate_load_subcommand(); break; @@ -144,6 +152,9 @@ void Cli::transform_args() { case dump: transform_args_dump_subcommand(); break; + case fix_mcool: + transform_args_fix_mcool_subcommand(); + break; case load: transform_args_load_subcommand(); break; diff --git a/src/hictk/cli/cli_convert.cpp b/src/hictk/cli/cli_convert.cpp index df7dd01c..17141f7d 100644 --- a/src/hictk/cli/cli_convert.cpp +++ b/src/hictk/cli/cli_convert.cpp @@ -91,10 +91,10 @@ void Cli::make_convert_subcommand() { ->check(CLI::Range(1, 4)) ->capture_default_str(); sc.add_option( - "-p,--processes", - c.processes, - "Maximum number of parallel processes to spawn.\n" - "When converting from hic to cool, only two processes will be used.") + "-t,--threads", + c.threads, + "Maximum number of parallel threads to spawn.\n" + "When converting from hic to cool, only two threads will be used.") ->check(CLI::Range(std::uint32_t(2), std::thread::hardware_concurrency())) ->capture_default_str(); sc.add_option( diff --git a/src/hictk/cli/cli_fix_mcool.cpp b/src/hictk/cli/cli_fix_mcool.cpp new file mode 100644 index 00000000..cd486a38 --- /dev/null +++ b/src/hictk/cli/cli_fix_mcool.cpp @@ -0,0 +1,143 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include + +#include +#include +#include +#include +#include + +#include "hictk/tools/cli.hpp" +#include "hictk/tools/config.hpp" + +namespace hictk::tools { + +void Cli::make_fix_mcool_subcommand() { + auto& sc = *_cli.add_subcommand("fix-mcool", "Fix corrupted .mcool files.") + ->fallthrough() + ->preparse_callback([this]([[maybe_unused]] std::size_t i) { + assert(_config.index() == 0); + _config = FixMcoolConfig{}; + }); + + _config = FixMcoolConfig{}; + auto& c = std::get(_config); + + // clang-format off + sc.add_option( + "input", + c.path_to_input, + "Path to a corrupted .mcool file.") + ->check(IsValidMultiresCoolerFile) + ->required(); + sc.add_option( + "output", + c.path_to_output, + "Path where to store the restored .mcool.") + ->required(); + sc.add_option( + "--tmpdir", + c.tmp_dir, + "Path to a folder where to store temporary data.") + ->capture_default_str(); + sc.add_flag( + "--skip-balancing", + c.skip_balancing, + "Do not recompute or copy balancing weights."); + sc.add_flag( + "--check-base-resolution", + c.check_base_resolution, + "Check whether the base resolution is corrupted."); + sc.add_flag( + "--in-memory", + c.in_memory, + "Store all interactions in memory while balancing (greatly improves performance).") + ->capture_default_str(); + sc.add_option( + "--chunk-size", + c.chunk_size, + "Number of interactions to process at once during balancing.\n" + "Ignored when using --in-memory.") + ->check(CLI::PositiveNumber) + ->capture_default_str(); + sc.add_option( + "-v,--verbosity", + c.verbosity, + "Set verbosity of output to the console.") + ->check(CLI::Range(1, 4)) + ->capture_default_str(); + sc.add_option( + "-t,--threads", + c.threads, + "Maximum number of parallel threads to spawn (only applies to balancing stage).") + ->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency())) + ->capture_default_str(); + sc.add_option( + "-l,--compression-level", + c.zstd_compression_lvl, + "Compression level used to compress temporary files using ZSTD (only applies to balancing stage).") + ->check(CLI::Range(0, 19)) + ->capture_default_str(); + sc.add_flag( + "-f,--force", + c.force, + "Overwrite existing files (if any).") + ->capture_default_str(); + // clang-format on +} + +void Cli::validate_fix_mcool_subcommand() const { + const auto& c = std::get(_config); + std::vector errors; + std::vector warnings{}; + + if (!c.force && std::filesystem::exists(c.path_to_output)) { + errors.emplace_back(fmt::format( + FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.path_to_output)); + } + + if (c.skip_balancing) { + const auto* sc = _cli.get_subcommand("fix-mcool"); + if (!sc->get_option("--tmpdir")->empty()) { + warnings.emplace_back("option --tmpdir is ignored when --skip-balancing is provided."); + } + if (!sc->get_option("--in-memory")->empty()) { + warnings.emplace_back("option --in-memory is ignored when --skip-balancing is provided."); + } + if (!sc->get_option("--compression-level")->empty()) { + warnings.emplace_back( + "option --compression-level is ignored when --skip-balancing is provided."); + } + if (!sc->get_option("--chunk-size")->empty()) { + warnings.emplace_back("option --chunk-size is ignored when --skip-balancing is provided."); + } + if (!sc->get_option("--threads")->empty()) { + warnings.emplace_back("option --threads is ignored when --skip-balancing is provided."); + } + } + + for (const auto& w : warnings) { + SPDLOG_WARN(FMT_STRING("{}"), w); + } + + if (!errors.empty()) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "The following error(s) where encountered while validating CLI arguments:\n - {}"), + fmt::join(errors, "\n - "))); + } +} + +void Cli::transform_args_fix_mcool_subcommand() { + auto& c = std::get(_config); + + // in spdlog, high numbers correspond to low log levels + assert(c.verbosity > 0 && c.verbosity < 5); + c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; +} + +} // namespace hictk::tools diff --git a/src/hictk/convert/cool_to_hic.cpp b/src/hictk/convert/cool_to_hic.cpp index 97b41763..6a99e1f7 100644 --- a/src/hictk/convert/cool_to_hic.cpp +++ b/src/hictk/convert/cool_to_hic.cpp @@ -89,14 +89,14 @@ static std::size_t dump_pixels_plain(const cooler::File& clr, const std::filesys template [[nodiscard]] static std::unique_ptr run_pigz( Pipe& pipe, const std::filesystem::path& dest, std::uint8_t compression_lvl, - std::size_t processes) { + std::size_t threads) { assert(compression_lvl != 0); - assert(processes != 0); + assert(threads != 0); // clang-format off return std::make_unique( find_pigz().string(), fmt::format(FMT_STRING("-{}"), compression_lvl), - "--processes", fmt::to_string(processes), + "--processes", fmt::to_string(threads), boost::process::std_in < pipe, boost::process::std_out > dest.string() ); @@ -104,14 +104,14 @@ template } static std::size_t dump_pixels_pigz(const cooler::File& clr, const std::filesystem::path& dest, - std::uint8_t compression_lvl, std::size_t processes, + std::uint8_t compression_lvl, std::size_t threads, std::size_t update_frequency = 10'000'000) { assert(compression_lvl != 0); - assert(processes > 1); + assert(threads > 1); boost::asio::io_context ioc; boost::process::async_pipe pipe{ioc}; - const auto pigz = run_pigz(pipe, dest, compression_lvl, processes - 1); + const auto pigz = run_pigz(pipe, dest, compression_lvl, threads - 1); auto t0 = std::chrono::steady_clock::now(); std::string buffer; @@ -178,7 +178,7 @@ static std::size_t dump_pixels_pigz(const cooler::File& clr, const std::filesyst } static void dump_pixels(const cooler::File& clr, const std::filesystem::path& dest, - std::uint8_t compression_lvl, std::size_t processes) { + std::uint8_t compression_lvl, std::size_t threads) { const auto t0 = std::chrono::steady_clock::now(); SPDLOG_INFO(FMT_STRING("writing pixels to file {}..."), dest); @@ -186,7 +186,7 @@ static void dump_pixels(const cooler::File& clr, const std::filesystem::path& de std::size_t pixels_processed{}; if (dest.extension() == ".gz") { assert(compression_lvl != 0); - pixels_processed = dump_pixels_pigz(clr, dest, compression_lvl, processes); + pixels_processed = dump_pixels_pigz(clr, dest, compression_lvl, threads); } else { pixels_processed = dump_pixels_plain(clr, dest); } @@ -277,12 +277,12 @@ void cool_to_hic(const ConvertConfig& c) { const cooler::File clr(uri); dump_chrom_sizes(clr, chrom_sizes); - dump_pixels(clr, pixels, c.gzip_compression_lvl, c.processes); + dump_pixels(clr, pixels, c.gzip_compression_lvl, c.threads); } auto t1 = std::chrono::steady_clock::now(); SPDLOG_INFO(FMT_STRING("running juicer_tools pre...")); - process = run_juicer_tools_pre(c, chrom_sizes, pixels, c.processes); + process = run_juicer_tools_pre(c, chrom_sizes, pixels, c.threads); process->wait(); if (process->exit_code() != 0) { throw std::runtime_error(fmt::format(FMT_STRING("juicer_tools pre failed with exit code {}"), diff --git a/src/hictk/fix_mcool/fix_mcool.cpp b/src/hictk/fix_mcool/fix_mcool.cpp new file mode 100644 index 00000000..f1490d7c --- /dev/null +++ b/src/hictk/fix_mcool/fix_mcool.cpp @@ -0,0 +1,126 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include + +#include +#include +#include +#include +#include + +#include "hictk/balancing/ice.hpp" +#include "hictk/balancing/methods.hpp" +#include "hictk/cooler.hpp" +#include "hictk/tools/config.hpp" +#include "hictk/tools/tools.hpp" + +namespace hictk::tools { + +static void validate_base_resolution(const FixMcoolConfig& c, std::uint32_t base_resolution) { + ValidateConfig vc{}; + vc.uri = + fmt::format(FMT_STRING("{}::/resolutions/{}"), c.path_to_input.string(), base_resolution); + vc.validate_index = true; + + [[maybe_unused]] const auto ec = validate_subcmd(vc); + assert(ec == 0); +} + +static void run_hictk_zoomify(const FixMcoolConfig& c, + const std::vector& resolutions, + std::string_view base_uri) { + ZoomifyConfig zc{}; + zc.input_uri = std::string{base_uri}; + zc.output_path = c.path_to_output.string(); + zc.resolutions = resolutions; + zc.copy_base_resolution = true; + zc.force = c.force; + zc.verbosity = c.verbosity; + + [[maybe_unused]] const auto ec = zoomify_subcmd(zc); + assert(ec == 0); +} + +static std::optional detect_balancing_params(std::string_view file, + std::uint32_t resolution) { + const HighFive::File clr(std::string{file}, HighFive::File::ReadOnly); + const auto path = fmt::format(FMT_STRING("resolutions/{}/bins/weight"), resolution); + + if (!clr.exist(path)) { + SPDLOG_WARN( + FMT_STRING("Cooler at {}::{} does not appear to have been balanced. SKIPPING balancing!"), + file, path); + return {}; + } + + const cooler::Dataset dset(cooler::RootGroup{clr.getGroup("/")}, path); + BalanceConfig c{}; + + try { + const auto cis_only = dset.read_attribute("cis_only"); + const auto trans_only = + dset.has_attribute("trans_only") ? dset.read_attribute("trans_only") : false; + + assert(cis_only + trans_only < 2); + + if (cis_only) { + c.mode = "cis"; + } else if (trans_only) { + c.mode = "trans"; + } else { + c.mode = "gw"; + } + + c.masked_diags = dset.read_attribute("ignore_diags"); + c.mad_max = dset.read_attribute("mad_max"); + c.min_count = dset.read_attribute("min_count"); + c.min_nnz = dset.read_attribute("min_nnz"); + c.tolerance = dset.read_attribute("tol"); + + } catch (const std::exception& e) { + } + return c; +} + +static void run_hictk_balance(const FixMcoolConfig& c, std::uint32_t resolution) { + auto bc = detect_balancing_params(c.path_to_input.string(), resolution); + + if (!bc) { + return; + } + + bc->path_to_input = fmt::format(FMT_STRING("{}::/resolutions/{}"), c.path_to_output, resolution); + bc->tmp_dir = c.tmp_dir; + bc->in_memory = c.in_memory; + bc->threads = c.threads; + bc->zstd_compression_lvl = c.zstd_compression_lvl; + bc->chunk_size = c.chunk_size; + + [[maybe_unused]] const auto ec = balance_subcmd(*bc); + assert(ec == 0); +} + +int fix_mcool_subcmd(const FixMcoolConfig& c) { + assert(cooler::utils::is_multires_file(c.path_to_input.string())); + + const auto resolutions = cooler::MultiResFile{c.path_to_input.string()}.resolutions(); + const auto base_resolution = resolutions.front(); + + const auto base_uri = + fmt::format(FMT_STRING("{}::/resolutions/{}"), c.path_to_input.string(), base_resolution); + + if (c.check_base_resolution) { + SPDLOG_INFO(FMT_STRING("Validating {}..."), base_uri); + validate_base_resolution(c, base_resolution); + } + + run_hictk_zoomify(c, resolutions, base_uri); + + std::for_each(resolutions.begin() + 1, resolutions.end(), + [&](const auto& res) { run_hictk_balance(c, res); }); + + return 0; +} +} // namespace hictk::tools diff --git a/src/hictk/include/hictk/tools/cli.hpp b/src/hictk/include/hictk/tools/cli.hpp index f0823632..4043c423 100644 --- a/src/hictk/include/hictk/tools/cli.hpp +++ b/src/hictk/include/hictk/tools/cli.hpp @@ -195,6 +195,7 @@ class Cli { balance, convert, dump, + fix_mcool, load, merge, validate, @@ -220,6 +221,7 @@ class Cli { void make_balance_subcommand(); void make_convert_subcommand(); void make_dump_subcommand(); + void make_fix_mcool_subcommand(); void make_load_subcommand(); void make_merge_subcommand(); void make_validate_subcommand(); @@ -229,6 +231,7 @@ class Cli { void validate_balance_subcommand() const; void validate_convert_subcommand() const; void validate_dump_subcommand() const; + void validate_fix_mcool_subcommand() const; void validate_load_subcommand() const; void validate_merge_subcommand() const; void validate_zoomify_subcommand() const; @@ -237,6 +240,7 @@ class Cli { void transform_args_balance_subcommand(); void transform_args_convert_subcommand(); void transform_args_dump_subcommand(); + void transform_args_fix_mcool_subcommand(); void transform_args_load_subcommand(); void transform_args_merge_subcommand(); void transform_args_zoomify_subcommand(); diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index f6dd2e5f..66b07928 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -54,7 +54,7 @@ struct ConvertConfig { bool fail_if_normalization_method_is_not_avaliable{false}; std::uint8_t gzip_compression_lvl{6}; - std::size_t processes{2}; + std::size_t threads{2}; std::size_t juicer_tools_xmx{32'000'000'000}; std::uint8_t verbosity{4}; @@ -82,6 +82,23 @@ struct DumpConfig { bool force{false}; }; +struct FixMcoolConfig { + std::filesystem::path path_to_input{}; + std::filesystem::path path_to_output{}; + std::filesystem::path tmp_dir{std::filesystem::temp_directory_path()}; + + bool skip_balancing{false}; + bool check_base_resolution{false}; + + bool in_memory{false}; + std::uint8_t zstd_compression_lvl{3}; + std::size_t chunk_size{10'000'000}; + + std::size_t threads{1}; + std::uint8_t verbosity{4}; + bool force{false}; +}; + struct LoadConfig { std::string uri{}; @@ -133,6 +150,7 @@ using Config = std::variant(config)); case sc::dump: return dump_subcmd(std::get(config)); + case sc::fix_mcool: + return fix_mcool_subcmd(std::get(config)); case sc::load: return load_subcmd(std::get(config)); case sc::merge: diff --git a/src/hictk/zoomify/zoomify.cpp b/src/hictk/zoomify/zoomify.cpp index efa09582..5ed19bce 100644 --- a/src/hictk/zoomify/zoomify.cpp +++ b/src/hictk/zoomify/zoomify.cpp @@ -11,31 +11,12 @@ namespace hictk::tools { -template , - typename CoarsenIt = typename transformers::CoarsenPixels::iterator> -internal::PixelMerger setup_pixel_merger(const cooler::File& clr, std::size_t factor) { - const auto& chroms = clr.chromosomes(); - std::vector heads{}; - std::vector tails{}; - - for (std::uint32_t chrom1_id = 0; chrom1_id < chroms.size(); ++chrom1_id) { - for (std::uint32_t chrom2_id = chrom1_id; chrom2_id < chroms.size(); ++chrom2_id) { - auto sel = clr.fetch(chroms.at(chrom1_id).name(), chroms.at(chrom2_id).name()); - auto sel1 = transformers::CoarsenPixels(sel.begin(), sel.end(), clr.bins_ptr(), factor); - auto first = sel1.begin(); - auto last = sel1.end(); - if (first != last) { - heads.emplace_back(std::move(first)); - tails.emplace_back(std::move(last)); - } - } - } - return {heads, tails}; -} - void zoomify_once(const cooler::File& clr1, cooler::RootGroup entrypoint2, std::uint32_t resolution) { - auto clr2 = cooler::File::create(std::move(entrypoint2), clr1.chromosomes(), resolution); + auto attrs = cooler::Attributes::init(clr1.bin_size()); + attrs.assembly = clr1.attributes().assembly; + + auto clr2 = cooler::File::create(std::move(entrypoint2), clr1.chromosomes(), resolution, attrs); std::vector> buffer{500'000}; cooler::MultiResFile::coarsen(clr1, clr2, buffer); diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp index a152f6f6..00870ab9 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -154,7 +154,7 @@ inline void SparseMatrix::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id _counts.push_back(count); } -void SparseMatrix::serialize(std::fstream& fs, ZSTD_CCtx& ctx, int compression_lvl) const { +inline void SparseMatrix::serialize(std::fstream& fs, ZSTD_CCtx& ctx, int compression_lvl) const { auto size_ = size(); fs.write(reinterpret_cast(&size_), sizeof(std::size_t)); @@ -196,7 +196,7 @@ void SparseMatrix::serialize(std::fstream& fs, ZSTD_CCtx& ctx, int compression_l fs.flush(); } -void SparseMatrix::deserialize(std::fstream& fs, ZSTD_DCtx& ctx) { +inline void SparseMatrix::deserialize(std::fstream& fs, ZSTD_DCtx& ctx) { std::size_t size_{}; fs.read(reinterpret_cast(&size_), sizeof(std::size_t)); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp index 1d20e8d5..cbe072c7 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/multires_cooler_impl.hpp @@ -135,11 +135,12 @@ inline File MultiResFile::copy_resolution(const File& clr) { template inline File MultiResFile::create_resolution(std::uint32_t resolution, Attributes attributes) { - attributes.bin_size = resolution; const auto base_resolution = compute_base_resolution(resolutions(), resolution); std::vector> buffer{500'000}; auto base_clr = open(base_resolution); + attributes.assembly = base_clr.attributes().assembly; + attributes.bin_size = resolution; { auto clr = File::create(init_resolution(resolution), base_clr.chromosomes(), resolution, attributes, DEFAULT_HDF5_CACHE_SIZE);