Skip to content

Commit

Permalink
Initial implementation of hictk fix-mcool
Browse files Browse the repository at this point in the history
  • Loading branch information
robomics committed Sep 29, 2023
1 parent 9ca5c9d commit e03bab3
Show file tree
Hide file tree
Showing 14 changed files with 331 additions and 42 deletions.
2 changes: 2 additions & 0 deletions src/hictk/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ target_sources(
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_balance.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_convert.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_dump.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_fix_mcool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_load.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_merge.cpp
${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_validate.cpp
Expand All @@ -28,6 +29,7 @@ target_sources(
${CMAKE_CURRENT_SOURCE_DIR}/convert/cool_to_hic.cpp
${CMAKE_CURRENT_SOURCE_DIR}/convert/hic_to_cool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/dump/dump.cpp
${CMAKE_CURRENT_SOURCE_DIR}/fix_mcool/fix_mcool.cpp
${CMAKE_CURRENT_SOURCE_DIR}/load/load.cpp
${CMAKE_CURRENT_SOURCE_DIR}/merge/merge.cpp
${CMAKE_CURRENT_SOURCE_DIR}/validate/validate.cpp
Expand Down
2 changes: 1 addition & 1 deletion src/hictk/balance/balance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ static void write_weights_cooler(std::string_view uri, const BalanceConfig& c,
const std::vector<double>& scale) {
const auto& [file, grp] = cooler::parse_cooler_uri(uri);
const auto path = fmt::format(FMT_STRING("{}/bins/{}"), grp, c.name);
SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), uri, path);
SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), file, path);

const HighFive::File clr(file, HighFive::File::ReadWrite);

Expand Down
11 changes: 11 additions & 0 deletions src/hictk/cli/cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ auto Cli::parse_arguments() -> Config {
_subcommand = subcommand::convert;
} else if (_cli.get_subcommand("dump")->parsed()) {
_subcommand = subcommand::dump;
} else if (_cli.get_subcommand("fix-mcool")->parsed()) {
_subcommand = subcommand::fix_mcool;

Check warning on line 35 in src/hictk/cli/cli.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli.cpp#L35

Added line #L35 was not covered by tests
} else if (_cli.get_subcommand("load")->parsed()) {
_subcommand = subcommand::load;
} else if (_cli.get_subcommand("merge")->parsed()) {
Expand Down Expand Up @@ -77,6 +79,8 @@ std::string_view Cli::subcommand_to_str(subcommand s) noexcept {
return "convert";
case dump:
return "dump";
case fix_mcool:
return "fix-mcool";

Check warning on line 83 in src/hictk/cli/cli.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli.cpp#L82-L83

Added lines #L82 - L83 were not covered by tests
case load:
return "load";
case merge:
Expand All @@ -100,6 +104,7 @@ void Cli::make_cli() {
make_balance_subcommand();
make_convert_subcommand();
make_dump_subcommand();
make_fix_mcool_subcommand();
make_load_subcommand();
make_merge_subcommand();
make_validate_subcommand();
Expand All @@ -117,6 +122,9 @@ void Cli::validate_args() const {
case dump:
validate_dump_subcommand();
break;
case fix_mcool:
validate_fix_mcool_subcommand();
break;

Check warning on line 127 in src/hictk/cli/cli.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli.cpp#L125-L127

Added lines #L125 - L127 were not covered by tests
case load:
validate_load_subcommand();
break;
Expand Down Expand Up @@ -144,6 +152,9 @@ void Cli::transform_args() {
case dump:
transform_args_dump_subcommand();
break;
case fix_mcool:
transform_args_fix_mcool_subcommand();
break;

Check warning on line 157 in src/hictk/cli/cli.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli.cpp#L155-L157

Added lines #L155 - L157 were not covered by tests
case load:
transform_args_load_subcommand();
break;
Expand Down
8 changes: 4 additions & 4 deletions src/hictk/cli/cli_convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ void Cli::make_convert_subcommand() {
->check(CLI::Range(1, 4))
->capture_default_str();
sc.add_option(
"-p,--processes",
c.processes,
"Maximum number of parallel processes to spawn.\n"
"When converting from hic to cool, only two processes will be used.")
"-t,--threads",
c.threads,
"Maximum number of parallel threads to spawn.\n"
"When converting from hic to cool, only two threads will be used.")
->check(CLI::Range(std::uint32_t(2), std::thread::hardware_concurrency()))
->capture_default_str();
sc.add_option(
Expand Down
143 changes: 143 additions & 0 deletions src/hictk/cli/cli_fix_mcool.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
// Copyright (C) 2023 Roberto Rossini <roberros@uio.no>
//
// SPDX-License-Identifier: MIT

#include <fmt/format.h>
#include <fmt/std.h>

#include <CLI/CLI.hpp>
#include <cassert>
#include <cstdint>
#include <string>
#include <thread>

#include "hictk/tools/cli.hpp"
#include "hictk/tools/config.hpp"

namespace hictk::tools {

void Cli::make_fix_mcool_subcommand() {
auto& sc = *_cli.add_subcommand("fix-mcool", "Fix corrupted .mcool files.")
->fallthrough()
->preparse_callback([this]([[maybe_unused]] std::size_t i) {
assert(_config.index() == 0);
_config = FixMcoolConfig{};
});

Check warning on line 25 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L23-L25

Added lines #L23 - L25 were not covered by tests

_config = FixMcoolConfig{};
auto& c = std::get<FixMcoolConfig>(_config);

// clang-format off
sc.add_option(
"input",
c.path_to_input,
"Path to a corrupted .mcool file.")
->check(IsValidMultiresCoolerFile)
->required();
sc.add_option(
"output",
c.path_to_output,
"Path where to store the restored .mcool.")
->required();
sc.add_option(
"--tmpdir",
c.tmp_dir,
"Path to a folder where to store temporary data.")
->capture_default_str();
sc.add_flag(
"--skip-balancing",
c.skip_balancing,
"Do not recompute or copy balancing weights.");
sc.add_flag(
"--check-base-resolution",
c.check_base_resolution,
"Check whether the base resolution is corrupted.");
sc.add_flag(
"--in-memory",
c.in_memory,
"Store all interactions in memory while balancing (greatly improves performance).")
->capture_default_str();
sc.add_option(
"--chunk-size",
c.chunk_size,
"Number of interactions to process at once during balancing.\n"
"Ignored when using --in-memory.")
->check(CLI::PositiveNumber)
->capture_default_str();
sc.add_option(
"-v,--verbosity",
c.verbosity,
"Set verbosity of output to the console.")
->check(CLI::Range(1, 4))
->capture_default_str();
sc.add_option(
"-t,--threads",
c.threads,
"Maximum number of parallel threads to spawn (only applies to balancing stage).")
->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency()))
->capture_default_str();
sc.add_option(
"-l,--compression-level",
c.zstd_compression_lvl,
"Compression level used to compress temporary files using ZSTD (only applies to balancing stage).")
->check(CLI::Range(0, 19))
->capture_default_str();
sc.add_flag(
"-f,--force",
c.force,
"Overwrite existing files (if any).")
->capture_default_str();
// clang-format on
}

void Cli::validate_fix_mcool_subcommand() const {
const auto& c = std::get<FixMcoolConfig>(_config);
std::vector<std::string> errors;
std::vector<std::string> warnings{};

Check warning on line 96 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L93-L96

Added lines #L93 - L96 were not covered by tests

if (!c.force && std::filesystem::exists(c.path_to_output)) {
errors.emplace_back(fmt::format(
FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.path_to_output));

Check warning on line 100 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L98-L100

Added lines #L98 - L100 were not covered by tests
}

if (c.skip_balancing) {
const auto* sc = _cli.get_subcommand("fix-mcool");
if (!sc->get_option("--tmpdir")->empty()) {
warnings.emplace_back("option --tmpdir is ignored when --skip-balancing is provided.");

Check warning on line 106 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L103-L106

Added lines #L103 - L106 were not covered by tests
}
if (!sc->get_option("--in-memory")->empty()) {
warnings.emplace_back("option --in-memory is ignored when --skip-balancing is provided.");

Check warning on line 109 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L108-L109

Added lines #L108 - L109 were not covered by tests
}
if (!sc->get_option("--compression-level")->empty()) {
warnings.emplace_back(

Check warning on line 112 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L111-L112

Added lines #L111 - L112 were not covered by tests
"option --compression-level is ignored when --skip-balancing is provided.");
}
if (!sc->get_option("--chunk-size")->empty()) {
warnings.emplace_back("option --chunk-size is ignored when --skip-balancing is provided.");

Check warning on line 116 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L115-L116

Added lines #L115 - L116 were not covered by tests
}
if (!sc->get_option("--threads")->empty()) {
warnings.emplace_back("option --threads is ignored when --skip-balancing is provided.");

Check warning on line 119 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L118-L119

Added lines #L118 - L119 were not covered by tests
}
}

for (const auto& w : warnings) {
SPDLOG_WARN(FMT_STRING("{}"), w);

Check warning on line 124 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L123-L124

Added lines #L123 - L124 were not covered by tests
}

if (!errors.empty()) {
throw std::runtime_error(fmt::format(
FMT_STRING(

Check warning on line 129 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L127-L129

Added lines #L127 - L129 were not covered by tests
"The following error(s) where encountered while validating CLI arguments:\n - {}"),
fmt::join(errors, "\n - ")));

Check warning on line 131 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L131

Added line #L131 was not covered by tests
}
}

void Cli::transform_args_fix_mcool_subcommand() {
auto& c = std::get<FixMcoolConfig>(_config);

Check warning on line 136 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L135-L136

Added lines #L135 - L136 were not covered by tests

// in spdlog, high numbers correspond to low log levels
assert(c.verbosity > 0 && c.verbosity < 5);
c.verbosity = static_cast<std::uint8_t>(spdlog::level::critical) - c.verbosity;

Check warning on line 140 in src/hictk/cli/cli_fix_mcool.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/cli/cli_fix_mcool.cpp#L139-L140

Added lines #L139 - L140 were not covered by tests
}

} // namespace hictk::tools
20 changes: 10 additions & 10 deletions src/hictk/convert/cool_to_hic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,29 +89,29 @@ static std::size_t dump_pixels_plain(const cooler::File& clr, const std::filesys
template <typename Pipe>
[[nodiscard]] static std::unique_ptr<boost::process::child> run_pigz(
Pipe& pipe, const std::filesystem::path& dest, std::uint8_t compression_lvl,
std::size_t processes) {
std::size_t threads) {
assert(compression_lvl != 0);
assert(processes != 0);
assert(threads != 0);

Check warning on line 94 in src/hictk/convert/cool_to_hic.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/convert/cool_to_hic.cpp#L94

Added line #L94 was not covered by tests
// clang-format off
return std::make_unique<boost::process::child>(
find_pigz().string(),
fmt::format(FMT_STRING("-{}"), compression_lvl),
"--processes", fmt::to_string(processes),
"--processes", fmt::to_string(threads),
boost::process::std_in < pipe,
boost::process::std_out > dest.string()
);
// clang-format on
}

static std::size_t dump_pixels_pigz(const cooler::File& clr, const std::filesystem::path& dest,
std::uint8_t compression_lvl, std::size_t processes,
std::uint8_t compression_lvl, std::size_t threads,
std::size_t update_frequency = 10'000'000) {
assert(compression_lvl != 0);
assert(processes > 1);
assert(threads > 1);

Check warning on line 110 in src/hictk/convert/cool_to_hic.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/convert/cool_to_hic.cpp#L110

Added line #L110 was not covered by tests

boost::asio::io_context ioc;
boost::process::async_pipe pipe{ioc};
const auto pigz = run_pigz(pipe, dest, compression_lvl, processes - 1);
const auto pigz = run_pigz(pipe, dest, compression_lvl, threads - 1);

Check warning on line 114 in src/hictk/convert/cool_to_hic.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/convert/cool_to_hic.cpp#L114

Added line #L114 was not covered by tests

auto t0 = std::chrono::steady_clock::now();
std::string buffer;
Expand Down Expand Up @@ -178,15 +178,15 @@ static std::size_t dump_pixels_pigz(const cooler::File& clr, const std::filesyst
}

static void dump_pixels(const cooler::File& clr, const std::filesystem::path& dest,
std::uint8_t compression_lvl, std::size_t processes) {
std::uint8_t compression_lvl, std::size_t threads) {
const auto t0 = std::chrono::steady_clock::now();

SPDLOG_INFO(FMT_STRING("writing pixels to file {}..."), dest);

std::size_t pixels_processed{};
if (dest.extension() == ".gz") {
assert(compression_lvl != 0);
pixels_processed = dump_pixels_pigz(clr, dest, compression_lvl, processes);
pixels_processed = dump_pixels_pigz(clr, dest, compression_lvl, threads);

Check warning on line 189 in src/hictk/convert/cool_to_hic.cpp

View check run for this annotation

Codecov / codecov/patch

src/hictk/convert/cool_to_hic.cpp#L189

Added line #L189 was not covered by tests
} else {
pixels_processed = dump_pixels_plain(clr, dest);
}
Expand Down Expand Up @@ -277,12 +277,12 @@ void cool_to_hic(const ConvertConfig& c) {

const cooler::File clr(uri);
dump_chrom_sizes(clr, chrom_sizes);
dump_pixels(clr, pixels, c.gzip_compression_lvl, c.processes);
dump_pixels(clr, pixels, c.gzip_compression_lvl, c.threads);
}

auto t1 = std::chrono::steady_clock::now();
SPDLOG_INFO(FMT_STRING("running juicer_tools pre..."));
process = run_juicer_tools_pre(c, chrom_sizes, pixels, c.processes);
process = run_juicer_tools_pre(c, chrom_sizes, pixels, c.threads);
process->wait();
if (process->exit_code() != 0) {
throw std::runtime_error(fmt::format(FMT_STRING("juicer_tools pre failed with exit code {}"),
Expand Down
Loading

0 comments on commit e03bab3

Please sign in to comment.