diff --git a/.clang-tidy b/.clang-tidy index 9ec34b36..016e969d 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -17,9 +17,11 @@ Checks: > -cppcoreguidelines-pro-bounds-constant-array-index, -hicpp-no-array-decay, -misc-no-recursion, + -misc-use-anonymous-namespace, -modernize-use-trailing-return-type, -readability-identifier-length, - -readability-magic-numbers + -readability-magic-numbers, + -readability-static-definition-in-anonymous-namespace WarningsAsErrors: '' HeaderFilterRegex: '' FormatStyle: none diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index e4ef6830..fb62099e 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -190,6 +190,8 @@ jobs: - name: Run integration tests run: | + test/scripts/hictk_balance.sh build/src/hictk/hictk hic_tools.jar + test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk test/scripts/hictk_dump_bins.sh build/src/hictk/hictk test/scripts/hictk_dump_resolutions.sh build/src/hictk/hictk diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index baf850eb..5c8a0857 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -362,6 +362,10 @@ jobs: zstd -dcf binaries.tar.zst | tar -xf - tar -xf test/data/hictk_test_data.tar.xz + - name: Test hictk balance + run: | + test/scripts/hictk_balance.sh bin/hictk hic_tools.jar + - name: Test hictk dump chroms run: | test/scripts/hictk_dump_chroms.sh bin/hictk diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml index 19bf42eb..17953490 100644 --- a/.github/workflows/ubuntu-ci.yml +++ b/.github/workflows/ubuntu-ci.yml @@ -415,6 +415,10 @@ jobs: zstd -dcf binaries.tar.zst | tar -xf - tar -xf test/data/hictk_test_data.tar.xz + - name: Test hictk balance + run: | + test/scripts/hictk_balance.sh bin/hictk hic_tools.jar + - name: Test hictk dump chroms run: | test/scripts/hictk_dump_chroms.sh bin/hictk diff --git a/CMakeLists.txt b/CMakeLists.txt index 50d31243..0e43db82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,6 +119,8 @@ target_compile_features(hictk_project_options INTERFACE "cxx_std_${CMAKE_CXX_STA target_compile_definitions(hictk_project_options INTERFACE FMT_HEADER_ONLY FMT_ENFORCE_COMPILE_STRING) # Tweak spdlog target_compile_definitions(hictk_project_options INTERFACE SPDLOG_FMT_EXTERNAL) +#Tweak xxHash +target_compile_definitions(hictk_project_options INTERFACE XXH_INLINE_ALL) if(WIN32) target_compile_definitions(hictk_project_options INTERFACE NOMINMAX _CRT_SECURE_NO_WARNINGS) diff --git a/cmake/FetchTestDataset.cmake b/cmake/FetchTestDataset.cmake index 80be7212..856a3b9d 100644 --- a/cmake/FetchTestDataset.cmake +++ b/cmake/FetchTestDataset.cmake @@ -4,8 +4,8 @@ # cmake-format: off file( - DOWNLOAD https://zenodo.org/record/8204309/files/hictk_test_data.tar.xz?download=1 - EXPECTED_HASH SHA256=f9d0ccc8f8c7bade7f9049ea0abb6b26c061e6fed9e1e7bb8f2da0e1a402cfa4 + DOWNLOAD https://zenodo.org/record/8386066/files/hictk_test_data.tar.xz?download=1 + EXPECTED_HASH SHA256=77e2b9186d9edb90a8436fc41d6c0632727df3af95acb74e9635d1c0f29f2b8d "${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz") # cmake-format: on diff --git a/conanfile.txt b/conanfile.txt index 70a9cfda..87e5db24 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -4,6 +4,7 @@ [requires] boost/1.83.0#f0c3932db7f65b606ed78357ecbdcbef +bshoshany-thread-pool/3.5.0#3c9fd1e21a688432b7f31b40d2d168ee cli11/2.3.2#1424b9b1d9e3682a7122f415b078b4d7 eigen/3.4.0#2e192482a8acff96fe34766adca2b24c fast_float/5.2.0#9bf1a3fac625789f2b571d43efb8013b @@ -13,7 +14,10 @@ highfive/2.7.1#a73bc6937c9add30c9d47a7a70a466eb libdeflate/1.18#3697b637656a9af04cabcbed50db9a7e parallel-hashmap/1.3.11#719aed501c271a34e2347a7731ab3bfb readerwriterqueue/1.0.6#aaa5ff6fac60c2aee591e9e51b063b83 +span-lite/0.10.3#1967d71abb32b314387c2ab9c558dd22 spdlog/1.12.0#248c215bc5f0718402fbf1de126ef847 +xxhash/0.8.1#7bf1cd1fe3d31f1fbcc758d93c907e8d +zstd/1.5.5#93372fe14bb7883bd4de82914e0a1841 [generators] CMakeDeps @@ -69,3 +73,4 @@ highfive*:with_eigen=False highfive*:with_opencv=False highfive*:with_xtensor=False spdlog*:header_only=True +xxhash*:utility=False diff --git a/src/hictk/CMakeLists.txt b/src/hictk/CMakeLists.txt index 3033984d..e2b8fb8d 100644 --- a/src/hictk/CMakeLists.txt +++ b/src/hictk/CMakeLists.txt @@ -16,12 +16,14 @@ target_sources( hictk PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_balance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_convert.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_dump.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_load.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_merge.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_validate.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_zoomify.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/balance/balance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/convert/convert.cpp ${CMAKE_CURRENT_SOURCE_DIR}/convert/cool_to_hic.cpp ${CMAKE_CURRENT_SOURCE_DIR}/convert/hic_to_cool.cpp diff --git a/src/hictk/balance/balance.cpp b/src/hictk/balance/balance.cpp new file mode 100644 index 00000000..4b3fab75 --- /dev/null +++ b/src/hictk/balance/balance.cpp @@ -0,0 +1,209 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include + +#include + +#include "hictk/balancing/ice.hpp" +#include "hictk/balancing/methods.hpp" +#include "hictk/cooler.hpp" +#include "hictk/file.hpp" +#include "hictk/hic.hpp" +#include "hictk/tools/common.hpp" +#include "hictk/tools/config.hpp" +#include "hictk/tools/juicer_tools.hpp" + +namespace hictk::tools { + +static void write_weights_hic(const hic::File& hf, const BalanceConfig& c, + const std::vector& weights) { + auto tmpfile = c.tmp_dir / std::filesystem::path{hf.name()}.filename(); + for (std::size_t i = 0; i < 1024; ++i) { + if (!std::filesystem::exists(tmpfile)) { + break; + } + + tmpfile.replace_extension(".tmp" + std::to_string(i)); + } + + if (std::filesystem::exists(tmpfile)) { + throw std::runtime_error( + fmt::format(FMT_STRING("unable to create temporary file {}"), tmpfile)); + } + + try { + { + const std::unique_ptr f(std::fopen(tmpfile.string().c_str(), "ae")); + if (!bool(f)) { + throw fmt::system_error(errno, FMT_STRING("cannot open file {}"), tmpfile); + } + + std::ptrdiff_t i0 = 0; + + for (const auto& chrom : hf.chromosomes()) { + if (chrom.is_all()) { + continue; + } + fmt::print(f.get(), FMT_STRING("vector\t{}\t{}\t{}\tBP\n"), c.name, chrom.name(), + hf.bin_size()); + + const auto num_bins = (chrom.size() + hf.bin_size() - 1) / hf.bin_size(); + const auto i1 = i0 + static_cast(num_bins); + std::for_each(weights.begin() + i0, weights.begin() + i1, [&](const double w) { + std::isnan(w) ? fmt::print(f.get(), FMT_COMPILE(".\n")) + : fmt::print(f.get(), FMT_COMPILE("{}\n"), 1.0 / w); + if (!bool(f)) { // NOLINT + throw fmt::system_error( + errno, FMT_STRING("an error occurred while writing weights to file {}"), tmpfile); + } + }); + + i0 = i1; + } + } + + auto jt = run_juicer_tools_add_norm(c.juicer_tools_jar, tmpfile, hf.url(), c.juicer_tools_xmx); + jt->wait(); + if (jt->exit_code() != 0) { + throw std::runtime_error( + fmt::format(FMT_STRING("juicer_tools pre failed with exit code {}"), jt->exit_code())); + } + } catch (...) { + std::error_code ec{}; + std::filesystem::remove(tmpfile, ec); + } + std::filesystem::remove(tmpfile); +} + +static void write_weights_cooler(std::string_view uri, const BalanceConfig& c, + const std::vector& weights, + const std::vector& variance, + const std::vector& scale) { + const auto& [file, grp] = cooler::parse_cooler_uri(uri); + const auto path = fmt::format(FMT_STRING("{}/bins/{}"), grp, c.name); + SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), uri, path); + + const HighFive::File clr(file, HighFive::File::ReadWrite); + + if (clr.exist(path)) { + assert(c.force); + clr.unlink(path); + } + + cooler::Dataset dset(cooler::RootGroup{clr.getGroup(grp)}, path, 0.0); + dset.append(weights); + + dset.write_attribute("cis_only", c.mode == "cis"); + dset.write_attribute("divisive_weights", false); + dset.write_attribute("ignore_diags", std::int64_t(c.masked_diags)); + dset.write_attribute("mad_max", std::int64_t(c.mad_max)); + dset.write_attribute("min_count", std::int64_t(c.min_count)); + dset.write_attribute("min_nnz", std::int64_t(c.min_nnz)); + dset.write_attribute("tol", c.tolerance); + + if (c.mode != "cis") { + dset.write_attribute("converged", variance.front() < c.tolerance); + dset.write_attribute("scale", scale.front()); + dset.write_attribute("var", variance.front()); + } else { + std::vector converged{}; + for (const auto& var : variance) { + converged.push_back(var < c.tolerance); // NOLINT + } + dset.write_attribute("converged", converged); + dset.write_attribute("scale", scale); + dset.write_attribute("var", variance); + } +} + +static int balance_singleres_file(File&& f, const BalanceConfig& c) { + std::filesystem::path tmpfile{}; + + if (!c.force && !c.stdout_ && f.has_normalization(c.name)) { + throw std::runtime_error( + fmt::format(FMT_STRING("Normalization weights for \"{}\" already exist in file {}. Pass " + "--force to overwrite existing weights."), + c.name, f.path())); + } + + if (!c.in_memory) { + tmpfile = c.tmp_dir / std::filesystem::path{f.path()}.filename(); + for (std::size_t i = 0; i < 1024; ++i) { + if (!std::filesystem::exists(tmpfile)) { + break; + } + + tmpfile.replace_extension(".tmp" + std::to_string(i)); + } + + if (std::filesystem::exists(tmpfile)) { + throw std::runtime_error( + fmt::format(FMT_STRING("unable to create temporary file {}"), tmpfile)); + } + } + + const balancing::ICE::Params params{c.tolerance, c.max_iters, c.masked_diags, + c.min_nnz, c.min_count, c.mad_max, + tmpfile, c.chunk_size, c.threads}; + balancing::ICE::Type mode{}; + if (c.mode == "gw") { + mode = balancing::ICE::Type::gw; + } else if (c.mode == "cis") { + mode = balancing::ICE::Type::cis; + } else { + mode = balancing::ICE::Type::trans; + } + + const auto balancer = + std::visit([&](const auto& ff) { return balancing::ICE(ff, mode, params); }, f.get()); + const auto weights = balancer.get_weights(c.rescale_marginals); + + if (c.stdout_) { + std::for_each(weights.begin(), weights.end(), + [&](const auto w) { fmt::print(FMT_COMPILE("{}\n"), w); }); + return 0; + } + + if (f.is_cooler()) { + const auto uri = f.uri(); + f.get().close(); + write_weights_cooler(uri, c, weights, balancer.variance(), balancer.scale()); + return 0; + } + + write_weights_hic(f.get(), c, weights); + + return 0; +} + +static int balance_multires(const BalanceConfig& c) { + const auto resolutions = cooler::MultiResFile(c.path_to_input.string()).resolutions(); + + for (const auto& res : resolutions) { + balance_singleres_file( + File(fmt::format(FMT_STRING("{}::/resolutions/{}"), c.path_to_input.string(), res)), c); + } + return 0; +} + +int balance_subcmd(const BalanceConfig& c) { + if (cooler::utils::is_multires_file(c.path_to_input.string())) { + return balance_multires(c); + } + + std::vector resolutions{}; + if (hic::utils::is_hic_file(c.path_to_input)) { + resolutions = hic::utils::list_resolutions(c.path_to_input); + } else { + resolutions.push_back(File(c.path_to_input.string()).bin_size()); + } + + for (const auto& res : resolutions) { + balance_singleres_file(File(c.path_to_input.string(), res), c); + } + + return 0; +} +} // namespace hictk::tools diff --git a/src/hictk/cli/cli.cpp b/src/hictk/cli/cli.cpp index 1a5b057b..fa044d93 100644 --- a/src/hictk/cli/cli.cpp +++ b/src/hictk/cli/cli.cpp @@ -25,7 +25,9 @@ auto Cli::parse_arguments() -> Config { _cli.name(_exec_name); _cli.parse(_argc, _argv); - if (_cli.get_subcommand("convert")->parsed()) { + if (_cli.get_subcommand("balance")->parsed()) { + _subcommand = subcommand::balance; + } else if (_cli.get_subcommand("convert")->parsed()) { _subcommand = subcommand::convert; } else if (_cli.get_subcommand("dump")->parsed()) { _subcommand = subcommand::dump; @@ -69,6 +71,8 @@ int Cli::exit(const CLI::ParseError& e) const { return _cli.exit(e); } std::string_view Cli::subcommand_to_str(subcommand s) noexcept { switch (s) { + case balance: + return "balance"; case convert: return "convert"; case dump: @@ -93,6 +97,7 @@ void Cli::make_cli() { _cli.set_version_flag("-V,--version", std::string{hictk::config::version::str_long()}); _cli.require_subcommand(1); + make_balance_subcommand(); make_convert_subcommand(); make_dump_subcommand(); make_load_subcommand(); @@ -103,6 +108,9 @@ void Cli::make_cli() { void Cli::validate_args() const { switch (_subcommand) { + case balance: + validate_balance_subcommand(); + break; case convert: validate_convert_subcommand(); break; @@ -127,6 +135,9 @@ void Cli::validate_args() const { void Cli::transform_args() { switch (_subcommand) { + case balance: + transform_args_balance_subcommand(); + break; case convert: transform_args_convert_subcommand(); break; diff --git a/src/hictk/cli/cli_balance.cpp b/src/hictk/cli/cli_balance.cpp new file mode 100644 index 00000000..7015b4f2 --- /dev/null +++ b/src/hictk/cli/cli_balance.cpp @@ -0,0 +1,179 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include + +#include +#include +#include +#include +#include + +#include "hictk/tools/cli.hpp" +#include "hictk/tools/config.hpp" + +namespace hictk::tools { + +void Cli::make_balance_subcommand() { + auto& sc = *_cli.add_subcommand("balance", "Balance HiC matrices using ICE.") + ->fallthrough() + ->preparse_callback([this]([[maybe_unused]] std::size_t i) { + assert(_config.index() == 0); + _config = BalanceConfig{}; + }); + + _config = BalanceConfig{}; + auto& c = std::get(_config); + + // clang-format off + sc.add_option( + "input", + c.path_to_input, + "Path to the .hic, .cool or .mcool file to be balanced.") + ->check(IsValidHiCFile | IsValidCoolerFile | IsValidMultiresCoolerFile) + ->required(); + sc.add_option( + "--mode", + c.mode, + "Balance matrix using:\n" + " - genome-wide interactions (gw)\n" + " - trans-only interactions (trans)\n" + " - cis-only interactions (cis)") + ->check(CLI::IsMember({"gw", "trans", "cis"})) + ->capture_default_str(); + sc.add_option( + "--tmpdir", + c.tmp_dir, + "Path to a folder where to store temporary data.") + ->capture_default_str(); + sc.add_option( + "--ignore-diags", + c.masked_diags, + "Number of diagonals (including the main diagonal) to mask before balancing.") + ->capture_default_str(); + sc.add_option( + "--mad-max", + c.mad_max, + "Mask bins using the MAD-max filter.\n" + "bins whose log marginal sum is less than --mad-max median\n" + "absolute deviations below the median log marginal sum of\n" + "all the bins in the same chromosome.") + ->check(CLI::NonNegativeNumber) + ->capture_default_str(); + sc.add_option( + "--min-nnz", + c.min_nnz, + "Mask rows with fewer than --min-nnz non-zero entries.") + ->capture_default_str(); + sc.add_option( + "--min-count", + c.min_count, + "Mask rows with fewer than --min-count interactions.") + ->capture_default_str(); + sc.add_option( + "--tolerance", + c.tolerance, + "Threshold of the variance of marginals used to determine whether\n" + "the algorithm has converged.") + ->check(CLI::NonNegativeNumber) + ->capture_default_str(); + sc.add_option( + "--max-iters", + c.max_iters, + "Maximum number of iterations.") + ->check(CLI::PositiveNumber) + ->capture_default_str(); + sc.add_flag( + "--rescale-weights,!--no-rescale-weights", + c.rescale_marginals, + "Rescale weights such that rows sum approximately to 2.") + ->capture_default_str(); + sc.add_option( + "--name", + c.name, + "Name to use when writing weights to file.") + ->capture_default_str(); + sc.add_flag( + "--in-memory", + c.in_memory, + "Store all interactions in memory (greatly improves performance).") + ->capture_default_str(); + sc.add_flag( + "--stdout", + c.stdout_, + "Write balancing weights to stdout instead of writing them to the input file.") + ->capture_default_str(); + sc.add_option( + "--chunk-size", + c.chunk_size, + "Number of interactions to process at once. Ignored when using --in-memory.") + ->check(CLI::PositiveNumber) + ->capture_default_str(); + sc.add_option( + "-v,--verbosity", + c.verbosity, + "Set verbosity of output to the console.") + ->check(CLI::Range(1, 4)) + ->capture_default_str(); + sc.add_option( + "-t,--threads", + c.threads, + "Maximum number of parallel threads to spawn.") + ->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency())) + ->capture_default_str(); + sc.add_option( + "-l,--compression-level", + c.zstd_compression_lvl, + "Compression level used to compress temporary files using ZSTD.") + ->check(CLI::Range(0, 19)) + ->capture_default_str(); + sc.add_option( + "--juicer-tools-jar", + c.juicer_tools_jar, + "Path to juicer_tools or hic_tools JAR.") + ->check(CLI::ExistingFile); + sc.add_option( + "--juicer-tools-memory", + c.juicer_tools_xmx, + "Max heap size used by juicer_tools.") + ->default_str(fmt::format(FMT_STRING("{:.0f}MB"), double(c.juicer_tools_xmx) / 1.0e6)) + ->check(CLI::PositiveNumber) + ->transform(CLI::AsSizeValue(true)); + sc.add_flag( + "-f,--force", + c.force, + "Overwrite existing files and datasets (if any).") + ->capture_default_str(); + // clang-format on +} + +void Cli::validate_balance_subcommand() const { + const auto& c = std::get(_config); + std::vector errors; + + const auto juicer_tools_jar_parsed = + !_cli.get_subcommand("balance")->get_option("--juicer-tools-jar")->empty(); + if (hic::utils::is_hic_file(c.path_to_input) && !c.stdout_ && !juicer_tools_jar_parsed) { + errors.emplace_back( + "option --juicer-tools-jar is required when balancing files in .hic format."); + } + + if (!errors.empty()) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "The following error(s) where encountered while validating CLI arguments:\n - {}"), + fmt::join(errors, "\n - "))); + } +} + +void Cli::transform_args_balance_subcommand() { + auto& c = std::get(_config); + + // in spdlog, high numbers correspond to low log levels + assert(c.verbosity > 0 && c.verbosity < 5); + c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; +} + +} // namespace hictk::tools diff --git a/src/hictk/cli/cli_convert.cpp b/src/hictk/cli/cli_convert.cpp index a5374d3f..df7dd01c 100644 --- a/src/hictk/cli/cli_convert.cpp +++ b/src/hictk/cli/cli_convert.cpp @@ -95,7 +95,7 @@ void Cli::make_convert_subcommand() { c.processes, "Maximum number of parallel processes to spawn.\n" "When converting from hic to cool, only two processes will be used.") - ->check(CLI::Range(2, 1024)) + ->check(CLI::Range(std::uint32_t(2), std::thread::hardware_concurrency())) ->capture_default_str(); sc.add_option( "-l,--compression-level", diff --git a/src/hictk/convert/cool_to_hic.cpp b/src/hictk/convert/cool_to_hic.cpp index 99534a0f..97b41763 100644 --- a/src/hictk/convert/cool_to_hic.cpp +++ b/src/hictk/convert/cool_to_hic.cpp @@ -16,62 +16,16 @@ #include "hictk/fmt.hpp" #include "hictk/tmpdir.hpp" +#include "hictk/tools/common.hpp" #include "hictk/tools/config.hpp" - -namespace std { -template <> -struct default_delete { - void operator()(FILE* file) const { std::fclose(file); } // NOLINT -}; -} // namespace std +#include "hictk/tools/juicer_tools.hpp" namespace hictk::tools { -[[nodiscard]] static std::filesystem::path find_java() { - auto java = boost::process::search_path("java"); - if (java.empty()) { - throw std::runtime_error("unable to find java in your PATH"); - } - return java.string(); -} - [[maybe_unused]] [[nodiscard]] static std::filesystem::path find_pigz() { return boost::process::search_path("pigz").string(); } -[[nodiscard]] static std::vector generate_juicer_tools_pre_args( - const ConvertConfig& c, const std::filesystem::path& path_to_pixels, - const std::filesystem::path& path_to_chrom_sizes, std::size_t processes) { - assert(processes != 0); - return {fmt::format(FMT_STRING("-Xmx{}M"), c.juicer_tools_xmx / 1'000'000), - "-jar", - c.juicer_tools_jar.string(), - "pre", - "-j", - fmt::to_string(processes), - "-t", - c.tmp_dir.string(), - "-n", - "-r", - fmt::format(FMT_STRING("{}"), fmt::join(c.resolutions, ",")), - path_to_pixels.string(), - c.path_to_output.string(), - path_to_chrom_sizes.string()}; -} - -[[nodiscard]] static std::vector generate_juicer_tools_add_norm_args( - const ConvertConfig& c, const std::filesystem::path& path_to_weights, std::size_t processes) { - assert(processes != 0); - return {fmt::format(FMT_STRING("-Xmx{}M"), c.juicer_tools_xmx / 1'000'000), - "-jar", - c.juicer_tools_jar.string(), - "addNorm", - "-j", - fmt::to_string(processes), - c.path_to_output.string(), - path_to_weights.string()}; -} - static void dump_chrom_sizes(const cooler::File& clr, const std::filesystem::path& dest) { SPDLOG_INFO(FMT_STRING("writing chromosomes to file {}..."), dest); const std::unique_ptr f(std::fopen(dest.string().c_str(), "we")); @@ -250,7 +204,7 @@ static bool dump_weights(std::uint32_t resolution, std::string_view cooler_uri, const cooler::File clr(cooler_uri); assert(clr.bin_size() == resolution); - if (!clr.has_weights("weight")) { + if (!clr.has_normalization("weight")) { SPDLOG_WARN(FMT_STRING("[{}] unable to read weights from \"{}\"..."), resolution, cooler_uri); return false; } @@ -296,19 +250,6 @@ static bool dump_weights(const ConvertConfig& c, const std::filesystem::path& we return cooler_has_weights; } -[[nodiscard]] static std::unique_ptr run_juicer_tools_pre( - const ConvertConfig& c, const std::filesystem::path& chrom_sizes, - const std::filesystem::path& pixels, std::size_t processes) { - const auto cmd = generate_juicer_tools_pre_args(c, pixels, chrom_sizes, processes); - return std::make_unique(find_java().string(), cmd); -} - -[[nodiscard]] static std::unique_ptr run_juicer_tools_add_norm( - const ConvertConfig& c, const std::filesystem::path& path_to_weights, std::size_t processes) { - const auto cmd = generate_juicer_tools_add_norm_args(c, path_to_weights, processes); - return std::make_unique(find_java().string(), cmd); -} - void cool_to_hic(const ConvertConfig& c) { static const internal::TmpDir tmpdir{}; @@ -365,7 +306,8 @@ void cool_to_hic(const ConvertConfig& c) { if (weight_file_has_data) { t1 = std::chrono::steady_clock::now(); SPDLOG_INFO(FMT_STRING("running juicer_tools addNorm...")); - process = run_juicer_tools_add_norm(c, weights, c.processes); + process = run_juicer_tools_add_norm(c.juicer_tools_jar, weights, c.path_to_output, + c.juicer_tools_xmx); process->wait(); if (process->exit_code() != 0) { throw std::runtime_error(fmt::format( diff --git a/src/hictk/include/hictk/tools/cli.hpp b/src/hictk/include/hictk/tools/cli.hpp index 6170ead7..f0823632 100644 --- a/src/hictk/include/hictk/tools/cli.hpp +++ b/src/hictk/include/hictk/tools/cli.hpp @@ -192,6 +192,7 @@ class Cli { public: enum subcommand { help, + balance, convert, dump, load, @@ -216,6 +217,7 @@ class Cli { CLI::App _cli{}; subcommand _subcommand{subcommand::help}; + void make_balance_subcommand(); void make_convert_subcommand(); void make_dump_subcommand(); void make_load_subcommand(); @@ -224,6 +226,7 @@ class Cli { void make_zoomify_subcommand(); void make_cli(); + void validate_balance_subcommand() const; void validate_convert_subcommand() const; void validate_dump_subcommand() const; void validate_load_subcommand() const; @@ -231,6 +234,7 @@ class Cli { void validate_zoomify_subcommand() const; void validate_args() const; + void transform_args_balance_subcommand(); void transform_args_convert_subcommand(); void transform_args_dump_subcommand(); void transform_args_load_subcommand(); diff --git a/src/hictk/include/hictk/tools/common.hpp b/src/hictk/include/hictk/tools/common.hpp new file mode 100644 index 00000000..b161df1f --- /dev/null +++ b/src/hictk/include/hictk/tools/common.hpp @@ -0,0 +1,15 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +namespace std { +template <> +struct default_delete { + void operator()(FILE* file) const { std::fclose(file); } // NOLINT +}; +} // namespace std diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index 2b1955a5..f6dd2e5f 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -14,6 +14,31 @@ namespace hictk::tools { +struct BalanceConfig { + std::filesystem::path path_to_input{}; + std::filesystem::path tmp_dir{std::filesystem::temp_directory_path()}; + std::filesystem::path juicer_tools_jar{}; + + std::string mode{"gw"}; + std::size_t masked_diags{2}; + double mad_max{5.0}; + std::size_t min_nnz{10}; + std::size_t min_count{0}; + double tolerance{1.0e-5}; + std::size_t max_iters{500}; + bool rescale_marginals{true}; + std::string name{"weight"}; + bool in_memory{false}; + bool stdout_{false}; + std::uint8_t zstd_compression_lvl{3}; + std::size_t threads{1}; + std::size_t chunk_size{10'000'000}; + std::size_t juicer_tools_xmx{256'000'000}; + + std::uint8_t verbosity{4}; + bool force{false}; +}; + struct ConvertConfig { std::filesystem::path path_to_input{}; std::filesystem::path path_to_output{}; @@ -105,6 +130,7 @@ struct ZoomifyConfig { // clang-format off using Config = std::variant +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/tools/config.hpp" + +namespace hictk::tools { + +[[nodiscard]] inline std::filesystem::path find_java() { + auto java = boost::process::search_path("java"); + if (java.empty()) { + throw std::runtime_error("unable to find java in your PATH"); + } + return java.string(); +} + +[[nodiscard]] inline std::vector generate_juicer_tools_pre_args( + const ConvertConfig& c, const std::filesystem::path& path_to_pixels, + const std::filesystem::path& path_to_chrom_sizes, std::size_t processes) { + assert(processes != 0); + return {fmt::format(FMT_STRING("-Xmx{}M"), c.juicer_tools_xmx / 1'000'000), + "-jar", + c.juicer_tools_jar.string(), + "pre", + "-j", + fmt::to_string(processes), + "-t", + c.tmp_dir.string(), + "-n", + "-r", + fmt::format(FMT_STRING("{}"), fmt::join(c.resolutions, ",")), + path_to_pixels.string(), + c.path_to_output.string(), + path_to_chrom_sizes.string()}; +} + +[[nodiscard]] inline std::vector generate_juicer_tools_add_norm_args( + const std::filesystem::path& juicer_tools_jar, const std::filesystem::path& path_to_weights, + const std::filesystem::path& path_to_output, std::size_t juicer_tools_xmx) { + return {fmt::format(FMT_STRING("-Xmx{}M"), juicer_tools_xmx / 1'000'000), + "-jar", + juicer_tools_jar.string(), + "addNorm", + "-j", + "1", + path_to_output.string(), + path_to_weights.string()}; +} + +[[nodiscard]] inline std::unique_ptr run_juicer_tools_pre( + const ConvertConfig& c, const std::filesystem::path& chrom_sizes, + const std::filesystem::path& pixels, std::size_t processes) { + const auto cmd = generate_juicer_tools_pre_args(c, pixels, chrom_sizes, processes); + return std::make_unique(find_java().string(), cmd); +} + +[[nodiscard]] inline std::unique_ptr run_juicer_tools_add_norm( + const std::filesystem::path& juicer_tools_jar, const std::filesystem::path& path_to_weights, + const std::filesystem::path& path_to_output, std::size_t juicer_tools_xmx) { + const auto cmd = generate_juicer_tools_add_norm_args(juicer_tools_jar, path_to_weights, + path_to_output, juicer_tools_xmx); + return std::make_unique(find_java().string(), cmd); +} + +} // namespace hictk::tools diff --git a/src/hictk/include/hictk/tools/tools.hpp b/src/hictk/include/hictk/tools/tools.hpp index 062203a3..3dae7308 100644 --- a/src/hictk/include/hictk/tools/tools.hpp +++ b/src/hictk/include/hictk/tools/tools.hpp @@ -8,6 +8,7 @@ namespace hictk::tools { +[[nodiscard]] int balance_subcmd(const BalanceConfig& c); [[nodiscard]] int convert_subcmd(const ConvertConfig& c); [[nodiscard]] int dump_subcmd(const DumpConfig& c); [[nodiscard]] int load_subcmd(const LoadConfig& c); diff --git a/src/hictk/main.cpp b/src/hictk/main.cpp index 19673e20..e27103a9 100644 --- a/src/hictk/main.cpp +++ b/src/hictk/main.cpp @@ -99,6 +99,8 @@ int main(int argc, char** argv) noexcept { } using sc = Cli::subcommand; switch (subcmd) { + case sc::balance: + return balance_subcmd(std::get(config)); case sc::convert: return convert_subcmd(std::get(config)); case sc::dump: diff --git a/src/libhictk/balancing/CMakeLists.txt b/src/libhictk/balancing/CMakeLists.txt index b7fbd58a..e6325e44 100644 --- a/src/libhictk/balancing/CMakeLists.txt +++ b/src/libhictk/balancing/CMakeLists.txt @@ -2,7 +2,11 @@ # # SPDX-License-Identifier: MIT +find_package(bshoshany-thread-pool REQUIRED) find_package(phmap REQUIRED) +find_package(span-lite REQUIRED) +find_package(xxHash REQUIRED) +find_package(zstd REQUIRED) add_library(balancing INTERFACE) add_library(hictk::balancing ALIAS balancing) @@ -18,4 +22,13 @@ target_include_directories(balancing INTERFACE "$") target_link_libraries(balancing INTERFACE hictk::common hictk::pixel) -target_link_system_libraries(balancing INTERFACE phmap) +target_link_system_libraries( + balancing + INTERFACE + bshoshany-thread-pool::bshoshany-thread-pool + nonstd::span-lite + phmap + xxHash::xxhash + "zstd::libzstd_$,shared,static>") + +target_compile_definitions(balancing INTERFACE span_FEATURE_MAKE_SPAN=1) diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp new file mode 100644 index 00000000..021745e0 --- /dev/null +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -0,0 +1,158 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +#include "hictk/balancing/sparse_matrix.hpp" +#include "hictk/bin_table.hpp" + +namespace hictk::balancing { + +class ICE { + std::vector _chrom_offsets{}; + std::vector _biases{}; + std::vector _variance{}; + std::vector _scale{}; + + struct Result { + double scale{}; + double variance{}; + }; + + public: + enum Type { cis, trans, gw }; + + struct Params { + double tol{1.0e-5}; + std::size_t max_iters{200}; + std::size_t num_masked_diags{2}; + std::size_t min_nnz{10}; + std::size_t min_count{0}; + double mad_max{5.0}; + std::filesystem::path tmpfile{}; + std::size_t chunk_size{10'000'000}; + std::size_t threads{1}; + }; + + // NOLINTNEXTLINE + inline static const Params DefaultParams{1.0e-5, 200, 2, 10, 0, 5.0, "", 10'000'000, 1}; + + template + explicit ICE(const File& f, Type type = Type::gw, const Params& params = DefaultParams); + + [[nodiscard]] std::vector get_weights(bool rescale = true) const; + [[nodiscard]] std::vector scale() const noexcept; + [[nodiscard]] std::vector variance() const noexcept; + + private: + template + void balance_in_memory(const File& f, Type type, double tol, std::size_t max_iters, + std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, + double mad_max, BS::thread_pool* tpool); + + template + void balance_chunked(const File& f, Type type, double tol, std::size_t max_iters, + std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, + double mad_max, const std::filesystem::path& tmpfile, std::size_t chunk_size, + BS::thread_pool* tpool); + + template + void balance_gw(const MatrixT& matrix, std::size_t max_iters, double tol, BS::thread_pool* tpool); + + template + void balance_cis(const MatrixT& matrix, const Chromosome& chrom, std::size_t max_iters, + double tol, BS::thread_pool* tpool); + + template + void balance_trans(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, double tol, + BS::thread_pool* tpool); + + template + [[nodiscard]] static auto construct_sparse_matrix(const File& f, Type type, + std::size_t num_masked_diags) -> SparseMatrix; + template + [[nodiscard]] static auto construct_sparse_matrix_gw(const File& f, std::size_t num_masked_diags) + -> SparseMatrix; + template + [[nodiscard]] static auto construct_sparse_matrix_cis(const File& f, const Chromosome& chrom, + std::size_t bin_offset, + std::size_t num_masked_diags) + -> SparseMatrix; + template + [[nodiscard]] static auto construct_sparse_matrix_cis(const File& f, std::size_t num_masked_diags) + -> SparseMatrix; + template + [[nodiscard]] static auto construct_sparse_matrix_trans(const File& f, + std::size_t num_masked_diags) + -> SparseMatrix; + + template + [[nodiscard]] static auto construct_sparse_matrix_chunked(const File& f, Type type, + std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) + -> SparseMatrixChunked; + template + [[nodiscard]] static auto construct_sparse_matrix_chunked_gw(const File& f, + std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) + -> SparseMatrixChunked; + + template + [[nodiscard]] static auto construct_sparse_matrix_chunked_cis( + const File& f, const Chromosome& chrom, std::size_t bin_offset, std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, std::size_t chunk_size) -> SparseMatrixChunked; + template + [[nodiscard]] static auto construct_sparse_matrix_chunked_cis( + const File& f, std::size_t num_masked_diags, const std::filesystem::path& tmpfile, + std::size_t chunk_size) -> SparseMatrixChunked; + + template + [[nodiscard]] static auto construct_sparse_matrix_chunked_trans( + const File& f, std::size_t num_masked_diags, const std::filesystem::path& tmpfile, + std::size_t chunk_size) -> SparseMatrixChunked; + + template + [[nodiscard]] static auto inner_loop(const MatrixT& matrix, nonstd::span biases, + MargsVector& marg, nonstd::span weights = {}, + BS::thread_pool* tpool = nullptr) -> Result; + [[nodiscard]] static std::pair aggregate_marg( + nonstd::span marg, BS::thread_pool* tpool); + + static void update_biases(nonstd::span marg, nonstd::span biases, + double avg_nzmarg, BS::thread_pool* tpool); + + [[nodiscard]] static double compute_ssq_nzmarg(nonstd::span marg, double avg_nzmarg, + BS::thread_pool* tpool); + + template + static void min_nnz_filtering(MargsVector& marg, const MatrixT& matrix, + nonstd::span biases, std::size_t min_nnz, + BS::thread_pool* tpool); + + static void min_count_filtering(nonstd::span biases, std::size_t min_count, + nonstd::span marg); + + static void mad_max_filtering(nonstd::span chrom_offsets, + nonstd::span biases, nonstd::span marg, + double mad_max); + + template + static void initialize_biases(const MatrixT& matrix, nonstd::span biases, + nonstd::span chrom_bin_offsets, + std::size_t min_nnz, std::size_t min_count, double mad_max, + BS::thread_pool* tpool); + + [[nodiscard]] static std::vector compute_weights_from_chromosome_sizes( + const BinTable& bins, nonstd::span chrom_bin_offsets); +}; + +} // namespace hictk::balancing + +#include "./impl/ice_impl.hpp" diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp new file mode 100644 index 00000000..408cbead --- /dev/null +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp @@ -0,0 +1,683 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "hictk/cooler/cooler.hpp" +#include "hictk/pixel.hpp" +#include "hictk/type_traits.hpp" + +namespace hictk::balancing { + +template +inline ICE::ICE(const File& f, Type type, const Params& params) + : _chrom_offsets(f.bins().num_bin_prefix_sum()), + _biases(f.bins().size(), 1.0), + _variance(f.chromosomes().size(), 0), + _scale(f.chromosomes().size(), std::numeric_limits::quiet_NaN()) { + std::unique_ptr tpool{}; + if (params.threads != 1) { + tpool = std::make_unique(params.threads); + } + if (params.tmpfile.empty()) { + balance_in_memory(f, type, params.tol, params.max_iters, params.num_masked_diags, + params.min_nnz, params.min_count, params.mad_max, tpool.get()); + } else { + balance_chunked(f, type, params.tol, params.max_iters, params.num_masked_diags, params.min_nnz, + params.min_count, params.mad_max, params.tmpfile, params.chunk_size, + tpool.get()); + } +} + +template +inline void ICE::balance_in_memory(const File& f, Type type, double tol, std::size_t max_iters, + std::size_t num_masked_diags, std::size_t min_nnz, + std::size_t min_count, double mad_max, BS::thread_pool* tpool) { + auto matrix = construct_sparse_matrix(f, type, num_masked_diags); + + initialize_biases(matrix, _biases, _chrom_offsets, min_nnz, min_count, mad_max, tpool); + + if (type == Type::gw) { + return balance_gw(matrix, max_iters, tol, tpool); + } + + if (type == Type::trans) { + matrix.clear(true); + matrix = construct_sparse_matrix_trans(f, num_masked_diags); + return balance_trans(matrix, f.bins(), max_iters, tol, tpool); + } + + assert(type == Type::cis); + matrix.clear(true); + for (std::uint32_t i = 0; i < f.chromosomes().size(); ++i) { + const Chromosome& chrom = f.chromosomes().at(i); + if (chrom.is_all()) { + continue; + } + matrix = construct_sparse_matrix_cis(f, chrom, _chrom_offsets[i], num_masked_diags); + balance_cis(matrix, chrom, max_iters, tol, tpool); + } +} + +template +inline void ICE::balance_chunked(const File& f, Type type, double tol, std::size_t max_iters, + std::size_t num_masked_diags, std::size_t min_nnz, + std::size_t min_count, double mad_max, + const std::filesystem::path& tmpfile, std::size_t chunk_size, + BS::thread_pool* tpool) { + auto matrix = construct_sparse_matrix_chunked(f, type, num_masked_diags, tmpfile, chunk_size); + + initialize_biases(matrix, _biases, _chrom_offsets, min_nnz, min_count, mad_max, tpool); + + if (type == Type::gw) { + return balance_gw(matrix, max_iters, tol, tpool); + } + + if (type == Type::trans) { + matrix.clear(true); + matrix = construct_sparse_matrix_chunked_trans(f, num_masked_diags, tmpfile, chunk_size); + return balance_trans(matrix, f.bins(), max_iters, tol, tpool); + } + + assert(type == Type::cis); + matrix.clear(true); + for (std::uint32_t i = 0; i < f.chromosomes().size(); ++i) { + const Chromosome& chrom = f.chromosomes().at(i); + if (chrom.is_all()) { + continue; + } + matrix = construct_sparse_matrix_chunked_cis(f, chrom, _chrom_offsets[i], num_masked_diags, + tmpfile, chunk_size); + balance_cis(matrix, chrom, max_iters, tol, tpool); + } +} + +template +inline void ICE::balance_gw(const MatrixT& matrix, std::size_t max_iters, double tol, + BS::thread_pool* tpool) { + _variance.resize(1, 0); + _scale.resize(1, std::numeric_limits::quiet_NaN()); + + MargsVector marg(_biases.size()); + for (std::size_t i = 0; i < max_iters; ++i) { + const auto res = inner_loop(matrix, _biases, marg, {}, tpool); + SPDLOG_INFO(FMT_STRING("Iteration {}: {}"), i + 1, res.variance); + _variance[0] = res.variance; + _scale[0] = res.scale; + if (res.variance < tol) { + return; + } + } +} + +template +inline void ICE::balance_trans(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, + double tol, BS::thread_pool* tpool) { + _variance.resize(1, 0); + _scale.resize(1, std::numeric_limits::quiet_NaN()); + const auto weights = compute_weights_from_chromosome_sizes(bins, _chrom_offsets); + + MargsVector marg(_biases.size()); + for (std::size_t i = 0; i < max_iters; ++i) { + const auto res = inner_loop(matrix, _biases, marg, weights, tpool); + SPDLOG_INFO(FMT_STRING("Iteration {}: {}"), i + 1, res.variance); + _variance[0] = res.variance; + _scale[0] = res.scale; + if (res.variance < tol) { + return; + } + } +} + +template +inline void ICE::balance_cis(const MatrixT& matrix, const Chromosome& chrom, std::size_t max_iters, + double tol, BS::thread_pool* tpool) { + const auto i0 = _chrom_offsets[chrom.id()]; + const auto i1 = _chrom_offsets[chrom.id() + 1]; + auto biases_ = nonstd::span(_biases).subspan(i0, i1 - i0); + + MargsVector marg(biases_.size()); + for (std::size_t k = 0; k < max_iters; ++k) { + const auto res = inner_loop(matrix, biases_, marg, {}, tpool); + SPDLOG_INFO(FMT_STRING("[{}] iteration {}: {}"), chrom.name(), k + 1, res.variance); + _variance[chrom.id()] = res.variance; + _scale[chrom.id()] = res.scale; + + if (res.variance < tol) { + break; + } + } +} + +template +auto ICE::construct_sparse_matrix(const File& f, Type type, std::size_t num_masked_diags) + -> SparseMatrix { + SPDLOG_INFO(FMT_STRING("Reading interactions into memory...")); + if (type == Type::cis) { + return construct_sparse_matrix_cis(f, num_masked_diags); + } + return construct_sparse_matrix_gw(f, num_masked_diags); +} + +template +inline auto ICE::construct_sparse_matrix_gw(const File& f, std::size_t num_masked_diags) + -> SparseMatrix { + SparseMatrix m{}; + + const auto sel = f.fetch(); + std::for_each(sel.template begin(), sel.template end(), [&](const auto& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + + m.finalize(); + + return m; +} + +template +[[nodiscard]] inline auto ICE::construct_sparse_matrix_cis(const File& f, const Chromosome& chrom, + std::size_t bin_offset, + std::size_t num_masked_diags) + -> SparseMatrix { + SparseMatrix m{}; + + const auto sel = f.fetch(chrom.name()); + std::for_each(sel.template begin(), sel.template end(), + [&](const ThinPixel& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count, bin_offset); + } + }); + m.finalize(); + + return m; +} + +template +[[nodiscard]] inline auto ICE::construct_sparse_matrix_cis(const File& f, + std::size_t num_masked_diags) + -> SparseMatrix { + SparseMatrix m{}; + + for (const auto& chrom : f.chromosomes()) { + if (chrom.is_all()) { + continue; + } + auto sel = f.fetch(chrom.name()); + std::for_each(sel.template begin(), sel.template end(), + [&](const ThinPixel& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + } + m.finalize(); + + return m; +} + +template +[[nodiscard]] inline auto ICE::construct_sparse_matrix_trans(const File& f, + std::size_t num_masked_diags) + -> SparseMatrix { + using SelectorT = decltype(f.fetch("chr1", "chr2")); + using PixelIt = decltype(f.fetch("chr1", "chr2").template begin()); + + std::vector selectors{}; + for (const Chromosome& chrom1 : f.chromosomes()) { + if (chrom1.is_all()) { + continue; + } + for (std::uint32_t chrom2_id = chrom1.id() + 1; chrom2_id < f.chromosomes().size(); + ++chrom2_id) { + const auto& chrom2 = f.chromosomes().at(chrom2_id); + if (chrom2.is_all()) { + continue; + } + + selectors.emplace_back(f.fetch(chrom1.name(), chrom2.name())); + } + std::vector heads{}; + std::vector tails{}; + for (const auto& sel : selectors) { + heads.emplace_back(sel.template begin()); + tails.emplace_back(sel.template end()); + } + } + + std::vector heads{}; + std::vector tails{}; + for (const auto& sel : selectors) { + heads.emplace_back(sel.template begin()); + tails.emplace_back(sel.template end()); + } + + internal::PixelMerger merger{heads, tails}; + + SparseMatrix m{}; + std::for_each(merger.begin(), merger.end(), [&](const ThinPixel& p) { + // TODO: this filtering step is wrong when done on trans matrices, as it will + // remove the first and last few pixels from trans matrices of adjacent chromosomes. + // Remove the filtering once this bug has been fixed in cooler + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + + m.shrink_to_fit(); + + return m; +} + +template +auto ICE::construct_sparse_matrix_chunked(const File& f, Type type, std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) -> SparseMatrixChunked { + SPDLOG_INFO(FMT_STRING("Writing interactions to temporary file {}..."), tmpfile); + if (type == Type::cis) { + return construct_sparse_matrix_chunked_cis(f, num_masked_diags, tmpfile, chunk_size); + } + return construct_sparse_matrix_chunked_gw(f, num_masked_diags, tmpfile, chunk_size); +} + +template +inline auto ICE::construct_sparse_matrix_chunked_gw(const File& f, std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) -> SparseMatrixChunked { + SparseMatrixChunked m(tmpfile, chunk_size); + + const auto sel = f.fetch(); + std::for_each(sel.template begin(), sel.template end(), [&](const auto& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + + m.finalize(); + return m; +} + +template +inline auto ICE::construct_sparse_matrix_chunked_cis(const File& f, const Chromosome& chrom, + std::size_t bin_offset, + std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) + -> SparseMatrixChunked { + SparseMatrixChunked m(tmpfile, chunk_size); + + const auto sel = f.fetch(chrom.name()); + std::for_each(sel.template begin(), sel.template end(), + [&](const ThinPixel& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count, bin_offset); + } + }); + m.finalize(); + return m; +} + +template +inline auto ICE::construct_sparse_matrix_chunked_cis(const File& f, std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) + -> SparseMatrixChunked { + SparseMatrixChunked m(tmpfile, chunk_size); + + for (const Chromosome& chrom : f.chromosomes()) { + if (chrom.is_all()) { + continue; + } + const auto sel = f.fetch(chrom.name()); + std::for_each(sel.template begin(), sel.template end(), + [&](const ThinPixel& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + } + m.finalize(); + return m; +} + +template +inline auto ICE::construct_sparse_matrix_chunked_trans(const File& f, std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) + -> SparseMatrixChunked { + using SelectorT = decltype(f.fetch("chr1", "chr2")); + using PixelIt = decltype(f.fetch("chr1", "chr2").template begin()); + + std::vector selectors{}; + for (const Chromosome& chrom1 : f.chromosomes()) { + if (chrom1.is_all()) { + continue; + } + for (std::uint32_t chrom2_id = chrom1.id() + 1; chrom2_id < f.chromosomes().size(); + ++chrom2_id) { + const auto& chrom2 = f.chromosomes().at(chrom2_id); + if (chrom2.is_all()) { + continue; + } + + selectors.emplace_back(f.fetch(chrom1.name(), chrom2.name())); + } + } + + std::vector heads{}; + std::vector tails{}; + for (const auto& sel : selectors) { + heads.emplace_back(sel.template begin()); + tails.emplace_back(sel.template end()); + } + + internal::PixelMerger merger{heads, tails}; + + SparseMatrixChunked m(tmpfile, chunk_size); + std::for_each(merger.begin(), merger.end(), [&](const ThinPixel& p) { + // TODO: this filtering step is wrong when done on trans matrices, as it will + // remove the first and last few pixels from trans matrices of adjacent chromosomes. + // Remove the filtering once this bug has been fixed in cooler + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + + m.finalize(); + return m; +} + +template +inline void ICE::min_nnz_filtering(MargsVector& marg, const MatrixT& matrix, + nonstd::span biases, std::size_t min_nnz, + BS::thread_pool* tpool) { + matrix.marginalize_nnz(marg, tpool); + for (std::size_t i = 0; i < biases.size(); ++i) { + if (marg()[i] < static_cast(min_nnz)) { + biases[i] = 0; + } + } +} + +inline void ICE::min_count_filtering(nonstd::span biases, std::size_t min_count, + nonstd::span marg) { + for (std::size_t i = 0; i < biases.size(); ++i) { + if (marg[i] < static_cast(min_count)) { + biases[i] = 0; + } + } +} + +inline void ICE::mad_max_filtering(nonstd::span chrom_offsets, + nonstd::span biases, nonstd::span marg, + double mad_max) { + auto median = [](auto v) { + assert(!v.empty()); + + const auto size = static_cast(v.size()); + auto first = v.begin(); + auto mid = first + (size / 2); + auto last = v.end(); + + std::nth_element(first, mid, last); + + if (size % 2 != 0) { + return *mid; + } + + const auto n1 = *mid; + std::nth_element(first, --mid, last); + const auto n2 = *mid; + + return (n1 + n2) / 2; + }; + + auto mad = [&](const auto vin) { + const auto median_ = median(vin); + auto vout = vin; + + std::transform(vout.begin(), vout.end(), vout.begin(), + [&](const auto n) { return std::abs(n - median_); }); + + return median(vout); + }; + + assert(chrom_offsets.size() > 1); + std::vector cmarg{}; + for (std::size_t i = 1; i < chrom_offsets.size(); ++i) { + const auto i0 = static_cast(chrom_offsets[i - 1] - chrom_offsets.front()); + const auto i1 = static_cast(chrom_offsets[i] - chrom_offsets.front()); + + cmarg.clear(); + std::copy_if(marg.begin() + i0, marg.begin() + i1, std::back_inserter(cmarg), + [](const auto n) { return n > 0; }); + + if (!cmarg.empty()) { + const auto median_ = median(cmarg); + std::transform(marg.begin() + i0, marg.begin() + i1, marg.begin() + i0, + [&](const auto n) { return n / median_; }); + } + } + + std::vector log_nz_marg{}; + for (const auto n : marg) { + if (n > 0) { + log_nz_marg.push_back(std::log(n)); + } + } + + if (log_nz_marg.empty()) { + return; + } + + const auto median_log_nz_marg = median(log_nz_marg); + const auto dev_log_nz_marg = mad(log_nz_marg); + + const auto cutoff = std::exp(median_log_nz_marg - mad_max * dev_log_nz_marg); + + for (std::size_t i = 0; i < marg.size(); ++i) { + if (marg[i] < cutoff) { + biases[i] = 0.0; + } + } +} + +template +inline auto ICE::inner_loop(const MatrixT& matrix, nonstd::span biases, MargsVector& marg, + nonstd::span weights, BS::thread_pool* tpool) -> Result { + if (matrix.empty()) { + std::fill(biases.begin(), biases.end(), std::numeric_limits::quiet_NaN()); + return {std::numeric_limits::quiet_NaN(), 0.0}; + } + + marg.resize(biases.size()); + matrix.times_outer_product_marg(marg, biases, weights, tpool); + const auto [marg_sum, nnz_marg] = aggregate_marg(marg(), tpool); + + if (nnz_marg == 0) { + std::fill(biases.begin(), biases.end(), std::numeric_limits::quiet_NaN()); + return {std::numeric_limits::quiet_NaN(), 0.0}; + } + + const auto avg_nzmarg = (marg_sum / static_cast(nnz_marg)); + update_biases(marg(), biases, avg_nzmarg, tpool); + + const auto ssq_nzmarg = compute_ssq_nzmarg(marg(), avg_nzmarg, tpool); + const auto var_nzmarg = ssq_nzmarg / static_cast(nnz_marg - 1); + + return {avg_nzmarg, var_nzmarg}; +} + +inline std::pair ICE::aggregate_marg(nonstd::span marg, + BS::thread_pool* tpool) { + double marg_sum = 0.0; + std::size_t nnz_marg{}; + + std::mutex mtx{}; + + auto aggregate_marg_impl = [&](std::size_t istart, std::size_t iend) { + double marg_sum_ = 0.0; + std::size_t nnz_marg_{}; + + for (auto i = istart; i < iend; ++i) { + marg_sum_ += marg[i]; + nnz_marg_ += marg[i] != 0; + } + + [[maybe_unused]] const std::scoped_lock lck(mtx); + marg_sum += marg_sum_; + nnz_marg += nnz_marg_; + }; + + if (marg.size() < 10'000 || !tpool) { + aggregate_marg_impl(0, marg.size()); + return std::make_pair(marg_sum, nnz_marg); + } + + tpool->push_loop(0, marg.size(), aggregate_marg_impl); + tpool->wait_for_tasks(); + + return std::make_pair(marg_sum, nnz_marg); +} + +inline void ICE::update_biases(nonstd::span marg, nonstd::span biases, + double avg_nzmarg, BS::thread_pool* tpool) { + auto update_biases_impl = [&](std::size_t istart, std::size_t iend) { + for (auto i = istart; i < iend; ++i) { + const auto n = marg[i] / avg_nzmarg; + if (n != 0) { + biases[i] /= n; + } + } + }; + + if (marg.size() < 10'000 || !tpool) { + return update_biases_impl(0, marg.size()); + } + + tpool->push_loop(0, marg.size(), update_biases_impl); + tpool->wait_for_tasks(); +} + +inline double ICE::compute_ssq_nzmarg(nonstd::span marg, double avg_nzmarg, + BS::thread_pool* tpool) { + std::mutex mtx{}; + double ssq_nzmarg = 0; + + auto compute_ssq_nzmarg_impl = [&](std::size_t istart, std::size_t iend) { + double ssq_nzmarg_ = 0.0; + for (auto i = istart; i < iend; ++i) { + const auto& n = marg[i]; + if (n != 0) { + ssq_nzmarg_ += std::pow(n - avg_nzmarg, 2); + } + } + [[maybe_unused]] const std::scoped_lock lck(mtx); + ssq_nzmarg += ssq_nzmarg_; + }; + + if (marg.size() < 10'000 || !tpool) { + compute_ssq_nzmarg_impl(0, marg.size()); + return ssq_nzmarg; + } + + tpool->push_loop(0, marg.size(), compute_ssq_nzmarg_impl); + tpool->wait_for_tasks(); + return ssq_nzmarg; +} + +template +inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span biases, + nonstd::span chrom_bin_offsets, + std::size_t min_nnz, std::size_t min_count, double mad_max, + BS::thread_pool* tpool) { + if (min_nnz == 0 && min_count == 0 && mad_max == 0) { + return; + } + + SPDLOG_INFO(FMT_STRING("Initializing bias vector...")); + MargsVector marg(biases.size()); + if (min_nnz != 0) { + SPDLOG_INFO(FMT_STRING("Masking rows with fewer than {} nnz entries..."), min_nnz); + min_nnz_filtering(marg, matrix, biases, min_nnz, tpool); + } + + if (min_count != 0 || mad_max != 0) { + matrix.marginalize(marg, tpool); + } + + if (min_count != 0) { + SPDLOG_INFO(FMT_STRING("Masking rows with fewer than {} interactions..."), min_count); + min_count_filtering(biases, min_count, marg()); + } + + if (mad_max != 0) { + SPDLOG_INFO(FMT_STRING("Masking rows using mad_max={}..."), mad_max); + mad_max_filtering(chrom_bin_offsets, biases, marg(), mad_max); + } +} + +inline std::vector ICE::compute_weights_from_chromosome_sizes( + const BinTable& bins, nonstd::span chrom_bin_offsets) { + std::vector weights(bins.size()); + for (std::uint32_t i = 1; i < chrom_bin_offsets.size(); ++i) { + const auto& chrom = bins.chromosomes().at(i - 1); + if (chrom.is_all()) { + continue; + } + const auto i0 = chrom_bin_offsets[i - 1]; + const auto i1 = chrom_bin_offsets[i]; + + const auto nbins = static_cast(bins.size()); + const auto cnbins = + std::ceil(static_cast(chrom.size()) / static_cast(bins.bin_size())); + + for (std::size_t j = i0; j < i1; ++j) { + weights[j] = 1.0 / (1.0 - cnbins / nbins); + } + } + return weights; +} + +inline std::vector ICE::get_weights(bool rescale) const { + if (!rescale) { + return _biases; + } + + std::vector biases(_biases.size()); + if (_scale.size() == 1) { + const auto scale = std::sqrt(_scale[0]); + std::transform(_biases.begin(), _biases.end(), biases.begin(), [&](const auto n) { + return n == 0 ? std::numeric_limits::quiet_NaN() : n / scale; + }); + } else { + for (std::size_t i = 1; i < _chrom_offsets.size(); ++i) { + const auto i0 = static_cast(_chrom_offsets[i - 1]); + const auto i1 = static_cast(_chrom_offsets[i]); + const auto scale = std::sqrt(_scale[i - 1]); + std::transform(_biases.begin() + i0, _biases.begin() + i1, biases.begin() + i0, + [&](const auto n) { + return n == 0 ? std::numeric_limits::quiet_NaN() : n / scale; + }); + } + } + return biases; +} + +inline std::vector ICE::scale() const noexcept { return _scale; } +inline std::vector ICE::variance() const noexcept { return _variance; } + +} // namespace hictk::balancing diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp new file mode 100644 index 00000000..a152f6f6 --- /dev/null +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -0,0 +1,561 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "hictk/cooler/cooler.hpp" +#include "hictk/pixel.hpp" +#include "hictk/type_traits.hpp" + +namespace hictk::balancing { + +inline MargsVector::MargsVector(std::size_t size_) + : _margs(size_, 0), _mtxes(compute_number_of_mutexes(size_)) {} + +inline MargsVector::MargsVector(const MargsVector& other) + : _margs(other._margs.begin(), other._margs.end()), _mtxes(other.size()) {} + +inline MargsVector& MargsVector::operator=(const MargsVector& other) { + if (this == &other) { + return *this; + } + + _margs = other._margs; + _mtxes = std::vector{other.size()}; + + return *this; +} + +inline double MargsVector::operator[](std::size_t i) const noexcept { + assert(i < size()); + return _margs[i]; +} + +inline double& MargsVector::operator[](std::size_t i) noexcept { + assert(i < size()); + return _margs[i]; +} + +inline void MargsVector::add(std::size_t i, double n) noexcept { + assert(i < size()); + [[maybe_unused]] std::scoped_lock lck(_mtxes[get_mutex_idx(i)]); + _margs[i] += n; +} + +inline const std::vector& MargsVector::operator()() const noexcept { return _margs; } +inline std::vector& MargsVector::operator()() noexcept { return _margs; } + +inline void MargsVector::fill(double n) noexcept { std::fill(_margs.begin(), _margs.end(), n); } +inline void MargsVector::resize(std::size_t size_) { + if (size_ != size()) { + _margs.resize(size_); + std::vector v(size_); + std::swap(v, _mtxes); + } +} + +inline std::size_t MargsVector::size() const noexcept { return _margs.size(); } +inline bool MargsVector::empty() const noexcept { return size() == 0; } + +inline std::size_t MargsVector::compute_number_of_mutexes(std::size_t size) noexcept { + if (size == 0) { + return 0; + } + const auto nthreads = static_cast(std::thread::hardware_concurrency()); + // Clamping to 2-n is needed because get_pixel_mutex_idx expects the number of + // mutexes to be a multiple of 2 + return next_pow2(std::clamp(size, std::size_t(2), 5000 * nthreads)); +} + +template +inline I MargsVector::next_pow2(I n) noexcept { + using ull = unsigned long long; + if constexpr (std::is_signed_v) { + assert(n >= 0); + return conditional_static_cast(next_pow2(static_cast(n))); + } else { + auto m = conditional_static_cast(n); +#ifndef __GNUC__ + // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --m; + m |= m >> 1; + m |= m >> 2; + m |= m >> 4; + m |= m >> 8; + m |= m >> 16; + m |= m >> 32; + return conditional_static_cast(m + 1); +#else + // https://jameshfisher.com/2018/03/30/round-up-power-2/ + // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html + + return conditional_static_cast( + m <= 1 ? m + : std::uint64_t(1) << (std::uint64_t(64) - std::uint64_t(__builtin_clzll(m - 1)))); +#endif + } +} + +inline std::size_t MargsVector::get_mutex_idx(std::size_t i) const noexcept { + assert(!_mtxes.empty()); + assert(_mtxes.size() % 2 == 0); + i = XXH3_64bits(&i, sizeof(std::size_t)); + // equivalent to i % _mtxes.size() when _mtxes.size() % 2 == 0 + return i & (_mtxes.size() - 1); +} + +inline bool SparseMatrix::empty() const noexcept { return size() == 0; } +inline std::size_t SparseMatrix::size() const noexcept { return _counts.size(); } + +inline void SparseMatrix::clear(bool shrink_to_fit_) noexcept { + _bin1_ids.clear(); + _bin2_ids.clear(); + _counts.clear(); + if (shrink_to_fit_) { + shrink_to_fit(); + } +} + +inline void SparseMatrix::shrink_to_fit() noexcept { + _bin1_ids.shrink_to_fit(); + _bin2_ids.shrink_to_fit(); + _counts.shrink_to_fit(); +} + +inline void SparseMatrix::finalize() { shrink_to_fit(); } + +inline const std::vector& SparseMatrix::bin1_ids() const noexcept { + return _bin1_ids; +} +inline const std::vector& SparseMatrix::bin2_ids() const noexcept { + return _bin2_ids; +} +inline const std::vector& SparseMatrix::counts() const noexcept { return _counts; } + +inline void SparseMatrix::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count, + std::size_t bin_offset) { + assert(bin1_id >= bin_offset); + assert(bin2_id >= bin1_id); + + _bin1_ids.push_back(bin1_id - bin_offset); + _bin2_ids.push_back(bin2_id - bin_offset); + _counts.push_back(count); +} + +void SparseMatrix::serialize(std::fstream& fs, ZSTD_CCtx& ctx, int compression_lvl) const { + auto size_ = size(); + fs.write(reinterpret_cast(&size_), sizeof(std::size_t)); + + const auto tmpbuff_size = ZSTD_compressBound(size() * sizeof(std::uint64_t)); + std::string tmpbuff(tmpbuff_size, '\0'); + + std::size_t compressed_size = ZSTD_compressCCtx(&ctx, reinterpret_cast(tmpbuff.data()), + tmpbuff.size() * sizeof(char), + reinterpret_cast(_bin1_ids.data()), + size() * sizeof(std::uint64_t), compression_lvl); + if (ZSTD_isError(compressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(compressed_size)); + } + + fs.write(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + fs.write(tmpbuff.data(), static_cast(compressed_size)); + + compressed_size = ZSTD_compressCCtx(&ctx, reinterpret_cast(tmpbuff.data()), + tmpbuff.size() * sizeof(char), + reinterpret_cast(_bin2_ids.data()), + size() * sizeof(std::uint64_t), compression_lvl); + if (ZSTD_isError(compressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(compressed_size)); + } + + fs.write(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + fs.write(tmpbuff.data(), static_cast(compressed_size)); + + compressed_size = ZSTD_compressCCtx( + &ctx, reinterpret_cast(tmpbuff.data()), tmpbuff.size() * sizeof(char), + reinterpret_cast(_counts.data()), size() * sizeof(double), compression_lvl); + if (ZSTD_isError(compressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(compressed_size)); + } + + fs.write(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + fs.write(tmpbuff.data(), static_cast(compressed_size)); + + fs.flush(); +} + +void SparseMatrix::deserialize(std::fstream& fs, ZSTD_DCtx& ctx) { + std::size_t size_{}; + fs.read(reinterpret_cast(&size_), sizeof(std::size_t)); + + _bin1_ids.resize(size_); + _bin2_ids.resize(size_); + _counts.resize(size_); + + std::string tmpbuff{}; + std::size_t compressed_size{}; + fs.read(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + + tmpbuff.resize(compressed_size); + fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); + std::size_t decompressed_size = ZSTD_decompressDCtx( + &ctx, reinterpret_cast(_bin1_ids.data()), _bin1_ids.size() * sizeof(std::uint64_t), + tmpbuff.data(), tmpbuff.size() * sizeof(char)); + if (ZSTD_isError(decompressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); + } + + fs.read(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + tmpbuff.resize(compressed_size); + fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); + decompressed_size = ZSTD_decompressDCtx(&ctx, reinterpret_cast(_bin2_ids.data()), + _bin2_ids.size() * sizeof(std::uint64_t), tmpbuff.data(), + tmpbuff.size() * sizeof(char)); + if (ZSTD_isError(decompressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); + } + + fs.read(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + tmpbuff.resize(compressed_size); + fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); + decompressed_size = ZSTD_decompressDCtx(&ctx, reinterpret_cast(_counts.data()), + _counts.size() * sizeof(double), tmpbuff.data(), + tmpbuff.size() * sizeof(char)); + if (ZSTD_isError(decompressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); + } +} + +inline void SparseMatrix::marginalize(MargsVector& marg, BS::thread_pool* tpool, + bool init_buffer) const { + assert(!marg.empty()); + if (init_buffer) { + marg.fill(0); + } + + auto marginalize_impl = [&](std::size_t istart, std::size_t iend) { + for (auto i = istart; i < iend; ++i) { + const auto i1 = _bin1_ids[i]; + const auto i2 = _bin2_ids[i]; + + if (tpool) { + if (_counts[i] != 0) { + marg.add(i1, _counts[i]); + marg.add(i2, _counts[i]); + } + } else { + marg[i1] += _counts[i]; + marg[i2] += _counts[i]; + } + } + }; + + if (size() < 1'000'000 || !tpool) { + marginalize_impl(0, size()); + return; + } + + tpool->push_loop(0, size(), marginalize_impl); + tpool->wait_for_tasks(); +} + +inline void SparseMatrix::marginalize_nnz(MargsVector& marg, BS::thread_pool* tpool, + bool init_buffer) const { + if (init_buffer) { + marg.fill(0); + } + + auto marginalize_nnz_impl = [&](std::size_t istart, std::size_t iend) { + for (auto i = istart; i < iend; ++i) { + const auto i1 = _bin1_ids[i]; + const auto i2 = _bin2_ids[i]; + + if (tpool) { + if (_counts[i] != 0) { + marg.add(i1, _counts[i] != 0); + marg.add(i2, _counts[i] != 0); + } + } else { + marg[i1] += _counts[i] != 0; + marg[i2] += _counts[i] != 0; + } + } + }; + + if (size() < 1'000'000 || !tpool) { + marginalize_nnz_impl(0, size()); + return; + } + + tpool->push_loop(0, size(), marginalize_nnz_impl); + tpool->wait_for_tasks(); +} + +inline void SparseMatrix::times_outer_product_marg(MargsVector& marg, + nonstd::span biases, + nonstd::span weights, + BS::thread_pool* tpool, bool init_buffer) const { + assert(biases.size() == weights.size() || weights.empty()); + marg.resize(biases.size()); + + if (init_buffer) { + marg.fill(0); + } + + auto times_outer_product_marg_impl = [&](std::size_t istart, std::size_t iend) { + for (auto i = istart; i < iend; ++i) { + const auto i1 = _bin1_ids[i]; + const auto i2 = _bin2_ids[i]; + const auto w1 = weights.empty() ? 1 : weights[i1]; + const auto w2 = weights.empty() ? 1 : weights[i2]; + const auto count = _counts[i] * (w1 * biases[i1]) * (w2 * biases[i2]); + + if (tpool) { + if (count != 0) { + marg.add(i1, count); + marg.add(i2, count); + } + } else { + marg[i1] += count; + marg[i2] += count; + } + } + }; + + if (size() < 1'000'000 || !tpool) { + times_outer_product_marg_impl(0, size()); + return; + } + + tpool->push_loop(0, size(), times_outer_product_marg_impl); + tpool->wait_for_tasks(); +} + +inline SparseMatrixChunked::SparseMatrixChunked(std::filesystem::path tmp_file, + std::size_t chunk_size, int compression_lvl) + : _path(std::move(tmp_file)), + _chunk_size(chunk_size), + _compression_lvl(compression_lvl), + _zstd_cctx(ZSTD_createCCtx()), + _zstd_dctx(ZSTD_createDCtx()) { + _fs.exceptions(std::ios::badbit); + _fs.open(_path, std::ios::out | std::ios::binary); +} + +inline SparseMatrixChunked::~SparseMatrixChunked() noexcept { + try { + if (!_path.empty() && std::filesystem::exists(_path)) { + std::filesystem::remove(_path); + } + } catch (...) { + } +} + +inline bool SparseMatrixChunked::empty() const noexcept { return size() == 0; } +inline std::size_t SparseMatrixChunked::size() const noexcept { return _size; } +inline void SparseMatrixChunked::clear(bool shrink_to_fit_) { + _index.clear(); + _fs.close(); + std::filesystem::remove(_path); + _path = ""; + _size = 0; + _matrix.clear(shrink_to_fit_); +} + +inline void SparseMatrixChunked::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, + double count, std::size_t bin_offset) { + if (_matrix.size() == _chunk_size) { + write_chunk(); + } + + _matrix.push_back(bin1_id, bin2_id, count, bin_offset); + ++_size; +} + +inline void SparseMatrixChunked::finalize() { + if (!_matrix.empty()) { + write_chunk(); + } + _fs.open(_path, std::ios::in | std::ios::binary); +} + +inline void SparseMatrixChunked::marginalize(MargsVector& marg, BS::thread_pool* tpool, + bool init_buffer) const { + auto marginalize_impl = [&](std::size_t istart, std::size_t iend) { + std::unique_ptr zstd_dctx(ZSTD_createDCtx()); + std::fstream fs{}; + fs.exceptions(_fs.exceptions()); + fs.open(_path, std::ios::in | std::ios::binary); + auto matrix = _matrix; + MargsVector marg_local(marg.size()); + for (const auto offset : nonstd::span(_index).subspan(istart, iend - istart)) { + fs.seekg(offset); + matrix.deserialize(fs, *zstd_dctx); + matrix.marginalize(marg_local, nullptr, false); + } + + for (std::size_t i = 0; i < marg_local.size(); ++i) { + if (marg_local[i] != 0) { + marg.add(i, marg_local[i]); + } + } + }; + + assert(!marg.empty()); + if (init_buffer) { + marg.fill(0); + } + + if (_index.size() == 1 || !tpool) { + marginalize_impl(0, _index.size()); + return; + } + + const auto offsets = compute_chunk_offsets(_index.size(), tpool->get_thread_count()); + + for (std::size_t i = 1; i < offsets.size(); ++i) { + const auto i0 = offsets[i - 1]; + const auto i1 = offsets[i]; + + tpool->push_task(marginalize_impl, i0, i1); + } + tpool->wait_for_tasks(); +} + +inline void SparseMatrixChunked::marginalize_nnz(MargsVector& marg, BS::thread_pool* tpool, + bool init_buffer) const { + auto marginalize_nnz_impl = [&](std::size_t istart, std::size_t iend) { + std::unique_ptr zstd_dctx(ZSTD_createDCtx()); + std::fstream fs{}; + fs.exceptions(_fs.exceptions()); + fs.open(_path, std::ios::in | std::ios::binary); + auto matrix = _matrix; + MargsVector marg_local(marg.size()); + for (const auto offset : nonstd::span(_index).subspan(istart, iend - istart)) { + fs.seekg(offset); + matrix.deserialize(fs, *zstd_dctx); + matrix.marginalize_nnz(marg, nullptr, false); + } + for (std::size_t i = 0; i < marg_local.size(); ++i) { + if (marg_local[i] != 0) { + marg.add(i, marg_local[i]); + } + } + }; + + assert(!marg.empty()); + if (init_buffer) { + marg.fill(0); + } + + if (_index.size() == 1 || !tpool) { + marginalize_nnz_impl(0, _index.size()); + return; + } + const auto offsets = compute_chunk_offsets(_index.size(), tpool->get_thread_count()); + + for (std::size_t i = 1; i < offsets.size(); ++i) { + const auto i0 = offsets[i - 1]; + const auto i1 = offsets[i]; + + tpool->push_task(marginalize_nnz_impl, i0, i1); + } + tpool->wait_for_tasks(); +} + +inline void SparseMatrixChunked::times_outer_product_marg(MargsVector& marg, + nonstd::span biases, + nonstd::span weights, + BS::thread_pool* tpool, + bool init_buffer) const { + auto times_outer_product_marg_impl = [&](std::size_t istart, std::size_t iend) { + std::unique_ptr zstd_dctx(ZSTD_createDCtx()); + std::fstream fs{}; + fs.exceptions(_fs.exceptions()); + fs.open(_path, std::ios::in | std::ios::binary); + auto matrix = _matrix; + MargsVector marg_local(marg.size()); + for (const auto offset : nonstd::span(_index).subspan(istart, iend - istart)) { + fs.seekg(offset); + matrix.deserialize(fs, *zstd_dctx); + matrix.times_outer_product_marg(marg_local, biases, weights, nullptr, false); + } + for (std::size_t i = 0; i < marg.size(); ++i) { + if (marg_local[i] != 0) { + marg.add(i, marg_local[i]); + } + } + }; + + assert(biases.size() == weights.size() || weights.empty()); + marg.resize(biases.size()); + if (init_buffer) { + marg.fill(0); + } + + if (_index.size() == 1 || !tpool) { + times_outer_product_marg_impl(0, _index.size()); + return; + } + + const auto offsets = compute_chunk_offsets(_index.size(), tpool->get_thread_count()); + + for (std::size_t i = 1; i < offsets.size(); ++i) { + const auto i0 = offsets[i - 1]; + const auto i1 = offsets[i]; + + tpool->push_task(times_outer_product_marg_impl, i0, i1); + } + tpool->wait_for_tasks(); + + return; +} + +inline void SparseMatrixChunked::write_chunk() { + assert(!_matrix.empty()); + _index.push_back(_fs.tellg()); + _matrix.finalize(); + _matrix.serialize(_fs, *_zstd_cctx, _compression_lvl); + _matrix.clear(); +} + +inline std::vector SparseMatrixChunked::compute_chunk_offsets(std::size_t size, + std::size_t num_chunks) { + std::vector offsets{}; + if (size < num_chunks) { + offsets.resize(size + 1, 1); + offsets.front() = 0; + } else { + const auto n = size / num_chunks; + offsets.resize(num_chunks + 1, n); + offsets.front() = 0; + auto tot = n * num_chunks; + + for (std::size_t i = 1; i < offsets.size(); ++i) { + if (tot == size) { + break; + } + offsets[i]++; + tot++; + } + } + for (std::size_t i = 1; i < offsets.size(); ++i) { + offsets[i] += offsets[i - 1]; + } + + return offsets; +} + +} // namespace hictk::balancing diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp new file mode 100644 index 00000000..fde72faa --- /dev/null +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -0,0 +1,159 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "hictk/bin_table.hpp" +#include "hictk/common.hpp" + +namespace std { +template <> +struct default_delete { + void operator()(ZSTD_CCtx_s* ctx) const { ZSTD_freeCCtx(ctx); } // NOLINT +}; + +template <> +struct default_delete { + void operator()(ZSTD_DCtx_s* ctx) const { ZSTD_freeDCtx(ctx); } // NOLINT +}; +} // namespace std + +namespace hictk::balancing { + +class MargsVector { + std::vector _margs{}; + mutable std::vector _mtxes; + + public: + MargsVector() = default; + explicit MargsVector(std::size_t size_); + + MargsVector(const MargsVector& other); + MargsVector(MargsVector&& other) noexcept = default; + + ~MargsVector() = default; + + MargsVector& operator=(const MargsVector& other); + MargsVector& operator=(MargsVector&& other) noexcept = default; + + [[nodiscard]] double operator[](std::size_t i) const noexcept; + [[nodiscard]] double& operator[](std::size_t i) noexcept; + void add(std::size_t i, double n) noexcept; + + [[nodiscard]] const std::vector& operator()() const noexcept; + [[nodiscard]] std::vector& operator()() noexcept; + + void fill(double n = 0) noexcept; + void resize(std::size_t size_); + + [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] bool empty() const noexcept; + + private: + static std::size_t compute_number_of_mutexes(std::size_t size) noexcept; + template >> + [[nodiscard]] static I next_pow2(I n) noexcept; + [[nodiscard]] std::size_t get_mutex_idx(std::size_t i) const noexcept; +}; + +class SparseMatrix { + std::vector _bin1_ids{}; + std::vector _bin2_ids{}; + std::vector _counts{}; + + public: + SparseMatrix() = default; + + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; + void clear(bool shrink_to_fit_ = false) noexcept; + void shrink_to_fit() noexcept; + void finalize(); + + [[nodiscard]] const std::vector& bin1_ids() const noexcept; + [[nodiscard]] const std::vector& bin2_ids() const noexcept; + [[nodiscard]] const std::vector& counts() const noexcept; + + void push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count, + std::size_t bin_offset = 0); + + void serialize(std::fstream& fs, ZSTD_CCtx& ctx, int compression_lvl = 3) const; + void deserialize(std::fstream& fs, ZSTD_DCtx& ctx); + + void marginalize(MargsVector& marg, BS::thread_pool* tpool = nullptr, + bool init_buffer = true) const; + void marginalize_nnz(MargsVector& marg, BS::thread_pool* tpool = nullptr, + bool init_buffer = true) const; + void times_outer_product_marg(MargsVector& marg, nonstd::span biases, + nonstd::span weights, + BS::thread_pool* tpool = nullptr, bool init_buffer = true) const; +}; + +class SparseMatrixChunked { + mutable SparseMatrix _matrix{}; + mutable std::string _buff{}; + std::filesystem::path _path{}; + mutable std::fstream _fs{}; + + std::vector _index{}; + std::size_t _size{}; + std::size_t _chunk_size{}; + int _compression_lvl{}; + + std::unique_ptr _zstd_cctx{}; + std::unique_ptr _zstd_dctx{}; + + public: + SparseMatrixChunked() = default; + SparseMatrixChunked(std::filesystem::path tmp_file, std::size_t chunk_size, + int compression_lvl = 3); + + SparseMatrixChunked(const SparseMatrixChunked& other) = delete; +#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ > 9 + SparseMatrixChunked(SparseMatrixChunked&& other) noexcept = default; +#else + SparseMatrixChunked(SparseMatrixChunked&& other) = default; +#endif + + ~SparseMatrixChunked() noexcept; + + SparseMatrixChunked& operator=(const SparseMatrixChunked& other) = delete; + SparseMatrixChunked& operator=(SparseMatrixChunked&& other) noexcept( + noexcept_move_assignment_op()) = default; + + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; + void clear(bool shrink_to_fit_ = false); + + void push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count, + std::size_t bin_offset = 0); + void finalize(); + + void marginalize(MargsVector& marg, BS::thread_pool* tpool = nullptr, + bool init_buffer = true) const; + void marginalize_nnz(MargsVector& marg, BS::thread_pool* tpool = nullptr, + bool init_buffer = true) const; + void times_outer_product_marg(MargsVector& marg, nonstd::span biases, + nonstd::span weights, + BS::thread_pool* tpool = nullptr, bool init_buffer = true) const; + + private: + void write_chunk(); + [[nodiscard]] static std::vector compute_chunk_offsets(std::size_t size, + std::size_t num_chunks); +}; + +} // namespace hictk::balancing + +#include "./impl/sparse_matrix_impl.hpp" diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp index 739910f5..c8b04a1c 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp @@ -105,7 +105,8 @@ inline std::size_t BinTable::size() const noexcept { if (_num_bins_prefix_sum.empty()) { return 0; } - return static_cast(_num_bins_prefix_sum.back()); + return conditional_static_cast(_num_bins_prefix_sum.back() - + _num_bins_prefix_sum.front()); } inline bool BinTable::empty() const noexcept { return size() == 0; } diff --git a/src/libhictk/common/include/hictk/common.hpp b/src/libhictk/common/include/hictk/common.hpp index c33e77b1..b0fff824 100644 --- a/src/libhictk/common/include/hictk/common.hpp +++ b/src/libhictk/common/include/hictk/common.hpp @@ -86,7 +86,7 @@ inline constexpr std::uint8_t SENTINEL_ATTR_VALUE{255}; #endif } -[[nodiscard]] constexpr bool noexcept_move_assigment_op() noexcept { +[[nodiscard]] constexpr bool noexcept_move_assignment_op() noexcept { #if defined(__GNUC__) && defined(__clang__) return __clang_major__ > 8; #elif defined(__GNUC__) diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index 104b19f5..9ff256d9 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -118,12 +118,10 @@ class File { File(File &&other) noexcept(noexcept_move_ctor()) = default; // NOLINT // Simple constructor. Open file in read-only mode. Automatically detects pixel count type - [[nodiscard]] explicit File(std::string_view uri, - std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, - bool validate = true); - [[nodiscard]] explicit File(RootGroup entrypoint, - std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, - bool validate = true); + explicit File(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, + bool validate = true); + explicit File(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, + bool validate = true); [[nodiscard]] static File open_random_access( std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, @@ -152,7 +150,7 @@ class File { ~File() noexcept; File &operator=(const File &other) = delete; - File &operator=(File &&other) noexcept(noexcept_move_assigment_op()) = default; // NOLINT + File &operator=(File &&other) noexcept(noexcept_move_assignment_op()) = default; // NOLINT [[nodiscard]] explicit operator bool() const noexcept; @@ -243,13 +241,13 @@ class File { std::uint64_t first_bin1, std::uint64_t last_bin1, std::uint64_t first_bin2, std::uint64_t last_bin2, std::shared_ptr weights = nullptr) const; - bool has_weights(std::string_view normalization) const; std::shared_ptr read_weights(std::string_view normalization, bool rescale = false) const; std::shared_ptr read_weights(std::string_view normalization, balancing::Weights::Type type, bool rescale = false) const; + [[nodiscard]] bool has_normalization(std::string_view normalization) const; [[nodiscard]] std::vector avail_normalizations() const; [[nodiscard]] bool has_normalization(const balancing::Method &normalization) const; std::shared_ptr read_weights(const balancing::Method &normalization, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp index d0ca7987..00f1a175 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp @@ -87,6 +87,16 @@ inline auto File::dataset(std::string_view dataset_name) const -> const Dataset } } +inline bool File::has_normalization(const balancing::Method &normalization) const { + const auto dset_path = fmt::format(FMT_STRING("{}/{}"), _groups.at("bins").group.getPath(), + normalization.to_string()); + if (_weights.contains(dset_path)) { + return true; + } + + return _root_group().exist(dset_path); +} + inline std::vector File::avail_normalizations() const { const phmap::flat_hash_set bin_table_dsets{"chrom", "start", "end"}; diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp index 819bb657..e27e2d2a 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp @@ -226,7 +226,7 @@ inline PixelSelector File::fetch(PixelCoordinates coord1, PixelCoordinates coord // clang-format on } -inline bool File::has_weights(std::string_view normalization) const { +inline bool File::has_normalization(std::string_view normalization) const { return has_normalization(balancing::Method{normalization}); } inline std::shared_ptr File::read_weights(std::string_view normalization, @@ -239,16 +239,6 @@ inline std::shared_ptr File::read_weights(std::string_ return read_weights(balancing::Method{normalization}, type, rescale); } -inline bool File::has_normalization(const balancing::Method &normalization) const { - const auto dset_path = fmt::format(FMT_STRING("{}/{}"), _groups.at("bins").group.getPath(), - normalization.to_string()); - if (_weights.contains(dset_path)) { - return true; - } - - return _root_group().exist(dset_path); -} - inline std::shared_ptr File::read_weights( const balancing::Method &normalization, bool rescale) const { if (normalization == "NONE") { diff --git a/src/libhictk/file/include/hictk/file.hpp b/src/libhictk/file/include/hictk/file.hpp index c8e05983..c162cff7 100644 --- a/src/libhictk/file/include/hictk/file.hpp +++ b/src/libhictk/file/include/hictk/file.hpp @@ -132,6 +132,7 @@ class File { std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, const balancing::Method &normalization = balancing::Method::NONE()) const; + [[nodiscard]] bool has_normalization(std::string_view normalization) const; [[nodiscard]] std::vector avail_normalizations() const; template diff --git a/src/libhictk/file/include/hictk/impl/file_impl.hpp b/src/libhictk/file/include/hictk/impl/file_impl.hpp index e8adf527..cb617891 100644 --- a/src/libhictk/file/include/hictk/impl/file_impl.hpp +++ b/src/libhictk/file/include/hictk/impl/file_impl.hpp @@ -277,6 +277,9 @@ inline PixelSelector File::fetch(std::string_view chrom1_name, std::uint32_t sta _fp); } +inline bool File::has_normalization(std::string_view normalization) const { + return std::visit([&](const auto& fp) { return fp.has_normalization(normalization); }, _fp); +} inline std::vector File::avail_normalizations() const { return std::visit([](const auto& fp) { return fp.avail_normalizations(); }, _fp); } diff --git a/src/libhictk/hic/include/hictk/hic.hpp b/src/libhictk/hic/include/hictk/hic.hpp index 42f2689f..62023ab1 100644 --- a/src/libhictk/hic/include/hictk/hic.hpp +++ b/src/libhictk/hic/include/hictk/hic.hpp @@ -56,6 +56,7 @@ class File { [[nodiscard]] std::uint64_t nchroms() const; [[nodiscard]] const std::string &assembly() const noexcept; [[nodiscard]] const std::vector &avail_resolutions() const noexcept; + [[nodiscard]] bool has_normalization(std::string_view normalization) const; [[nodiscard]] std::vector avail_normalizations() const; [[nodiscard]] std::uint32_t resolution() const noexcept; diff --git a/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp index ef8128b4..94f072b2 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp @@ -87,6 +87,14 @@ inline const std::vector& File::avail_resolutions() const noexcep return _fs->header().resolutions; } +inline bool File::has_normalization(std::string_view normalization) const { + const auto normalizations = avail_normalizations(); + const auto it = std::find_if(normalizations.begin(), normalizations.end(), + [&](const auto& norm) { return norm.to_string() == normalization; }); + + return it != normalizations.end(); +} + inline std::vector File::avail_normalizations() const { return _fs->list_avail_normalizations(_type, _unit, _bins->bin_size()); } diff --git a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp index 32bad1e7..3325fe2d 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp @@ -701,6 +701,19 @@ inline std::uint32_t PixelSelectorAll::resolution() const noexcept { inline const BinTable &PixelSelectorAll::bins() const noexcept { return _selectors.front().bins(); } +inline std::vector PixelSelectorAll::weights() const { + std::vector weights_{}; + weights_.reserve(bins().size()); + + std::for_each(_selectors.begin(), _selectors.end(), [&](const PixelSelector &sel) { + if (sel.is_intra()) { + weights_.insert(weights_.end(), sel.weights1()().begin(), sel.weights1()().end()); + } + }); + + return weights_; +} + template inline bool PixelSelectorAll::iterator::Pair::operator<(const Pair &other) const noexcept { return first < other.first; diff --git a/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp b/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp index b35a8e6d..e5e5aecb 100644 --- a/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp @@ -205,6 +205,7 @@ class PixelSelectorAll { [[nodiscard]] MatrixUnit unit() const noexcept; [[nodiscard]] std::uint32_t resolution() const noexcept; [[nodiscard]] const BinTable &bins() const noexcept; + [[nodiscard]] std::vector weights() const; template class iterator { diff --git a/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp b/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp index 607a2f1e..f40579ee 100644 --- a/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp +++ b/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp @@ -401,6 +401,15 @@ inline PixelMerger::PixelMerger(std::vector heads, std::vector } } } +template +inline auto PixelMerger::begin() -> iterator { + return iterator{*this}; +} + +template +inline auto PixelMerger::end() const noexcept -> iterator { + return {}; +} template inline void PixelMerger::replace_top_node(std::size_t i) { @@ -429,8 +438,49 @@ inline auto PixelMerger::next() -> ThinPixel { current_node.pixel.count += next_node.pixel.count; replace_top_node(next_node.i); } + ++_i; return current_node.pixel; } + +template +inline PixelMerger::iterator::iterator(PixelMerger &merger) + : _merger(&merger), _value(merger.next()) {} + +template +inline bool PixelMerger::iterator::operator==(const iterator &other) const noexcept { + if (!_merger || !other._merger) { + return _merger == other._merger; + } + + return _merger == other._merger && _merger->_i == other._merger->_i; +} + +template +inline bool PixelMerger::iterator::operator!=(const iterator &other) const noexcept { + return !(*this == other); +} + +template +inline auto PixelMerger::iterator::operator*() const noexcept -> const ThinPixel & { + return _value; +} + +template +inline auto PixelMerger::iterator::operator->() const noexcept -> const ThinPixel * { + return &(**this); +} + +template +inline auto PixelMerger::iterator::operator++() -> iterator & { + assert(!!_merger); + _value = _merger->next(); + if (!_value) { + _merger = nullptr; + } + + return *this; +} + } // namespace internal } // namespace hictk diff --git a/src/libhictk/pixel/include/hictk/pixel.hpp b/src/libhictk/pixel/include/hictk/pixel.hpp index 167ebf5b..c4c596b7 100644 --- a/src/libhictk/pixel/include/hictk/pixel.hpp +++ b/src/libhictk/pixel/include/hictk/pixel.hpp @@ -114,12 +114,35 @@ class PixelMerger { std::vector _heads{}; std::vector _tails{}; + std::size_t _i{}; public: + class iterator; + PixelMerger() = delete; PixelMerger(std::vector head, std::vector tail); + + auto begin() -> iterator; + auto end() const noexcept -> iterator; [[nodiscard]] auto next() -> ThinPixel; + class iterator { + PixelMerger *_merger{}; + ThinPixel _value{}; + + public: + iterator() = default; + explicit iterator(PixelMerger &merger); + + [[nodiscard]] bool operator==(const iterator &other) const noexcept; + [[nodiscard]] bool operator!=(const iterator &other) const noexcept; + + auto operator*() const noexcept -> const ThinPixel &; + auto operator->() const noexcept -> const ThinPixel *; + + [[nodiscard]] auto operator++() -> iterator &; + }; + private: void replace_top_node(std::size_t i); }; diff --git a/test/scripts/hictk_balance.sh b/test/scripts/hictk_balance.sh new file mode 100755 index 00000000..cf253b90 --- /dev/null +++ b/test/scripts/hictk_balance.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash + +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +echo "##################################" +echo "#### hictk balance ####" + +# readlink -f is not available on macos... +function readlink_py { + set -eu + python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" +} + +function nproc_py { + set -eu + python3 -c 'import multiprocessing as mp; print(mp.cpu_count())' +} + +function check_files_exist { + set -eu + status=0 + for f in "$@"; do + if [ ! -f "$f" ]; then + 2>&1 echo "Unable to find test file \"$f\"" + status=1 + fi + done + + return "$status" +} + +function dump_interactions { + set -o pipefail + set -eu + + hictk="$1" + resolution="$3" + f="$2" + + if [[ "$f" == *.hic ]]; then + weight=WEIGHT + else + weight=weight + fi + + "$hictk" dump "$f" \ + --balance="$weight" \ + --resolution \ + "$resolution" | + cut -f 3 +} + +function absolute_error { + set -o pipefail + set -eu + + f1="$1" + f2="$2" + + # shellcheck disable=SC2016 + cmd='function abs(v) { + return v < 0 ? -v : v + } + ($2!=$1 && abs($1 - $2) > 1.0e-5) { print $0 } + ' + + # Fail if the absolute error is > 1.0e-5 + if paste "$f1" "$f2" | awk -F '\t' "$cmd" | grep . ; then + return 1 + else + return 0 + fi +} + +function compare_matrices { + set -o pipefail + set -eu + + hictk="$1" + resolution="$4" + f1="$2" + f2="$3" + + 2>&1 echo "Comparing $f1 with $f2..." + if absolute_error \ + <(dump_interactions "$hictk" "$f1" "$resolution") \ + <(dump_interactions "$hictk" "$f2" "$resolution"); then + 2>&1 echo "Files are identical" + return 0 + else + 2>&1 echo "Files differ" + return 1 + fi +} + +export function readlink_py + +status=0 + +if [ $# -ne 2 ]; then + 2>&1 echo "Usage: $0 path_to_hictk juicer_tools.jar" + status=1 +fi + +hictk_bin="$1" +juicer_tools_jar="$2" + +data_dir="$(readlink_py "$(dirname "$0")/../data/")" +script_dir="$(readlink_py "$(dirname "$0")")" + +ref_cool="$data_dir/cooler/ENCFF993FGR.2500000.cool" +ref_hic="$data_dir/hic/ENCFF993FGR.hic" + +export PATH="$PATH:$script_dir" + +if ! check_files_exist "$ref_cool" "$ref_hic" "$juicer_tools_jar"; then + exit 1 +fi + +outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" +trap 'rm -rf -- "$outdir"' EXIT + +cp "$ref_cool" "$ref_hic" "$outdir" + +"$hictk_bin" balance "$outdir/"*.cool -t $(nproc_py) --chunk-size=100 --mode=cis --force +if ! compare_matrices "$hictk_bin" "$outdir/"*.cool "$ref_cool" 2500000; then + status=1 +fi + +"$hictk_bin" balance "$outdir/"*.hic -t $(nproc_py) --chunk-size=100 --mode=cis --force --juicer-tools-jar "$juicer_tools_jar" +if ! compare_matrices "$hictk_bin" "$outdir/"*.hic "$ref_cool" 2500000; then + status=1 +fi + +"$hictk_bin" balance "$outdir/"*.cool -t $(nproc_py) --in-memory --mode=cis --force +if ! compare_matrices "$hictk_bin" "$outdir/"*.cool "$ref_cool" 2500000; then + status=1 +fi + +if [ "$status" -eq 0 ]; then + printf '\n### PASS ###\n' +else + printf '\n### FAIL ###\n' +fi + +exit "$status" diff --git a/test/units/CMakeLists.txt b/test/units/CMakeLists.txt index 08212577..e99d132e 100644 --- a/test/units/CMakeLists.txt +++ b/test/units/CMakeLists.txt @@ -4,6 +4,7 @@ include_directories(include) +add_subdirectory(balancing) add_subdirectory(bin_table) add_subdirectory(chromosome) add_subdirectory(cooler) diff --git a/test/units/balancing/CMakeLists.txt b/test/units/balancing/CMakeLists.txt new file mode 100644 index 00000000..4d9f96c7 --- /dev/null +++ b/test/units/balancing/CMakeLists.txt @@ -0,0 +1,65 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +find_package(Filesystem REQUIRED) + +include(CTest) +include(Catch) + +add_executable(hictk_balancing_tests) + +target_sources(hictk_balancing_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/balancing_test.cpp) + +target_link_libraries( + hictk_balancing_tests + PRIVATE hictk_project_warnings hictk_project_options + PUBLIC hictk::balancing hictk::file) + +target_link_system_libraries( + hictk_balancing_tests + PUBLIC + Catch2::Catch2WithMain + std::filesystem) + +file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/Testing/") + +# automatically discover tests that are defined in catch based test files you can modify the unittests. TEST_PREFIX to +# whatever you want, or use different for different binaries +catch_discover_tests( + hictk_balancing_tests + TEST_SPEC + "[short]" + TEST_SUFFIX + " - SHORT" + WORKING_DIRECTORY + "${PROJECT_SOURCE_DIR}" + OUTPUT_DIR + "${CMAKE_CURRENT_BINARY_DIR}/Testing/" + EXTRA_ARGS + --success + --skip-benchmarks) + +catch_discover_tests( + hictk_balancing_tests + TEST_SPEC + "[medium]" + TEST_SUFFIX + " - MEDIUM" + WORKING_DIRECTORY + "${PROJECT_SOURCE_DIR}" + EXTRA_ARGS + --success + --skip-benchmarks) + +catch_discover_tests( + hictk_balancing_tests + TEST_SPEC + "[long]" + TEST_SUFFIX + " - LONG" + WORKING_DIRECTORY + "${PROJECT_SOURCE_DIR}" + EXTRA_ARGS + --success + --skip-benchmarks) diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp new file mode 100644 index 00000000..7c994305 --- /dev/null +++ b/test/units/balancing/balancing_test.cpp @@ -0,0 +1,262 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include + +#include "hictk/balancing/ice.hpp" +#include "hictk/balancing/methods.hpp" +#include "hictk/file.hpp" +#include "tmpdir.hpp" + +namespace hictk::test { +inline const std::filesystem::path datadir{"test/data/"}; // NOLINT(cert-err58-cpp) +} // namespace hictk::test + +namespace hictk::test::balancing { + +[[nodiscard]] static std::vector read_weights(const std::filesystem::path& path, + char sep = '\n') { + assert(std::filesystem::exists(path)); + std::ifstream ifs(path); + std::string strbuf; + std::vector buffer{}; + + while (std::getline(ifs, strbuf, sep)) { + buffer.push_back(std::stod(strbuf)); + } + + return buffer; +} + +static void compare_weights(const std::vector& weights, const std::vector& expected, + double tol = 1.0e-6) { + REQUIRE(weights.size() == expected.size()); + + for (std::size_t i = 0; i < weights.size(); ++i) { + if (std::isnan(weights[i])) { + CHECK(std::isnan(expected[i])); + } else { + CHECK_THAT(weights[i], Catch::Matchers::WithinAbs(expected[i], tol)); + } + } +} + +template +static void compare_vectors(const std::vector& v1, const std::vector& v2) { + REQUIRE(v1.size() == v2.size()); + + for (std::size_t i = 0; i < v1.size(); ++i) { + CHECK(v1[i] == v2[i]); + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Balancing: SparseMatrix", "[balancing][short]") { + using SparseMatrix = hictk::balancing::SparseMatrix; + const BinTable bins{Reference{Chromosome{0, "chr0", 50}, Chromosome{1, "chr1", 100}, + Chromosome{2, "chr2", 50}, Chromosome{3, "chr3", 50}}, + 50}; + // clang-format off + const std::vector> pixels{ + {1, 1, 1}, {1, 2, 2}, {2, 2, 3}, // chr1 + {3, 3, 4}, {3, 4, 5}}; // chr2 + // clang-format on + + SECTION("accessors") { CHECK(SparseMatrix{}.empty()); } + + SECTION("push_back") { + SparseMatrix m{}; + for (const auto& p : pixels) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + m.finalize(); + CHECK(m.size() == pixels.size()); + + m.clear(); + CHECK(m.empty()); + } + + SECTION("serde") { + const auto tmpfile = testdir() / "sparse_matrix_serde.bin"; + std::unique_ptr zstd_cctx{ZSTD_createCCtx()}; + std::unique_ptr zstd_dctx{ZSTD_createDCtx()}; + + SECTION("empty matrix") { + std::fstream f{}; + f.open(tmpfile, std::ios::in | std::ios::out | std::ios::trunc); + f.exceptions(std::ios::badbit | std::ios::failbit); + + SparseMatrix m1{}; + SparseMatrix m2{}; + m1.finalize(); + m1.serialize(f, *zstd_cctx); + f.seekg(std::ios::beg); + m2.deserialize(f, *zstd_dctx); + + compare_vectors(m1.bin1_ids(), m2.bin1_ids()); + compare_vectors(m1.bin2_ids(), m2.bin2_ids()); + compare_vectors(m1.counts(), m2.counts()); + } + + SECTION("full matrix") { + SparseMatrix m1{}; + for (const auto& p : pixels) { + m1.push_back(p.bin1_id, p.bin2_id, p.count); + } + m1.finalize(); + + std::fstream f{}; + f.open(tmpfile, std::ios::in | std::ios::out | std::ios::trunc); + f.exceptions(std::ios::badbit | std::ios::failbit); + + SparseMatrix m2{}; + m1.serialize(f, *zstd_cctx); + f.seekg(std::ios::beg); + m2.deserialize(f, *zstd_dctx); + + compare_vectors(m1.bin1_ids(), m2.bin1_ids()); + compare_vectors(m1.bin2_ids(), m2.bin2_ids()); + compare_vectors(m1.counts(), m2.counts()); + } + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Balancing: SparseMatrixChunked", "[balancing][short]") { + using SparseMatrixChunked = hictk::balancing::SparseMatrixChunked; + const BinTable bins{Reference{Chromosome{0, "chr0", 50}, Chromosome{1, "chr1", 100}, + Chromosome{2, "chr2", 50}, Chromosome{3, "chr3", 50}}, + 50}; + // clang-format off + const std::vector> pixels{ + {1, 1, 1}, {1, 2, 2}, {2, 2, 3}, // chr1 + {3, 3, 4}, {3, 4, 5}}; // chr2 + // clang-format on + const auto tmpfile = testdir() / "sparse_matrix_chunked.tmp"; + + SECTION("accessors") { CHECK(SparseMatrixChunked{tmpfile, 2, 0}.empty()); } + + SECTION("push_back") { + SparseMatrixChunked m{tmpfile, 2, 0}; + for (const auto& p : pixels) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + m.finalize(); + + CHECK(m.size() == pixels.size()); + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Balancing: ICE (intra)", "[balancing][short]") { + const std::array, 2> files{ + std::make_pair("cooler", datadir / "cooler/ENCFF993FGR.2500000.cool"), + std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; + + const auto tmpfile = testdir() / "balancing_ice_intra.tmp"; + const auto path_weights = datadir / "balancing/ENCFF993FGR.2500000.ICE.cis.txt"; + + for (const auto& [label, path] : files) { + SECTION(label) { + const hictk::File f(path.string(), 2'500'000); + + SECTION("in-memory") { + constexpr auto type = hictk::balancing::ICE::Type::cis; + const auto weights = hictk::balancing::ICE(f, type).get_weights(); + const auto expected_weights = read_weights(path_weights); + + compare_weights(weights, expected_weights); + } + + SECTION("chunked") { + auto params = hictk::balancing::ICE::DefaultParams; + params.tmpfile = tmpfile; + params.chunk_size = 1000; + + constexpr auto type = hictk::balancing::ICE::Type::cis; + const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); + const auto expected_weights = read_weights(path_weights); + + compare_weights(weights, expected_weights); + } + } + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Balancing: ICE (inter)", "[balancing][medium]") { + const std::array, 2> files{ + std::make_pair("cooler", datadir / "cooler/ENCFF993FGR.2500000.cool"), + std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; + + const auto tmpfile = testdir() / "balancing_ice_inter.tmp"; + const auto path_weights = datadir / "balancing/ENCFF993FGR.2500000.ICE.trans.txt"; + + for (const auto& [label, path] : files) { + SECTION(label) { + const hictk::File f(path.string(), 2'500'000); + + SECTION("in-memory") { + constexpr auto type = hictk::balancing::ICE::Type::trans; + const auto weights = hictk::balancing::ICE(f, type).get_weights(); + const auto expected_weights = read_weights(path_weights); + + compare_weights(weights, expected_weights); + } + + SECTION("chunked") { + auto params = hictk::balancing::ICE::DefaultParams; + params.tmpfile = tmpfile; + params.chunk_size = 1000; + + constexpr auto type = hictk::balancing::ICE::Type::trans; + const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); + const auto expected_weights = read_weights(path_weights); + + compare_weights(weights, expected_weights); + } + } + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Balancing: ICE (gw)", "[balancing][medium]") { + const std::array, 2> files{ + std::make_pair("cooler", datadir / "cooler/ENCFF993FGR.2500000.cool"), + std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; + + const auto tmpfile = testdir() / "balancing_ice_inter.tmp"; + const auto path_weights = datadir / "balancing/ENCFF993FGR.2500000.ICE.gw.txt"; + + for (const auto& [label, path] : files) { + SECTION(label) { + const hictk::File f(path.string(), 2'500'000); + + SECTION("in-memory") { + constexpr auto type = hictk::balancing::ICE::Type::gw; + const auto weights = hictk::balancing::ICE(f, type).get_weights(); + const auto expected_weights = read_weights(path_weights); + + compare_weights(weights, expected_weights); + } + + SECTION("chunked") { + auto params = hictk::balancing::ICE::DefaultParams; + params.tmpfile = tmpfile; + params.chunk_size = 1000; + + constexpr auto type = hictk::balancing::ICE::Type::gw; + const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); + const auto expected_weights = read_weights(path_weights); + + compare_weights(weights, expected_weights); + } + } + } +} + +} // namespace hictk::test::balancing diff --git a/test/units/cooler/file_weights_test.cpp b/test/units/cooler/file_weights_test.cpp index 5218c0a8..c5e5e4c6 100644 --- a/test/units/cooler/file_weights_test.cpp +++ b/test/units/cooler/file_weights_test.cpp @@ -22,8 +22,8 @@ TEST_CASE("Cooler: read weights", "[cooler][short]") { SECTION("wo/ weights") { CHECK(clr1.avail_normalizations().empty()); } SECTION("w/ weights") { CHECK(clr2.avail_normalizations().size() == 6); - CHECK(clr2.has_weights("SCALE")); - CHECK(!clr2.has_weights("FOOBAR")); + CHECK(clr2.has_normalization("SCALE")); + CHECK(!clr2.has_normalization("FOOBAR")); CHECK(clr2.read_weights("SCALE")->type() == hictk::balancing::Weights::Type::DIVISIVE); } @@ -38,7 +38,7 @@ TEST_CASE("Cooler: write weights", "[cooler][short]") { std::filesystem::remove(path2); std::filesystem::remove(path3); std::filesystem::copy(path1, path2); - REQUIRE_FALSE(File(path2.string()).has_weights("weight")); + REQUIRE_FALSE(File(path2.string()).has_normalization("weight")); const auto num_bins = File(path1.string()).bins().size(); diff --git a/test/units/include/tmpdir.hpp b/test/units/include/tmpdir.hpp index 7beddef5..939951f2 100644 --- a/test/units/include/tmpdir.hpp +++ b/test/units/include/tmpdir.hpp @@ -22,6 +22,7 @@ inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert- } // namespace cooler::test::attribute namespace cooler::test::balancing { +inline const auto& testdir = hictk::test::testdir; inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } // namespace cooler::test::balancing