From 81f51237b95335a76fd92a22bb4b01a61d586253 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 18 Sep 2023 09:09:20 +0200 Subject: [PATCH 01/33] Initial implementation of VC balancing --- .../balancing/include/hictk/balancing/vc.hpp | 27 +++++ src/libhictk/balancing/vc_impl.hpp | 102 ++++++++++++++++++ .../include/hictk/impl/bin_table_impl.hpp | 2 +- .../hictk/hic/impl/pixel_selector_impl.hpp | 13 +++ .../hic/include/hictk/hic/pixel_selector.hpp | 1 + test/units/CMakeLists.txt | 1 + test/units/balancing/CMakeLists.txt | 65 +++++++++++ test/units/balancing/balancing_test.cpp | 69 ++++++++++++ 8 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 src/libhictk/balancing/include/hictk/balancing/vc.hpp create mode 100644 src/libhictk/balancing/vc_impl.hpp create mode 100644 test/units/balancing/CMakeLists.txt create mode 100644 test/units/balancing/balancing_test.cpp diff --git a/src/libhictk/balancing/include/hictk/balancing/vc.hpp b/src/libhictk/balancing/include/hictk/balancing/vc.hpp new file mode 100644 index 00000000..2f37def5 --- /dev/null +++ b/src/libhictk/balancing/include/hictk/balancing/vc.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +namespace hictk::balancing { + +template +class VC { + std::variant, std::vector> _rowsum{}; + std::variant _sum{}; + double _norm_sum{}; + + public: + template + VC(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_rows, std::size_t binid_offset = 0); + + [[nodiscard]] std::vector get_weights() const; +}; +} // namespace hictk::balancing + +#include "../../../vc_impl.hpp" diff --git a/src/libhictk/balancing/vc_impl.hpp b/src/libhictk/balancing/vc_impl.hpp new file mode 100644 index 00000000..afec1047 --- /dev/null +++ b/src/libhictk/balancing/vc_impl.hpp @@ -0,0 +1,102 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +#include "hictk/pixel.hpp" +#include "hictk/type_traits.hpp" + +namespace hictk::balancing { + +template +template +inline VC::VC(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_rows, + std::size_t bin_id_offset) { + if constexpr (std::is_floating_point_v) { + _rowsum = std::vector(num_rows, 0); + _sum = 0.0; + } else { + _rowsum = std::vector(num_rows, 0); + _sum = std::int64_t(0); + } + + // Compute rowsum and matrix sum + std::visit( + [&](auto& sum) { + using T = remove_cvref_t; + auto& rowsum = std::get>(_rowsum); + std::for_each(first_pixel, last_pixel, [&](const ThinPixel& p) { + if constexpr (std::is_floating_point_v) { + if (std::isnan(p.count)) { + return; + } + } + const auto bin1_id = p.bin1_id - bin_id_offset; + const auto bin2_id = p.bin2_id - bin_id_offset; + const auto count = conditional_static_cast(p.count); + + rowsum[bin1_id] += count; + + if (bin1_id != bin2_id) { + rowsum[bin2_id] += count; + } + }); + }, + _sum); + + std::visit( + [&](auto& sum) { + using T = remove_cvref_t; + const auto& rowsum = std::get>(_rowsum); + std::for_each(first_pixel, last_pixel, [&](const ThinPixel& p) { + if constexpr (std::is_floating_point_v) { + if (std::isnan(p.count)) { + return; + } + } + const auto bin1_id = p.bin1_id - bin_id_offset; + const auto bin2_id = p.bin2_id - bin_id_offset; + + const auto rs1 = conditional_static_cast(rowsum[bin1_id]); + const auto rs2 = conditional_static_cast(rowsum[bin2_id]); + if (rs1 == 0 || rs2 == 0) { + return; + } + + const auto count = conditional_static_cast(bin1_id == bin2_id ? p.count : 2 * p.count); + sum += count; + _norm_sum += conditional_static_cast(count) / (rs1 * rs2); + }); + }, + _sum); +} + +template +inline std::vector VC::get_weights() const { + std::vector weights; + + const auto scaling_factor = std::visit( + [&](const auto& sum) { return std::sqrt(_norm_sum / conditional_static_cast(sum)); }, + _sum); + + std::visit( + [&](const auto& rowsum) { + weights.reserve(rowsum.size()); + + for (const auto rs : rowsum) { + weights.push_back(conditional_static_cast(rs) * scaling_factor); + } + }, + _rowsum); + + return weights; +} + +} // namespace hictk::balancing diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp index 739910f5..931453e9 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp @@ -105,7 +105,7 @@ inline std::size_t BinTable::size() const noexcept { if (_num_bins_prefix_sum.empty()) { return 0; } - return static_cast(_num_bins_prefix_sum.back()); + return static_cast(_num_bins_prefix_sum.back() - _num_bins_prefix_sum.front()); } inline bool BinTable::empty() const noexcept { return size() == 0; } diff --git a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp index 32bad1e7..3325fe2d 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp @@ -701,6 +701,19 @@ inline std::uint32_t PixelSelectorAll::resolution() const noexcept { inline const BinTable &PixelSelectorAll::bins() const noexcept { return _selectors.front().bins(); } +inline std::vector PixelSelectorAll::weights() const { + std::vector weights_{}; + weights_.reserve(bins().size()); + + std::for_each(_selectors.begin(), _selectors.end(), [&](const PixelSelector &sel) { + if (sel.is_intra()) { + weights_.insert(weights_.end(), sel.weights1()().begin(), sel.weights1()().end()); + } + }); + + return weights_; +} + template inline bool PixelSelectorAll::iterator::Pair::operator<(const Pair &other) const noexcept { return first < other.first; diff --git a/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp b/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp index b35a8e6d..e5e5aecb 100644 --- a/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp @@ -205,6 +205,7 @@ class PixelSelectorAll { [[nodiscard]] MatrixUnit unit() const noexcept; [[nodiscard]] std::uint32_t resolution() const noexcept; [[nodiscard]] const BinTable &bins() const noexcept; + [[nodiscard]] std::vector weights() const; template class iterator { diff --git a/test/units/CMakeLists.txt b/test/units/CMakeLists.txt index 08212577..e99d132e 100644 --- a/test/units/CMakeLists.txt +++ b/test/units/CMakeLists.txt @@ -4,6 +4,7 @@ include_directories(include) +add_subdirectory(balancing) add_subdirectory(bin_table) add_subdirectory(chromosome) add_subdirectory(cooler) diff --git a/test/units/balancing/CMakeLists.txt b/test/units/balancing/CMakeLists.txt new file mode 100644 index 00000000..600f1d55 --- /dev/null +++ b/test/units/balancing/CMakeLists.txt @@ -0,0 +1,65 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +find_package(Filesystem REQUIRED) + +include(CTest) +include(Catch) + +add_executable(hictk_balancing_tests) + +target_sources(hictk_balancing_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/balancing_test.cpp) + +target_link_libraries( + hictk_balancing_tests + PRIVATE hictk_project_warnings hictk_project_options + PUBLIC hictk::balancing hictk::hic hictk::cooler) + +target_link_system_libraries( + hictk_balancing_tests + PUBLIC + Catch2::Catch2WithMain + std::filesystem) + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/Testing/) + +# automatically discover tests that are defined in catch based test files you can modify the unittests. TEST_PREFIX to +# whatever you want, or use different for different binaries +catch_discover_tests( + hictk_balancing_tests + TEST_SPEC + "[short]" + TEST_SUFFIX + " - SHORT" + WORKING_DIRECTORY + ${PROJECT_SOURCE_DIR} + OUTPUT_DIR + ${CMAKE_CURRENT_SOURCE_DIR}/Testing/ + EXTRA_ARGS + --success + --skip-benchmarks) + +catch_discover_tests( + hictk_balancing_tests + TEST_SPEC + "[medium]" + TEST_SUFFIX + " - MEDIUM" + WORKING_DIRECTORY + ${PROJECT_SOURCE_DIR} + EXTRA_ARGS + --success + --skip-benchmarks) + +catch_discover_tests( + hictk_balancing_tests + TEST_SPEC + "[long]" + TEST_SUFFIX + " - LONG" + WORKING_DIRECTORY + ${PROJECT_SOURCE_DIR} + EXTRA_ARGS + --success + --skip-benchmarks) diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp new file mode 100644 index 00000000..020ad674 --- /dev/null +++ b/test/units/balancing/balancing_test.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include +#include + +#include "hictk/balancing/methods.hpp" +#include "hictk/balancing/vc.hpp" +#include "hictk/hic.hpp" + +namespace hictk::test { +inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) +} // namespace hictk::test + +namespace hictk::test::balancing { + +static void compare_weights(const std::vector& weights, const std::vector& expected, + double tol = 1.0e-6) { + REQUIRE(weights.size() == expected.size()); + + for (std::size_t i = 0; i < weights.size(); ++i) { + if (std::isnan(weights[i])) { + CHECK(std::isnan(expected[i])); + } else { + CHECK_THAT(weights[i], Catch::Matchers::WithinAbs(expected[i], tol)); + } + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Balancing: VC", "[balancing][short]") { + const auto path = datadir / "ENCFF993FGR.hic"; + + auto hf = hictk::hic::File(path.string(), 2500000); + + SECTION("INTRA") { + for (const auto& chrom : hf.chromosomes()) { + if (chrom.is_all()) { + continue; + } + auto sel1 = hf.fetch(chrom.name()); + + const auto num_bins = hf.bins().subset(chrom).size(); + const auto bin_id_offset = hf.bins().at(chrom.name(), 0).id(); + const auto weights = + hictk::balancing::VC(sel1.begin(), sel1.end(), + num_bins, bin_id_offset) + .get_weights(); + + auto sel2 = hf.fetch(chrom.name(), hictk::balancing::Method::VC()); + compare_weights(weights, sel2.weights1()()); + } + } + + SECTION("GW") { + const auto num_bins = hf.bins().size(); + auto sel = hf.fetch(); + const auto weights = hictk::balancing::VC(sel.begin(), + sel.end(), num_bins) + .get_weights(); + + const auto expected = hf.fetch(hictk::balancing::Method::GW_VC()).weights(); + compare_weights(weights, expected); + } +} + +} // namespace hictk::test::balancing From 7e3fc1de4fa31b0bebbbda6d911a3ab39ff9cd75 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 18 Sep 2023 19:07:12 +0200 Subject: [PATCH 02/33] Initial implementation of ICE balancing --- src/libhictk/balancing/ice_impl.hpp | 155 ++++++++++++++++++ .../balancing/include/hictk/balancing/ice.hpp | 61 +++++++ src/libhictk/balancing/vc_impl.hpp | 1 - test/units/balancing/balancing_test.cpp | 25 ++- 4 files changed, 239 insertions(+), 3 deletions(-) create mode 100644 src/libhictk/balancing/ice_impl.hpp create mode 100644 src/libhictk/balancing/include/hictk/balancing/ice.hpp diff --git a/src/libhictk/balancing/ice_impl.hpp b/src/libhictk/balancing/ice_impl.hpp new file mode 100644 index 00000000..a052e164 --- /dev/null +++ b/src/libhictk/balancing/ice_impl.hpp @@ -0,0 +1,155 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +#include "hictk/pixel.hpp" +#include "hictk/type_traits.hpp" + +namespace hictk::balancing { + +template +inline ICE::ICE(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_rows, double tol, + std::size_t max_iters, std::size_t num_masked_diags, std::size_t min_nnz, + [[maybe_unused]] double min_count) + : _biases(num_rows, 1.0) { + auto [bin1_ids, bin2_ids, counts] = + construct_sparse_matrix(first_pixel, last_pixel, num_masked_diags); + std::vector margs(_biases.size()); + + if (min_nnz != 0) { + filter_rows_by_nnz(bin1_ids, bin2_ids, counts, _biases, min_nnz, margs); + } + + // TODO mad-max filter + + for (std::size_t i = 0; i < max_iters; ++i) { + const auto res = inner_loop(bin1_ids, bin2_ids, counts, _biases, margs); + _variance = res.variance; + _scale = res.scale; + if (res.variance < tol) { + break; + } + } +} + +template +inline std::tuple, std::vector, std::vector> +ICE::construct_sparse_matrix(PixelIt first_pixel, PixelIt last_pixel, + std::size_t num_masked_diags) { + std::vector bin1_ids{}; + std::vector bin2_ids{}; + std::vector counts{}; + std::for_each(first_pixel, last_pixel, [&](const auto& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + bin1_ids.push_back(p.bin1_id); + bin2_ids.push_back(p.bin2_id); + counts.push_back(p.count); + } + }); + + bin1_ids.shrink_to_fit(); + bin2_ids.shrink_to_fit(); + counts.shrink_to_fit(); + return std::make_tuple(bin1_ids, bin2_ids, counts); +} + +inline void ICE::times_outer_product(const std::vector& bin1_ids, + const std::vector& bin2_ids, + std::vector& counts, + const std::vector& biases) { + assert(bin1_ids.size() == counts.size()); + assert(bin2_ids.size() == counts.size()); + for (std::size_t i = 0; i < counts.size(); ++i) { + const auto i1 = bin1_ids[i]; + const auto i2 = bin2_ids[i]; + counts[i] *= biases[i1] * biases[i2]; + } +} + +inline void ICE::marginalize(const std::vector& bin1_ids, + const std::vector& bin2_ids, std::vector& counts, + std::vector& marg) { + std::fill(marg.begin(), marg.end(), 0); + + for (std::size_t i = 0; i < counts.size(); ++i) { + const auto i1 = bin1_ids[i]; + const auto i2 = bin2_ids[i]; + + marg[i1] += counts[i]; + marg[i2] += counts[i]; + } +} + +inline void ICE::filter_rows_by_nnz(const std::vector& bin1_ids, + const std::vector& bin2_ids, + std::vector counts, std::vector& biases, + std::size_t min_nnz, std::vector& marg_buff) { + std::transform(counts.begin(), counts.end(), counts.begin(), [](const auto n) { return n != 0; }); + marginalize(bin1_ids, bin2_ids, counts, marg_buff); + for (std::size_t i = 0; i < biases.size(); ++i) { + if (marg_buff[i] < static_cast(min_nnz)) { + biases[i] = 0; + } + } +} + +inline auto ICE::inner_loop(const std::vector& bin1_ids, + const std::vector& bin2_ids, std::vector counts, + std::vector& biases, std::vector& marg_buffer) + -> Result { + times_outer_product(bin1_ids, bin2_ids, counts, biases); + + marginalize(bin1_ids, bin2_ids, counts, marg_buffer); + + double marg_sum = 0.0; + std::size_t nnz_marg{}; + for (const auto& n : marg_buffer) { + marg_sum += n; + nnz_marg += n != 0; + } + + if (nnz_marg == 0) { + std::fill(biases.begin(), biases.end(), std::numeric_limits::quiet_NaN()); + return {std::numeric_limits::quiet_NaN(), 0.0}; + } + + const auto avg_nzmarg = (marg_sum / static_cast(nnz_marg)); + for (std::size_t i = 0; i < biases.size(); ++i) { + const auto n = marg_buffer[i] / avg_nzmarg; + if (n != 0) { + biases[i] /= n; + } + } + + double ssq_nzmarg = 0.0; + for (const auto n : marg_buffer) { + if (n != 0) { + ssq_nzmarg += std::pow(n - avg_nzmarg, 2); + } + } + const auto var_nzmarg = ssq_nzmarg / static_cast(nnz_marg - 1); + + return {avg_nzmarg, var_nzmarg}; +} + +inline std::vector ICE::get_weights(bool rescale) const { + std::vector biases(_biases.size()); + const auto scale = rescale ? std::sqrt(_scale) : 1.0; + std::transform(_biases.begin(), _biases.end(), biases.begin(), [&](const auto n) { + return n == 0 ? std::numeric_limits::quiet_NaN() : n / scale; + }); + return biases; +} + +inline double ICE::scale() const noexcept { return _scale; } +inline double ICE::variance() const noexcept { return _variance; } + +} // namespace hictk::balancing diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp new file mode 100644 index 00000000..d0f2702d --- /dev/null +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -0,0 +1,61 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +namespace hictk::balancing { + +class ICE { + std::vector _biases{}; + double _variance{0.0}; + double _scale{std::numeric_limits::quiet_NaN()}; + std::variant _sum{}; + + struct Result { + double scale; + double variance; + }; + + public: + template + ICE(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_rows, double tol = 1.0e-5, + std::size_t max_iters = 200, std::size_t num_masked_diags = 2, std::size_t min_nnz = 10, + double min_count = 0); + + [[nodiscard]] std::vector get_weights(bool rescale = true) const; + [[nodiscard]] double scale() const noexcept; + [[nodiscard]] double variance() const noexcept; + + private: + template + static std::tuple, std::vector, std::vector> + construct_sparse_matrix(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_masked_diags); + + [[nodiscard]] static auto inner_loop(const std::vector& bin1_ids, + const std::vector& bin2_ids, + std::vector counts, std::vector& biases, + std::vector& marg_buffer) -> Result; + + static void times_outer_product(const std::vector& bin1_ids, + const std::vector& bin2_ids, + std::vector& counts, const std::vector& biases); + + static void marginalize(const std::vector& bin1_ids, + const std::vector& bin2_ids, std::vector& counts, + std::vector& marg); + + static void filter_rows_by_nnz(const std::vector& bin1_ids, + const std::vector& bin2_ids, + std::vector counts, std::vector& biases, + std::size_t min_nnz, std::vector& marg_buff); + +}; + +} // namespace hictk::balancing + +#include "../../../ice_impl.hpp" diff --git a/src/libhictk/balancing/vc_impl.hpp b/src/libhictk/balancing/vc_impl.hpp index afec1047..88b08e8a 100644 --- a/src/libhictk/balancing/vc_impl.hpp +++ b/src/libhictk/balancing/vc_impl.hpp @@ -4,7 +4,6 @@ #pragma once -#include #include #include #include diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index 020ad674..c9e3a480 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -6,12 +6,14 @@ #include #include +#include "hictk/balancing/ice.hpp" #include "hictk/balancing/methods.hpp" #include "hictk/balancing/vc.hpp" +#include "hictk/cooler.hpp" #include "hictk/hic.hpp" namespace hictk::test { -inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) +inline const std::filesystem::path datadir{"test/data/"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test namespace hictk::test::balancing { @@ -31,7 +33,7 @@ static void compare_weights(const std::vector& weights, const std::vecto // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("Balancing: VC", "[balancing][short]") { - const auto path = datadir / "ENCFF993FGR.hic"; + const auto path = datadir / "hic/ENCFF993FGR.hic"; auto hf = hictk::hic::File(path.string(), 2500000); @@ -66,4 +68,23 @@ TEST_CASE("Balancing: VC", "[balancing][short]") { } } +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Balancing: ICE", "[balancing][short]") { + const auto path = datadir / "cooler/ENCFF993FGR.2500000.cool"; + + auto clr = hictk::cooler::File(path.string()); + + SECTION("GW") { + auto sel = clr.fetch(); + + const auto num_bins = clr.bins().size(); + const auto weights = + hictk::balancing::ICE(sel.begin(), sel.end(), num_bins) + .get_weights(); + + const auto expected = (*clr.read_weights("weight"))(); + compare_weights(weights, expected); + } +} + } // namespace hictk::test::balancing From fc703a100f89f8d475532f8a8af6859444e6ced4 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 19 Sep 2023 16:59:31 +0200 Subject: [PATCH 03/33] Properly implement ICE cis balancing --- conanfile.txt | 1 + src/libhictk/balancing/CMakeLists.txt | 5 +- src/libhictk/balancing/ice_impl.hpp | 348 +++++++++++++++--- .../balancing/include/hictk/balancing/ice.hpp | 100 +++-- test/units/balancing/balancing_test.cpp | 14 +- 5 files changed, 371 insertions(+), 97 deletions(-) diff --git a/conanfile.txt b/conanfile.txt index 70a9cfda..48aa5ce4 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -13,6 +13,7 @@ highfive/2.7.1#a73bc6937c9add30c9d47a7a70a466eb libdeflate/1.18#3697b637656a9af04cabcbed50db9a7e parallel-hashmap/1.3.11#719aed501c271a34e2347a7731ab3bfb readerwriterqueue/1.0.6#aaa5ff6fac60c2aee591e9e51b063b83 +span-lite/0.10.3#1967d71abb32b314387c2ab9c558dd22 spdlog/1.12.0#248c215bc5f0718402fbf1de126ef847 [generators] diff --git a/src/libhictk/balancing/CMakeLists.txt b/src/libhictk/balancing/CMakeLists.txt index b7fbd58a..d4fce9b9 100644 --- a/src/libhictk/balancing/CMakeLists.txt +++ b/src/libhictk/balancing/CMakeLists.txt @@ -3,6 +3,7 @@ # SPDX-License-Identifier: MIT find_package(phmap REQUIRED) +find_package(span-lite REQUIRED) add_library(balancing INTERFACE) add_library(hictk::balancing ALIAS balancing) @@ -18,4 +19,6 @@ target_include_directories(balancing INTERFACE "$") target_link_libraries(balancing INTERFACE hictk::common hictk::pixel) -target_link_system_libraries(balancing INTERFACE phmap) +target_link_system_libraries(balancing INTERFACE nonstd::span-lite phmap) + +target_compile_definitions(balancing INTERFACE span_FEATURE_MAKE_SPAN=1) diff --git a/src/libhictk/balancing/ice_impl.hpp b/src/libhictk/balancing/ice_impl.hpp index a052e164..79c6a30b 100644 --- a/src/libhictk/balancing/ice_impl.hpp +++ b/src/libhictk/balancing/ice_impl.hpp @@ -7,93 +7,190 @@ #include #include #include +#include #include #include +#include "hictk/cooler/cooler.hpp" #include "hictk/pixel.hpp" #include "hictk/type_traits.hpp" namespace hictk::balancing { -template -inline ICE::ICE(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_rows, double tol, - std::size_t max_iters, std::size_t num_masked_diags, std::size_t min_nnz, - [[maybe_unused]] double min_count) - : _biases(num_rows, 1.0) { - auto [bin1_ids, bin2_ids, counts] = - construct_sparse_matrix(first_pixel, last_pixel, num_masked_diags); +template +inline ICE::ICE(const File& f, Type type, double tol, std::size_t max_iters, + std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, + double mad_max) + : _chrom_bin_offsets(read_chrom_bin_offsets(f.bins())), _biases(f.bins().size(), 1.0) { + const auto matrix = construct_sparse_matrix(f, type, num_masked_diags); + + initialize_biases(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, _chrom_bin_offsets, + min_nnz, min_count, mad_max); + std::vector margs(_biases.size()); + if (type != Type::cis) { + _variance.resize(1, 0); + _scale.resize(1, std::numeric_limits::quiet_NaN()); - if (min_nnz != 0) { - filter_rows_by_nnz(bin1_ids, bin2_ids, counts, _biases, min_nnz, margs); + for (std::size_t i = 0; i < max_iters; ++i) { + const auto res = inner_loop(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, margs); + _variance[0] = res.variance; + _scale[0] = res.scale; + if (res.variance < tol) { + return; + } + } } - // TODO mad-max filter + _variance.resize(_chrom_bin_offsets.size() - 1, 0); + _scale.resize(_chrom_bin_offsets.size() - 1, std::numeric_limits::quiet_NaN()); + for (std::size_t i = 1; i < _chrom_bin_offsets.size(); ++i) { + const auto i0 = matrix.chrom_offsets[i - 1]; + const auto i1 = matrix.chrom_offsets[i]; + + auto bin1_ids_ = nonstd::span(matrix.bin1_ids).subspan(i0, i1 - i0); + auto bin2_ids_ = nonstd::span(matrix.bin2_ids).subspan(i0, i1 - i0); + std::vector counts_{}; + std::copy(matrix.counts.begin() + static_cast(i0), + matrix.counts.begin() + static_cast(i1), std::back_inserter(counts_)); - for (std::size_t i = 0; i < max_iters; ++i) { - const auto res = inner_loop(bin1_ids, bin2_ids, counts, _biases, margs); - _variance = res.variance; - _scale = res.scale; - if (res.variance < tol) { - break; + const auto j0 = _chrom_bin_offsets[i - 1]; + const auto j1 = _chrom_bin_offsets[i]; + + auto biases_ = nonstd::span(_biases).subspan(j0, j1 - j0); + auto margs_ = nonstd::span(margs).subspan(j0, j1 - j0); + for (std::size_t k = 0; k < max_iters; ++k) { + const auto res = inner_loop(bin1_ids_, bin2_ids_, counts_, biases_, margs_, j0); + _variance[i - 1] = res.variance; + _scale[i - 1] = res.scale; + + if (res.variance < tol) { + break; + } } } } -template -inline std::tuple, std::vector, std::vector> -ICE::construct_sparse_matrix(PixelIt first_pixel, PixelIt last_pixel, - std::size_t num_masked_diags) { - std::vector bin1_ids{}; - std::vector bin2_ids{}; - std::vector counts{}; - std::for_each(first_pixel, last_pixel, [&](const auto& p) { +template +auto ICE::construct_sparse_matrix(const File& f, Type type, std::size_t num_masked_diags, + std::size_t bin_offset) -> SparseMatrix { + switch (type) { + case Type::cis: + return construct_sparse_matrix_cis(f, num_masked_diags, bin_offset); + case Type::trans: + return construct_sparse_matrix_trans(f, num_masked_diags, bin_offset); + case Type::gw: + return construct_sparse_matrix_gw(f, num_masked_diags, bin_offset); + } +} + +template +inline auto ICE::construct_sparse_matrix_gw(const File& f, std::size_t num_masked_diags, + std::size_t bin_offset) -> SparseMatrix { + SparseMatrix m{}; + + const auto sel = f.fetch(); + std::for_each(sel.template begin(), sel.template end(), [&](const auto& p) { if (p.bin2_id - p.bin1_id >= num_masked_diags) { - bin1_ids.push_back(p.bin1_id); - bin2_ids.push_back(p.bin2_id); - counts.push_back(p.count); + m.bin1_ids.push_back(p.bin1_id - bin_offset); + m.bin2_ids.push_back(p.bin2_id - bin_offset); + m.counts.push_back(p.count); } }); - bin1_ids.shrink_to_fit(); - bin2_ids.shrink_to_fit(); - counts.shrink_to_fit(); - return std::make_tuple(bin1_ids, bin2_ids, counts); + m.bin1_ids.shrink_to_fit(); + m.bin2_ids.shrink_to_fit(); + m.counts.shrink_to_fit(); + + return m; +} + +template +[[nodiscard]] inline auto ICE::construct_sparse_matrix_cis(const File& f, + std::size_t num_masked_diags, + std::size_t bin_offset) -> SparseMatrix { + SparseMatrix m{}; + m.chrom_offsets.push_back(0); + + for (const Chromosome& chrom : f.chromosomes()) { + const auto sel = f.fetch(chrom.name()); + std::for_each(sel.template begin(), sel.template end(), + [&](const ThinPixel& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.bin1_ids.push_back(p.bin1_id - bin_offset); + m.bin2_ids.push_back(p.bin2_id - bin_offset); + + m.counts.push_back(p.count); + } + }); + + m.chrom_offsets.push_back(m.bin1_ids.size()); + } + + m.bin1_ids.shrink_to_fit(); + m.bin2_ids.shrink_to_fit(); + m.counts.shrink_to_fit(); + + return m; } -inline void ICE::times_outer_product(const std::vector& bin1_ids, - const std::vector& bin2_ids, - std::vector& counts, - const std::vector& biases) { +template +[[nodiscard]] inline auto ICE::construct_sparse_matrix_trans( + [[maybe_unused]] const File& f, [[maybe_unused]] std::size_t num_masked_diags, + [[maybe_unused]] std::size_t bin_offset) -> SparseMatrix { + SparseMatrix m{}; + return m; +} + +inline void ICE::times_outer_product(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span biases, + std::size_t bin_offset) { assert(bin1_ids.size() == counts.size()); assert(bin2_ids.size() == counts.size()); for (std::size_t i = 0; i < counts.size(); ++i) { - const auto i1 = bin1_ids[i]; - const auto i2 = bin2_ids[i]; + const auto i1 = bin1_ids[i] - bin_offset; + const auto i2 = bin2_ids[i] - bin_offset; counts[i] *= biases[i1] * biases[i2]; } } -inline void ICE::marginalize(const std::vector& bin1_ids, - const std::vector& bin2_ids, std::vector& counts, - std::vector& marg) { +inline void ICE::marginalize(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span marg, + std::size_t bin_offset) { std::fill(marg.begin(), marg.end(), 0); for (std::size_t i = 0; i < counts.size(); ++i) { - const auto i1 = bin1_ids[i]; - const auto i2 = bin2_ids[i]; + const auto i1 = bin1_ids[i] - bin_offset; + const auto i2 = bin2_ids[i] - bin_offset; marg[i1] += counts[i]; marg[i2] += counts[i]; } } -inline void ICE::filter_rows_by_nnz(const std::vector& bin1_ids, - const std::vector& bin2_ids, - std::vector counts, std::vector& biases, - std::size_t min_nnz, std::vector& marg_buff) { - std::transform(counts.begin(), counts.end(), counts.begin(), [](const auto n) { return n != 0; }); - marginalize(bin1_ids, bin2_ids, counts, marg_buff); +inline void ICE::marginalize_nnz(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span marg, + std::size_t bin_offset) { + std::fill(marg.begin(), marg.end(), 0); + + for (std::size_t i = 0; i < counts.size(); ++i) { + const auto i1 = bin1_ids[i] - bin_offset; + const auto i2 = bin2_ids[i] - bin_offset; + + marg[i1] += counts[i] != 0; + marg[i2] += counts[i] != 0; + } +} + +inline void ICE::min_nnz_filtering(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span biases, + std::size_t min_nnz, nonstd::span marg_buff, + std::size_t bin_offset) { + marginalize_nnz(bin1_ids, bin2_ids, counts, marg_buff, bin_offset); for (std::size_t i = 0; i < biases.size(); ++i) { if (marg_buff[i] < static_cast(min_nnz)) { biases[i] = 0; @@ -101,13 +198,96 @@ inline void ICE::filter_rows_by_nnz(const std::vector& bin1_ids, } } -inline auto ICE::inner_loop(const std::vector& bin1_ids, - const std::vector& bin2_ids, std::vector counts, - std::vector& biases, std::vector& marg_buffer) - -> Result { - times_outer_product(bin1_ids, bin2_ids, counts, biases); +inline void ICE::min_count_filtering(nonstd::span biases, std::size_t min_count, + const nonstd::span marg) { + for (std::size_t i = 0; i < biases.size(); ++i) { + if (marg[i] < static_cast(min_count)) { + biases[i] = 0; + } + } +} + +inline void ICE::mad_max_filtering(nonstd::span chrom_offsets, + nonstd::span biases, std::vector marg, + double mad_max) { + auto median = [](auto v) { + assert(!v.empty()); + + const auto size = static_cast(v.size()); + auto first = v.begin(); + auto mid = first + (size / 2); + auto last = v.end(); + + std::nth_element(first, mid, last); + + if (size % 2 != 0) { + return *mid; + } + + const auto n1 = *mid; + std::nth_element(first, --mid, last); + const auto n2 = *mid; + + return (n1 + n2) / 2; + }; + + auto mad = [&](const auto vin) { + const auto median_ = median(vin); + auto vout = vin; + + std::transform(vout.begin(), vout.end(), vout.begin(), + [&](const auto n) { return std::abs(n - median_); }); + + return median(vout); + }; + + assert(chrom_offsets.size() > 1); + std::vector cmarg{}; + for (std::size_t i = 1; i < chrom_offsets.size(); ++i) { + const auto i0 = static_cast(chrom_offsets[i - 1] - chrom_offsets.front()); + const auto i1 = static_cast(chrom_offsets[i] - chrom_offsets.front()); + + cmarg.clear(); + std::copy_if(marg.begin() + i0, marg.begin() + i1, std::back_inserter(cmarg), + [](const auto n) { return n > 0; }); + + if (!cmarg.empty()) { + const auto median_ = median(cmarg); + std::transform(marg.begin() + i0, marg.begin() + i1, marg.begin() + i0, + [&](const auto n) { return n / median_; }); + } + } + + std::vector log_nz_marg{}; + for (const auto n : marg) { + if (n > 0) { + log_nz_marg.push_back(std::log(n)); + } + } + + if (log_nz_marg.empty()) { + return; + } + + const auto median_log_nz_marg = median(log_nz_marg); + const auto dev_log_nz_marg = mad(log_nz_marg); + + const auto cutoff = std::exp(median_log_nz_marg - mad_max * dev_log_nz_marg); + + for (std::size_t i = 0; i < marg.size(); ++i) { + if (marg[i] < cutoff) { + biases[i] = 0.0; + } + } +} + +inline auto ICE::inner_loop(nonstd::span bin1_ids, + nonstd::span bin2_ids, std::vector counts, + nonstd::span biases, nonstd::span marg_buffer, + std::size_t bin_offset) -> Result { + times_outer_product(bin1_ids, bin2_ids, counts, biases, bin_offset); - marginalize(bin1_ids, bin2_ids, counts, marg_buffer); + marginalize(bin1_ids, bin2_ids, counts, marg_buffer, bin_offset); double marg_sum = 0.0; std::size_t nnz_marg{}; @@ -140,16 +320,62 @@ inline auto ICE::inner_loop(const std::vector& bin1_ids, return {avg_nzmarg, var_nzmarg}; } -inline std::vector ICE::get_weights(bool rescale) const { +inline void ICE::initialize_biases(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span biases, + nonstd::span chrom_bin_offsets, + std::size_t min_nnz, std::size_t min_count, double mad_max) { + std::vector margs(biases.size()); + if (min_nnz != 0) { + min_nnz_filtering(bin1_ids, bin2_ids, counts, biases, min_nnz, margs); + } + + marginalize(bin1_ids, bin2_ids, counts, margs); + if (min_count != 0) { + min_count_filtering(biases, min_count, margs); + } + + if (mad_max != 0) { + mad_max_filtering(chrom_bin_offsets, biases, margs, mad_max); + } +} + +inline std::vector ICE::read_chrom_bin_offsets(const BinTable& bins) { + std::vector buff{0}; + for (const Chromosome& chrom : bins.chromosomes()) { + const auto nbins = (chrom.size() + bins.bin_size() - 1) / bins.bin_size(); + buff.push_back(buff.back() + nbins); + } + + return buff; +} + +inline std::vector ICE::get_weights([[maybe_unused]] bool rescale) const { std::vector biases(_biases.size()); - const auto scale = rescale ? std::sqrt(_scale) : 1.0; - std::transform(_biases.begin(), _biases.end(), biases.begin(), [&](const auto n) { - return n == 0 ? std::numeric_limits::quiet_NaN() : n / scale; - }); + if (!rescale) { + return biases; + } + + if (_scale.size() == 1) { + const auto scale = std::sqrt(_scale[0]); + std::transform(_biases.begin(), _biases.end(), biases.begin(), [&](const auto n) { + return n == 0 ? std::numeric_limits::quiet_NaN() : n / scale; + }); + } else { + for (std::size_t i = 1; i < _chrom_bin_offsets.size(); ++i) { + const auto i0 = static_cast(_chrom_bin_offsets[i - 1]); + const auto i1 = static_cast(_chrom_bin_offsets[i]); + const auto scale = std::sqrt(_scale[i - 1]); + std::transform(_biases.begin() + i0, _biases.begin() + i1, biases.begin() + i0, + [&](const auto n) { + return n == 0 ? std::numeric_limits::quiet_NaN() : n / scale; + }); + } + } return biases; } -inline double ICE::scale() const noexcept { return _scale; } -inline double ICE::variance() const noexcept { return _variance; } +inline std::vector ICE::scale() const noexcept { return _scale; } +inline std::vector ICE::variance() const noexcept { return _variance; } } // namespace hictk::balancing diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index d0f2702d..fd857d50 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -5,16 +5,18 @@ #pragma once #include -#include +#include #include +#include "hictk/bin_table.hpp" + namespace hictk::balancing { class ICE { + std::vector _chrom_bin_offsets{}; std::vector _biases{}; - double _variance{0.0}; - double _scale{std::numeric_limits::quiet_NaN()}; - std::variant _sum{}; + std::vector _variance{}; + std::vector _scale{}; struct Result { double scale; @@ -22,38 +24,86 @@ class ICE { }; public: + enum Type { cis, trans, gw }; + + template + ICE(const File& f, Type type = Type::gw, double tol = 1.0e-5, std::size_t max_iters = 200, + std::size_t num_masked_diags = 2, std::size_t min_nnz = 10, std::size_t min_count = 0, + double mad_max = 5.0); + template - ICE(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_rows, double tol = 1.0e-5, + ICE(PixelIt first_pixel, PixelIt last_pixel, const BinTable& bins, double tol = 1.0e-5, std::size_t max_iters = 200, std::size_t num_masked_diags = 2, std::size_t min_nnz = 10, - double min_count = 0); + std::size_t min_count = 0, double mad_max = 5.0); [[nodiscard]] std::vector get_weights(bool rescale = true) const; - [[nodiscard]] double scale() const noexcept; - [[nodiscard]] double variance() const noexcept; + [[nodiscard]] std::vector scale() const noexcept; + [[nodiscard]] std::vector variance() const noexcept; private: - template - static std::tuple, std::vector, std::vector> - construct_sparse_matrix(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_masked_diags); + struct SparseMatrix { + std::vector bin1_ids{}; + std::vector bin2_ids{}; + std::vector counts{}; + + std::vector chrom_offsets{}; + }; + template + [[nodiscard]] static auto construct_sparse_matrix(const File& f, Type type, + std::size_t num_masked_diags, + std::size_t bin_offset = 0) -> SparseMatrix; + + template + [[nodiscard]] static auto construct_sparse_matrix_gw(const File& f, std::size_t num_masked_diags, + std::size_t bin_offset) -> SparseMatrix; + + template + [[nodiscard]] static auto construct_sparse_matrix_cis(const File& f, std::size_t num_masked_diags, + std::size_t bin_offset) -> SparseMatrix; + template + [[nodiscard]] static auto construct_sparse_matrix_trans(const File& f, + std::size_t num_masked_diags, + std::size_t bin_offset) -> SparseMatrix; + + [[nodiscard]] static auto inner_loop(nonstd::span bin1_ids, + nonstd::span bin2_ids, + std::vector counts, nonstd::span biases, + nonstd::span marg_buffer, std::size_t bin_offset = 0) + -> Result; + + static void times_outer_product(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span biases, + std::size_t bin_offset = 0); - [[nodiscard]] static auto inner_loop(const std::vector& bin1_ids, - const std::vector& bin2_ids, - std::vector counts, std::vector& biases, - std::vector& marg_buffer) -> Result; + static void marginalize(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span marg, + std::size_t bin_offset = 0); - static void times_outer_product(const std::vector& bin1_ids, - const std::vector& bin2_ids, - std::vector& counts, const std::vector& biases); + static void marginalize_nnz(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span marg, + std::size_t bin_offset = 0); - static void marginalize(const std::vector& bin1_ids, - const std::vector& bin2_ids, std::vector& counts, - std::vector& marg); + static void min_nnz_filtering(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span biases, + std::size_t min_nnz, nonstd::span marg_buff, + std::size_t bin_offset = 0); + static void min_count_filtering(nonstd::span biases, std::size_t min_count, + nonstd::span marg); - static void filter_rows_by_nnz(const std::vector& bin1_ids, - const std::vector& bin2_ids, - std::vector counts, std::vector& biases, - std::size_t min_nnz, std::vector& marg_buff); + static void mad_max_filtering(nonstd::span chrom_offsets, + nonstd::span biases, std::vector marg, + double mad_max); + static void initialize_biases(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span biases, + nonstd::span chrom_bin_offsets, + std::size_t min_nnz, std::size_t min_count, double mad_max); + [[nodiscard]] static std::vector read_chrom_bin_offsets(const BinTable& bins); }; } // namespace hictk::balancing diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index c9e3a480..eafd22a2 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -74,16 +74,10 @@ TEST_CASE("Balancing: ICE", "[balancing][short]") { auto clr = hictk::cooler::File(path.string()); - SECTION("GW") { - auto sel = clr.fetch(); - - const auto num_bins = clr.bins().size(); - const auto weights = - hictk::balancing::ICE(sel.begin(), sel.end(), num_bins) - .get_weights(); - - const auto expected = (*clr.read_weights("weight"))(); - compare_weights(weights, expected); + SECTION("INTRA") { + constexpr auto type = hictk::balancing::ICE::Type::cis; + const auto weights = hictk::balancing::ICE(clr, type).get_weights(); + compare_weights(weights, (*clr.read_weights("weight"))()); } } From abbffe32766b5780aaabe97c121d645375f08943 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 19 Sep 2023 19:43:53 +0200 Subject: [PATCH 04/33] Properly implement ICE trans balancing. Add tests --- cmake/FetchTestDataset.cmake | 4 +- src/libhictk/balancing/ice_impl.hpp | 103 ++++++++++++++++-- .../balancing/include/hictk/balancing/ice.hpp | 20 +++- test/units/balancing/balancing_test.cpp | 39 ++++++- 4 files changed, 147 insertions(+), 19 deletions(-) diff --git a/cmake/FetchTestDataset.cmake b/cmake/FetchTestDataset.cmake index 80be7212..4d30a60d 100644 --- a/cmake/FetchTestDataset.cmake +++ b/cmake/FetchTestDataset.cmake @@ -4,8 +4,8 @@ # cmake-format: off file( - DOWNLOAD https://zenodo.org/record/8204309/files/hictk_test_data.tar.xz?download=1 - EXPECTED_HASH SHA256=f9d0ccc8f8c7bade7f9049ea0abb6b26c061e6fed9e1e7bb8f2da0e1a402cfa4 + DOWNLOAD https://www.dropbox.com/s/l6rymg9mezixin6/hictk_test_data.tar.xz?dl=1 + EXPECTED_HASH SHA256=a97c3a66d25c7441154ef15c9b747e69ac1b6a5810a478a67139565ee3ea999c "${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz") # cmake-format: on diff --git a/src/libhictk/balancing/ice_impl.hpp b/src/libhictk/balancing/ice_impl.hpp index 79c6a30b..ea28904b 100644 --- a/src/libhictk/balancing/ice_impl.hpp +++ b/src/libhictk/balancing/ice_impl.hpp @@ -22,7 +22,7 @@ inline ICE::ICE(const File& f, Type type, double tol, std::size_t max_iters, std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, double mad_max) : _chrom_bin_offsets(read_chrom_bin_offsets(f.bins())), _biases(f.bins().size(), 1.0) { - const auto matrix = construct_sparse_matrix(f, type, num_masked_diags); + auto matrix = construct_sparse_matrix(f, type, num_masked_diags); initialize_biases(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, _chrom_bin_offsets, min_nnz, min_count, mad_max); @@ -32,8 +32,16 @@ inline ICE::ICE(const File& f, Type type, double tol, std::size_t max_iters, _variance.resize(1, 0); _scale.resize(1, std::numeric_limits::quiet_NaN()); + auto weights = type == Type::trans + ? compute_weights_from_chromosome_sizes(f.bins(), _chrom_bin_offsets) + : std::vector{}; + if (type == Type::trans) { + mask_cis_interactions(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _chrom_bin_offsets); + } + for (std::size_t i = 0; i < max_iters; ++i) { - const auto res = inner_loop(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, margs); + const auto res = + inner_loop(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, margs, 0, weights); _variance[0] = res.variance; _scale[0] = res.scale; if (res.variance < tol) { @@ -78,7 +86,7 @@ auto ICE::construct_sparse_matrix(const File& f, Type type, std::size_t num_mask case Type::cis: return construct_sparse_matrix_cis(f, num_masked_diags, bin_offset); case Type::trans: - return construct_sparse_matrix_trans(f, num_masked_diags, bin_offset); + [[fallthrough]]; case Type::gw: return construct_sparse_matrix_gw(f, num_masked_diags, bin_offset); } @@ -135,23 +143,58 @@ template } template -[[nodiscard]] inline auto ICE::construct_sparse_matrix_trans( - [[maybe_unused]] const File& f, [[maybe_unused]] std::size_t num_masked_diags, - [[maybe_unused]] std::size_t bin_offset) -> SparseMatrix { +[[nodiscard]] inline auto ICE::construct_sparse_matrix_trans(const File& f, std::size_t bin_offset) + -> SparseMatrix { SparseMatrix m{}; + + using PixelIt = decltype(f.fetch("chr1", "chr2").template begin()); + + std::vector heads{}; + std::vector tails{}; + for (const Chromosome& chrom1 : f.chromosomes()) { + for (std::uint32_t chrom2_id = chrom1.id() + 1; chrom2_id < f.chromosomes().size(); + ++chrom2_id) { + const auto& chrom2 = f.chromosomes().at(chrom2_id); + const auto sel = f.fetch(chrom1.name(), chrom2.name()); + auto first = sel.template begin(); + auto last = sel.template end(); + if (first != last) { + heads.emplace_back(std::move(first)); + tails.emplace_back(std::move(last)); + } + } + } + + internal::PixelMerger merger{heads, tails}; + while (true) { + auto pixel = merger.next(); + if (!pixel) { + break; + } + m.bin1_ids.push_back(pixel.bin1_id - bin_offset); + m.bin2_ids.push_back(pixel.bin2_id - bin_offset); + m.counts.push_back(pixel.count); + } + + m.bin1_ids.shrink_to_fit(); + m.bin2_ids.shrink_to_fit(); + m.counts.shrink_to_fit(); + return m; } inline void ICE::times_outer_product(nonstd::span bin1_ids, nonstd::span bin2_ids, nonstd::span counts, nonstd::span biases, - std::size_t bin_offset) { + std::size_t bin_offset, nonstd::span weights) { assert(bin1_ids.size() == counts.size()); assert(bin2_ids.size() == counts.size()); for (std::size_t i = 0; i < counts.size(); ++i) { const auto i1 = bin1_ids[i] - bin_offset; const auto i2 = bin2_ids[i] - bin_offset; - counts[i] *= biases[i1] * biases[i2]; + const auto w1 = weights.empty() ? 1 : weights[i1]; + const auto w2 = weights.empty() ? 1 : weights[i2]; + counts[i] *= (w1 * biases[i1]) * (w2 * biases[i2]); } } @@ -284,8 +327,8 @@ inline void ICE::mad_max_filtering(nonstd::span chrom_offsets inline auto ICE::inner_loop(nonstd::span bin1_ids, nonstd::span bin2_ids, std::vector counts, nonstd::span biases, nonstd::span marg_buffer, - std::size_t bin_offset) -> Result { - times_outer_product(bin1_ids, bin2_ids, counts, biases, bin_offset); + std::size_t bin_offset, nonstd::span weights) -> Result { + times_outer_product(bin1_ids, bin2_ids, counts, biases, bin_offset, weights); marginalize(bin1_ids, bin2_ids, counts, marg_buffer, bin_offset); @@ -350,6 +393,25 @@ inline std::vector ICE::read_chrom_bin_offsets(const BinTable& bins) { return buff; } +inline std::vector ICE::compute_weights_from_chromosome_sizes( + const BinTable& bins, nonstd::span chrom_bin_offsets) { + std::vector weights(bins.size()); + for (std::uint32_t i = 1; i < chrom_bin_offsets.size(); ++i) { + const auto& chrom = bins.chromosomes().at(i - 1); + const auto i0 = chrom_bin_offsets[i - 1]; + const auto i1 = chrom_bin_offsets[i]; + + const auto nbins = static_cast(bins.size()); + const auto cnbins = + std::ceil(static_cast(chrom.size()) / static_cast(bins.bin_size())); + + for (std::size_t j = i0; j < i1; ++j) { + weights[j] = 1.0 / (1.0 - cnbins / nbins); + } + } + return weights; +} + inline std::vector ICE::get_weights([[maybe_unused]] bool rescale) const { std::vector biases(_biases.size()); if (!rescale) { @@ -375,6 +437,27 @@ inline std::vector ICE::get_weights([[maybe_unused]] bool rescale) const return biases; } +inline void ICE::mask_cis_interactions(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, + nonstd::span chrom_bin_offsets) { + std::size_t j = 0; + for (std::size_t i = 1; i < chrom_bin_offsets.size(); ++i) { + const auto last_bin_id = chrom_bin_offsets[i]; + + while (j < counts.size()) { + if (bin1_ids[j] < last_bin_id) { + if (bin2_ids[j] < last_bin_id) { + counts[j] = 0; + } + } else { + break; + } + ++j; + } + } +} + inline std::vector ICE::scale() const noexcept { return _scale; } inline std::vector ICE::variance() const noexcept { return _variance; } diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index fd857d50..511f7b2d 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -61,20 +61,20 @@ class ICE { [[nodiscard]] static auto construct_sparse_matrix_cis(const File& f, std::size_t num_masked_diags, std::size_t bin_offset) -> SparseMatrix; template - [[nodiscard]] static auto construct_sparse_matrix_trans(const File& f, - std::size_t num_masked_diags, - std::size_t bin_offset) -> SparseMatrix; + [[nodiscard]] static auto construct_sparse_matrix_trans(const File& f, std::size_t bin_offset) + -> SparseMatrix; [[nodiscard]] static auto inner_loop(nonstd::span bin1_ids, nonstd::span bin2_ids, std::vector counts, nonstd::span biases, - nonstd::span marg_buffer, std::size_t bin_offset = 0) - -> Result; + nonstd::span marg_buffer, std::size_t bin_offset = 0, + nonstd::span weights = {}) -> Result; static void times_outer_product(nonstd::span bin1_ids, nonstd::span bin2_ids, nonstd::span counts, nonstd::span biases, - std::size_t bin_offset = 0); + std::size_t bin_offset = 0, + nonstd::span weights = {}); static void marginalize(nonstd::span bin1_ids, nonstd::span bin2_ids, @@ -104,6 +104,14 @@ class ICE { nonstd::span chrom_bin_offsets, std::size_t min_nnz, std::size_t min_count, double mad_max); [[nodiscard]] static std::vector read_chrom_bin_offsets(const BinTable& bins); + + [[nodiscard]] static std::vector compute_weights_from_chromosome_sizes( + const BinTable& bins, nonstd::span chrom_bin_offsets); + + static void mask_cis_interactions(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, + nonstd::span chrom_bin_offsets); }; } // namespace hictk::balancing diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index eafd22a2..37ae70d2 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "hictk/balancing/ice.hpp" #include "hictk/balancing/methods.hpp" @@ -18,6 +19,20 @@ inline const std::filesystem::path datadir{"test/data/"}; // NOLINT(cert-err58- namespace hictk::test::balancing { +[[nodiscard]] static std::vector read_weights(const std::filesystem::path& path, + char sep = '\n') { + assert(std::filesystem::exists(path)); + std::ifstream ifs(path); + std::string strbuf; + std::vector buffer{}; + + while (std::getline(ifs, strbuf, sep)) { + buffer.push_back(std::stod(strbuf)); + } + + return buffer; +} + static void compare_weights(const std::vector& weights, const std::vector& expected, double tol = 1.0e-6) { REQUIRE(weights.size() == expected.size()); @@ -75,9 +90,31 @@ TEST_CASE("Balancing: ICE", "[balancing][short]") { auto clr = hictk::cooler::File(path.string()); SECTION("INTRA") { + const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; + constexpr auto type = hictk::balancing::ICE::Type::cis; const auto weights = hictk::balancing::ICE(clr, type).get_weights(); - compare_weights(weights, (*clr.read_weights("weight"))()); + const auto expected_weights = read_weights(path_intra_weights); + + compare_weights(weights, expected_weights); + } + SECTION("INTER") { + const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; + + constexpr auto type = hictk::balancing::ICE::Type::trans; + const auto weights = hictk::balancing::ICE(clr, type).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); + + compare_weights(weights, expected_weights); + } + SECTION("GW") { + const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; + + constexpr auto type = hictk::balancing::ICE::Type::gw; + const auto weights = hictk::balancing::ICE(clr, type).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); + + compare_weights(weights, expected_weights); } } From d520b14e77ea0d632d29fdbde1b576f438c78d47 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 20 Sep 2023 10:57:20 +0200 Subject: [PATCH 05/33] Perf optimizations - Perform times_outer_product and marginalization in one step - Avoid copying the vector of counts every iteration --- src/libhictk/balancing/ice_impl.hpp | 100 +++++++----------- .../balancing/include/hictk/balancing/ice.hpp | 38 ++++--- 2 files changed, 60 insertions(+), 78 deletions(-) diff --git a/src/libhictk/balancing/ice_impl.hpp b/src/libhictk/balancing/ice_impl.hpp index ea28904b..7eae830f 100644 --- a/src/libhictk/balancing/ice_impl.hpp +++ b/src/libhictk/balancing/ice_impl.hpp @@ -17,6 +17,16 @@ namespace hictk::balancing { +inline bool SparseMatrix::empty() const noexcept { return size() == 0; } +inline std::size_t SparseMatrix::size() const noexcept { return counts.size(); } + +inline void SparseMatrix::clear() noexcept { + bin1_ids.clear(); + bin2_ids.clear(); + counts.clear(); + chrom_offsets.clear(); +} + template inline ICE::ICE(const File& f, Type type, double tol, std::size_t max_iters, std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, @@ -134,7 +144,6 @@ template m.chrom_offsets.push_back(m.bin1_ids.size()); } - m.bin1_ids.shrink_to_fit(); m.bin2_ids.shrink_to_fit(); m.counts.shrink_to_fit(); @@ -142,74 +151,44 @@ template return m; } -template -[[nodiscard]] inline auto ICE::construct_sparse_matrix_trans(const File& f, std::size_t bin_offset) - -> SparseMatrix { - SparseMatrix m{}; +inline void ICE::marginalize(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span marg, + std::size_t bin_offset) { + std::fill(marg.begin(), marg.end(), 0); - using PixelIt = decltype(f.fetch("chr1", "chr2").template begin()); - - std::vector heads{}; - std::vector tails{}; - for (const Chromosome& chrom1 : f.chromosomes()) { - for (std::uint32_t chrom2_id = chrom1.id() + 1; chrom2_id < f.chromosomes().size(); - ++chrom2_id) { - const auto& chrom2 = f.chromosomes().at(chrom2_id); - const auto sel = f.fetch(chrom1.name(), chrom2.name()); - auto first = sel.template begin(); - auto last = sel.template end(); - if (first != last) { - heads.emplace_back(std::move(first)); - tails.emplace_back(std::move(last)); - } - } - } + for (std::size_t i = 0; i < counts.size(); ++i) { + const auto i1 = bin1_ids[i] - bin_offset; + const auto i2 = bin2_ids[i] - bin_offset; - internal::PixelMerger merger{heads, tails}; - while (true) { - auto pixel = merger.next(); - if (!pixel) { - break; - } - m.bin1_ids.push_back(pixel.bin1_id - bin_offset); - m.bin2_ids.push_back(pixel.bin2_id - bin_offset); - m.counts.push_back(pixel.count); + marg[i1] += counts[i]; + marg[i2] += counts[i]; } - - m.bin1_ids.shrink_to_fit(); - m.bin2_ids.shrink_to_fit(); - m.counts.shrink_to_fit(); - - return m; } -inline void ICE::times_outer_product(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span biases, - std::size_t bin_offset, nonstd::span weights) { +inline void ICE::times_outer_product_marg(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, + nonstd::span biases, + nonstd::span marg, std::size_t bin_offset, + nonstd::span weights) { assert(bin1_ids.size() == counts.size()); assert(bin2_ids.size() == counts.size()); - for (std::size_t i = 0; i < counts.size(); ++i) { - const auto i1 = bin1_ids[i] - bin_offset; - const auto i2 = bin2_ids[i] - bin_offset; - const auto w1 = weights.empty() ? 1 : weights[i1]; - const auto w2 = weights.empty() ? 1 : weights[i2]; - counts[i] *= (w1 * biases[i1]) * (w2 * biases[i2]); - } -} -inline void ICE::marginalize(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span marg, - std::size_t bin_offset) { + assert(biases.size() == marg.size()); + assert(biases.size() == weights.size() || weights.empty()); + std::fill(marg.begin(), marg.end(), 0); for (std::size_t i = 0; i < counts.size(); ++i) { const auto i1 = bin1_ids[i] - bin_offset; const auto i2 = bin2_ids[i] - bin_offset; + const auto w1 = weights.empty() ? 1 : weights[i1]; + const auto w2 = weights.empty() ? 1 : weights[i2]; + const auto count = counts[i] * (w1 * biases[i1]) * (w2 * biases[i2]); - marg[i1] += counts[i]; - marg[i2] += counts[i]; + marg[i1] += count; + marg[i2] += count; } } @@ -325,12 +304,11 @@ inline void ICE::mad_max_filtering(nonstd::span chrom_offsets } inline auto ICE::inner_loop(nonstd::span bin1_ids, - nonstd::span bin2_ids, std::vector counts, - nonstd::span biases, nonstd::span marg_buffer, - std::size_t bin_offset, nonstd::span weights) -> Result { - times_outer_product(bin1_ids, bin2_ids, counts, biases, bin_offset, weights); - - marginalize(bin1_ids, bin2_ids, counts, marg_buffer, bin_offset); + nonstd::span bin2_ids, + nonstd::span counts, nonstd::span biases, + nonstd::span marg_buffer, std::size_t bin_offset, + nonstd::span weights) -> Result { + times_outer_product_marg(bin1_ids, bin2_ids, counts, biases, marg_buffer, bin_offset, weights); double marg_sum = 0.0; std::size_t nnz_marg{}; diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index 511f7b2d..b748206f 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -12,6 +12,18 @@ namespace hictk::balancing { +struct SparseMatrix { + std::vector bin1_ids{}; // NOLINT + std::vector bin2_ids{}; // NOLINT + std::vector counts{}; // NOLINT + + std::vector chrom_offsets{}; // NOLINT + + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; + void clear() noexcept; +}; + class ICE { std::vector _chrom_bin_offsets{}; std::vector _biases{}; @@ -41,13 +53,6 @@ class ICE { [[nodiscard]] std::vector variance() const noexcept; private: - struct SparseMatrix { - std::vector bin1_ids{}; - std::vector bin2_ids{}; - std::vector counts{}; - - std::vector chrom_offsets{}; - }; template [[nodiscard]] static auto construct_sparse_matrix(const File& f, Type type, std::size_t num_masked_diags, @@ -60,27 +65,26 @@ class ICE { template [[nodiscard]] static auto construct_sparse_matrix_cis(const File& f, std::size_t num_masked_diags, std::size_t bin_offset) -> SparseMatrix; - template - [[nodiscard]] static auto construct_sparse_matrix_trans(const File& f, std::size_t bin_offset) - -> SparseMatrix; [[nodiscard]] static auto inner_loop(nonstd::span bin1_ids, nonstd::span bin2_ids, - std::vector counts, nonstd::span biases, + nonstd::span counts, + nonstd::span biases, nonstd::span marg_buffer, std::size_t bin_offset = 0, nonstd::span weights = {}) -> Result; - static void times_outer_product(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span biases, - std::size_t bin_offset = 0, - nonstd::span weights = {}); - static void marginalize(nonstd::span bin1_ids, nonstd::span bin2_ids, nonstd::span counts, nonstd::span marg, std::size_t bin_offset = 0); + static void times_outer_product_marg(nonstd::span bin1_ids, + nonstd::span bin2_ids, + nonstd::span counts, + nonstd::span biases, nonstd::span marg, + std::size_t bin_offset, + nonstd::span weights = {}); + static void marginalize_nnz(nonstd::span bin1_ids, nonstd::span bin2_ids, nonstd::span counts, nonstd::span marg, From 7d94262d1c3925101169f34a5ed28b197ac382a6 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 20 Sep 2023 11:53:20 +0200 Subject: [PATCH 06/33] Refactor --- src/libhictk/balancing/ice_impl.hpp | 71 ++++++++++++++----- .../balancing/include/hictk/balancing/ice.hpp | 7 +- 2 files changed, 59 insertions(+), 19 deletions(-) diff --git a/src/libhictk/balancing/ice_impl.hpp b/src/libhictk/balancing/ice_impl.hpp index 7eae830f..b8babb25 100644 --- a/src/libhictk/balancing/ice_impl.hpp +++ b/src/libhictk/balancing/ice_impl.hpp @@ -4,6 +4,8 @@ #pragma once +#include + #include #include #include @@ -37,31 +39,61 @@ inline ICE::ICE(const File& f, Type type, double tol, std::size_t max_iters, initialize_biases(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, _chrom_bin_offsets, min_nnz, min_count, mad_max); + switch (type) { + case Type::gw: + balance_gw(matrix, max_iters, tol); + break; + case Type::cis: + balance_cis(matrix, f.bins(), max_iters, tol); + break; + case Type::trans: + balance_trans(matrix, f.bins(), max_iters, tol); + } +} + +void ICE::balance_gw(const SparseMatrix& matrix, std::size_t max_iters, double tol) { std::vector margs(_biases.size()); - if (type != Type::cis) { - _variance.resize(1, 0); - _scale.resize(1, std::numeric_limits::quiet_NaN()); - - auto weights = type == Type::trans - ? compute_weights_from_chromosome_sizes(f.bins(), _chrom_bin_offsets) - : std::vector{}; - if (type == Type::trans) { - mask_cis_interactions(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _chrom_bin_offsets); + _variance.resize(1, 0); + _scale.resize(1, std::numeric_limits::quiet_NaN()); + + for (std::size_t i = 0; i < max_iters; ++i) { + const auto res = inner_loop(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, margs, 0); + SPDLOG_INFO(FMT_STRING("Iteration {}: {}"), i + 1, res.variance); + _variance[0] = res.variance; + _scale[0] = res.scale; + if (res.variance < tol) { + return; } + } +} - for (std::size_t i = 0; i < max_iters; ++i) { - const auto res = - inner_loop(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, margs, 0, weights); - _variance[0] = res.variance; - _scale[0] = res.scale; - if (res.variance < tol) { - return; - } +void ICE::balance_trans(SparseMatrix& matrix, const BinTable& bins, std::size_t max_iters, + double tol) { + std::vector margs(_biases.size()); + _variance.resize(1, 0); + _scale.resize(1, std::numeric_limits::quiet_NaN()); + const auto weights = compute_weights_from_chromosome_sizes(bins, _chrom_bin_offsets); + mask_cis_interactions(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _chrom_bin_offsets); + + for (std::size_t i = 0; i < max_iters; ++i) { + const auto res = + inner_loop(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, margs, 0, weights); + SPDLOG_INFO(FMT_STRING("Iteration {}: {}"), i + 1, res.variance); + _variance[0] = res.variance; + _scale[0] = res.scale; + if (res.variance < tol) { + return; } } +} +void ICE::balance_cis(const SparseMatrix& matrix, const BinTable& bins, std::size_t max_iters, + double tol) { _variance.resize(_chrom_bin_offsets.size() - 1, 0); _scale.resize(_chrom_bin_offsets.size() - 1, std::numeric_limits::quiet_NaN()); + + std::vector margs(_biases.size()); + for (std::size_t i = 1; i < _chrom_bin_offsets.size(); ++i) { const auto i0 = matrix.chrom_offsets[i - 1]; const auto i1 = matrix.chrom_offsets[i]; @@ -79,6 +111,9 @@ inline ICE::ICE(const File& f, Type type, double tol, std::size_t max_iters, auto margs_ = nonstd::span(margs).subspan(j0, j1 - j0); for (std::size_t k = 0; k < max_iters; ++k) { const auto res = inner_loop(bin1_ids_, bin2_ids_, counts_, biases_, margs_, j0); + SPDLOG_INFO(FMT_STRING("[{}] iteration {}: {}"), + bins.chromosomes().at(static_cast(i - 1)).name(), k + 1, + res.variance); _variance[i - 1] = res.variance; _scale[i - 1] = res.scale; @@ -307,7 +342,7 @@ inline auto ICE::inner_loop(nonstd::span bin1_ids, nonstd::span bin2_ids, nonstd::span counts, nonstd::span biases, nonstd::span marg_buffer, std::size_t bin_offset, - nonstd::span weights) -> Result { + nonstd::span weights) -> Result { times_outer_product_marg(bin1_ids, bin2_ids, counts, biases, marg_buffer, bin_offset, weights); double marg_sum = 0.0; diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index b748206f..33f3bdd4 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -53,6 +53,11 @@ class ICE { [[nodiscard]] std::vector variance() const noexcept; private: + void balance_gw(const SparseMatrix& matrix, std::size_t max_iters, double tol); + void balance_cis(const SparseMatrix& matrix, [[maybe_unused]] const BinTable& bins, + std::size_t max_iters, double tol); + void balance_trans(SparseMatrix& matrix, const BinTable& bins, std::size_t max_iters, double tol); + template [[nodiscard]] static auto construct_sparse_matrix(const File& f, Type type, std::size_t num_masked_diags, @@ -71,7 +76,7 @@ class ICE { nonstd::span counts, nonstd::span biases, nonstd::span marg_buffer, std::size_t bin_offset = 0, - nonstd::span weights = {}) -> Result; + nonstd::span weights = {}) -> Result; static void marginalize(nonstd::span bin1_ids, nonstd::span bin2_ids, From 44636085bf44011683876af0b8c8fca82bcb717e Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 22 Sep 2023 11:15:30 +0200 Subject: [PATCH 07/33] Implement chunked ICE balancing --- conanfile.txt | 1 + src/libhictk/balancing/CMakeLists.txt | 17 +- src/libhictk/balancing/ice_impl.hpp | 477 --------------- .../balancing/include/hictk/balancing/ice.hpp | 152 ++--- .../hictk/balancing/impl/ice_mem_impl.hpp | 542 ++++++++++++++++++ .../balancing/impl/sparse_matrix_impl.hpp | 437 ++++++++++++++ .../hictk/balancing/impl}/vc_impl.hpp | 0 .../include/hictk/balancing/sparse_matrix.hpp | 161 ++++++ .../balancing/include/hictk/balancing/vc.hpp | 2 +- .../pixel/include/hictk/impl/pixel_impl.hpp | 50 ++ src/libhictk/pixel/include/hictk/pixel.hpp | 23 + test/units/balancing/balancing_test.cpp | 83 ++- test/units/include/tmpdir.hpp | 1 + 13 files changed, 1369 insertions(+), 577 deletions(-) delete mode 100644 src/libhictk/balancing/ice_impl.hpp create mode 100644 src/libhictk/balancing/include/hictk/balancing/impl/ice_mem_impl.hpp create mode 100644 src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp rename src/libhictk/balancing/{ => include/hictk/balancing/impl}/vc_impl.hpp (100%) create mode 100644 src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp diff --git a/conanfile.txt b/conanfile.txt index 48aa5ce4..58da529e 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -15,6 +15,7 @@ parallel-hashmap/1.3.11#719aed501c271a34e2347a7731ab3bfb readerwriterqueue/1.0.6#aaa5ff6fac60c2aee591e9e51b063b83 span-lite/0.10.3#1967d71abb32b314387c2ab9c558dd22 spdlog/1.12.0#248c215bc5f0718402fbf1de126ef847 +zstd/1.5.5#93372fe14bb7883bd4de82914e0a1841 [generators] CMakeDeps diff --git a/src/libhictk/balancing/CMakeLists.txt b/src/libhictk/balancing/CMakeLists.txt index d4fce9b9..1cb6f0ea 100644 --- a/src/libhictk/balancing/CMakeLists.txt +++ b/src/libhictk/balancing/CMakeLists.txt @@ -4,21 +4,24 @@ find_package(phmap REQUIRED) find_package(span-lite REQUIRED) +find_package(zstd REQUIRED) add_library(balancing INTERFACE) add_library(hictk::balancing ALIAS balancing) target_sources( - balancing - INTERFACE FILE_SET - HEADERS - BASE_DIRS - "${CMAKE_CURRENT_SOURCE_DIR}/include") + balancing + INTERFACE FILE_SET + HEADERS + BASE_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/include") target_include_directories(balancing INTERFACE "$" - "$") + "$") target_link_libraries(balancing INTERFACE hictk::common hictk::pixel) -target_link_system_libraries(balancing INTERFACE nonstd::span-lite phmap) +target_link_system_libraries(balancing INTERFACE nonstd::span-lite phmap + "zstd::libzstd_$,shared,static>" +) target_compile_definitions(balancing INTERFACE span_FEATURE_MAKE_SPAN=1) diff --git a/src/libhictk/balancing/ice_impl.hpp b/src/libhictk/balancing/ice_impl.hpp deleted file mode 100644 index b8babb25..00000000 --- a/src/libhictk/balancing/ice_impl.hpp +++ /dev/null @@ -1,477 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -#include - -#include -#include -#include -#include -#include -#include - -#include "hictk/cooler/cooler.hpp" -#include "hictk/pixel.hpp" -#include "hictk/type_traits.hpp" - -namespace hictk::balancing { - -inline bool SparseMatrix::empty() const noexcept { return size() == 0; } -inline std::size_t SparseMatrix::size() const noexcept { return counts.size(); } - -inline void SparseMatrix::clear() noexcept { - bin1_ids.clear(); - bin2_ids.clear(); - counts.clear(); - chrom_offsets.clear(); -} - -template -inline ICE::ICE(const File& f, Type type, double tol, std::size_t max_iters, - std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, - double mad_max) - : _chrom_bin_offsets(read_chrom_bin_offsets(f.bins())), _biases(f.bins().size(), 1.0) { - auto matrix = construct_sparse_matrix(f, type, num_masked_diags); - - initialize_biases(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, _chrom_bin_offsets, - min_nnz, min_count, mad_max); - - switch (type) { - case Type::gw: - balance_gw(matrix, max_iters, tol); - break; - case Type::cis: - balance_cis(matrix, f.bins(), max_iters, tol); - break; - case Type::trans: - balance_trans(matrix, f.bins(), max_iters, tol); - } -} - -void ICE::balance_gw(const SparseMatrix& matrix, std::size_t max_iters, double tol) { - std::vector margs(_biases.size()); - _variance.resize(1, 0); - _scale.resize(1, std::numeric_limits::quiet_NaN()); - - for (std::size_t i = 0; i < max_iters; ++i) { - const auto res = inner_loop(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, margs, 0); - SPDLOG_INFO(FMT_STRING("Iteration {}: {}"), i + 1, res.variance); - _variance[0] = res.variance; - _scale[0] = res.scale; - if (res.variance < tol) { - return; - } - } -} - -void ICE::balance_trans(SparseMatrix& matrix, const BinTable& bins, std::size_t max_iters, - double tol) { - std::vector margs(_biases.size()); - _variance.resize(1, 0); - _scale.resize(1, std::numeric_limits::quiet_NaN()); - const auto weights = compute_weights_from_chromosome_sizes(bins, _chrom_bin_offsets); - mask_cis_interactions(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _chrom_bin_offsets); - - for (std::size_t i = 0; i < max_iters; ++i) { - const auto res = - inner_loop(matrix.bin1_ids, matrix.bin2_ids, matrix.counts, _biases, margs, 0, weights); - SPDLOG_INFO(FMT_STRING("Iteration {}: {}"), i + 1, res.variance); - _variance[0] = res.variance; - _scale[0] = res.scale; - if (res.variance < tol) { - return; - } - } -} - -void ICE::balance_cis(const SparseMatrix& matrix, const BinTable& bins, std::size_t max_iters, - double tol) { - _variance.resize(_chrom_bin_offsets.size() - 1, 0); - _scale.resize(_chrom_bin_offsets.size() - 1, std::numeric_limits::quiet_NaN()); - - std::vector margs(_biases.size()); - - for (std::size_t i = 1; i < _chrom_bin_offsets.size(); ++i) { - const auto i0 = matrix.chrom_offsets[i - 1]; - const auto i1 = matrix.chrom_offsets[i]; - - auto bin1_ids_ = nonstd::span(matrix.bin1_ids).subspan(i0, i1 - i0); - auto bin2_ids_ = nonstd::span(matrix.bin2_ids).subspan(i0, i1 - i0); - std::vector counts_{}; - std::copy(matrix.counts.begin() + static_cast(i0), - matrix.counts.begin() + static_cast(i1), std::back_inserter(counts_)); - - const auto j0 = _chrom_bin_offsets[i - 1]; - const auto j1 = _chrom_bin_offsets[i]; - - auto biases_ = nonstd::span(_biases).subspan(j0, j1 - j0); - auto margs_ = nonstd::span(margs).subspan(j0, j1 - j0); - for (std::size_t k = 0; k < max_iters; ++k) { - const auto res = inner_loop(bin1_ids_, bin2_ids_, counts_, biases_, margs_, j0); - SPDLOG_INFO(FMT_STRING("[{}] iteration {}: {}"), - bins.chromosomes().at(static_cast(i - 1)).name(), k + 1, - res.variance); - _variance[i - 1] = res.variance; - _scale[i - 1] = res.scale; - - if (res.variance < tol) { - break; - } - } - } -} - -template -auto ICE::construct_sparse_matrix(const File& f, Type type, std::size_t num_masked_diags, - std::size_t bin_offset) -> SparseMatrix { - switch (type) { - case Type::cis: - return construct_sparse_matrix_cis(f, num_masked_diags, bin_offset); - case Type::trans: - [[fallthrough]]; - case Type::gw: - return construct_sparse_matrix_gw(f, num_masked_diags, bin_offset); - } -} - -template -inline auto ICE::construct_sparse_matrix_gw(const File& f, std::size_t num_masked_diags, - std::size_t bin_offset) -> SparseMatrix { - SparseMatrix m{}; - - const auto sel = f.fetch(); - std::for_each(sel.template begin(), sel.template end(), [&](const auto& p) { - if (p.bin2_id - p.bin1_id >= num_masked_diags) { - m.bin1_ids.push_back(p.bin1_id - bin_offset); - m.bin2_ids.push_back(p.bin2_id - bin_offset); - m.counts.push_back(p.count); - } - }); - - m.bin1_ids.shrink_to_fit(); - m.bin2_ids.shrink_to_fit(); - m.counts.shrink_to_fit(); - - return m; -} - -template -[[nodiscard]] inline auto ICE::construct_sparse_matrix_cis(const File& f, - std::size_t num_masked_diags, - std::size_t bin_offset) -> SparseMatrix { - SparseMatrix m{}; - m.chrom_offsets.push_back(0); - - for (const Chromosome& chrom : f.chromosomes()) { - const auto sel = f.fetch(chrom.name()); - std::for_each(sel.template begin(), sel.template end(), - [&](const ThinPixel& p) { - if (p.bin2_id - p.bin1_id >= num_masked_diags) { - m.bin1_ids.push_back(p.bin1_id - bin_offset); - m.bin2_ids.push_back(p.bin2_id - bin_offset); - - m.counts.push_back(p.count); - } - }); - - m.chrom_offsets.push_back(m.bin1_ids.size()); - } - m.bin1_ids.shrink_to_fit(); - m.bin2_ids.shrink_to_fit(); - m.counts.shrink_to_fit(); - - return m; -} - -inline void ICE::marginalize(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span marg, - std::size_t bin_offset) { - std::fill(marg.begin(), marg.end(), 0); - - for (std::size_t i = 0; i < counts.size(); ++i) { - const auto i1 = bin1_ids[i] - bin_offset; - const auto i2 = bin2_ids[i] - bin_offset; - - marg[i1] += counts[i]; - marg[i2] += counts[i]; - } -} - -inline void ICE::times_outer_product_marg(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, - nonstd::span biases, - nonstd::span marg, std::size_t bin_offset, - nonstd::span weights) { - assert(bin1_ids.size() == counts.size()); - assert(bin2_ids.size() == counts.size()); - - assert(biases.size() == marg.size()); - assert(biases.size() == weights.size() || weights.empty()); - - std::fill(marg.begin(), marg.end(), 0); - - for (std::size_t i = 0; i < counts.size(); ++i) { - const auto i1 = bin1_ids[i] - bin_offset; - const auto i2 = bin2_ids[i] - bin_offset; - const auto w1 = weights.empty() ? 1 : weights[i1]; - const auto w2 = weights.empty() ? 1 : weights[i2]; - const auto count = counts[i] * (w1 * biases[i1]) * (w2 * biases[i2]); - - marg[i1] += count; - marg[i2] += count; - } -} - -inline void ICE::marginalize_nnz(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span marg, - std::size_t bin_offset) { - std::fill(marg.begin(), marg.end(), 0); - - for (std::size_t i = 0; i < counts.size(); ++i) { - const auto i1 = bin1_ids[i] - bin_offset; - const auto i2 = bin2_ids[i] - bin_offset; - - marg[i1] += counts[i] != 0; - marg[i2] += counts[i] != 0; - } -} - -inline void ICE::min_nnz_filtering(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span biases, - std::size_t min_nnz, nonstd::span marg_buff, - std::size_t bin_offset) { - marginalize_nnz(bin1_ids, bin2_ids, counts, marg_buff, bin_offset); - for (std::size_t i = 0; i < biases.size(); ++i) { - if (marg_buff[i] < static_cast(min_nnz)) { - biases[i] = 0; - } - } -} - -inline void ICE::min_count_filtering(nonstd::span biases, std::size_t min_count, - const nonstd::span marg) { - for (std::size_t i = 0; i < biases.size(); ++i) { - if (marg[i] < static_cast(min_count)) { - biases[i] = 0; - } - } -} - -inline void ICE::mad_max_filtering(nonstd::span chrom_offsets, - nonstd::span biases, std::vector marg, - double mad_max) { - auto median = [](auto v) { - assert(!v.empty()); - - const auto size = static_cast(v.size()); - auto first = v.begin(); - auto mid = first + (size / 2); - auto last = v.end(); - - std::nth_element(first, mid, last); - - if (size % 2 != 0) { - return *mid; - } - - const auto n1 = *mid; - std::nth_element(first, --mid, last); - const auto n2 = *mid; - - return (n1 + n2) / 2; - }; - - auto mad = [&](const auto vin) { - const auto median_ = median(vin); - auto vout = vin; - - std::transform(vout.begin(), vout.end(), vout.begin(), - [&](const auto n) { return std::abs(n - median_); }); - - return median(vout); - }; - - assert(chrom_offsets.size() > 1); - std::vector cmarg{}; - for (std::size_t i = 1; i < chrom_offsets.size(); ++i) { - const auto i0 = static_cast(chrom_offsets[i - 1] - chrom_offsets.front()); - const auto i1 = static_cast(chrom_offsets[i] - chrom_offsets.front()); - - cmarg.clear(); - std::copy_if(marg.begin() + i0, marg.begin() + i1, std::back_inserter(cmarg), - [](const auto n) { return n > 0; }); - - if (!cmarg.empty()) { - const auto median_ = median(cmarg); - std::transform(marg.begin() + i0, marg.begin() + i1, marg.begin() + i0, - [&](const auto n) { return n / median_; }); - } - } - - std::vector log_nz_marg{}; - for (const auto n : marg) { - if (n > 0) { - log_nz_marg.push_back(std::log(n)); - } - } - - if (log_nz_marg.empty()) { - return; - } - - const auto median_log_nz_marg = median(log_nz_marg); - const auto dev_log_nz_marg = mad(log_nz_marg); - - const auto cutoff = std::exp(median_log_nz_marg - mad_max * dev_log_nz_marg); - - for (std::size_t i = 0; i < marg.size(); ++i) { - if (marg[i] < cutoff) { - biases[i] = 0.0; - } - } -} - -inline auto ICE::inner_loop(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span biases, - nonstd::span marg_buffer, std::size_t bin_offset, - nonstd::span weights) -> Result { - times_outer_product_marg(bin1_ids, bin2_ids, counts, biases, marg_buffer, bin_offset, weights); - - double marg_sum = 0.0; - std::size_t nnz_marg{}; - for (const auto& n : marg_buffer) { - marg_sum += n; - nnz_marg += n != 0; - } - - if (nnz_marg == 0) { - std::fill(biases.begin(), biases.end(), std::numeric_limits::quiet_NaN()); - return {std::numeric_limits::quiet_NaN(), 0.0}; - } - - const auto avg_nzmarg = (marg_sum / static_cast(nnz_marg)); - for (std::size_t i = 0; i < biases.size(); ++i) { - const auto n = marg_buffer[i] / avg_nzmarg; - if (n != 0) { - biases[i] /= n; - } - } - - double ssq_nzmarg = 0.0; - for (const auto n : marg_buffer) { - if (n != 0) { - ssq_nzmarg += std::pow(n - avg_nzmarg, 2); - } - } - const auto var_nzmarg = ssq_nzmarg / static_cast(nnz_marg - 1); - - return {avg_nzmarg, var_nzmarg}; -} - -inline void ICE::initialize_biases(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span biases, - nonstd::span chrom_bin_offsets, - std::size_t min_nnz, std::size_t min_count, double mad_max) { - std::vector margs(biases.size()); - if (min_nnz != 0) { - min_nnz_filtering(bin1_ids, bin2_ids, counts, biases, min_nnz, margs); - } - - marginalize(bin1_ids, bin2_ids, counts, margs); - if (min_count != 0) { - min_count_filtering(biases, min_count, margs); - } - - if (mad_max != 0) { - mad_max_filtering(chrom_bin_offsets, biases, margs, mad_max); - } -} - -inline std::vector ICE::read_chrom_bin_offsets(const BinTable& bins) { - std::vector buff{0}; - for (const Chromosome& chrom : bins.chromosomes()) { - const auto nbins = (chrom.size() + bins.bin_size() - 1) / bins.bin_size(); - buff.push_back(buff.back() + nbins); - } - - return buff; -} - -inline std::vector ICE::compute_weights_from_chromosome_sizes( - const BinTable& bins, nonstd::span chrom_bin_offsets) { - std::vector weights(bins.size()); - for (std::uint32_t i = 1; i < chrom_bin_offsets.size(); ++i) { - const auto& chrom = bins.chromosomes().at(i - 1); - const auto i0 = chrom_bin_offsets[i - 1]; - const auto i1 = chrom_bin_offsets[i]; - - const auto nbins = static_cast(bins.size()); - const auto cnbins = - std::ceil(static_cast(chrom.size()) / static_cast(bins.bin_size())); - - for (std::size_t j = i0; j < i1; ++j) { - weights[j] = 1.0 / (1.0 - cnbins / nbins); - } - } - return weights; -} - -inline std::vector ICE::get_weights([[maybe_unused]] bool rescale) const { - std::vector biases(_biases.size()); - if (!rescale) { - return biases; - } - - if (_scale.size() == 1) { - const auto scale = std::sqrt(_scale[0]); - std::transform(_biases.begin(), _biases.end(), biases.begin(), [&](const auto n) { - return n == 0 ? std::numeric_limits::quiet_NaN() : n / scale; - }); - } else { - for (std::size_t i = 1; i < _chrom_bin_offsets.size(); ++i) { - const auto i0 = static_cast(_chrom_bin_offsets[i - 1]); - const auto i1 = static_cast(_chrom_bin_offsets[i]); - const auto scale = std::sqrt(_scale[i - 1]); - std::transform(_biases.begin() + i0, _biases.begin() + i1, biases.begin() + i0, - [&](const auto n) { - return n == 0 ? std::numeric_limits::quiet_NaN() : n / scale; - }); - } - } - return biases; -} - -inline void ICE::mask_cis_interactions(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, - nonstd::span chrom_bin_offsets) { - std::size_t j = 0; - for (std::size_t i = 1; i < chrom_bin_offsets.size(); ++i) { - const auto last_bin_id = chrom_bin_offsets[i]; - - while (j < counts.size()) { - if (bin1_ids[j] < last_bin_id) { - if (bin2_ids[j] < last_bin_id) { - counts[j] = 0; - } - } else { - break; - } - ++j; - } - } -} - -inline std::vector ICE::scale() const noexcept { return _scale; } -inline std::vector ICE::variance() const noexcept { return _variance; } - -} // namespace hictk::balancing diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index 33f3bdd4..2729aac3 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -8,121 +8,129 @@ #include #include +#include "hictk/balancing/sparse_matrix.hpp" #include "hictk/bin_table.hpp" namespace hictk::balancing { -struct SparseMatrix { - std::vector bin1_ids{}; // NOLINT - std::vector bin2_ids{}; // NOLINT - std::vector counts{}; // NOLINT - - std::vector chrom_offsets{}; // NOLINT - - [[nodiscard]] bool empty() const noexcept; - [[nodiscard]] std::size_t size() const noexcept; - void clear() noexcept; -}; - class ICE { - std::vector _chrom_bin_offsets{}; + std::vector _chrom_offsets{}; std::vector _biases{}; std::vector _variance{}; std::vector _scale{}; struct Result { - double scale; - double variance; + double scale{}; + double variance{}; }; public: enum Type { cis, trans, gw }; - template - ICE(const File& f, Type type = Type::gw, double tol = 1.0e-5, std::size_t max_iters = 200, - std::size_t num_masked_diags = 2, std::size_t min_nnz = 10, std::size_t min_count = 0, - double mad_max = 5.0); + struct Params { + double tol{1.0e-6}; + std::size_t max_iters{200}; + std::size_t num_masked_diags{2}; + std::size_t min_nnz{10}; + std::size_t min_count{0}; + double mad_max{5.0}; + std::filesystem::path tmpfile{}; + std::size_t chunk_size{10'000'000}; + }; - template - ICE(PixelIt first_pixel, PixelIt last_pixel, const BinTable& bins, double tol = 1.0e-5, - std::size_t max_iters = 200, std::size_t num_masked_diags = 2, std::size_t min_nnz = 10, - std::size_t min_count = 0, double mad_max = 5.0); + inline static const Params DefaultParams{1.0e-6, 200, 2, 10, 0, 5.0, "", 10'000'000}; // NOLINT + + template + explicit ICE(const File& f, Type type = Type::gw, const Params& params = DefaultParams); [[nodiscard]] std::vector get_weights(bool rescale = true) const; [[nodiscard]] std::vector scale() const noexcept; [[nodiscard]] std::vector variance() const noexcept; private: - void balance_gw(const SparseMatrix& matrix, std::size_t max_iters, double tol); - void balance_cis(const SparseMatrix& matrix, [[maybe_unused]] const BinTable& bins, - std::size_t max_iters, double tol); - void balance_trans(SparseMatrix& matrix, const BinTable& bins, std::size_t max_iters, double tol); + template + void balance_in_memory(const File& f, Type type, double tol, std::size_t max_iters, + std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, + double mad_max); + + template + void balance_chunked(const File& f, Type type, double tol, std::size_t max_iters, + std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, + double mad_max, const std::filesystem::path& tmpfile, + std::size_t chunk_size); + + template + void balance_gw(const MatrixT& matrix, std::size_t max_iters, double tol); + + template + void balance_cis(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, double tol); + + template + void balance_trans(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, + double tol); template [[nodiscard]] static auto construct_sparse_matrix(const File& f, Type type, - std::size_t num_masked_diags, - std::size_t bin_offset = 0) -> SparseMatrix; + std::size_t num_masked_diags) -> SparseMatrix; + template + [[nodiscard]] static auto construct_sparse_matrix_gw(const File& f, std::size_t num_masked_diags) + -> SparseMatrix; + template + [[nodiscard]] static auto construct_sparse_matrix_cis(const File& f, std::size_t num_masked_diags) + -> SparseMatrix; + template + [[nodiscard]] static auto construct_sparse_matrix_trans(const File& f, + std::size_t num_masked_diags) + -> SparseMatrix; template - [[nodiscard]] static auto construct_sparse_matrix_gw(const File& f, std::size_t num_masked_diags, - std::size_t bin_offset) -> SparseMatrix; + [[nodiscard]] static auto construct_sparse_matrix_chunked(const File& f, Type type, + std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) + -> SparseMatrixChunked; + template + [[nodiscard]] static auto construct_sparse_matrix_chunked_gw(const File& f, + std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) + -> SparseMatrixChunked; + template + [[nodiscard]] static auto construct_sparse_matrix_chunked_cis( + const File& f, std::size_t num_masked_diags, const std::filesystem::path& tmpfile, + std::size_t chunk_size) -> SparseMatrixChunked; template - [[nodiscard]] static auto construct_sparse_matrix_cis(const File& f, std::size_t num_masked_diags, - std::size_t bin_offset) -> SparseMatrix; - - [[nodiscard]] static auto inner_loop(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, - nonstd::span biases, - nonstd::span marg_buffer, std::size_t bin_offset = 0, + [[nodiscard]] static auto construct_sparse_matrix_chunked_trans( + const File& f, std::size_t num_masked_diags, const std::filesystem::path& tmpfile, + std::size_t chunk_size) -> SparseMatrixChunked; + + template + [[nodiscard]] static auto inner_loop(const MatrixT& matrix, nonstd::span biases, nonstd::span weights = {}) -> Result; - static void marginalize(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span marg, - std::size_t bin_offset = 0); - - static void times_outer_product_marg(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, - nonstd::span biases, nonstd::span marg, - std::size_t bin_offset, - nonstd::span weights = {}); - - static void marginalize_nnz(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span marg, - std::size_t bin_offset = 0); - - static void min_nnz_filtering(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span biases, - std::size_t min_nnz, nonstd::span marg_buff, - std::size_t bin_offset = 0); + template + static void min_nnz_filtering(const MatrixT& matrix, nonstd::span biases, + std::size_t min_nnz); + static void min_count_filtering(nonstd::span biases, std::size_t min_count, - nonstd::span marg); + nonstd::span marg); static void mad_max_filtering(nonstd::span chrom_offsets, - nonstd::span biases, std::vector marg, + nonstd::span biases, nonstd::span marg, double mad_max); - static void initialize_biases(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, nonstd::span biases, + template + static void initialize_biases(const MatrixT& matrix, nonstd::span biases, nonstd::span chrom_bin_offsets, std::size_t min_nnz, std::size_t min_count, double mad_max); + [[nodiscard]] static std::vector read_chrom_bin_offsets(const BinTable& bins); [[nodiscard]] static std::vector compute_weights_from_chromosome_sizes( const BinTable& bins, nonstd::span chrom_bin_offsets); - - static void mask_cis_interactions(nonstd::span bin1_ids, - nonstd::span bin2_ids, - nonstd::span counts, - nonstd::span chrom_bin_offsets); }; } // namespace hictk::balancing -#include "../../../ice_impl.hpp" +#include "./impl/ice_mem_impl.hpp" diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_mem_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_mem_impl.hpp new file mode 100644 index 00000000..a0eb98ff --- /dev/null +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_mem_impl.hpp @@ -0,0 +1,542 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "hictk/cooler/cooler.hpp" +#include "hictk/pixel.hpp" +#include "hictk/type_traits.hpp" + +namespace hictk::balancing { + +template +inline ICE::ICE(const File& f, Type type, const Params& params) + : _chrom_offsets(read_chrom_bin_offsets(f.bins())), _biases(f.bins().size(), 1.0) { + if (params.tmpfile.empty()) { + balance_in_memory(f, type, params.tol, params.max_iters, params.num_masked_diags, + params.min_nnz, params.min_count, params.mad_max); + } else { + balance_chunked(f, type, params.tol, params.max_iters, params.num_masked_diags, params.min_nnz, + params.min_count, params.mad_max, params.tmpfile, params.chunk_size); + } +} + +template +inline void ICE::balance_in_memory(const File& f, Type type, double tol, std::size_t max_iters, + std::size_t num_masked_diags, std::size_t min_nnz, + std::size_t min_count, double mad_max) { + auto matrix = construct_sparse_matrix(f, type, num_masked_diags); + + initialize_biases(matrix.view(), _biases, _chrom_offsets, min_nnz, min_count, mad_max); + + switch (type) { + case Type::gw: + balance_gw(matrix.view(), max_iters, tol); + break; + case Type::cis: + balance_cis(matrix, f.bins(), max_iters, tol); + break; + case Type::trans: + matrix = construct_sparse_matrix_trans(f, num_masked_diags); + balance_trans(matrix.view(), f.bins(), max_iters, tol); + } +} + +template +inline void ICE::balance_chunked(const File& f, Type type, double tol, std::size_t max_iters, + std::size_t num_masked_diags, std::size_t min_nnz, + std::size_t min_count, double mad_max, + const std::filesystem::path& tmpfile, std::size_t chunk_size) { + auto matrix = construct_sparse_matrix_chunked(f, type, num_masked_diags, tmpfile, chunk_size); + + initialize_biases(matrix.view(), _biases, _chrom_offsets, min_nnz, min_count, mad_max); + + switch (type) { + case Type::gw: + balance_gw(matrix.view(), max_iters, tol); + break; + case Type::cis: + balance_cis(matrix, f.bins(), max_iters, tol); + break; + case Type::trans: + balance_trans(construct_sparse_matrix_trans(f, num_masked_diags).view(), f.bins(), max_iters, + tol); + } +} + +template +inline void ICE::balance_gw(const MatrixT& matrix, std::size_t max_iters, double tol) { + _variance.resize(1, 0); + _scale.resize(1, std::numeric_limits::quiet_NaN()); + + for (std::size_t i = 0; i < max_iters; ++i) { + const auto res = inner_loop(matrix, _biases); + SPDLOG_INFO(FMT_STRING("Iteration {}: {}"), i + 1, res.variance); + _variance[0] = res.variance; + _scale[0] = res.scale; + if (res.variance < tol) { + return; + } + } +} + +template +inline void ICE::balance_trans(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, + double tol) { + _variance.resize(1, 0); + _scale.resize(1, std::numeric_limits::quiet_NaN()); + const auto weights = compute_weights_from_chromosome_sizes(bins, _chrom_offsets); + + for (std::size_t i = 0; i < max_iters; ++i) { + const auto res = inner_loop(matrix, _biases, weights); + SPDLOG_INFO(FMT_STRING("Iteration {}: {}"), i + 1, res.variance); + _variance[0] = res.variance; + _scale[0] = res.scale; + if (res.variance < tol) { + return; + } + } +} + +template +inline void ICE::balance_cis(const MatrixT& matrix, [[maybe_unused]] const BinTable& bins, + std::size_t max_iters, double tol) { + _variance.resize(_chrom_offsets.size() - 1, 0); + _scale.resize(_chrom_offsets.size() - 1, std::numeric_limits::quiet_NaN()); + + std::vector margs(_biases.size()); + + for (std::uint32_t chrom_id = 0; chrom_id < _chrom_offsets.size() - 1; ++chrom_id) { + const auto cis_matrix = matrix.subset(chrom_id); + + const auto j0 = _chrom_offsets[chrom_id]; + const auto j1 = _chrom_offsets[chrom_id + 1]; + + auto biases_ = nonstd::span(_biases).subspan(j0, j1 - j0); + + for (std::size_t k = 0; k < max_iters; ++k) { + const auto res = inner_loop(cis_matrix, biases_); + SPDLOG_INFO(FMT_STRING("[{}] iteration {}: {}"), bins.chromosomes().at(chrom_id).name(), + k + 1, res.variance); + _variance[chrom_id] = res.variance; + _scale[chrom_id] = res.scale; + + if (res.variance < tol) { + break; + } + } + } +} + +template +auto ICE::construct_sparse_matrix(const File& f, Type type, std::size_t num_masked_diags) + -> SparseMatrix { + switch (type) { + case Type::cis: + return construct_sparse_matrix_cis(f, num_masked_diags); + case Type::trans: + [[fallthrough]]; + case Type::gw: + return construct_sparse_matrix_gw(f, num_masked_diags); + } +} + +template +inline auto ICE::construct_sparse_matrix_gw(const File& f, std::size_t num_masked_diags) + -> SparseMatrix { + SparseMatrix m(f.bins()); + + const auto sel = f.fetch(); + std::for_each(sel.template begin(), sel.template end(), [&](const auto& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + + m.finalize(); + + return m; +} + +template +[[nodiscard]] inline auto ICE::construct_sparse_matrix_cis(const File& f, + std::size_t num_masked_diags) + -> SparseMatrix { + SparseMatrix m(f.bins()); + + for (const Chromosome& chrom : f.chromosomes()) { + const auto sel = f.fetch(chrom.name()); + std::for_each(sel.template begin(), sel.template end(), + [&](const ThinPixel& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + } + m.finalize(); + + return m; +} + +template +[[nodiscard]] inline auto ICE::construct_sparse_matrix_trans(const File& f, + std::size_t num_masked_diags) + -> SparseMatrix { + using PixelIt = decltype(f.fetch("chr1", "chr2").template begin()); + std::vector heads{}; + std::vector tails{}; + + for (const Chromosome& chrom1 : f.chromosomes()) { + if (chrom1.is_all()) { + continue; + } + for (std::uint32_t chrom2_id = chrom1.id() + 1; chrom2_id < f.chromosomes().size(); + ++chrom2_id) { + const auto& chrom2 = f.chromosomes().at(chrom2_id); + if (chrom2.is_all()) { + continue; + } + + const auto sel = f.fetch(chrom1.name(), chrom2.name()); + heads.emplace_back(sel.template begin()); + tails.emplace_back(sel.template end()); + } + } + + [[maybe_unused]] internal::PixelMerger merger{heads, tails}; + + SparseMatrix m(f.bins()); + std::for_each(merger.begin(), merger.end(), [&](const ThinPixel& p) { + // TODO: this filtering step is wrong when done on trans matrices, as it will + // remove the first and last few pixels from trans matrices of adjacent chromosomes. + // Remove the filtering once this bug has been fixed in cooler + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + + m.shrink_to_fit(); + + return m; +} + +template +auto ICE::construct_sparse_matrix_chunked(const File& f, Type type, std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) -> SparseMatrixChunked { + switch (type) { + case Type::cis: + return construct_sparse_matrix_chunked_cis(f, num_masked_diags, tmpfile, chunk_size); + case Type::trans: + [[fallthrough]]; + case Type::gw: + return construct_sparse_matrix_chunked_gw(f, num_masked_diags, tmpfile, chunk_size); + } +} + +template +inline auto ICE::construct_sparse_matrix_chunked_gw(const File& f, std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) -> SparseMatrixChunked { + SparseMatrixChunked m(f.bins(), tmpfile, chunk_size); + + const auto sel = f.fetch(); + std::for_each(sel.template begin(), sel.template end(), [&](const auto& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + + m.finalize(); + return m; +} + +template +inline auto ICE::construct_sparse_matrix_chunked_cis(const File& f, std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) + -> SparseMatrixChunked { + SparseMatrixChunked m(f.bins(), tmpfile, chunk_size); + + for (const Chromosome& chrom : f.chromosomes()) { + const auto sel = f.fetch(chrom.name()); + std::for_each(sel.template begin(), sel.template end(), + [&](const ThinPixel& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + } + m.finalize(); + return m; +} + +template +inline auto ICE::construct_sparse_matrix_chunked_trans(const File& f, std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) + -> SparseMatrixChunked { + using PixelIt = decltype(f.fetch("chr1", "chr2").template begin()); + std::vector heads{}; + std::vector tails{}; + + for (const Chromosome& chrom1 : f.chromosomes()) { + if (chrom1.is_all()) { + continue; + } + for (std::uint32_t chrom2_id = chrom1.id() + 1; chrom2_id < f.chromosomes().size(); + ++chrom2_id) { + const auto& chrom2 = f.chromosomes().at(chrom2_id); + if (chrom2.is_all()) { + continue; + } + + const auto sel = f.fetch(chrom1.name(), chrom2.name()); + heads.emplace_back(sel.template begin()); + tails.emplace_back(sel.template end()); + } + } + + internal::PixelMerger merger{heads, tails}; + + SparseMatrixChunked m(f.bins(), tmpfile, chunk_size); + std::for_each(merger.begin(), merger.end(), [&](const ThinPixel& p) { + // TODO: this filtering step is wrong when done on trans matrices, as it will + // remove the first and last few pixels from trans matrices of adjacent chromosomes. + // Remove the filtering once this bug has been fixed in cooler + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + }); + + m.finalize(); + return m; +} + +template +inline void ICE::min_nnz_filtering(const MatrixT& matrix, nonstd::span biases, + std::size_t min_nnz) { + const auto& marg = matrix.marginalize_nnz(); + for (std::size_t i = 0; i < biases.size(); ++i) { + if (marg[i] < static_cast(min_nnz)) { + biases[i] = 0; + } + } +} + +inline void ICE::min_count_filtering(nonstd::span biases, std::size_t min_count, + nonstd::span marg) { + for (std::size_t i = 0; i < biases.size(); ++i) { + if (marg[i] < static_cast(min_count)) { + biases[i] = 0; + } + } +} + +inline void ICE::mad_max_filtering(nonstd::span chrom_offsets, + nonstd::span biases, nonstd::span marg, + double mad_max) { + auto median = [](auto v) { + assert(!v.empty()); + + const auto size = static_cast(v.size()); + auto first = v.begin(); + auto mid = first + (size / 2); + auto last = v.end(); + + std::nth_element(first, mid, last); + + if (size % 2 != 0) { + return *mid; + } + + const auto n1 = *mid; + std::nth_element(first, --mid, last); + const auto n2 = *mid; + + return (n1 + n2) / 2; + }; + + auto mad = [&](const auto vin) { + const auto median_ = median(vin); + auto vout = vin; + + std::transform(vout.begin(), vout.end(), vout.begin(), + [&](const auto n) { return std::abs(n - median_); }); + + return median(vout); + }; + + assert(chrom_offsets.size() > 1); + std::vector cmarg{}; + for (std::size_t i = 1; i < chrom_offsets.size(); ++i) { + const auto i0 = static_cast(chrom_offsets[i - 1] - chrom_offsets.front()); + const auto i1 = static_cast(chrom_offsets[i] - chrom_offsets.front()); + + cmarg.clear(); + std::copy_if(marg.begin() + i0, marg.begin() + i1, std::back_inserter(cmarg), + [](const auto n) { return n > 0; }); + + if (!cmarg.empty()) { + const auto median_ = median(cmarg); + std::transform(marg.begin() + i0, marg.begin() + i1, marg.begin() + i0, + [&](const auto n) { return n / median_; }); + } + } + + std::vector log_nz_marg{}; + for (const auto n : marg) { + if (n > 0) { + log_nz_marg.push_back(std::log(n)); + } + } + + if (log_nz_marg.empty()) { + return; + } + + const auto median_log_nz_marg = median(log_nz_marg); + const auto dev_log_nz_marg = mad(log_nz_marg); + + const auto cutoff = std::exp(median_log_nz_marg - mad_max * dev_log_nz_marg); + + for (std::size_t i = 0; i < marg.size(); ++i) { + if (marg[i] < cutoff) { + biases[i] = 0.0; + } + } +} + +template +inline auto ICE::inner_loop(const MatrixT& matrix, nonstd::span biases, + nonstd::span weights) -> Result { + if (matrix.empty()) { + std::fill(biases.begin(), biases.end(), std::numeric_limits::quiet_NaN()); + return {std::numeric_limits::quiet_NaN(), 0.0}; + } + const auto& marg = matrix.times_outer_product_marg(biases, weights); + + double marg_sum = 0.0; + std::size_t nnz_marg{}; + for (const auto& n : marg) { + marg_sum += n; + nnz_marg += n != 0; + } + + if (nnz_marg == 0) { + std::fill(biases.begin(), biases.end(), std::numeric_limits::quiet_NaN()); + return {std::numeric_limits::quiet_NaN(), 0.0}; + } + + const auto avg_nzmarg = (marg_sum / static_cast(nnz_marg)); + for (std::size_t i = 0; i < biases.size(); ++i) { + const auto n = marg[i] / avg_nzmarg; + if (n != 0) { + biases[i] /= n; + } + } + + double ssq_nzmarg = 0.0; + for (const auto n : marg) { + if (n != 0) { + ssq_nzmarg += std::pow(n - avg_nzmarg, 2); + } + } + const auto var_nzmarg = ssq_nzmarg / static_cast(nnz_marg - 1); + + return {avg_nzmarg, var_nzmarg}; +} + +template +inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span biases, + nonstd::span chrom_bin_offsets, + std::size_t min_nnz, std::size_t min_count, double mad_max) { + if (min_nnz != 0) { + min_nnz_filtering(matrix, biases, min_nnz); + } + + if (min_count != 0 || mad_max != 0) { + matrix.marginalize(); + } + if (min_count != 0) { + min_count_filtering(biases, min_count, matrix.margs()); + } + + if (mad_max != 0) { + auto margs = std::vector{matrix.margs()}; + mad_max_filtering(chrom_bin_offsets, biases, margs, mad_max); + } +} + +inline std::vector ICE::read_chrom_bin_offsets(const BinTable& bins) { + std::vector buff{0}; + for (const Chromosome& chrom : bins.chromosomes()) { + const auto nbins = (chrom.size() + bins.bin_size() - 1) / bins.bin_size(); + buff.push_back(buff.back() + nbins); + } + + return buff; +} + +inline std::vector ICE::compute_weights_from_chromosome_sizes( + const BinTable& bins, nonstd::span chrom_bin_offsets) { + std::vector weights(bins.size()); + for (std::uint32_t i = 1; i < chrom_bin_offsets.size(); ++i) { + const auto& chrom = bins.chromosomes().at(i - 1); + if (chrom.is_all()) { + continue; + } + const auto i0 = chrom_bin_offsets[i - 1]; + const auto i1 = chrom_bin_offsets[i]; + + const auto nbins = static_cast(bins.size()); + const auto cnbins = + std::ceil(static_cast(chrom.size()) / static_cast(bins.bin_size())); + + for (std::size_t j = i0; j < i1; ++j) { + weights[j] = 1.0 / (1.0 - cnbins / nbins); + } + } + return weights; +} + +inline std::vector ICE::get_weights([[maybe_unused]] bool rescale) const { + std::vector biases(_biases.size()); + if (!rescale) { + return biases; + } + + if (_scale.size() == 1) { + const auto scale = std::sqrt(_scale[0]); + std::transform(_biases.begin(), _biases.end(), biases.begin(), [&](const auto n) { + return n == 0 ? std::numeric_limits::quiet_NaN() : n / scale; + }); + } else { + for (std::size_t i = 1; i < _chrom_offsets.size(); ++i) { + const auto i0 = static_cast(_chrom_offsets[i - 1]); + const auto i1 = static_cast(_chrom_offsets[i]); + const auto scale = std::sqrt(_scale[i - 1]); + std::transform(_biases.begin() + i0, _biases.begin() + i1, biases.begin() + i0, + [&](const auto n) { + return n == 0 ? std::numeric_limits::quiet_NaN() : n / scale; + }); + } + } + return biases; +} + +inline std::vector ICE::scale() const noexcept { return _scale; } +inline std::vector ICE::variance() const noexcept { return _variance; } + +} // namespace hictk::balancing diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp new file mode 100644 index 00000000..382c8314 --- /dev/null +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -0,0 +1,437 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "hictk/cooler/cooler.hpp" +#include "hictk/pixel.hpp" +#include "hictk/type_traits.hpp" + +namespace hictk::balancing { + +inline SparseMatrix::SparseMatrix(const hictk::BinTable& bins, std::uint32_t chrom_id) + : _chrom_id(chrom_id == _gw_id ? 0 : chrom_id), + _marg(chrom_id == _gw_id ? bins.size() : bins.subset(chrom_id).size()) { + _chrom_offsets.push_back(0); + for (const Chromosome& chrom : bins.chromosomes()) { + const auto nbins = (chrom.size() + bins.bin_size() - 1) / bins.bin_size(); + _chrom_offsets.push_back(_chrom_offsets.back() + nbins); + } + _bin1_offsets.resize(_chrom_offsets.size(), 0); +} + +inline bool SparseMatrix::empty() const noexcept { return size() == 0; } +inline std::size_t SparseMatrix::size() const noexcept { return _counts.size(); } + +inline void SparseMatrix::clear() noexcept { + _bin1_ids.clear(); + _bin2_ids.clear(); + _counts.clear(); +} + +inline void SparseMatrix::shrink_to_fit() noexcept { + _bin1_ids.shrink_to_fit(); + _bin2_ids.shrink_to_fit(); + _counts.shrink_to_fit(); + _chrom_offsets.shrink_to_fit(); +} + +inline void SparseMatrix::finalize() { + shrink_to_fit(); + for (std::size_t i = 1; i < _bin1_offsets.size(); ++i) { + if (_bin1_offsets[i] == 0) { + _bin1_offsets[i] = _bin1_offsets[i - 1]; + } + } +} + +inline const std::vector& SparseMatrix::bin1_ids() const noexcept { return _bin1_ids; } +inline const std::vector& SparseMatrix::bin2_ids() const noexcept { return _bin2_ids; } +inline const std::vector& SparseMatrix::counts() const noexcept { return _counts; } +inline const std::vector& SparseMatrix::margs() const noexcept { return _marg; } +inline const std::vector& SparseMatrix::chrom_offsets() const noexcept { + return _chrom_offsets; +} + +inline void SparseMatrix::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count) { + if (!empty() && bin1_id >= _chrom_offsets[_chrom_id + 1]) { + _chrom_id = static_cast( + std::lower_bound(_chrom_offsets.begin(), _chrom_offsets.end(), _bin1_ids.back()) - + _chrom_offsets.begin()); + _bin1_offsets[_chrom_id] = size(); + } + + _bin1_ids.push_back(bin1_id); + _bin2_ids.push_back(bin2_id); + _counts.push_back(count); +} + +inline SparseMatrixView SparseMatrix::subset(std::uint32_t chrom_id) const { + assert(chrom_id + 1 < chrom_offsets().size()); + const auto i0 = _bin1_offsets[chrom_id]; + const auto i1 = _bin1_offsets[chrom_id + 1]; + + const auto bin1_ids_ = nonstd::span(bin1_ids()).subspan(i0, i1 - i0); + const auto bin2_ids_ = nonstd::span(bin2_ids()).subspan(i0, i1 - i0); + const auto counts_ = nonstd::span(counts()).subspan(i0, i1 - i0); + + const auto j0 = chrom_offsets()[chrom_id]; + const auto j1 = chrom_offsets()[chrom_id + 1]; + + return {bin1_ids_, bin2_ids_, counts_, j0, j1 - j0}; +} + +inline SparseMatrixView SparseMatrix::view() const { + const auto bin1_ids_ = nonstd::span(bin1_ids()); + const auto bin2_ids_ = nonstd::span(bin2_ids()); + const auto counts_ = nonstd::span(counts()); + + return {bin1_ids_, bin2_ids_, counts_, 0, _marg.size()}; +} + +void SparseMatrix::serialize(std::fstream& fs, int compression_lvl) const { + const auto size_ = size(); + fs.write(reinterpret_cast(&size_), sizeof(std::size_t)); + + const auto tmpbuff_size = ZSTD_compressBound(size() * sizeof(std::uint64_t)); + std::string tmpbuff(tmpbuff_size, '\0'); + + std::size_t compressed_size = + ZSTD_compress(reinterpret_cast(tmpbuff.data()), tmpbuff.size() * sizeof(char), + reinterpret_cast(_bin1_ids.data()), size() * sizeof(std::uint64_t), + compression_lvl); + if (ZSTD_isError(compressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(compressed_size)); + } + + fs.write(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + fs.write(tmpbuff.data(), static_cast(compressed_size)); + + compressed_size = + ZSTD_compress(reinterpret_cast(tmpbuff.data()), tmpbuff.size() * sizeof(char), + reinterpret_cast(_bin2_ids.data()), size() * sizeof(std::uint64_t), + compression_lvl); + if (ZSTD_isError(compressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(compressed_size)); + } + + fs.write(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + fs.write(tmpbuff.data(), static_cast(compressed_size)); + + compressed_size = ZSTD_compress( + reinterpret_cast(tmpbuff.data()), tmpbuff.size() * sizeof(char), + reinterpret_cast(_counts.data()), size() * sizeof(double), compression_lvl); + if (ZSTD_isError(compressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(compressed_size)); + } + + fs.write(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + fs.write(tmpbuff.data(), static_cast(compressed_size)); +} + +void SparseMatrix::deserialize(std::fstream& fs) { + std::size_t size{}; + fs.read(reinterpret_cast(&size), sizeof(std::size_t)); + + _bin1_ids.resize(size); + _bin2_ids.resize(size); + _counts.resize(size); + + std::string tmpbuff{}; + std::size_t compressed_size{}; + fs.read(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + + tmpbuff.resize(compressed_size); + fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); + std::size_t decompressed_size = + ZSTD_decompress(reinterpret_cast(_bin1_ids.data()), size * sizeof(std::uint64_t), + tmpbuff.data(), tmpbuff.size() * sizeof(char)); + if (ZSTD_isError(decompressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); + } + + fs.read(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + tmpbuff.resize(compressed_size); + fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); + decompressed_size = + ZSTD_decompress(reinterpret_cast(_bin2_ids.data()), size * sizeof(std::uint64_t), + tmpbuff.data(), tmpbuff.size() * sizeof(char)); + if (ZSTD_isError(decompressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); + } + + fs.read(reinterpret_cast(&compressed_size), sizeof(std::size_t)); + tmpbuff.resize(compressed_size); + fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); + decompressed_size = + ZSTD_decompress(reinterpret_cast(_counts.data()), size * sizeof(double), + tmpbuff.data(), tmpbuff.size() * sizeof(char)); + if (ZSTD_isError(decompressed_size)) { + throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); + } +} + +inline SparseMatrixChunked::SparseMatrixChunked(const BinTable& bins, + std::filesystem::path tmp_file, + std::size_t chunk_size, int compression_lvl) + : _matrix(bins), + _path(std::move(tmp_file)), + _marg(bins.size()), + _chrom_offsets(_matrix.chrom_offsets()), + _bin1_offsets(_chrom_offsets.size(), 0), + _chunk_size(chunk_size), + _compression_lvl(compression_lvl) { + _fs.exceptions(std::ios::badbit); + _fs.open(_path, std::ios::out); + + _chrom_index.emplace(0, std::make_pair(std::size_t{}, std::size_t{})); +} + +inline SparseMatrixChunked::~SparseMatrixChunked() noexcept { + try { + if (!_path.empty() && std::filesystem::exists(_path)) { + std::filesystem::remove(_path); + } + } catch (...) { + } +} + +inline bool SparseMatrixChunked::empty() const noexcept { return size() == 0; } +inline std::size_t SparseMatrixChunked::size() const noexcept { return _size; } + +inline const std::vector& SparseMatrixChunked::margs() const noexcept { return _marg; } +inline const std::vector& SparseMatrixChunked::chrom_offsets() const noexcept { + return _chrom_offsets; +} + +inline void SparseMatrixChunked::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, + double count) { + const auto beginning_of_new_chromosome = bin1_id >= _chrom_offsets[_chrom_id + 1]; + + if (beginning_of_new_chromosome) { + finalize_chromosome(_chrom_id); + _chrom_id = static_cast( + std::lower_bound(_chrom_offsets.begin(), _chrom_offsets.end(), _matrix.bin1_ids().back()) - + _chrom_offsets.begin()); + } + + if (_matrix.size() == _chunk_size || beginning_of_new_chromosome) { + write_chunk(); + } + + _matrix.push_back(bin1_id, bin2_id, count); + ++_size; +} + +inline void SparseMatrixChunked::finalize() { + if (!_matrix.empty()) { + write_chunk(); + } + _fs.close(); + _fs.open(_path, std::ios::in); +} + +inline void SparseMatrixChunked::finalize_chromosome(std::uint32_t chrom_id) { + _bin1_offsets[chrom_id + 1] = size(); + auto [it, inserted] = + _chrom_index.try_emplace(chrom_id, std::make_pair(_index.size(), _index.size())); + if (!inserted) { + it->second.second = _index.size() + 1; + } +} + +inline SparseMatrixChunkedView SparseMatrixChunked::view() const { + return {_path, _index, 0, _marg.size()}; +} + +inline SparseMatrixChunkedView SparseMatrixChunked::subset(std::uint32_t chrom_id) const { + auto it = _chrom_index.find(chrom_id); + if (it == _chrom_index.end()) { + return {}; + } + const auto& [first_offset, last_offset] = it->second; + const auto i0 = chrom_offsets()[chrom_id]; + const auto i1 = chrom_offsets()[chrom_id + 1]; + + return {_path, nonstd::span(_index).subspan(first_offset, last_offset - first_offset), i0, + i1 - i0}; +} + +inline void SparseMatrixChunked::read_chunk(std::size_t chunk_id, SparseMatrix& buffer) { + assert(chunk_id < _index.size()); + const auto offset = _index[chunk_id]; + + std::fstream fs; + _fs.exceptions(std::ios::badbit); + fs.open(_path, std::ios::in); + fs.seekg(offset); + + buffer.deserialize(fs); +} + +inline void SparseMatrixChunked::write_chunk() { + assert(!_matrix.empty()); + _index.push_back(_fs.tellg()); + _matrix.serialize(_fs, _compression_lvl); + _matrix.clear(); + _chrom_index.try_emplace(_chrom_id, std::make_pair(_index.size(), _index.size())); +} + +inline SparseMatrixView::SparseMatrixView(nonstd::span bin1_ids_, + nonstd::span bin2_ids_, + nonstd::span counts_, + std::size_t bin1_offset, std::size_t num_bins) + : _marg(num_bins), + _bin1_offset(bin1_offset), + bin1_ids(bin1_ids_), + bin2_ids(bin2_ids_), + counts(counts_) {} + +inline bool SparseMatrixView::empty() const noexcept { return size() == 0; } +inline std::size_t SparseMatrixView::size() const noexcept { return counts.size(); } + +inline const std::vector& SparseMatrixView::margs() const noexcept { return _marg; } + +inline const std::vector& SparseMatrixView::marginalize() const { + std::fill(_marg.begin(), _marg.end(), 0); + for (std::size_t i = 0; i < size(); ++i) { + const auto i1 = bin1_ids[i] - _bin1_offset; + const auto i2 = bin2_ids[i] - _bin1_offset; + + _marg[i1] += counts[i]; + _marg[i2] += counts[i]; + } + + return _marg; +} + +inline const std::vector& SparseMatrixView::marginalize_nnz() const { + std::fill(_marg.begin(), _marg.end(), 0); + + for (std::size_t i = 0; i < counts.size(); ++i) { + const auto i1 = bin1_ids[i] - _bin1_offset; + const auto i2 = bin2_ids[i] - _bin1_offset; + + _marg[i1] += counts[i] != 0; + _marg[i2] += counts[i] != 0; + } + + return _marg; +} + +inline const std::vector& SparseMatrixView::times_outer_product_marg( + nonstd::span biases, nonstd::span weights) const { + assert(biases.size() == _marg.size()); + assert(biases.size() == weights.size() || weights.empty()); + + std::fill(_marg.begin(), _marg.end(), 0); + for (std::size_t i = 0; i < size(); ++i) { + const auto i1 = bin1_ids[i] - _bin1_offset; + const auto i2 = bin2_ids[i] - _bin1_offset; + const auto w1 = weights.empty() ? 1 : weights[i1]; + const auto w2 = weights.empty() ? 1 : weights[i2]; + const auto count = counts[i] * (w1 * biases[i1]) * (w2 * biases[i2]); + + _marg[i1] += count; + _marg[i2] += count; + } + return _marg; +} + +inline SparseMatrixChunkedView::SparseMatrixChunkedView(const std::filesystem::path& path, + nonstd::span index, + std::size_t bin1_offset, + std::size_t num_bins) + : _fs(path, std::ios::in), + _index(index.begin(), index.end()), + _marg(num_bins), + _bin1_offset(bin1_offset) {} + +inline bool SparseMatrixChunkedView::empty() const noexcept { return _index.empty(); } + +inline const std::vector& SparseMatrixChunkedView::margs() const noexcept { return _marg; } + +inline const std::vector& SparseMatrixChunkedView::marginalize() const { + std::fill(_marg.begin(), _marg.end(), 0); + + for (const auto offset : _index) { + _fs.seekg(offset); + _matrix.deserialize(_fs); + + for (std::size_t i = 0; i < _matrix.counts().size(); ++i) { + const auto i1 = _matrix.bin1_ids()[i] - _bin1_offset; + const auto i2 = _matrix.bin2_ids()[i] - _bin1_offset; + + _marg[i1] += _matrix.counts()[i]; + _marg[i2] += _matrix.counts()[i]; + } + if (_fs.peek() && _fs.eof()) { + break; + } + } + + return _marg; +} + +inline const std::vector& SparseMatrixChunkedView::marginalize_nnz() const { + std::fill(_marg.begin(), _marg.end(), 0); + + for (const auto offset : _index) { + _fs.seekg(offset); + _matrix.deserialize(_fs); + + for (std::size_t i = 0; i < _matrix.counts().size(); ++i) { + const auto i1 = _matrix.bin1_ids()[i] - _bin1_offset; + const auto i2 = _matrix.bin2_ids()[i] - _bin1_offset; + + _marg[i1] += _matrix.counts()[i] != 0; + _marg[i2] += _matrix.counts()[i] != 0; + } + if (_fs.peek() && _fs.eof()) { + break; + } + } + + return _marg; +} + +inline const std::vector& SparseMatrixChunkedView::times_outer_product_marg( + nonstd::span biases, nonstd::span weights) const { + assert(biases.size() == _marg.size()); + assert(biases.size() == weights.size() || weights.empty()); + + std::fill(_marg.begin(), _marg.end(), 0); + + for (const auto offset : _index) { + _fs.seekg(offset); + _matrix.deserialize(_fs); + + for (std::size_t i = 0; i < _matrix.counts().size(); ++i) { + const auto i1 = _matrix.bin1_ids()[i] - _bin1_offset; + const auto i2 = _matrix.bin2_ids()[i] - _bin1_offset; + const auto w1 = weights.empty() ? 1 : weights[i1]; + const auto w2 = weights.empty() ? 1 : weights[i2]; + const auto count = _matrix.counts()[i] * (w1 * biases[i1]) * (w2 * biases[i2]); + + _marg[i1] += count; + _marg[i2] += count; + } + if (_fs.peek() && _fs.eof()) { + break; + } + } + return _marg; +} + +} // namespace hictk::balancing diff --git a/src/libhictk/balancing/vc_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/vc_impl.hpp similarity index 100% rename from src/libhictk/balancing/vc_impl.hpp rename to src/libhictk/balancing/include/hictk/balancing/impl/vc_impl.hpp diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp new file mode 100644 index 00000000..fc042eb8 --- /dev/null +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -0,0 +1,161 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include +#include + +#include "hictk/bin_table.hpp" + +namespace hictk::balancing { + +class SparseMatrixView; +class SparseMatrix { + std::vector _bin1_ids{}; + std::vector _bin2_ids{}; + std::vector _counts{}; + + std::uint32_t _chrom_id{}; // ID of the chromosome that is being procesed + std::vector _chrom_offsets{}; + std::vector _bin1_offsets{}; + mutable std::vector _marg{}; + + static constexpr auto _gw_id = std::numeric_limits::max(); + + public: + SparseMatrix() = default; + explicit SparseMatrix(const BinTable& bins, std::uint32_t chrom_id = _gw_id); + + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; + void clear() noexcept; + void shrink_to_fit() noexcept; + void finalize(); + + [[nodiscard]] const std::vector& bin1_ids() const noexcept; + [[nodiscard]] const std::vector& bin2_ids() const noexcept; + [[nodiscard]] const std::vector& counts() const noexcept; + [[nodiscard]] const std::vector& margs() const noexcept; + [[nodiscard]] const std::vector& chrom_offsets() const noexcept; + + void push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count); + + [[nodiscard]] SparseMatrixView subset(std::uint32_t chrom_id) const; + [[nodiscard]] SparseMatrixView view() const; + + void serialize(std::fstream& fs, int compression_lvl = 3) const; + void deserialize(std::fstream& fs); +}; + +class SparseMatrixChunkedView; +class SparseMatrixChunked { + mutable SparseMatrix _matrix{}; + mutable std::string _buff{}; + std::filesystem::path _path{}; + mutable std::fstream _fs{}; + + std::vector _index{}; + + // chrom_id, + phmap::flat_hash_map> _chrom_index{}; + std::uint32_t _chrom_id{}; // id of the chromosome that is currently being processed; + std::size_t _size{}; + + mutable std::vector _marg{}; + std::vector _chrom_offsets{}; + std::vector _bin1_offsets{}; + + std::size_t _chunk_size{}; + int _compression_lvl{}; + + public: + SparseMatrixChunked() = default; + SparseMatrixChunked(const BinTable& bins, std::filesystem::path tmp_file, std::size_t chunk_size, + int compression_lvl = 3); + + SparseMatrixChunked(const SparseMatrixChunked& other) = delete; + SparseMatrixChunked(SparseMatrixChunked&& other) noexcept = default; + + ~SparseMatrixChunked() noexcept; + + SparseMatrixChunked& operator=(const SparseMatrixChunked& other) = delete; + SparseMatrixChunked& operator=(SparseMatrixChunked&& other) noexcept = default; + + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; + + [[nodiscard]] const std::vector& margs() const noexcept; + [[nodiscard]] const std::vector& chrom_offsets() const noexcept; + + void push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count); + void finalize(); + void finalize_chromosome(std::uint32_t chrom_id); + + [[nodiscard]] SparseMatrixChunkedView subset(std::uint32_t chrom_id) const; + [[nodiscard]] SparseMatrixChunkedView view() const; + + void read_chunk(std::size_t chunk_id, SparseMatrix& buffer); + + private: + void write_chunk(); +}; + +class SparseMatrixView { + mutable std::vector _marg{}; + std::size_t _bin1_offset{}; + + public: + nonstd::span bin1_ids{}; // NOLINT + nonstd::span bin2_ids{}; // NOLINT + nonstd::span counts{}; // NOLINT + + SparseMatrixView() = default; + SparseMatrixView(nonstd::span bin1_ids_, + nonstd::span bin2_ids_, nonstd::span counts_, + std::size_t bin1_offset, std::size_t num_bins); + + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; + + [[nodiscard]] const std::vector& margs() const noexcept; + + const std::vector& marginalize() const; + const std::vector& marginalize_nnz() const; + const std::vector& times_outer_product_marg(nonstd::span biases, + nonstd::span weights) const; +}; + +class SparseMatrixChunkedView { + mutable SparseMatrix _matrix{}; + mutable std::string _buff{}; + mutable std::fstream _fs{}; + + std::vector _index{}; + + mutable std::vector _marg{}; + std::size_t _bin1_offset{}; + + public: + SparseMatrixChunkedView() = default; + SparseMatrixChunkedView(const std::filesystem::path& path, + nonstd::span index, std::size_t bin1_offset, + std::size_t num_bins); + + [[nodiscard]] bool empty() const noexcept; + + [[nodiscard]] const std::vector& margs() const noexcept; + + const std::vector& marginalize() const; + const std::vector& marginalize_nnz() const; + const std::vector& times_outer_product_marg(nonstd::span biases, + nonstd::span weights) const; +}; + +} // namespace hictk::balancing + +#include "./impl/sparse_matrix_impl.hpp" diff --git a/src/libhictk/balancing/include/hictk/balancing/vc.hpp b/src/libhictk/balancing/include/hictk/balancing/vc.hpp index 2f37def5..5a6affc1 100644 --- a/src/libhictk/balancing/include/hictk/balancing/vc.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/vc.hpp @@ -24,4 +24,4 @@ class VC { }; } // namespace hictk::balancing -#include "../../../vc_impl.hpp" +#include "./impl/vc_impl.hpp" diff --git a/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp b/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp index 607a2f1e..f40579ee 100644 --- a/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp +++ b/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp @@ -401,6 +401,15 @@ inline PixelMerger::PixelMerger(std::vector heads, std::vector } } } +template +inline auto PixelMerger::begin() -> iterator { + return iterator{*this}; +} + +template +inline auto PixelMerger::end() const noexcept -> iterator { + return {}; +} template inline void PixelMerger::replace_top_node(std::size_t i) { @@ -429,8 +438,49 @@ inline auto PixelMerger::next() -> ThinPixel { current_node.pixel.count += next_node.pixel.count; replace_top_node(next_node.i); } + ++_i; return current_node.pixel; } + +template +inline PixelMerger::iterator::iterator(PixelMerger &merger) + : _merger(&merger), _value(merger.next()) {} + +template +inline bool PixelMerger::iterator::operator==(const iterator &other) const noexcept { + if (!_merger || !other._merger) { + return _merger == other._merger; + } + + return _merger == other._merger && _merger->_i == other._merger->_i; +} + +template +inline bool PixelMerger::iterator::operator!=(const iterator &other) const noexcept { + return !(*this == other); +} + +template +inline auto PixelMerger::iterator::operator*() const noexcept -> const ThinPixel & { + return _value; +} + +template +inline auto PixelMerger::iterator::operator->() const noexcept -> const ThinPixel * { + return &(**this); +} + +template +inline auto PixelMerger::iterator::operator++() -> iterator & { + assert(!!_merger); + _value = _merger->next(); + if (!_value) { + _merger = nullptr; + } + + return *this; +} + } // namespace internal } // namespace hictk diff --git a/src/libhictk/pixel/include/hictk/pixel.hpp b/src/libhictk/pixel/include/hictk/pixel.hpp index 167ebf5b..c4c596b7 100644 --- a/src/libhictk/pixel/include/hictk/pixel.hpp +++ b/src/libhictk/pixel/include/hictk/pixel.hpp @@ -114,12 +114,35 @@ class PixelMerger { std::vector _heads{}; std::vector _tails{}; + std::size_t _i{}; public: + class iterator; + PixelMerger() = delete; PixelMerger(std::vector head, std::vector tail); + + auto begin() -> iterator; + auto end() const noexcept -> iterator; [[nodiscard]] auto next() -> ThinPixel; + class iterator { + PixelMerger *_merger{}; + ThinPixel _value{}; + + public: + iterator() = default; + explicit iterator(PixelMerger &merger); + + [[nodiscard]] bool operator==(const iterator &other) const noexcept; + [[nodiscard]] bool operator!=(const iterator &other) const noexcept; + + auto operator*() const noexcept -> const ThinPixel &; + auto operator->() const noexcept -> const ThinPixel *; + + [[nodiscard]] auto operator++() -> iterator &; + }; + private: void replace_top_node(std::size_t i); }; diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index 37ae70d2..c13e9600 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -12,6 +12,7 @@ #include "hictk/balancing/vc.hpp" #include "hictk/cooler.hpp" #include "hictk/hic.hpp" +#include "tmpdir.hpp" namespace hictk::test { inline const std::filesystem::path datadir{"test/data/"}; // NOLINT(cert-err58-cpp) @@ -84,37 +85,79 @@ TEST_CASE("Balancing: VC", "[balancing][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Balancing: ICE", "[balancing][short]") { +TEST_CASE("Balancing: ICE", "[balancing][long]") { const auto path = datadir / "cooler/ENCFF993FGR.2500000.cool"; + const auto tmpfile = testdir() / "balancing_ice.tmp"; auto clr = hictk::cooler::File(path.string()); - SECTION("INTRA") { - const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; + SECTION("in-memory") { + SECTION("INTRA") { + const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; - constexpr auto type = hictk::balancing::ICE::Type::cis; - const auto weights = hictk::balancing::ICE(clr, type).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + constexpr auto type = hictk::balancing::ICE::Type::cis; + const auto weights = hictk::balancing::ICE(clr, type).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); - compare_weights(weights, expected_weights); - } - SECTION("INTER") { - const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; + compare_weights(weights, expected_weights); + } - constexpr auto type = hictk::balancing::ICE::Type::trans; - const auto weights = hictk::balancing::ICE(clr, type).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + SECTION("INTER") { + const auto path_intra_weights = + datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; + + constexpr auto type = hictk::balancing::ICE::Type::trans; + const auto weights = hictk::balancing::ICE(clr, type).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); + + compare_weights(weights, expected_weights); + } - compare_weights(weights, expected_weights); + SECTION("GW") { + const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; + + constexpr auto type = hictk::balancing::ICE::Type::gw; + const auto weights = hictk::balancing::ICE(clr, type).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); + + compare_weights(weights, expected_weights); + } } - SECTION("GW") { - const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; - constexpr auto type = hictk::balancing::ICE::Type::gw; - const auto weights = hictk::balancing::ICE(clr, type).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + SECTION("chunked") { + auto params = hictk::balancing::ICE::DefaultParams; + params.tmpfile = tmpfile; + + SECTION("INTRA") { + const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; - compare_weights(weights, expected_weights); + constexpr auto type = hictk::balancing::ICE::Type::cis; + const auto weights = hictk::balancing::ICE(clr, type, params).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); + + compare_weights(weights, expected_weights); + } + + SECTION("INTER") { + const auto path_intra_weights = + datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; + + constexpr auto type = hictk::balancing::ICE::Type::trans; + const auto weights = hictk::balancing::ICE(clr, type, params).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); + + compare_weights(weights, expected_weights); + } + + SECTION("GW") { + const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; + + constexpr auto type = hictk::balancing::ICE::Type::gw; + const auto weights = hictk::balancing::ICE(clr, type, params).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); + + compare_weights(weights, expected_weights); + } } } diff --git a/test/units/include/tmpdir.hpp b/test/units/include/tmpdir.hpp index 7beddef5..939951f2 100644 --- a/test/units/include/tmpdir.hpp +++ b/test/units/include/tmpdir.hpp @@ -22,6 +22,7 @@ inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert- } // namespace cooler::test::attribute namespace cooler::test::balancing { +inline const auto& testdir = hictk::test::testdir; inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } // namespace cooler::test::balancing From d39230c6f6a08bb2567a3604b8488d1e2a70dc0d Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 22 Sep 2023 12:07:38 +0200 Subject: [PATCH 08/33] Bugfix --- src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp index 931453e9..c8b04a1c 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp @@ -105,7 +105,8 @@ inline std::size_t BinTable::size() const noexcept { if (_num_bins_prefix_sum.empty()) { return 0; } - return static_cast(_num_bins_prefix_sum.back() - _num_bins_prefix_sum.front()); + return conditional_static_cast(_num_bins_prefix_sum.back() - + _num_bins_prefix_sum.front()); } inline bool BinTable::empty() const noexcept { return size() == 0; } From 5c7840c1f6038dd814c02388493f06c9935532f2 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 22 Sep 2023 19:08:45 +0200 Subject: [PATCH 09/33] Bugfix --- .../balancing/include/hictk/balancing/ice.hpp | 2 +- .../impl/{ice_mem_impl.hpp => ice_impl.hpp} | 39 +++++++++++++------ 2 files changed, 29 insertions(+), 12 deletions(-) rename src/libhictk/balancing/include/hictk/balancing/impl/{ice_mem_impl.hpp => ice_impl.hpp} (94%) diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index 2729aac3..44005f6f 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -133,4 +133,4 @@ class ICE { } // namespace hictk::balancing -#include "./impl/ice_mem_impl.hpp" +#include "./impl/ice_impl.hpp" diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_mem_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp similarity index 94% rename from src/libhictk/balancing/include/hictk/balancing/impl/ice_mem_impl.hpp rename to src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp index a0eb98ff..f1be680b 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/ice_mem_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp @@ -70,8 +70,9 @@ inline void ICE::balance_chunked(const File& f, Type type, double tol, std::size balance_cis(matrix, f.bins(), max_iters, tol); break; case Type::trans: - balance_trans(construct_sparse_matrix_trans(f, num_masked_diags).view(), f.bins(), max_iters, - tol); + balance_trans( + construct_sparse_matrix_chunked_trans(f, num_masked_diags, tmpfile, chunk_size).view(), + f.bins(), max_iters, tol); } } @@ -193,10 +194,10 @@ template [[nodiscard]] inline auto ICE::construct_sparse_matrix_trans(const File& f, std::size_t num_masked_diags) -> SparseMatrix { + using SelectorT = decltype(f.fetch("chr1", "chr2")); using PixelIt = decltype(f.fetch("chr1", "chr2").template begin()); - std::vector heads{}; - std::vector tails{}; + std::vector selectors{}; for (const Chromosome& chrom1 : f.chromosomes()) { if (chrom1.is_all()) { continue; @@ -208,13 +209,24 @@ template continue; } - const auto sel = f.fetch(chrom1.name(), chrom2.name()); + selectors.emplace_back(f.fetch(chrom1.name(), chrom2.name())); + } + std::vector heads{}; + std::vector tails{}; + for (const auto& sel : selectors) { heads.emplace_back(sel.template begin()); tails.emplace_back(sel.template end()); } } - [[maybe_unused]] internal::PixelMerger merger{heads, tails}; + std::vector heads{}; + std::vector tails{}; + for (const auto& sel : selectors) { + heads.emplace_back(sel.template begin()); + tails.emplace_back(sel.template end()); + } + + internal::PixelMerger merger{heads, tails}; SparseMatrix m(f.bins()); std::for_each(merger.begin(), merger.end(), [&](const ThinPixel& p) { @@ -287,10 +299,10 @@ inline auto ICE::construct_sparse_matrix_chunked_trans(const File& f, std::size_ const std::filesystem::path& tmpfile, std::size_t chunk_size) -> SparseMatrixChunked { + using SelectorT = decltype(f.fetch("chr1", "chr2")); using PixelIt = decltype(f.fetch("chr1", "chr2").template begin()); - std::vector heads{}; - std::vector tails{}; + std::vector selectors{}; for (const Chromosome& chrom1 : f.chromosomes()) { if (chrom1.is_all()) { continue; @@ -302,12 +314,17 @@ inline auto ICE::construct_sparse_matrix_chunked_trans(const File& f, std::size_ continue; } - const auto sel = f.fetch(chrom1.name(), chrom2.name()); - heads.emplace_back(sel.template begin()); - tails.emplace_back(sel.template end()); + selectors.emplace_back(f.fetch(chrom1.name(), chrom2.name())); } } + std::vector heads{}; + std::vector tails{}; + for (const auto& sel : selectors) { + heads.emplace_back(sel.template begin()); + tails.emplace_back(sel.template end()); + } + internal::PixelMerger merger{heads, tails}; SparseMatrixChunked m(f.bins(), tmpfile, chunk_size); From de16e6c8611a3f0caaa1c33c64a007668ed4ebac Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sat, 23 Sep 2023 16:24:04 +0200 Subject: [PATCH 10/33] Bugfix --- .../balancing/include/hictk/balancing/ice.hpp | 2 - .../include/hictk/balancing/impl/ice_impl.hpp | 50 +++++---- .../balancing/impl/sparse_matrix_impl.hpp | 19 ++-- .../include/hictk/balancing/impl/vc_impl.hpp | 101 ------------------ .../balancing/include/hictk/balancing/vc.hpp | 27 ----- 5 files changed, 34 insertions(+), 165 deletions(-) delete mode 100644 src/libhictk/balancing/include/hictk/balancing/impl/vc_impl.hpp delete mode 100644 src/libhictk/balancing/include/hictk/balancing/vc.hpp diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index 44005f6f..ce08b580 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -125,8 +125,6 @@ class ICE { nonstd::span chrom_bin_offsets, std::size_t min_nnz, std::size_t min_count, double mad_max); - [[nodiscard]] static std::vector read_chrom_bin_offsets(const BinTable& bins); - [[nodiscard]] static std::vector compute_weights_from_chromosome_sizes( const BinTable& bins, nonstd::span chrom_bin_offsets); }; diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp index f1be680b..ff806751 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp @@ -22,7 +22,7 @@ namespace hictk::balancing { template inline ICE::ICE(const File& f, Type type, const Params& params) - : _chrom_offsets(read_chrom_bin_offsets(f.bins())), _biases(f.bins().size(), 1.0) { + : _chrom_offsets(f.bins().num_bin_prefix_sum()), _biases(f.bins().size(), 1.0) { if (params.tmpfile.empty()) { balance_in_memory(f, type, params.tol, params.max_iters, params.num_masked_diags, params.min_nnz, params.min_count, params.mad_max); @@ -111,27 +111,29 @@ inline void ICE::balance_trans(const MatrixT& matrix, const BinTable& bins, std: } template -inline void ICE::balance_cis(const MatrixT& matrix, [[maybe_unused]] const BinTable& bins, - std::size_t max_iters, double tol) { - _variance.resize(_chrom_offsets.size() - 1, 0); - _scale.resize(_chrom_offsets.size() - 1, std::numeric_limits::quiet_NaN()); +inline void ICE::balance_cis(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, + double tol) { + _variance.resize(bins.chromosomes().size(), 0); + _scale.resize(bins.chromosomes().size(), std::numeric_limits::quiet_NaN()); std::vector margs(_biases.size()); - for (std::uint32_t chrom_id = 0; chrom_id < _chrom_offsets.size() - 1; ++chrom_id) { - const auto cis_matrix = matrix.subset(chrom_id); + for (const auto& chrom : bins.chromosomes()) { + if (chrom.is_all()) { + continue; + } + const auto cis_matrix = matrix.subset(chrom.id()); - const auto j0 = _chrom_offsets[chrom_id]; - const auto j1 = _chrom_offsets[chrom_id + 1]; + const auto j0 = _chrom_offsets[chrom.id()]; + const auto j1 = _chrom_offsets[chrom.id() + 1]; auto biases_ = nonstd::span(_biases).subspan(j0, j1 - j0); for (std::size_t k = 0; k < max_iters; ++k) { const auto res = inner_loop(cis_matrix, biases_); - SPDLOG_INFO(FMT_STRING("[{}] iteration {}: {}"), bins.chromosomes().at(chrom_id).name(), - k + 1, res.variance); - _variance[chrom_id] = res.variance; - _scale[chrom_id] = res.scale; + SPDLOG_INFO(FMT_STRING("[{}] iteration {}: {}"), chrom.name(), k + 1, res.variance); + _variance[chrom.id()] = res.variance; + _scale[chrom.id()] = res.scale; if (res.variance < tol) { break; @@ -177,6 +179,9 @@ template SparseMatrix m(f.bins()); for (const Chromosome& chrom : f.chromosomes()) { + if (chrom.is_all()) { + continue; + } const auto sel = f.fetch(chrom.name()); std::for_each(sel.template begin(), sel.template end(), [&](const ThinPixel& p) { @@ -282,6 +287,9 @@ inline auto ICE::construct_sparse_matrix_chunked_cis(const File& f, std::size_t SparseMatrixChunked m(f.bins(), tmpfile, chunk_size); for (const Chromosome& chrom : f.chromosomes()) { + if (chrom.is_all()) { + continue; + } const auto sel = f.fetch(chrom.name()); std::for_each(sel.template begin(), sel.template end(), [&](const ThinPixel& p) { @@ -496,16 +504,6 @@ inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span b } } -inline std::vector ICE::read_chrom_bin_offsets(const BinTable& bins) { - std::vector buff{0}; - for (const Chromosome& chrom : bins.chromosomes()) { - const auto nbins = (chrom.size() + bins.bin_size() - 1) / bins.bin_size(); - buff.push_back(buff.back() + nbins); - } - - return buff; -} - inline std::vector ICE::compute_weights_from_chromosome_sizes( const BinTable& bins, nonstd::span chrom_bin_offsets) { std::vector weights(bins.size()); @@ -528,12 +526,12 @@ inline std::vector ICE::compute_weights_from_chromosome_sizes( return weights; } -inline std::vector ICE::get_weights([[maybe_unused]] bool rescale) const { - std::vector biases(_biases.size()); +inline std::vector ICE::get_weights(bool rescale) const { if (!rescale) { - return biases; + return _biases; } + std::vector biases(_biases.size()); if (_scale.size() == 1) { const auto scale = std::sqrt(_scale[0]); std::transform(_biases.begin(), _biases.end(), biases.begin(), [&](const auto n) { diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp index 382c8314..1a663ccc 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -20,16 +20,11 @@ namespace hictk::balancing { -inline SparseMatrix::SparseMatrix(const hictk::BinTable& bins, std::uint32_t chrom_id) +inline SparseMatrix::SparseMatrix(const BinTable& bins, std::uint32_t chrom_id) : _chrom_id(chrom_id == _gw_id ? 0 : chrom_id), - _marg(chrom_id == _gw_id ? bins.size() : bins.subset(chrom_id).size()) { - _chrom_offsets.push_back(0); - for (const Chromosome& chrom : bins.chromosomes()) { - const auto nbins = (chrom.size() + bins.bin_size() - 1) / bins.bin_size(); - _chrom_offsets.push_back(_chrom_offsets.back() + nbins); - } - _bin1_offsets.resize(_chrom_offsets.size(), 0); -} + _chrom_offsets(bins.num_bin_prefix_sum()), + _bin1_offsets(_chrom_offsets.size(), 0), + _marg(chrom_id == _gw_id ? bins.size() : bins.subset(chrom_id).size()) {} inline bool SparseMatrix::empty() const noexcept { return size() == 0; } inline std::size_t SparseMatrix::size() const noexcept { return _counts.size(); } @@ -65,6 +60,12 @@ inline const std::vector& SparseMatrix::chrom_offsets() const noexc } inline void SparseMatrix::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count) { + if (empty()) { + _chrom_id = static_cast( + std::lower_bound(_chrom_offsets.begin(), _chrom_offsets.end(), bin1_id) - + _chrom_offsets.begin()); + } + if (!empty() && bin1_id >= _chrom_offsets[_chrom_id + 1]) { _chrom_id = static_cast( std::lower_bound(_chrom_offsets.begin(), _chrom_offsets.end(), _bin1_ids.back()) - diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/vc_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/vc_impl.hpp deleted file mode 100644 index 88b08e8a..00000000 --- a/src/libhictk/balancing/include/hictk/balancing/impl/vc_impl.hpp +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include -#include - -#include "hictk/pixel.hpp" -#include "hictk/type_traits.hpp" - -namespace hictk::balancing { - -template -template -inline VC::VC(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_rows, - std::size_t bin_id_offset) { - if constexpr (std::is_floating_point_v) { - _rowsum = std::vector(num_rows, 0); - _sum = 0.0; - } else { - _rowsum = std::vector(num_rows, 0); - _sum = std::int64_t(0); - } - - // Compute rowsum and matrix sum - std::visit( - [&](auto& sum) { - using T = remove_cvref_t; - auto& rowsum = std::get>(_rowsum); - std::for_each(first_pixel, last_pixel, [&](const ThinPixel& p) { - if constexpr (std::is_floating_point_v) { - if (std::isnan(p.count)) { - return; - } - } - const auto bin1_id = p.bin1_id - bin_id_offset; - const auto bin2_id = p.bin2_id - bin_id_offset; - const auto count = conditional_static_cast(p.count); - - rowsum[bin1_id] += count; - - if (bin1_id != bin2_id) { - rowsum[bin2_id] += count; - } - }); - }, - _sum); - - std::visit( - [&](auto& sum) { - using T = remove_cvref_t; - const auto& rowsum = std::get>(_rowsum); - std::for_each(first_pixel, last_pixel, [&](const ThinPixel& p) { - if constexpr (std::is_floating_point_v) { - if (std::isnan(p.count)) { - return; - } - } - const auto bin1_id = p.bin1_id - bin_id_offset; - const auto bin2_id = p.bin2_id - bin_id_offset; - - const auto rs1 = conditional_static_cast(rowsum[bin1_id]); - const auto rs2 = conditional_static_cast(rowsum[bin2_id]); - if (rs1 == 0 || rs2 == 0) { - return; - } - - const auto count = conditional_static_cast(bin1_id == bin2_id ? p.count : 2 * p.count); - sum += count; - _norm_sum += conditional_static_cast(count) / (rs1 * rs2); - }); - }, - _sum); -} - -template -inline std::vector VC::get_weights() const { - std::vector weights; - - const auto scaling_factor = std::visit( - [&](const auto& sum) { return std::sqrt(_norm_sum / conditional_static_cast(sum)); }, - _sum); - - std::visit( - [&](const auto& rowsum) { - weights.reserve(rowsum.size()); - - for (const auto rs : rowsum) { - weights.push_back(conditional_static_cast(rs) * scaling_factor); - } - }, - _rowsum); - - return weights; -} - -} // namespace hictk::balancing diff --git a/src/libhictk/balancing/include/hictk/balancing/vc.hpp b/src/libhictk/balancing/include/hictk/balancing/vc.hpp deleted file mode 100644 index 5a6affc1..00000000 --- a/src/libhictk/balancing/include/hictk/balancing/vc.hpp +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include - -namespace hictk::balancing { - -template -class VC { - std::variant, std::vector> _rowsum{}; - std::variant _sum{}; - double _norm_sum{}; - - public: - template - VC(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_rows, std::size_t binid_offset = 0); - - [[nodiscard]] std::vector get_weights() const; -}; -} // namespace hictk::balancing - -#include "./impl/vc_impl.hpp" From 60a50f7d8c78cd74e7c4b5a6c486dee71e3f4672 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 24 Sep 2023 13:15:07 +0200 Subject: [PATCH 11/33] Bugfix --- .../balancing/impl/sparse_matrix_impl.hpp | 73 ++++++++++++++++--- .../include/hictk/balancing/sparse_matrix.hpp | 4 + 2 files changed, 65 insertions(+), 12 deletions(-) diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp index 1a663ccc..7aeda4be 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -44,6 +44,10 @@ inline void SparseMatrix::shrink_to_fit() noexcept { inline void SparseMatrix::finalize() { shrink_to_fit(); + if (_chrom_id + 1 < _bin1_offsets.size()) { + _bin1_offsets[_chrom_id + 1] = size(); + } + for (std::size_t i = 1; i < _bin1_offsets.size(); ++i) { if (_bin1_offsets[i] == 0) { _bin1_offsets[i] = _bin1_offsets[i - 1]; @@ -62,15 +66,16 @@ inline const std::vector& SparseMatrix::chrom_offsets() const noexc inline void SparseMatrix::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count) { if (empty()) { _chrom_id = static_cast( - std::lower_bound(_chrom_offsets.begin(), _chrom_offsets.end(), bin1_id) - + std::upper_bound(_chrom_offsets.begin(), _chrom_offsets.end(), bin1_id) - 1 - _chrom_offsets.begin()); } - if (!empty() && bin1_id >= _chrom_offsets[_chrom_id + 1]) { + const auto beginning_of_new_chromosome = bin1_id >= _chrom_offsets[_chrom_id + 1]; + if (!empty() && beginning_of_new_chromosome) { _chrom_id = static_cast( - std::lower_bound(_chrom_offsets.begin(), _chrom_offsets.end(), _bin1_ids.back()) - + std::upper_bound(_chrom_offsets.begin(), _chrom_offsets.end(), _bin1_ids.back()) - 1 - _chrom_offsets.begin()); - _bin1_offsets[_chrom_id] = size(); + _bin1_offsets[_chrom_id + 1] = size(); } _bin1_ids.push_back(bin1_id); @@ -139,6 +144,8 @@ void SparseMatrix::serialize(std::fstream& fs, int compression_lvl) const { fs.write(reinterpret_cast(&compressed_size), sizeof(std::size_t)); fs.write(tmpbuff.data(), static_cast(compressed_size)); + + fs.flush(); } void SparseMatrix::deserialize(std::fstream& fs) { @@ -218,13 +225,13 @@ inline const std::vector& SparseMatrixChunked::chrom_offsets() cons inline void SparseMatrixChunked::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count) { - const auto beginning_of_new_chromosome = bin1_id >= _chrom_offsets[_chrom_id + 1]; + if (empty()) { + initialize_index(bin1_id); + } - if (beginning_of_new_chromosome) { - finalize_chromosome(_chrom_id); - _chrom_id = static_cast( - std::lower_bound(_chrom_offsets.begin(), _chrom_offsets.end(), _matrix.bin1_ids().back()) - - _chrom_offsets.begin()); + const auto beginning_of_new_chromosome = bin1_id >= _chrom_offsets[_chrom_id + 1]; + if (!empty() && beginning_of_new_chromosome) { + update_index(bin1_id); } if (_matrix.size() == _chunk_size || beginning_of_new_chromosome) { @@ -236,20 +243,51 @@ inline void SparseMatrixChunked::push_back(std::uint64_t bin1_id, std::uint64_t } inline void SparseMatrixChunked::finalize() { + finalize_chromosome(_chrom_id); + + for (std::size_t i = 1; i < _bin1_offsets.size(); ++i) { + if (_bin1_offsets[i] == 0) { + _bin1_offsets[i] = _bin1_offsets[i - 1]; + } + } + if (!_matrix.empty()) { write_chunk(); } - _fs.close(); _fs.open(_path, std::ios::in); } inline void SparseMatrixChunked::finalize_chromosome(std::uint32_t chrom_id) { - _bin1_offsets[chrom_id + 1] = size(); + // Finalize current chromosome auto [it, inserted] = _chrom_index.try_emplace(chrom_id, std::make_pair(_index.size(), _index.size())); if (!inserted) { it->second.second = _index.size() + 1; } + + // Initialize next chromosome + if (chrom_id + 1 < _bin1_offsets.size()) { + _bin1_offsets[chrom_id + 1] = size(); + _chrom_index.emplace(chrom_id + 1, std::make_pair(_index.size() + 1, _index.size() + 1)); + } +} + +inline void SparseMatrixChunked::initialize_index(std::uint64_t bin1_id) { + assert(empty()); + _chrom_id = static_cast( + std::upper_bound(_chrom_offsets.begin(), _chrom_offsets.end(), bin1_id) - 1 - + _chrom_offsets.begin()); + for (std::uint32_t i = 0; i <= _chrom_id; ++i) { + _chrom_index.emplace(i, std::make_pair(std::size_t{}, std::size_t{})); + } +} + +inline void SparseMatrixChunked::update_index(std::uint64_t bin1_id) { + assert(!empty()); + finalize_chromosome(_chrom_id); + _chrom_id = static_cast( + std::upper_bound(_chrom_offsets.begin(), _chrom_offsets.end(), bin1_id) - 1 - + _chrom_offsets.begin()); } inline SparseMatrixChunkedView SparseMatrixChunked::view() const { @@ -360,6 +398,17 @@ inline SparseMatrixChunkedView::SparseMatrixChunkedView(const std::filesystem::p _bin1_offset(bin1_offset) {} inline bool SparseMatrixChunkedView::empty() const noexcept { return _index.empty(); } +inline std::size_t SparseMatrixChunkedView::size() { + std::size_t size_ = 0; + + for (const auto& idx : _index) { + _fs.seekg(idx); + std::size_t chunk_size{}; + _fs.read(reinterpret_cast(&chunk_size), sizeof(std::size_t)); + size_ += chunk_size; + } + return size_; +} inline const std::vector& SparseMatrixChunkedView::margs() const noexcept { return _marg; } diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp index fc042eb8..b5b21b21 100644 --- a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -96,6 +96,9 @@ class SparseMatrixChunked { void finalize(); void finalize_chromosome(std::uint32_t chrom_id); + void initialize_index(std::uint64_t bin1_id); + void update_index(std::uint64_t bin1_id); + [[nodiscard]] SparseMatrixChunkedView subset(std::uint32_t chrom_id) const; [[nodiscard]] SparseMatrixChunkedView view() const; @@ -147,6 +150,7 @@ class SparseMatrixChunkedView { std::size_t num_bins); [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t size(); [[nodiscard]] const std::vector& margs() const noexcept; From a138acdfd602067005197b9d365a13a0f73efb22 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 24 Sep 2023 13:15:39 +0200 Subject: [PATCH 12/33] Add more tests --- test/units/balancing/CMakeLists.txt | 2 +- test/units/balancing/balancing_test.cpp | 262 +++++++++++++++++------- 2 files changed, 186 insertions(+), 78 deletions(-) diff --git a/test/units/balancing/CMakeLists.txt b/test/units/balancing/CMakeLists.txt index 600f1d55..ae8d8d96 100644 --- a/test/units/balancing/CMakeLists.txt +++ b/test/units/balancing/CMakeLists.txt @@ -14,7 +14,7 @@ target_sources(hictk_balancing_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/balanci target_link_libraries( hictk_balancing_tests PRIVATE hictk_project_warnings hictk_project_options - PUBLIC hictk::balancing hictk::hic hictk::cooler) + PUBLIC hictk::balancing hictk::file) target_link_system_libraries( hictk_balancing_tests diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index c13e9600..8f5f1c89 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -9,9 +9,7 @@ #include "hictk/balancing/ice.hpp" #include "hictk/balancing/methods.hpp" -#include "hictk/balancing/vc.hpp" -#include "hictk/cooler.hpp" -#include "hictk/hic.hpp" +#include "hictk/file.hpp" #include "tmpdir.hpp" namespace hictk::test { @@ -47,116 +45,226 @@ static void compare_weights(const std::vector& weights, const std::vecto } } +template +static void compare_vectors(const std::vector& v1, const std::vector& v2) { + REQUIRE(v1.size() == v2.size()); + + for (std::size_t i = 0; i < v1.size(); ++i) { + CHECK(v1[i] == v2[i]); + } +} + // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Balancing: VC", "[balancing][short]") { - const auto path = datadir / "hic/ENCFF993FGR.hic"; +TEST_CASE("Balancing: SparseMatrix") { + using SparseMatrix = hictk::balancing::SparseMatrix; + const BinTable bins{Reference{Chromosome{0, "chr0", 50}, Chromosome{1, "chr1", 100}, + Chromosome{2, "chr2", 50}, Chromosome{3, "chr3", 50}}, + 50}; + // clang-format off + const std::vector> pixels{ + {1, 0, 1}, {1, 1, 2}, {2, 1, 3}, // chr1 + {3, 0, 4}, {3, 1, 5}}; // chr2 + // clang-format on + + SECTION("accessors") { + CHECK(SparseMatrix{}.empty()); + CHECK(SparseMatrix{bins}.empty()); + } + + SECTION("push_back") { + SparseMatrix m{bins}; + for (const auto& p : pixels) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + m.finalize(); + CHECK(m.size() == pixels.size()); + + m.clear(); + CHECK(m.empty()); + } + + SECTION("subset") { + SparseMatrix m{bins}; + for (const auto& p : pixels) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + m.finalize(); + + CHECK(m.subset(0).empty()); + CHECK(m.subset(1).size() == 3); + CHECK(m.subset(2).size() == 2); + CHECK(m.subset(3).empty()); + } - auto hf = hictk::hic::File(path.string(), 2500000); + SECTION("serde") { + const auto tmpfile = testdir() / "sparse_matrix_serde.bin"; + + SECTION("empty matrix") { + std::fstream f{}; + f.open(tmpfile, std::ios::in | std::ios::out | std::ios::trunc); + f.exceptions(std::ios::badbit | std::ios::failbit); + + SparseMatrix m1{}; + SparseMatrix m2{}; + m1.finalize(); + m1.serialize(f); + f.seekg(std::ios::beg); + m2.deserialize(f); + + compare_vectors(m1.bin1_ids(), m2.bin1_ids()); + compare_vectors(m1.bin2_ids(), m2.bin2_ids()); + compare_vectors(m1.counts(), m2.counts()); + compare_vectors(m1.chrom_offsets(), m2.chrom_offsets()); + } - SECTION("INTRA") { - for (const auto& chrom : hf.chromosomes()) { - if (chrom.is_all()) { - continue; + SECTION("full matrix") { + SparseMatrix m1{bins}; + for (const auto& p : pixels) { + m1.push_back(p.bin1_id, p.bin2_id, p.count); } - auto sel1 = hf.fetch(chrom.name()); + m1.finalize(); - const auto num_bins = hf.bins().subset(chrom).size(); - const auto bin_id_offset = hf.bins().at(chrom.name(), 0).id(); - const auto weights = - hictk::balancing::VC(sel1.begin(), sel1.end(), - num_bins, bin_id_offset) - .get_weights(); + std::fstream f{}; + f.open(tmpfile, std::ios::in | std::ios::out | std::ios::trunc); + f.exceptions(std::ios::badbit | std::ios::failbit); - auto sel2 = hf.fetch(chrom.name(), hictk::balancing::Method::VC()); - compare_weights(weights, sel2.weights1()()); + SparseMatrix m2{bins}; + m1.serialize(f); + f.seekg(std::ios::beg); + m2.deserialize(f); + + compare_vectors(m1.bin1_ids(), m2.bin1_ids()); + compare_vectors(m1.bin2_ids(), m2.bin2_ids()); + compare_vectors(m1.counts(), m2.counts()); + compare_vectors(m1.chrom_offsets(), m2.chrom_offsets()); } } +} - SECTION("GW") { - const auto num_bins = hf.bins().size(); - auto sel = hf.fetch(); - const auto weights = hictk::balancing::VC(sel.begin(), - sel.end(), num_bins) - .get_weights(); +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Balancing: SparseMatrixChunked") { + using SparseMatrixChunked = hictk::balancing::SparseMatrixChunked; + const BinTable bins{Reference{Chromosome{0, "chr0", 50}, Chromosome{1, "chr1", 100}, + Chromosome{2, "chr2", 50}, Chromosome{3, "chr3", 50}}, + 50}; + // clang-format off + const std::vector> pixels{ + {1, 0, 1}, {1, 1, 2}, {2, 1, 3}, // chr1 + {3, 0, 4}, {3, 1, 5}}; // chr2 + // clang-format on + const auto tmpfile = testdir() / "sparse_matrix_chunked.tmp"; + + SECTION("accessors") { CHECK(SparseMatrixChunked{bins, tmpfile, 2, 0}.empty()); } + + SECTION("push_back") { + SparseMatrixChunked m{bins, tmpfile, 2, 0}; + for (const auto& p : pixels) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + m.finalize(); + + CHECK(m.size() == pixels.size()); + } - const auto expected = hf.fetch(hictk::balancing::Method::GW_VC()).weights(); - compare_weights(weights, expected); + SECTION("subset") { + SparseMatrixChunked m{bins, tmpfile, 2, 0}; + for (const auto& p : pixels) { + m.push_back(p.bin1_id, p.bin2_id, p.count); + } + m.finalize(); + + CHECK(m.subset(0).empty()); + CHECK(m.subset(1).size() == 3); + CHECK(m.subset(2).size() == 2); + CHECK(m.subset(3).empty()); } } // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("Balancing: ICE", "[balancing][long]") { - const auto path = datadir / "cooler/ENCFF993FGR.2500000.cool"; + const std::array, 2> files{ + std::make_pair("cooler", datadir / "cooler/ENCFF993FGR.2500000.cool"), + std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; + const auto tmpfile = testdir() / "balancing_ice.tmp"; - auto clr = hictk::cooler::File(path.string()); + for (const auto& [label, path] : files) { + SECTION(label) { + const hictk::File f(path.string(), 2'500'000); - SECTION("in-memory") { - SECTION("INTRA") { - const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; + SECTION("in-memory") { + SECTION("INTRA") { + const auto path_intra_weights = + datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; - constexpr auto type = hictk::balancing::ICE::Type::cis; - const auto weights = hictk::balancing::ICE(clr, type).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + constexpr auto type = hictk::balancing::ICE::Type::cis; + const auto weights = hictk::balancing::ICE(f, type).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); - compare_weights(weights, expected_weights); - } + compare_weights(weights, expected_weights); + } - SECTION("INTER") { - const auto path_intra_weights = - datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; + SECTION("INTER") { + const auto path_intra_weights = + datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; - constexpr auto type = hictk::balancing::ICE::Type::trans; - const auto weights = hictk::balancing::ICE(clr, type).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + constexpr auto type = hictk::balancing::ICE::Type::trans; + const auto weights = hictk::balancing::ICE(f, type).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); - compare_weights(weights, expected_weights); - } + compare_weights(weights, expected_weights); + } - SECTION("GW") { - const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; + SECTION("GW") { + const auto path_intra_weights = + datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; - constexpr auto type = hictk::balancing::ICE::Type::gw; - const auto weights = hictk::balancing::ICE(clr, type).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + constexpr auto type = hictk::balancing::ICE::Type::gw; + const auto weights = hictk::balancing::ICE(f, type).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); - compare_weights(weights, expected_weights); - } - } + compare_weights(weights, expected_weights); + } + } - SECTION("chunked") { - auto params = hictk::balancing::ICE::DefaultParams; - params.tmpfile = tmpfile; + SECTION("chunked") { + auto params = hictk::balancing::ICE::DefaultParams; + params.tmpfile = tmpfile; + params.chunk_size = 1000; - SECTION("INTRA") { - const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; + SECTION("INTRA") { + const auto path_intra_weights = + datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; - constexpr auto type = hictk::balancing::ICE::Type::cis; - const auto weights = hictk::balancing::ICE(clr, type, params).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + constexpr auto type = hictk::balancing::ICE::Type::cis; + const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); - compare_weights(weights, expected_weights); - } + compare_weights(weights, expected_weights); + } - SECTION("INTER") { - const auto path_intra_weights = - datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; + SECTION("INTER") { + const auto path_intra_weights = + datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; - constexpr auto type = hictk::balancing::ICE::Type::trans; - const auto weights = hictk::balancing::ICE(clr, type, params).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + constexpr auto type = hictk::balancing::ICE::Type::trans; + const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); - compare_weights(weights, expected_weights); - } + compare_weights(weights, expected_weights); + } - SECTION("GW") { - const auto path_intra_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; + SECTION("GW") { + const auto path_intra_weights = + datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; - constexpr auto type = hictk::balancing::ICE::Type::gw; - const auto weights = hictk::balancing::ICE(clr, type, params).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + constexpr auto type = hictk::balancing::ICE::Type::gw; + const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); + const auto expected_weights = read_weights(path_intra_weights); - compare_weights(weights, expected_weights); + compare_weights(weights, expected_weights); + } + } } } } From 4a624fe5f0faab6451dade056bf74e04d7d710dc Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 24 Sep 2023 13:28:44 +0200 Subject: [PATCH 13/33] Bugfix --- .../balancing/include/hictk/balancing/ice.hpp | 2 +- .../include/hictk/balancing/impl/ice_impl.hpp | 20 ++++++------------- .../include/hictk/balancing/sparse_matrix.hpp | 4 ++-- .../cooler/include/hictk/cooler/cooler.hpp | 10 ++++------ 4 files changed, 13 insertions(+), 23 deletions(-) diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index ce08b580..4c05d2f1 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -14,7 +14,7 @@ namespace hictk::balancing { class ICE { - std::vector _chrom_offsets{}; + std::vector _chrom_offsets{}; std::vector _biases{}; std::vector _variance{}; std::vector _scale{}; diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp index ff806751..4a29cb79 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp @@ -145,14 +145,10 @@ inline void ICE::balance_cis(const MatrixT& matrix, const BinTable& bins, std::s template auto ICE::construct_sparse_matrix(const File& f, Type type, std::size_t num_masked_diags) -> SparseMatrix { - switch (type) { - case Type::cis: - return construct_sparse_matrix_cis(f, num_masked_diags); - case Type::trans: - [[fallthrough]]; - case Type::gw: - return construct_sparse_matrix_gw(f, num_masked_diags); + if (type == Type::cis) { + return construct_sparse_matrix_cis(f, num_masked_diags); } + return construct_sparse_matrix_gw(f, num_masked_diags); } template @@ -252,14 +248,10 @@ template auto ICE::construct_sparse_matrix_chunked(const File& f, Type type, std::size_t num_masked_diags, const std::filesystem::path& tmpfile, std::size_t chunk_size) -> SparseMatrixChunked { - switch (type) { - case Type::cis: - return construct_sparse_matrix_chunked_cis(f, num_masked_diags, tmpfile, chunk_size); - case Type::trans: - [[fallthrough]]; - case Type::gw: - return construct_sparse_matrix_chunked_gw(f, num_masked_diags, tmpfile, chunk_size); + if (type == Type::cis) { + return construct_sparse_matrix_chunked_cis(f, num_masked_diags, tmpfile, chunk_size); } + return construct_sparse_matrix_chunked_gw(f, num_masked_diags, tmpfile, chunk_size); } template diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp index b5b21b21..4bcecf3e 100644 --- a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -21,8 +21,8 @@ class SparseMatrix { std::vector _counts{}; std::uint32_t _chrom_id{}; // ID of the chromosome that is being procesed - std::vector _chrom_offsets{}; - std::vector _bin1_offsets{}; + std::vector _chrom_offsets{}; + std::vector _bin1_offsets{}; mutable std::vector _marg{}; static constexpr auto _gw_id = std::numeric_limits::max(); diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index 9737b07a..064a4178 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -118,12 +118,10 @@ class File { File(File &&other) noexcept(noexcept_move_ctor()) = default; // NOLINT // Simple constructor. Open file in read-only mode. Automatically detects pixel count type - [[nodiscard]] explicit File(std::string_view uri, - std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, - bool validate = true); - [[nodiscard]] explicit File(RootGroup entrypoint, - std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, - bool validate = true); + explicit File(std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, + bool validate = true); + explicit File(RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, + bool validate = true); [[nodiscard]] static File open_random_access( std::string_view uri, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, From 071d9d9d31bc93b13f1239bcc6ec63393c412133 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 24 Sep 2023 14:31:54 +0200 Subject: [PATCH 14/33] Improve logging --- .../balancing/include/hictk/balancing/impl/ice_impl.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp index 4a29cb79..7cbaa349 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp @@ -145,6 +145,7 @@ inline void ICE::balance_cis(const MatrixT& matrix, const BinTable& bins, std::s template auto ICE::construct_sparse_matrix(const File& f, Type type, std::size_t num_masked_diags) -> SparseMatrix { + SPDLOG_INFO(FMT_STRING("Reading interactions into memory...")); if (type == Type::cis) { return construct_sparse_matrix_cis(f, num_masked_diags); } @@ -479,7 +480,9 @@ template inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span biases, nonstd::span chrom_bin_offsets, std::size_t min_nnz, std::size_t min_count, double mad_max) { + SPDLOG_INFO(FMT_STRING("Initializing bias vector...")); if (min_nnz != 0) { + SPDLOG_INFO(FMT_STRING("Masking columns with fewer than {} nnz entries..."), min_nnz); min_nnz_filtering(matrix, biases, min_nnz); } @@ -487,10 +490,12 @@ inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span b matrix.marginalize(); } if (min_count != 0) { + SPDLOG_INFO(FMT_STRING("Masking columns with fewer than {} interactions..."), min_count); min_count_filtering(biases, min_count, matrix.margs()); } if (mad_max != 0) { + SPDLOG_INFO(FMT_STRING("Masking columns using mad_max={}..."), mad_max); auto margs = std::vector{matrix.margs()}; mad_max_filtering(chrom_bin_offsets, biases, margs, mad_max); } From 6660df4b9d580763f56e1fa926d494487bb40ef0 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 24 Sep 2023 14:32:35 +0200 Subject: [PATCH 15/33] Update serde code to use zstd contexts --- .../balancing/impl/sparse_matrix_impl.hpp | 57 ++++++++++--------- .../include/hictk/balancing/sparse_matrix.hpp | 23 +++++++- test/units/balancing/balancing_test.cpp | 10 ++-- 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp index 7aeda4be..5cc41742 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -106,17 +106,17 @@ inline SparseMatrixView SparseMatrix::view() const { return {bin1_ids_, bin2_ids_, counts_, 0, _marg.size()}; } -void SparseMatrix::serialize(std::fstream& fs, int compression_lvl) const { +void SparseMatrix::serialize(std::fstream& fs, ZSTD_CCtx& ctx, int compression_lvl) const { const auto size_ = size(); fs.write(reinterpret_cast(&size_), sizeof(std::size_t)); const auto tmpbuff_size = ZSTD_compressBound(size() * sizeof(std::uint64_t)); std::string tmpbuff(tmpbuff_size, '\0'); - std::size_t compressed_size = - ZSTD_compress(reinterpret_cast(tmpbuff.data()), tmpbuff.size() * sizeof(char), - reinterpret_cast(_bin1_ids.data()), size() * sizeof(std::uint64_t), - compression_lvl); + std::size_t compressed_size = ZSTD_compressCCtx(&ctx, reinterpret_cast(tmpbuff.data()), + tmpbuff.size() * sizeof(char), + reinterpret_cast(_bin1_ids.data()), + size() * sizeof(std::uint64_t), compression_lvl); if (ZSTD_isError(compressed_size)) { throw std::runtime_error(ZSTD_getErrorName(compressed_size)); } @@ -124,10 +124,10 @@ void SparseMatrix::serialize(std::fstream& fs, int compression_lvl) const { fs.write(reinterpret_cast(&compressed_size), sizeof(std::size_t)); fs.write(tmpbuff.data(), static_cast(compressed_size)); - compressed_size = - ZSTD_compress(reinterpret_cast(tmpbuff.data()), tmpbuff.size() * sizeof(char), - reinterpret_cast(_bin2_ids.data()), size() * sizeof(std::uint64_t), - compression_lvl); + compressed_size = ZSTD_compressCCtx(&ctx, reinterpret_cast(tmpbuff.data()), + tmpbuff.size() * sizeof(char), + reinterpret_cast(_bin2_ids.data()), + size() * sizeof(std::uint64_t), compression_lvl); if (ZSTD_isError(compressed_size)) { throw std::runtime_error(ZSTD_getErrorName(compressed_size)); } @@ -135,8 +135,8 @@ void SparseMatrix::serialize(std::fstream& fs, int compression_lvl) const { fs.write(reinterpret_cast(&compressed_size), sizeof(std::size_t)); fs.write(tmpbuff.data(), static_cast(compressed_size)); - compressed_size = ZSTD_compress( - reinterpret_cast(tmpbuff.data()), tmpbuff.size() * sizeof(char), + compressed_size = ZSTD_compressCCtx( + &ctx, reinterpret_cast(tmpbuff.data()), tmpbuff.size() * sizeof(char), reinterpret_cast(_counts.data()), size() * sizeof(double), compression_lvl); if (ZSTD_isError(compressed_size)) { throw std::runtime_error(ZSTD_getErrorName(compressed_size)); @@ -148,7 +148,7 @@ void SparseMatrix::serialize(std::fstream& fs, int compression_lvl) const { fs.flush(); } -void SparseMatrix::deserialize(std::fstream& fs) { +void SparseMatrix::deserialize(std::fstream& fs, ZSTD_DCtx& ctx) { std::size_t size{}; fs.read(reinterpret_cast(&size), sizeof(std::size_t)); @@ -162,9 +162,9 @@ void SparseMatrix::deserialize(std::fstream& fs) { tmpbuff.resize(compressed_size); fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); - std::size_t decompressed_size = - ZSTD_decompress(reinterpret_cast(_bin1_ids.data()), size * sizeof(std::uint64_t), - tmpbuff.data(), tmpbuff.size() * sizeof(char)); + std::size_t decompressed_size = ZSTD_decompressDCtx( + &ctx, reinterpret_cast(_bin1_ids.data()), size * sizeof(std::uint64_t), tmpbuff.data(), + tmpbuff.size() * sizeof(char)); if (ZSTD_isError(decompressed_size)) { throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); } @@ -172,9 +172,9 @@ void SparseMatrix::deserialize(std::fstream& fs) { fs.read(reinterpret_cast(&compressed_size), sizeof(std::size_t)); tmpbuff.resize(compressed_size); fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); - decompressed_size = - ZSTD_decompress(reinterpret_cast(_bin2_ids.data()), size * sizeof(std::uint64_t), - tmpbuff.data(), tmpbuff.size() * sizeof(char)); + decompressed_size = ZSTD_decompressDCtx(&ctx, reinterpret_cast(_bin2_ids.data()), + size * sizeof(std::uint64_t), tmpbuff.data(), + tmpbuff.size() * sizeof(char)); if (ZSTD_isError(decompressed_size)) { throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); } @@ -183,8 +183,8 @@ void SparseMatrix::deserialize(std::fstream& fs) { tmpbuff.resize(compressed_size); fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); decompressed_size = - ZSTD_decompress(reinterpret_cast(_counts.data()), size * sizeof(double), - tmpbuff.data(), tmpbuff.size() * sizeof(char)); + ZSTD_decompressDCtx(&ctx, reinterpret_cast(_counts.data()), size * sizeof(double), + tmpbuff.data(), tmpbuff.size() * sizeof(char)); if (ZSTD_isError(decompressed_size)) { throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); } @@ -199,7 +199,9 @@ inline SparseMatrixChunked::SparseMatrixChunked(const BinTable& bins, _chrom_offsets(_matrix.chrom_offsets()), _bin1_offsets(_chrom_offsets.size(), 0), _chunk_size(chunk_size), - _compression_lvl(compression_lvl) { + _compression_lvl(compression_lvl), + _zstd_cctx(ZSTD_createCCtx()), + _zstd_dctx(ZSTD_createDCtx()) { _fs.exceptions(std::ios::badbit); _fs.open(_path, std::ios::out); @@ -316,13 +318,13 @@ inline void SparseMatrixChunked::read_chunk(std::size_t chunk_id, SparseMatrix& fs.open(_path, std::ios::in); fs.seekg(offset); - buffer.deserialize(fs); + buffer.deserialize(fs, *_zstd_dctx); } inline void SparseMatrixChunked::write_chunk() { assert(!_matrix.empty()); _index.push_back(_fs.tellg()); - _matrix.serialize(_fs, _compression_lvl); + _matrix.serialize(_fs, *_zstd_cctx, _compression_lvl); _matrix.clear(); _chrom_index.try_emplace(_chrom_id, std::make_pair(_index.size(), _index.size())); } @@ -395,7 +397,8 @@ inline SparseMatrixChunkedView::SparseMatrixChunkedView(const std::filesystem::p : _fs(path, std::ios::in), _index(index.begin(), index.end()), _marg(num_bins), - _bin1_offset(bin1_offset) {} + _bin1_offset(bin1_offset), + _zstd_dctx(ZSTD_createDCtx()) {} inline bool SparseMatrixChunkedView::empty() const noexcept { return _index.empty(); } inline std::size_t SparseMatrixChunkedView::size() { @@ -417,7 +420,7 @@ inline const std::vector& SparseMatrixChunkedView::marginalize() const { for (const auto offset : _index) { _fs.seekg(offset); - _matrix.deserialize(_fs); + _matrix.deserialize(_fs, *_zstd_dctx); for (std::size_t i = 0; i < _matrix.counts().size(); ++i) { const auto i1 = _matrix.bin1_ids()[i] - _bin1_offset; @@ -439,7 +442,7 @@ inline const std::vector& SparseMatrixChunkedView::marginalize_nnz() con for (const auto offset : _index) { _fs.seekg(offset); - _matrix.deserialize(_fs); + _matrix.deserialize(_fs, *_zstd_dctx); for (std::size_t i = 0; i < _matrix.counts().size(); ++i) { const auto i1 = _matrix.bin1_ids()[i] - _bin1_offset; @@ -465,7 +468,7 @@ inline const std::vector& SparseMatrixChunkedView::times_outer_product_m for (const auto offset : _index) { _fs.seekg(offset); - _matrix.deserialize(_fs); + _matrix.deserialize(_fs, *_zstd_dctx); for (std::size_t i = 0; i < _matrix.counts().size(); ++i) { const auto i1 = _matrix.bin1_ids()[i] - _bin1_offset; diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp index 4bcecf3e..85514f2d 100644 --- a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -5,13 +5,28 @@ #pragma once #include +#include #include +#include +#include #include #include #include "hictk/bin_table.hpp" +namespace std { +template <> +struct default_delete { + void operator()(ZSTD_CCtx_s* ctx) const { ZSTD_freeCCtx(ctx); } // NOLINT +}; + +template <> +struct default_delete { + void operator()(ZSTD_DCtx_s* ctx) const { ZSTD_freeDCtx(ctx); } // NOLINT +}; +} // namespace std + namespace hictk::balancing { class SparseMatrixView; @@ -48,8 +63,8 @@ class SparseMatrix { [[nodiscard]] SparseMatrixView subset(std::uint32_t chrom_id) const; [[nodiscard]] SparseMatrixView view() const; - void serialize(std::fstream& fs, int compression_lvl = 3) const; - void deserialize(std::fstream& fs); + void serialize(std::fstream& fs, ZSTD_CCtx& ctx, int compression_lvl = 3) const; + void deserialize(std::fstream& fs, ZSTD_DCtx& ctx); }; class SparseMatrixChunkedView; @@ -73,6 +88,9 @@ class SparseMatrixChunked { std::size_t _chunk_size{}; int _compression_lvl{}; + std::unique_ptr _zstd_cctx{}; + std::unique_ptr _zstd_dctx{}; + public: SparseMatrixChunked() = default; SparseMatrixChunked(const BinTable& bins, std::filesystem::path tmp_file, std::size_t chunk_size, @@ -142,6 +160,7 @@ class SparseMatrixChunkedView { mutable std::vector _marg{}; std::size_t _bin1_offset{}; + std::unique_ptr _zstd_dctx{}; public: SparseMatrixChunkedView() = default; diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index 8f5f1c89..a06ba3c0 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -98,6 +98,8 @@ TEST_CASE("Balancing: SparseMatrix") { SECTION("serde") { const auto tmpfile = testdir() / "sparse_matrix_serde.bin"; + std::unique_ptr zstd_cctx{ZSTD_createCCtx()}; + std::unique_ptr zstd_dctx{ZSTD_createDCtx()}; SECTION("empty matrix") { std::fstream f{}; @@ -107,9 +109,9 @@ TEST_CASE("Balancing: SparseMatrix") { SparseMatrix m1{}; SparseMatrix m2{}; m1.finalize(); - m1.serialize(f); + m1.serialize(f, *zstd_cctx); f.seekg(std::ios::beg); - m2.deserialize(f); + m2.deserialize(f, *zstd_dctx); compare_vectors(m1.bin1_ids(), m2.bin1_ids()); compare_vectors(m1.bin2_ids(), m2.bin2_ids()); @@ -129,9 +131,9 @@ TEST_CASE("Balancing: SparseMatrix") { f.exceptions(std::ios::badbit | std::ios::failbit); SparseMatrix m2{bins}; - m1.serialize(f); + m1.serialize(f, *zstd_cctx); f.seekg(std::ios::beg); - m2.deserialize(f); + m2.deserialize(f, *zstd_dctx); compare_vectors(m1.bin1_ids(), m2.bin1_ids()); compare_vectors(m1.bin2_ids(), m2.bin2_ids()); From f004eb96d1ed83ae4de755611d1e9ea3c7e32618 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 24 Sep 2023 14:33:37 +0200 Subject: [PATCH 16/33] Refactor tests --- test/units/balancing/balancing_test.cpp | 119 ++++++++++++++---------- 1 file changed, 69 insertions(+), 50 deletions(-) diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index a06ba3c0..4c0fa465 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -183,50 +183,60 @@ TEST_CASE("Balancing: SparseMatrixChunked") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Balancing: ICE", "[balancing][long]") { +TEST_CASE("Balancing: ICE (intra)", "[balancing][short]") { const std::array, 2> files{ std::make_pair("cooler", datadir / "cooler/ENCFF993FGR.2500000.cool"), std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; - const auto tmpfile = testdir() / "balancing_ice.tmp"; + const auto tmpfile = testdir() / "balancing_ice_intra.tmp"; + const auto path_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; for (const auto& [label, path] : files) { SECTION(label) { const hictk::File f(path.string(), 2'500'000); SECTION("in-memory") { - SECTION("INTRA") { - const auto path_intra_weights = - datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; + constexpr auto type = hictk::balancing::ICE::Type::cis; + const auto weights = hictk::balancing::ICE(f, type).get_weights(); + const auto expected_weights = read_weights(path_weights); - constexpr auto type = hictk::balancing::ICE::Type::cis; - const auto weights = hictk::balancing::ICE(f, type).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + compare_weights(weights, expected_weights); + } - compare_weights(weights, expected_weights); - } + SECTION("chunked") { + auto params = hictk::balancing::ICE::DefaultParams; + params.tmpfile = tmpfile; + params.chunk_size = 1000; - SECTION("INTER") { - const auto path_intra_weights = - datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; + constexpr auto type = hictk::balancing::ICE::Type::cis; + const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); + const auto expected_weights = read_weights(path_weights); + + compare_weights(weights, expected_weights); + } + } + } +} - constexpr auto type = hictk::balancing::ICE::Type::trans; - const auto weights = hictk::balancing::ICE(f, type).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Balancing: ICE (inter)", "[balancing][medium]") { + const std::array, 2> files{ + std::make_pair("cooler", datadir / "cooler/ENCFF993FGR.2500000.cool"), + std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; - compare_weights(weights, expected_weights); - } + const auto tmpfile = testdir() / "balancing_ice_inter.tmp"; + const auto path_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; - SECTION("GW") { - const auto path_intra_weights = - datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; + for (const auto& [label, path] : files) { + SECTION(label) { + const hictk::File f(path.string(), 2'500'000); - constexpr auto type = hictk::balancing::ICE::Type::gw; - const auto weights = hictk::balancing::ICE(f, type).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + SECTION("in-memory") { + constexpr auto type = hictk::balancing::ICE::Type::trans; + const auto weights = hictk::balancing::ICE(f, type).get_weights(); + const auto expected_weights = read_weights(path_weights); - compare_weights(weights, expected_weights); - } + compare_weights(weights, expected_weights); } SECTION("chunked") { @@ -234,38 +244,47 @@ TEST_CASE("Balancing: ICE", "[balancing][long]") { params.tmpfile = tmpfile; params.chunk_size = 1000; - SECTION("INTRA") { - const auto path_intra_weights = - datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; + constexpr auto type = hictk::balancing::ICE::Type::trans; + const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); + const auto expected_weights = read_weights(path_weights); - constexpr auto type = hictk::balancing::ICE::Type::cis; - const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + compare_weights(weights, expected_weights); + } + } + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Balancing: ICE (gw)", "[balancing][medium]") { + const std::array, 2> files{ + std::make_pair("cooler", datadir / "cooler/ENCFF993FGR.2500000.cool"), + std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; - compare_weights(weights, expected_weights); - } + const auto tmpfile = testdir() / "balancing_ice_inter.tmp"; + const auto path_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; - SECTION("INTER") { - const auto path_intra_weights = - datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; + for (const auto& [label, path] : files) { + SECTION(label) { + const hictk::File f(path.string(), 2'500'000); - constexpr auto type = hictk::balancing::ICE::Type::trans; - const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + SECTION("in-memory") { + constexpr auto type = hictk::balancing::ICE::Type::gw; + const auto weights = hictk::balancing::ICE(f, type).get_weights(); + const auto expected_weights = read_weights(path_weights); - compare_weights(weights, expected_weights); - } + compare_weights(weights, expected_weights); + } - SECTION("GW") { - const auto path_intra_weights = - datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; + SECTION("chunked") { + auto params = hictk::balancing::ICE::DefaultParams; + params.tmpfile = tmpfile; + params.chunk_size = 1000; - constexpr auto type = hictk::balancing::ICE::Type::gw; - const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); - const auto expected_weights = read_weights(path_intra_weights); + constexpr auto type = hictk::balancing::ICE::Type::gw; + const auto weights = hictk::balancing::ICE(f, type, params).get_weights(); + const auto expected_weights = read_weights(path_weights); - compare_weights(weights, expected_weights); - } + compare_weights(weights, expected_weights); } } } From d9514c38eba51ec3fb6e6725b2132cfb5eb26c18 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 24 Sep 2023 14:40:44 +0200 Subject: [PATCH 17/33] Bugfix --- .../balancing/include/hictk/balancing/impl/ice_impl.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp index 7cbaa349..c3766273 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp @@ -249,6 +249,7 @@ template auto ICE::construct_sparse_matrix_chunked(const File& f, Type type, std::size_t num_masked_diags, const std::filesystem::path& tmpfile, std::size_t chunk_size) -> SparseMatrixChunked { + SPDLOG_INFO(FMT_STRING("Reading interactions into memory...")); if (type == Type::cis) { return construct_sparse_matrix_chunked_cis(f, num_masked_diags, tmpfile, chunk_size); } @@ -482,7 +483,7 @@ inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span b std::size_t min_nnz, std::size_t min_count, double mad_max) { SPDLOG_INFO(FMT_STRING("Initializing bias vector...")); if (min_nnz != 0) { - SPDLOG_INFO(FMT_STRING("Masking columns with fewer than {} nnz entries..."), min_nnz); + SPDLOG_INFO(FMT_STRING("Masking rows with fewer than {} nnz entries..."), min_nnz); min_nnz_filtering(matrix, biases, min_nnz); } @@ -490,12 +491,12 @@ inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span b matrix.marginalize(); } if (min_count != 0) { - SPDLOG_INFO(FMT_STRING("Masking columns with fewer than {} interactions..."), min_count); + SPDLOG_INFO(FMT_STRING("Masking rows with fewer than {} interactions..."), min_count); min_count_filtering(biases, min_count, matrix.margs()); } if (mad_max != 0) { - SPDLOG_INFO(FMT_STRING("Masking columns using mad_max={}..."), mad_max); + SPDLOG_INFO(FMT_STRING("Masking rows using mad_max={}..."), mad_max); auto margs = std::vector{matrix.margs()}; mad_max_filtering(chrom_bin_offsets, biases, margs, mad_max); } From c2aac06a74b7d0c13e970eea190310bbb39e2936 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 26 Sep 2023 16:36:53 +0200 Subject: [PATCH 18/33] Implement parallel ICE balancing --- CMakeLists.txt | 2 + conanfile.txt | 3 + src/libhictk/balancing/CMakeLists.txt | 25 +- .../balancing/include/hictk/balancing/ice.hpp | 48 +- .../include/hictk/balancing/impl/ice_impl.hpp | 323 ++++++--- .../balancing/impl/sparse_matrix_impl.hpp | 667 ++++++++++-------- .../include/hictk/balancing/sparse_matrix.hpp | 153 ++-- test/units/balancing/CMakeLists.txt | 78 +- test/units/balancing/balancing_test.cpp | 45 +- 9 files changed, 757 insertions(+), 587 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 50d31243..0e43db82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,6 +119,8 @@ target_compile_features(hictk_project_options INTERFACE "cxx_std_${CMAKE_CXX_STA target_compile_definitions(hictk_project_options INTERFACE FMT_HEADER_ONLY FMT_ENFORCE_COMPILE_STRING) # Tweak spdlog target_compile_definitions(hictk_project_options INTERFACE SPDLOG_FMT_EXTERNAL) +#Tweak xxHash +target_compile_definitions(hictk_project_options INTERFACE XXH_INLINE_ALL) if(WIN32) target_compile_definitions(hictk_project_options INTERFACE NOMINMAX _CRT_SECURE_NO_WARNINGS) diff --git a/conanfile.txt b/conanfile.txt index 58da529e..87e5db24 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -4,6 +4,7 @@ [requires] boost/1.83.0#f0c3932db7f65b606ed78357ecbdcbef +bshoshany-thread-pool/3.5.0#3c9fd1e21a688432b7f31b40d2d168ee cli11/2.3.2#1424b9b1d9e3682a7122f415b078b4d7 eigen/3.4.0#2e192482a8acff96fe34766adca2b24c fast_float/5.2.0#9bf1a3fac625789f2b571d43efb8013b @@ -15,6 +16,7 @@ parallel-hashmap/1.3.11#719aed501c271a34e2347a7731ab3bfb readerwriterqueue/1.0.6#aaa5ff6fac60c2aee591e9e51b063b83 span-lite/0.10.3#1967d71abb32b314387c2ab9c558dd22 spdlog/1.12.0#248c215bc5f0718402fbf1de126ef847 +xxhash/0.8.1#7bf1cd1fe3d31f1fbcc758d93c907e8d zstd/1.5.5#93372fe14bb7883bd4de82914e0a1841 [generators] @@ -71,3 +73,4 @@ highfive*:with_eigen=False highfive*:with_opencv=False highfive*:with_xtensor=False spdlog*:header_only=True +xxhash*:utility=False diff --git a/src/libhictk/balancing/CMakeLists.txt b/src/libhictk/balancing/CMakeLists.txt index 1cb6f0ea..fefed0f1 100644 --- a/src/libhictk/balancing/CMakeLists.txt +++ b/src/libhictk/balancing/CMakeLists.txt @@ -2,26 +2,33 @@ # # SPDX-License-Identifier: MIT +find_package(bshoshany-thread-pool REQUIRED) find_package(phmap REQUIRED) find_package(span-lite REQUIRED) +find_package(xxHash REQUIRED) find_package(zstd REQUIRED) add_library(balancing INTERFACE) add_library(hictk::balancing ALIAS balancing) target_sources( - balancing - INTERFACE FILE_SET - HEADERS - BASE_DIRS - "${CMAKE_CURRENT_SOURCE_DIR}/include") + balancing + INTERFACE FILE_SET + HEADERS + BASE_DIRS + "${CMAKE_CURRENT_SOURCE_DIR}/include") target_include_directories(balancing INTERFACE "$" - "$") + "$") target_link_libraries(balancing INTERFACE hictk::common hictk::pixel) -target_link_system_libraries(balancing INTERFACE nonstd::span-lite phmap - "zstd::libzstd_$,shared,static>" -) +target_link_system_libraries( + balancing + INTERFACE + bshoshany-thread-pool::bshoshany-thread-pool + nonstd::span-lite + phmap + xxHash::xxhash + "zstd::libzstd_$,shared,static>") target_compile_definitions(balancing INTERFACE span_FEATURE_MAKE_SPAN=1) diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index 4c05d2f1..91739ae9 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -36,9 +36,11 @@ class ICE { double mad_max{5.0}; std::filesystem::path tmpfile{}; std::size_t chunk_size{10'000'000}; + std::size_t threads{1}; }; - inline static const Params DefaultParams{1.0e-6, 200, 2, 10, 0, 5.0, "", 10'000'000}; // NOLINT + inline static const Params DefaultParams{1.0e-6, 200, 2, 10, 0, + 5.0, "", 10'000'000, 1}; // NOLINT template explicit ICE(const File& f, Type type = Type::gw, const Params& params = DefaultParams); @@ -51,23 +53,24 @@ class ICE { template void balance_in_memory(const File& f, Type type, double tol, std::size_t max_iters, std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, - double mad_max); + double mad_max, BS::thread_pool* tpool); template void balance_chunked(const File& f, Type type, double tol, std::size_t max_iters, std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, - double mad_max, const std::filesystem::path& tmpfile, - std::size_t chunk_size); + double mad_max, const std::filesystem::path& tmpfile, std::size_t chunk_size, + BS::thread_pool* tpool); template - void balance_gw(const MatrixT& matrix, std::size_t max_iters, double tol); + void balance_gw(const MatrixT& matrix, std::size_t max_iters, double tol, BS::thread_pool* tpool); template - void balance_cis(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, double tol); + void balance_cis(const MatrixT& matrix, const Chromosome& chrom, std::size_t max_iters, + double tol, BS::thread_pool* tpool); template - void balance_trans(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, - double tol); + void balance_trans(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, double tol, + BS::thread_pool* tpool); template [[nodiscard]] static auto construct_sparse_matrix(const File& f, Type type, @@ -76,6 +79,11 @@ class ICE { [[nodiscard]] static auto construct_sparse_matrix_gw(const File& f, std::size_t num_masked_diags) -> SparseMatrix; template + [[nodiscard]] static auto construct_sparse_matrix_cis(const File& f, const Chromosome& chrom, + std::size_t bin_offset, + std::size_t num_masked_diags) + -> SparseMatrix; + template [[nodiscard]] static auto construct_sparse_matrix_cis(const File& f, std::size_t num_masked_diags) -> SparseMatrix; template @@ -95,6 +103,11 @@ class ICE { const std::filesystem::path& tmpfile, std::size_t chunk_size) -> SparseMatrixChunked; + + template + [[nodiscard]] static auto construct_sparse_matrix_chunked_cis( + const File& f, const Chromosome& chrom, std::size_t bin_offset, std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, std::size_t chunk_size) -> SparseMatrixChunked; template [[nodiscard]] static auto construct_sparse_matrix_chunked_cis( const File& f, std::size_t num_masked_diags, const std::filesystem::path& tmpfile, @@ -107,11 +120,21 @@ class ICE { template [[nodiscard]] static auto inner_loop(const MatrixT& matrix, nonstd::span biases, - nonstd::span weights = {}) -> Result; + MargsVector& marg, nonstd::span weights = {}, + BS::thread_pool* tpool = nullptr) -> Result; + [[nodiscard]] static std::pair aggregate_marg( + nonstd::span marg, BS::thread_pool* tpool); + + static void update_biases(nonstd::span marg, nonstd::span biases, + double avg_nzmarg, BS::thread_pool* tpool); + + [[nodiscard]] static double compute_ssq_nzmarg(nonstd::span marg, double avg_nzmarg, + BS::thread_pool* tpool); template - static void min_nnz_filtering(const MatrixT& matrix, nonstd::span biases, - std::size_t min_nnz); + static void min_nnz_filtering(MargsVector& marg, const MatrixT& matrix, + nonstd::span biases, std::size_t min_nnz, + BS::thread_pool* tpool); static void min_count_filtering(nonstd::span biases, std::size_t min_count, nonstd::span marg); @@ -123,7 +146,8 @@ class ICE { template static void initialize_biases(const MatrixT& matrix, nonstd::span biases, nonstd::span chrom_bin_offsets, - std::size_t min_nnz, std::size_t min_count, double mad_max); + std::size_t min_nnz, std::size_t min_count, double mad_max, + BS::thread_pool* tpool); [[nodiscard]] static std::vector compute_weights_from_chromosome_sizes( const BinTable& bins, nonstd::span chrom_bin_offsets); diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp index c3766273..5b0f7a37 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp @@ -22,34 +22,51 @@ namespace hictk::balancing { template inline ICE::ICE(const File& f, Type type, const Params& params) - : _chrom_offsets(f.bins().num_bin_prefix_sum()), _biases(f.bins().size(), 1.0) { + : _chrom_offsets(f.bins().num_bin_prefix_sum()), + _biases(f.bins().size(), 1.0), + _variance(f.chromosomes().size(), 0), + _scale(f.chromosomes().size(), std::numeric_limits::quiet_NaN()) { + std::unique_ptr tpool{}; + if (params.threads != 1) { + tpool = std::make_unique(params.threads); + } if (params.tmpfile.empty()) { balance_in_memory(f, type, params.tol, params.max_iters, params.num_masked_diags, - params.min_nnz, params.min_count, params.mad_max); + params.min_nnz, params.min_count, params.mad_max, tpool.get()); } else { balance_chunked(f, type, params.tol, params.max_iters, params.num_masked_diags, params.min_nnz, - params.min_count, params.mad_max, params.tmpfile, params.chunk_size); + params.min_count, params.mad_max, params.tmpfile, params.chunk_size, + tpool.get()); } } template inline void ICE::balance_in_memory(const File& f, Type type, double tol, std::size_t max_iters, std::size_t num_masked_diags, std::size_t min_nnz, - std::size_t min_count, double mad_max) { + std::size_t min_count, double mad_max, BS::thread_pool* tpool) { auto matrix = construct_sparse_matrix(f, type, num_masked_diags); - initialize_biases(matrix.view(), _biases, _chrom_offsets, min_nnz, min_count, mad_max); + initialize_biases(matrix, _biases, _chrom_offsets, min_nnz, min_count, mad_max, tpool); - switch (type) { - case Type::gw: - balance_gw(matrix.view(), max_iters, tol); - break; - case Type::cis: - balance_cis(matrix, f.bins(), max_iters, tol); - break; - case Type::trans: - matrix = construct_sparse_matrix_trans(f, num_masked_diags); - balance_trans(matrix.view(), f.bins(), max_iters, tol); + if (type == Type::gw) { + return balance_gw(matrix, max_iters, tol, tpool); + } + + if (type == Type::trans) { + matrix.clear(true); + matrix = construct_sparse_matrix_trans(f, num_masked_diags); + return balance_trans(matrix, f.bins(), max_iters, tol, tpool); + } + + assert(type == Type::cis); + matrix.clear(true); + for (std::uint32_t i = 0; i < f.chromosomes().size(); ++i) { + const Chromosome& chrom = f.chromosomes().at(i); + if (chrom.is_all()) { + continue; + } + matrix = construct_sparse_matrix_cis(f, chrom, _chrom_offsets[i], num_masked_diags); + balance_cis(matrix, chrom, max_iters, tol, tpool); } } @@ -57,32 +74,45 @@ template inline void ICE::balance_chunked(const File& f, Type type, double tol, std::size_t max_iters, std::size_t num_masked_diags, std::size_t min_nnz, std::size_t min_count, double mad_max, - const std::filesystem::path& tmpfile, std::size_t chunk_size) { - auto matrix = construct_sparse_matrix_chunked(f, type, num_masked_diags, tmpfile, chunk_size); + const std::filesystem::path& tmpfile, std::size_t chunk_size, + BS::thread_pool* tpool) { + auto matrix = + construct_sparse_matrix_chunked(f, type, num_masked_diags, tmpfile, chunk_size); - initialize_biases(matrix.view(), _biases, _chrom_offsets, min_nnz, min_count, mad_max); + initialize_biases(matrix, _biases, _chrom_offsets, min_nnz, min_count, mad_max, tpool); - switch (type) { - case Type::gw: - balance_gw(matrix.view(), max_iters, tol); - break; - case Type::cis: - balance_cis(matrix, f.bins(), max_iters, tol); - break; - case Type::trans: - balance_trans( - construct_sparse_matrix_chunked_trans(f, num_masked_diags, tmpfile, chunk_size).view(), - f.bins(), max_iters, tol); + if (type == Type::gw) { + return balance_gw(matrix, max_iters, tol, tpool); + } + + if (type == Type::trans) { + matrix.clear(true); + matrix = construct_sparse_matrix_chunked_trans(f, num_masked_diags, tmpfile, chunk_size); + return balance_trans(matrix, f.bins(), max_iters, tol, tpool); + } + + assert(type == Type::cis); + matrix.clear(true); + for (std::uint32_t i = 0; i < f.chromosomes().size(); ++i) { + const Chromosome& chrom = f.chromosomes().at(i); + if (chrom.is_all()) { + continue; + } + matrix = construct_sparse_matrix_chunked_cis(f, chrom, _chrom_offsets[i], num_masked_diags, + tmpfile, chunk_size); + balance_cis(matrix, chrom, max_iters, tol, tpool); } } template -inline void ICE::balance_gw(const MatrixT& matrix, std::size_t max_iters, double tol) { +inline void ICE::balance_gw(const MatrixT& matrix, std::size_t max_iters, double tol, + BS::thread_pool* tpool) { _variance.resize(1, 0); _scale.resize(1, std::numeric_limits::quiet_NaN()); + MargsVector marg(_biases.size()); for (std::size_t i = 0; i < max_iters; ++i) { - const auto res = inner_loop(matrix, _biases); + const auto res = inner_loop(matrix, _biases, marg, {}, tpool); SPDLOG_INFO(FMT_STRING("Iteration {}: {}"), i + 1, res.variance); _variance[0] = res.variance; _scale[0] = res.scale; @@ -94,13 +124,14 @@ inline void ICE::balance_gw(const MatrixT& matrix, std::size_t max_iters, double template inline void ICE::balance_trans(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, - double tol) { + double tol, BS::thread_pool* tpool) { _variance.resize(1, 0); _scale.resize(1, std::numeric_limits::quiet_NaN()); const auto weights = compute_weights_from_chromosome_sizes(bins, _chrom_offsets); + MargsVector marg(_biases.size()); for (std::size_t i = 0; i < max_iters; ++i) { - const auto res = inner_loop(matrix, _biases, weights); + const auto res = inner_loop(matrix, _biases, marg, weights, tpool); SPDLOG_INFO(FMT_STRING("Iteration {}: {}"), i + 1, res.variance); _variance[0] = res.variance; _scale[0] = res.scale; @@ -111,33 +142,21 @@ inline void ICE::balance_trans(const MatrixT& matrix, const BinTable& bins, std: } template -inline void ICE::balance_cis(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, - double tol) { - _variance.resize(bins.chromosomes().size(), 0); - _scale.resize(bins.chromosomes().size(), std::numeric_limits::quiet_NaN()); - - std::vector margs(_biases.size()); - - for (const auto& chrom : bins.chromosomes()) { - if (chrom.is_all()) { - continue; - } - const auto cis_matrix = matrix.subset(chrom.id()); - - const auto j0 = _chrom_offsets[chrom.id()]; - const auto j1 = _chrom_offsets[chrom.id() + 1]; +inline void ICE::balance_cis(const MatrixT& matrix, const Chromosome& chrom, std::size_t max_iters, + double tol, BS::thread_pool* tpool) { + const auto i0 = _chrom_offsets[chrom.id()]; + const auto i1 = _chrom_offsets[chrom.id() + 1]; + auto biases_ = nonstd::span(_biases).subspan(i0, i1 - i0); + + MargsVector marg(biases_.size()); + for (std::size_t k = 0; k < max_iters; ++k) { + const auto res = inner_loop(matrix, biases_, marg, {}, tpool); + SPDLOG_INFO(FMT_STRING("[{}] iteration {}: {}"), chrom.name(), k + 1, res.variance); + _variance[chrom.id()] = res.variance; + _scale[chrom.id()] = res.scale; - auto biases_ = nonstd::span(_biases).subspan(j0, j1 - j0); - - for (std::size_t k = 0; k < max_iters; ++k) { - const auto res = inner_loop(cis_matrix, biases_); - SPDLOG_INFO(FMT_STRING("[{}] iteration {}: {}"), chrom.name(), k + 1, res.variance); - _variance[chrom.id()] = res.variance; - _scale[chrom.id()] = res.scale; - - if (res.variance < tol) { - break; - } + if (res.variance < tol) { + break; } } } @@ -155,7 +174,7 @@ auto ICE::construct_sparse_matrix(const File& f, Type type, std::size_t num_mask template inline auto ICE::construct_sparse_matrix_gw(const File& f, std::size_t num_masked_diags) -> SparseMatrix { - SparseMatrix m(f.bins()); + SparseMatrix m{}; const auto sel = f.fetch(); std::for_each(sel.template begin(), sel.template end(), [&](const auto& p) { @@ -169,17 +188,36 @@ inline auto ICE::construct_sparse_matrix_gw(const File& f, std::size_t num_maske return m; } +template +[[nodiscard]] inline auto ICE::construct_sparse_matrix_cis(const File& f, const Chromosome& chrom, + std::size_t bin_offset, + std::size_t num_masked_diags) + -> SparseMatrix { + SparseMatrix m{}; + + const auto sel = f.fetch(chrom.name()); + std::for_each(sel.template begin(), sel.template end(), + [&](const ThinPixel& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count, bin_offset); + } + }); + m.finalize(); + + return m; +} + template [[nodiscard]] inline auto ICE::construct_sparse_matrix_cis(const File& f, std::size_t num_masked_diags) -> SparseMatrix { - SparseMatrix m(f.bins()); + SparseMatrix m{}; - for (const Chromosome& chrom : f.chromosomes()) { + for (const auto& chrom : f.chromosomes()) { if (chrom.is_all()) { continue; } - const auto sel = f.fetch(chrom.name()); + auto sel = f.fetch(chrom.name()); std::for_each(sel.template begin(), sel.template end(), [&](const ThinPixel& p) { if (p.bin2_id - p.bin1_id >= num_masked_diags) { @@ -230,7 +268,7 @@ template internal::PixelMerger merger{heads, tails}; - SparseMatrix m(f.bins()); + SparseMatrix m{}; std::for_each(merger.begin(), merger.end(), [&](const ThinPixel& p) { // TODO: this filtering step is wrong when done on trans matrices, as it will // remove the first and last few pixels from trans matrices of adjacent chromosomes. @@ -249,7 +287,7 @@ template auto ICE::construct_sparse_matrix_chunked(const File& f, Type type, std::size_t num_masked_diags, const std::filesystem::path& tmpfile, std::size_t chunk_size) -> SparseMatrixChunked { - SPDLOG_INFO(FMT_STRING("Reading interactions into memory...")); + SPDLOG_INFO(FMT_STRING("Writing interactions to temporary file {}..."), tmpfile); if (type == Type::cis) { return construct_sparse_matrix_chunked_cis(f, num_masked_diags, tmpfile, chunk_size); } @@ -260,7 +298,7 @@ template inline auto ICE::construct_sparse_matrix_chunked_gw(const File& f, std::size_t num_masked_diags, const std::filesystem::path& tmpfile, std::size_t chunk_size) -> SparseMatrixChunked { - SparseMatrixChunked m(f.bins(), tmpfile, chunk_size); + SparseMatrixChunked m(tmpfile, chunk_size); const auto sel = f.fetch(); std::for_each(sel.template begin(), sel.template end(), [&](const auto& p) { @@ -273,12 +311,32 @@ inline auto ICE::construct_sparse_matrix_chunked_gw(const File& f, std::size_t n return m; } +template +inline auto ICE::construct_sparse_matrix_chunked_cis(const File& f, const Chromosome& chrom, + std::size_t bin_offset, + std::size_t num_masked_diags, + const std::filesystem::path& tmpfile, + std::size_t chunk_size) + -> SparseMatrixChunked { + SparseMatrixChunked m(tmpfile, chunk_size); + + const auto sel = f.fetch(chrom.name()); + std::for_each(sel.template begin(), sel.template end(), + [&](const ThinPixel& p) { + if (p.bin2_id - p.bin1_id >= num_masked_diags) { + m.push_back(p.bin1_id, p.bin2_id, p.count, bin_offset); + } + }); + m.finalize(); + return m; +} + template inline auto ICE::construct_sparse_matrix_chunked_cis(const File& f, std::size_t num_masked_diags, const std::filesystem::path& tmpfile, std::size_t chunk_size) -> SparseMatrixChunked { - SparseMatrixChunked m(f.bins(), tmpfile, chunk_size); + SparseMatrixChunked m(tmpfile, chunk_size); for (const Chromosome& chrom : f.chromosomes()) { if (chrom.is_all()) { @@ -329,7 +387,7 @@ inline auto ICE::construct_sparse_matrix_chunked_trans(const File& f, std::size_ internal::PixelMerger merger{heads, tails}; - SparseMatrixChunked m(f.bins(), tmpfile, chunk_size); + SparseMatrixChunked m(tmpfile, chunk_size); std::for_each(merger.begin(), merger.end(), [&](const ThinPixel& p) { // TODO: this filtering step is wrong when done on trans matrices, as it will // remove the first and last few pixels from trans matrices of adjacent chromosomes. @@ -344,11 +402,12 @@ inline auto ICE::construct_sparse_matrix_chunked_trans(const File& f, std::size_ } template -inline void ICE::min_nnz_filtering(const MatrixT& matrix, nonstd::span biases, - std::size_t min_nnz) { - const auto& marg = matrix.marginalize_nnz(); +inline void ICE::min_nnz_filtering(MargsVector& marg, const MatrixT& matrix, + nonstd::span biases, std::size_t min_nnz, + BS::thread_pool* tpool) { + matrix.marginalize_nnz(marg, tpool); for (std::size_t i = 0; i < biases.size(); ++i) { - if (marg[i] < static_cast(min_nnz)) { + if (marg()[i] < static_cast(min_nnz)) { biases[i] = 0; } } @@ -438,20 +497,16 @@ inline void ICE::mad_max_filtering(nonstd::span chrom_offsets } template -inline auto ICE::inner_loop(const MatrixT& matrix, nonstd::span biases, - nonstd::span weights) -> Result { +inline auto ICE::inner_loop(const MatrixT& matrix, nonstd::span biases, MargsVector& marg, + nonstd::span weights, BS::thread_pool* tpool) -> Result { if (matrix.empty()) { std::fill(biases.begin(), biases.end(), std::numeric_limits::quiet_NaN()); return {std::numeric_limits::quiet_NaN(), 0.0}; } - const auto& marg = matrix.times_outer_product_marg(biases, weights); - double marg_sum = 0.0; - std::size_t nnz_marg{}; - for (const auto& n : marg) { - marg_sum += n; - nnz_marg += n != 0; - } + marg.resize(biases.size()); + matrix.times_outer_product_marg(marg, biases, weights, tpool); + const auto [marg_sum, nnz_marg] = aggregate_marg(marg(), tpool); if (nnz_marg == 0) { std::fill(biases.begin(), biases.end(), std::numeric_limits::quiet_NaN()); @@ -459,46 +514,120 @@ inline auto ICE::inner_loop(const MatrixT& matrix, nonstd::span biases, } const auto avg_nzmarg = (marg_sum / static_cast(nnz_marg)); - for (std::size_t i = 0; i < biases.size(); ++i) { - const auto n = marg[i] / avg_nzmarg; - if (n != 0) { - biases[i] /= n; + update_biases(marg(), biases, avg_nzmarg, tpool); + + const auto ssq_nzmarg = compute_ssq_nzmarg(marg(), avg_nzmarg, tpool); + const auto var_nzmarg = ssq_nzmarg / static_cast(nnz_marg - 1); + + return {avg_nzmarg, var_nzmarg}; +} + +inline std::pair ICE::aggregate_marg(nonstd::span marg, + BS::thread_pool* tpool) { + double marg_sum = 0.0; + std::size_t nnz_marg{}; + + std::mutex mtx{}; + + auto aggregate_marg_impl = [&](std::size_t istart, std::size_t iend) { + double marg_sum_ = 0.0; + std::size_t nnz_marg_{}; + + for (auto i = istart; i < iend; ++i) { + marg_sum_ += marg[i]; + nnz_marg_ += marg[i] != 0; + } + + [[maybe_unused]] const std::scoped_lock lck(mtx); + marg_sum += marg_sum_; + nnz_marg += nnz_marg_; + }; + + if (marg.size() < 10'000 || !tpool) { + aggregate_marg_impl(0, marg.size()); + return std::make_pair(marg_sum, nnz_marg); + } + + tpool->push_loop(0, marg.size(), aggregate_marg_impl); + tpool->wait_for_tasks(); + + return std::make_pair(marg_sum, nnz_marg); +} + +inline void ICE::update_biases(nonstd::span marg, nonstd::span biases, + double avg_nzmarg, BS::thread_pool* tpool) { + auto update_biases_impl = [&](std::size_t istart, std::size_t iend) { + for (auto i = istart; i < iend; ++i) { + const auto n = marg[i] / avg_nzmarg; + if (n != 0) { + biases[i] /= n; + } } + }; + + if (marg.size() < 10'000 || !tpool) { + return update_biases_impl(0, marg.size()); } - double ssq_nzmarg = 0.0; - for (const auto n : marg) { - if (n != 0) { - ssq_nzmarg += std::pow(n - avg_nzmarg, 2); + tpool->push_loop(0, marg.size(), update_biases_impl); + tpool->wait_for_tasks(); +} + +inline double ICE::compute_ssq_nzmarg(nonstd::span marg, double avg_nzmarg, + BS::thread_pool* tpool) { + std::mutex mtx{}; + double ssq_nzmarg = 0; + + auto compute_ssq_nzmarg_impl = [&](std::size_t istart, std::size_t iend) { + double ssq_nzmarg_ = 0.0; + for (auto i = istart; i < iend; ++i) { + const auto& n = marg[i]; + if (n != 0) { + ssq_nzmarg_ += std::pow(n - avg_nzmarg, 2); + } } + [[maybe_unused]] const std::scoped_lock lck(mtx); + ssq_nzmarg += ssq_nzmarg_; + }; + + if (marg.size() < 10'000 || !tpool) { + compute_ssq_nzmarg_impl(0, marg.size()); + return ssq_nzmarg; } - const auto var_nzmarg = ssq_nzmarg / static_cast(nnz_marg - 1); - return {avg_nzmarg, var_nzmarg}; + tpool->push_loop(0, marg.size(), compute_ssq_nzmarg_impl); + tpool->wait_for_tasks(); + return ssq_nzmarg; } template inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span biases, nonstd::span chrom_bin_offsets, - std::size_t min_nnz, std::size_t min_count, double mad_max) { + std::size_t min_nnz, std::size_t min_count, double mad_max, + BS::thread_pool* tpool) { + if (min_nnz == 0 && min_count == 0 && mad_max == 0) { + return; + } + SPDLOG_INFO(FMT_STRING("Initializing bias vector...")); + MargsVector marg(biases.size()); if (min_nnz != 0) { SPDLOG_INFO(FMT_STRING("Masking rows with fewer than {} nnz entries..."), min_nnz); - min_nnz_filtering(matrix, biases, min_nnz); + min_nnz_filtering(marg, matrix, biases, min_nnz, tpool); } if (min_count != 0 || mad_max != 0) { - matrix.marginalize(); + matrix.marginalize(marg, tpool); } + if (min_count != 0) { SPDLOG_INFO(FMT_STRING("Masking rows with fewer than {} interactions..."), min_count); - min_count_filtering(biases, min_count, matrix.margs()); + min_count_filtering(biases, min_count, marg()); } if (mad_max != 0) { SPDLOG_INFO(FMT_STRING("Masking rows using mad_max={}..."), mad_max); - auto margs = std::vector{matrix.margs()}; - mad_max_filtering(chrom_bin_offsets, biases, margs, mad_max); + mad_max_filtering(chrom_bin_offsets, biases, marg(), mad_max); } } diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp index 5cc41742..412b8440 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include @@ -20,94 +21,137 @@ namespace hictk::balancing { -inline SparseMatrix::SparseMatrix(const BinTable& bins, std::uint32_t chrom_id) - : _chrom_id(chrom_id == _gw_id ? 0 : chrom_id), - _chrom_offsets(bins.num_bin_prefix_sum()), - _bin1_offsets(_chrom_offsets.size(), 0), - _marg(chrom_id == _gw_id ? bins.size() : bins.subset(chrom_id).size()) {} +inline MargsVector::MargsVector(std::size_t size_) + : _margs(size_, 0), _mtxes(compute_number_of_mutexes(size_)) {} -inline bool SparseMatrix::empty() const noexcept { return size() == 0; } -inline std::size_t SparseMatrix::size() const noexcept { return _counts.size(); } +inline MargsVector::MargsVector(const MargsVector& other) + : _margs(other._margs.begin(), other._margs.end()), _mtxes(other.size()) {} -inline void SparseMatrix::clear() noexcept { - _bin1_ids.clear(); - _bin2_ids.clear(); - _counts.clear(); -} +inline MargsVector& MargsVector::operator=(const MargsVector& other) { + if (this == &other) { + return *this; + } -inline void SparseMatrix::shrink_to_fit() noexcept { - _bin1_ids.shrink_to_fit(); - _bin2_ids.shrink_to_fit(); - _counts.shrink_to_fit(); - _chrom_offsets.shrink_to_fit(); + _margs = other._margs; + _mtxes = std::vector{other.size()}; + + return *this; } -inline void SparseMatrix::finalize() { - shrink_to_fit(); - if (_chrom_id + 1 < _bin1_offsets.size()) { - _bin1_offsets[_chrom_id + 1] = size(); - } +inline double MargsVector::operator[](std::size_t i) const noexcept { + assert(i < size()); + return _margs[i]; +} - for (std::size_t i = 1; i < _bin1_offsets.size(); ++i) { - if (_bin1_offsets[i] == 0) { - _bin1_offsets[i] = _bin1_offsets[i - 1]; - } - } +inline double& MargsVector::operator[](std::size_t i) noexcept { + assert(i < size()); + return _margs[i]; } -inline const std::vector& SparseMatrix::bin1_ids() const noexcept { return _bin1_ids; } -inline const std::vector& SparseMatrix::bin2_ids() const noexcept { return _bin2_ids; } -inline const std::vector& SparseMatrix::counts() const noexcept { return _counts; } -inline const std::vector& SparseMatrix::margs() const noexcept { return _marg; } -inline const std::vector& SparseMatrix::chrom_offsets() const noexcept { - return _chrom_offsets; +inline void MargsVector::add(std::size_t i, double n) noexcept { + assert(i < size()); + [[maybe_unused]] std::scoped_lock lck(_mtxes[get_mutex_idx(i)]); + _margs[i] += n; } -inline void SparseMatrix::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count) { - if (empty()) { - _chrom_id = static_cast( - std::upper_bound(_chrom_offsets.begin(), _chrom_offsets.end(), bin1_id) - 1 - - _chrom_offsets.begin()); +inline const std::vector& MargsVector::operator()() const noexcept { return _margs; } +inline std::vector& MargsVector::operator()() noexcept { return _margs; } + +inline void MargsVector::fill(double n) noexcept { std::fill(_margs.begin(), _margs.end(), n); } +inline void MargsVector::resize(std::size_t size_) { + if (size_ != size()) { + _margs.resize(size_); + std::vector v(size_); + std::swap(v, _mtxes); } +} - const auto beginning_of_new_chromosome = bin1_id >= _chrom_offsets[_chrom_id + 1]; - if (!empty() && beginning_of_new_chromosome) { - _chrom_id = static_cast( - std::upper_bound(_chrom_offsets.begin(), _chrom_offsets.end(), _bin1_ids.back()) - 1 - - _chrom_offsets.begin()); - _bin1_offsets[_chrom_id + 1] = size(); +inline std::size_t MargsVector::size() const noexcept { return _margs.size(); } +inline bool MargsVector::empty() const noexcept { return size() == 0; } + +constexpr std::size_t MargsVector::compute_number_of_mutexes(std::size_t size) noexcept { + if (size == 0) { + return 0; } + const auto nthreads = static_cast(std::thread::hardware_concurrency()); + // Clamping to 2-n is needed because get_pixel_mutex_idx expects the number of + // mutexes to be a multiple of 2 + return next_pow2(std::clamp(size, std::size_t(2), 5000 * nthreads)); +} - _bin1_ids.push_back(bin1_id); - _bin2_ids.push_back(bin2_id); - _counts.push_back(count); +template +constexpr I MargsVector::next_pow2(I n) noexcept { + using ull = unsigned long long; + if constexpr (std::is_signed_v) { + assert(n >= 0); + return conditional_static_cast(next_pow2(static_cast(n))); + } else { + auto m = conditional_static_cast(n); +#ifndef __GNUC__ + // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --m; + m |= m >> 1; + m |= m >> 2; + m |= m >> 4; + m |= m >> 8; + m |= m >> 16; + m |= m >> 32; + return conditional_static_cast(m + 1); +#else + // https://jameshfisher.com/2018/03/30/round-up-power-2/ + // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html + + return conditional_static_cast( + m <= 1 ? m + : std::uint64_t(1) << (std::uint64_t(64) - std::uint64_t(__builtin_clzll(m - 1)))); +#endif + } } -inline SparseMatrixView SparseMatrix::subset(std::uint32_t chrom_id) const { - assert(chrom_id + 1 < chrom_offsets().size()); - const auto i0 = _bin1_offsets[chrom_id]; - const auto i1 = _bin1_offsets[chrom_id + 1]; +inline std::size_t MargsVector::get_mutex_idx(std::size_t i) const noexcept { + assert(!_mtxes.empty()); + assert(_mtxes.size() % 2 == 0); + i = XXH3_64bits(&i, sizeof(std::size_t)); + // equivalent to i % _mtxes.size() when _mtxes.size() % 2 == 0 + return i & (_mtxes.size() - 1); +} - const auto bin1_ids_ = nonstd::span(bin1_ids()).subspan(i0, i1 - i0); - const auto bin2_ids_ = nonstd::span(bin2_ids()).subspan(i0, i1 - i0); - const auto counts_ = nonstd::span(counts()).subspan(i0, i1 - i0); +inline bool SparseMatrix::empty() const noexcept { return size() == 0; } +inline std::size_t SparseMatrix::size() const noexcept { return _counts.size(); } - const auto j0 = chrom_offsets()[chrom_id]; - const auto j1 = chrom_offsets()[chrom_id + 1]; +inline void SparseMatrix::clear(bool shrink_to_fit_) noexcept { + _bin1_ids.clear(); + _bin2_ids.clear(); + _counts.clear(); + if (shrink_to_fit_) { + shrink_to_fit(); + } +} - return {bin1_ids_, bin2_ids_, counts_, j0, j1 - j0}; +inline void SparseMatrix::shrink_to_fit() noexcept { + _bin1_ids.shrink_to_fit(); + _bin2_ids.shrink_to_fit(); + _counts.shrink_to_fit(); } -inline SparseMatrixView SparseMatrix::view() const { - const auto bin1_ids_ = nonstd::span(bin1_ids()); - const auto bin2_ids_ = nonstd::span(bin2_ids()); - const auto counts_ = nonstd::span(counts()); +inline void SparseMatrix::finalize() { shrink_to_fit(); } + +inline const std::vector& SparseMatrix::bin1_ids() const noexcept { return _bin1_ids; } +inline const std::vector& SparseMatrix::bin2_ids() const noexcept { return _bin2_ids; } +inline const std::vector& SparseMatrix::counts() const noexcept { return _counts; } - return {bin1_ids_, bin2_ids_, counts_, 0, _marg.size()}; +inline void SparseMatrix::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count, + std::size_t bin_offset) { + assert(bin1_id >= bin_offset); + assert(bin2_id >= bin1_id); + + _bin1_ids.push_back(bin1_id - bin_offset); + _bin2_ids.push_back(bin2_id - bin_offset); + _counts.push_back(count); } void SparseMatrix::serialize(std::fstream& fs, ZSTD_CCtx& ctx, int compression_lvl) const { - const auto size_ = size(); + auto size_ = size(); fs.write(reinterpret_cast(&size_), sizeof(std::size_t)); const auto tmpbuff_size = ZSTD_compressBound(size() * sizeof(std::uint64_t)); @@ -149,12 +193,12 @@ void SparseMatrix::serialize(std::fstream& fs, ZSTD_CCtx& ctx, int compression_l } void SparseMatrix::deserialize(std::fstream& fs, ZSTD_DCtx& ctx) { - std::size_t size{}; - fs.read(reinterpret_cast(&size), sizeof(std::size_t)); + std::size_t size_{}; + fs.read(reinterpret_cast(&size_), sizeof(std::size_t)); - _bin1_ids.resize(size); - _bin2_ids.resize(size); - _counts.resize(size); + _bin1_ids.resize(size_); + _bin2_ids.resize(size_); + _counts.resize(size_); std::string tmpbuff{}; std::size_t compressed_size{}; @@ -163,8 +207,8 @@ void SparseMatrix::deserialize(std::fstream& fs, ZSTD_DCtx& ctx) { tmpbuff.resize(compressed_size); fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); std::size_t decompressed_size = ZSTD_decompressDCtx( - &ctx, reinterpret_cast(_bin1_ids.data()), size * sizeof(std::uint64_t), tmpbuff.data(), - tmpbuff.size() * sizeof(char)); + &ctx, reinterpret_cast(_bin1_ids.data()), _bin1_ids.size() * sizeof(std::uint64_t), + tmpbuff.data(), tmpbuff.size() * sizeof(char)); if (ZSTD_isError(decompressed_size)) { throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); } @@ -173,7 +217,7 @@ void SparseMatrix::deserialize(std::fstream& fs, ZSTD_DCtx& ctx) { tmpbuff.resize(compressed_size); fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); decompressed_size = ZSTD_decompressDCtx(&ctx, reinterpret_cast(_bin2_ids.data()), - size * sizeof(std::uint64_t), tmpbuff.data(), + _bin2_ids.size() * sizeof(std::uint64_t), tmpbuff.data(), tmpbuff.size() * sizeof(char)); if (ZSTD_isError(decompressed_size)) { throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); @@ -182,30 +226,128 @@ void SparseMatrix::deserialize(std::fstream& fs, ZSTD_DCtx& ctx) { fs.read(reinterpret_cast(&compressed_size), sizeof(std::size_t)); tmpbuff.resize(compressed_size); fs.read(tmpbuff.data(), static_cast(tmpbuff.size() * sizeof(char))); - decompressed_size = - ZSTD_decompressDCtx(&ctx, reinterpret_cast(_counts.data()), size * sizeof(double), - tmpbuff.data(), tmpbuff.size() * sizeof(char)); + decompressed_size = ZSTD_decompressDCtx(&ctx, reinterpret_cast(_counts.data()), + _counts.size() * sizeof(double), tmpbuff.data(), + tmpbuff.size() * sizeof(char)); if (ZSTD_isError(decompressed_size)) { throw std::runtime_error(ZSTD_getErrorName(decompressed_size)); } } -inline SparseMatrixChunked::SparseMatrixChunked(const BinTable& bins, - std::filesystem::path tmp_file, +inline void SparseMatrix::marginalize(MargsVector& marg, BS::thread_pool* tpool, + bool init_buffer) const { + assert(!marg.empty()); + if (init_buffer) { + marg.fill(0); + } + + auto marginalize_impl = [&](std::size_t istart, std::size_t iend) { + for (auto i = istart; i < iend; ++i) { + const auto i1 = _bin1_ids[i]; + const auto i2 = _bin2_ids[i]; + + if (tpool) { + if (_counts[i] != 0) { + marg.add(i1, _counts[i]); + marg.add(i2, _counts[i]); + } + } else { + marg[i1] += _counts[i]; + marg[i2] += _counts[i]; + } + } + }; + + if (size() < 1'000'000 || !tpool) { + marginalize_impl(0, size()); + return; + } + + tpool->push_loop(0, size(), marginalize_impl); + tpool->wait_for_tasks(); +} + +inline void SparseMatrix::marginalize_nnz(MargsVector& marg, BS::thread_pool* tpool, + bool init_buffer) const { + if (init_buffer) { + marg.fill(0); + } + + auto marginalize_nnz_impl = [&](std::size_t istart, std::size_t iend) { + for (auto i = istart; i < iend; ++i) { + const auto i1 = _bin1_ids[i]; + const auto i2 = _bin2_ids[i]; + + if (tpool) { + if (_counts[i] != 0) { + marg.add(i1, _counts[i] != 0); + marg.add(i2, _counts[i] != 0); + } + } else { + marg[i1] += _counts[i] != 0; + marg[i2] += _counts[i] != 0; + } + } + }; + + if (size() < 1'000'000 || !tpool) { + marginalize_nnz_impl(0, size()); + return; + } + + tpool->push_loop(0, size(), marginalize_nnz_impl); + tpool->wait_for_tasks(); +} + +inline void SparseMatrix::times_outer_product_marg(MargsVector& marg, + nonstd::span biases, + nonstd::span weights, + BS::thread_pool* tpool, bool init_buffer) const { + assert(biases.size() == weights.size() || weights.empty()); + marg.resize(biases.size()); + + if (init_buffer) { + marg.fill(0); + } + + auto times_outer_product_marg_impl = [&](std::size_t istart, std::size_t iend) { + for (auto i = istart; i < iend; ++i) { + const auto i1 = _bin1_ids[i]; + const auto i2 = _bin2_ids[i]; + const auto w1 = weights.empty() ? 1 : weights[i1]; + const auto w2 = weights.empty() ? 1 : weights[i2]; + const auto count = _counts[i] * (w1 * biases[i1]) * (w2 * biases[i2]); + + if (tpool) { + if (count != 0) { + marg.add(i1, count); + marg.add(i2, count); + } + } else { + marg[i1] += count; + marg[i2] += count; + } + } + }; + + if (size() < 1'000'000 || !tpool) { + times_outer_product_marg_impl(0, size()); + return; + } + + tpool->push_loop(0, size(), times_outer_product_marg_impl); + tpool->wait_for_tasks(); +} + +inline SparseMatrixChunked::SparseMatrixChunked(std::filesystem::path tmp_file, std::size_t chunk_size, int compression_lvl) - : _matrix(bins), - _path(std::move(tmp_file)), - _marg(bins.size()), - _chrom_offsets(_matrix.chrom_offsets()), - _bin1_offsets(_chrom_offsets.size(), 0), + : _path(std::move(tmp_file)), _chunk_size(chunk_size), _compression_lvl(compression_lvl), _zstd_cctx(ZSTD_createCCtx()), _zstd_dctx(ZSTD_createDCtx()) { _fs.exceptions(std::ios::badbit); _fs.open(_path, std::ios::out); - - _chrom_index.emplace(0, std::make_pair(std::size_t{}, std::size_t{})); } inline SparseMatrixChunked::~SparseMatrixChunked() noexcept { @@ -219,272 +361,197 @@ inline SparseMatrixChunked::~SparseMatrixChunked() noexcept { inline bool SparseMatrixChunked::empty() const noexcept { return size() == 0; } inline std::size_t SparseMatrixChunked::size() const noexcept { return _size; } - -inline const std::vector& SparseMatrixChunked::margs() const noexcept { return _marg; } -inline const std::vector& SparseMatrixChunked::chrom_offsets() const noexcept { - return _chrom_offsets; +inline void SparseMatrixChunked::clear(bool shrink_to_fit_) { + _index.clear(); + _fs.close(); + std::filesystem::remove(_path); + _path = ""; + _size = 0; + _matrix.clear(shrink_to_fit_); } inline void SparseMatrixChunked::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, - double count) { - if (empty()) { - initialize_index(bin1_id); - } - - const auto beginning_of_new_chromosome = bin1_id >= _chrom_offsets[_chrom_id + 1]; - if (!empty() && beginning_of_new_chromosome) { - update_index(bin1_id); - } - - if (_matrix.size() == _chunk_size || beginning_of_new_chromosome) { + double count, std::size_t bin_offset) { + if (_matrix.size() == _chunk_size) { write_chunk(); } - _matrix.push_back(bin1_id, bin2_id, count); + _matrix.push_back(bin1_id, bin2_id, count, bin_offset); ++_size; } inline void SparseMatrixChunked::finalize() { - finalize_chromosome(_chrom_id); - - for (std::size_t i = 1; i < _bin1_offsets.size(); ++i) { - if (_bin1_offsets[i] == 0) { - _bin1_offsets[i] = _bin1_offsets[i - 1]; - } - } - if (!_matrix.empty()) { write_chunk(); } _fs.open(_path, std::ios::in); } -inline void SparseMatrixChunked::finalize_chromosome(std::uint32_t chrom_id) { - // Finalize current chromosome - auto [it, inserted] = - _chrom_index.try_emplace(chrom_id, std::make_pair(_index.size(), _index.size())); - if (!inserted) { - it->second.second = _index.size() + 1; - } +inline void SparseMatrixChunked::marginalize(MargsVector& marg, BS::thread_pool* tpool, + bool init_buffer) const { + auto marginalize_impl = [&](std::size_t istart, std::size_t iend) { + std::unique_ptr zstd_dctx(ZSTD_createDCtx()); + std::fstream fs{}; + fs.exceptions(_fs.exceptions()); + fs.open(_path, std::ios::in); + auto matrix = _matrix; + MargsVector marg_local(marg.size()); + for (const auto offset : nonstd::span(_index).subspan(istart, iend - istart)) { + fs.seekg(offset); + matrix.deserialize(fs, *zstd_dctx); + matrix.marginalize(marg_local, nullptr, false); + } - // Initialize next chromosome - if (chrom_id + 1 < _bin1_offsets.size()) { - _bin1_offsets[chrom_id + 1] = size(); - _chrom_index.emplace(chrom_id + 1, std::make_pair(_index.size() + 1, _index.size() + 1)); - } -} + for (std::size_t i = 0; i < marg_local.size(); ++i) { + if (marg_local[i] != 0) { + marg.add(i, marg_local[i]); + } + } + }; -inline void SparseMatrixChunked::initialize_index(std::uint64_t bin1_id) { - assert(empty()); - _chrom_id = static_cast( - std::upper_bound(_chrom_offsets.begin(), _chrom_offsets.end(), bin1_id) - 1 - - _chrom_offsets.begin()); - for (std::uint32_t i = 0; i <= _chrom_id; ++i) { - _chrom_index.emplace(i, std::make_pair(std::size_t{}, std::size_t{})); + assert(!marg.empty()); + if (init_buffer) { + marg.fill(0); } -} -inline void SparseMatrixChunked::update_index(std::uint64_t bin1_id) { - assert(!empty()); - finalize_chromosome(_chrom_id); - _chrom_id = static_cast( - std::upper_bound(_chrom_offsets.begin(), _chrom_offsets.end(), bin1_id) - 1 - - _chrom_offsets.begin()); -} - -inline SparseMatrixChunkedView SparseMatrixChunked::view() const { - return {_path, _index, 0, _marg.size()}; -} - -inline SparseMatrixChunkedView SparseMatrixChunked::subset(std::uint32_t chrom_id) const { - auto it = _chrom_index.find(chrom_id); - if (it == _chrom_index.end()) { - return {}; + if (_index.size() == 1 || !tpool) { + marginalize_impl(0, _index.size()); + return; } - const auto& [first_offset, last_offset] = it->second; - const auto i0 = chrom_offsets()[chrom_id]; - const auto i1 = chrom_offsets()[chrom_id + 1]; - return {_path, nonstd::span(_index).subspan(first_offset, last_offset - first_offset), i0, - i1 - i0}; -} + const auto offsets = compute_chunk_offsets(_index.size(), tpool->get_thread_count()); -inline void SparseMatrixChunked::read_chunk(std::size_t chunk_id, SparseMatrix& buffer) { - assert(chunk_id < _index.size()); - const auto offset = _index[chunk_id]; + for (std::size_t i = 1; i < offsets.size(); ++i) { + const auto i0 = offsets[i - 1]; + const auto i1 = offsets[i]; - std::fstream fs; - _fs.exceptions(std::ios::badbit); - fs.open(_path, std::ios::in); - fs.seekg(offset); - - buffer.deserialize(fs, *_zstd_dctx); -} - -inline void SparseMatrixChunked::write_chunk() { - assert(!_matrix.empty()); - _index.push_back(_fs.tellg()); - _matrix.serialize(_fs, *_zstd_cctx, _compression_lvl); - _matrix.clear(); - _chrom_index.try_emplace(_chrom_id, std::make_pair(_index.size(), _index.size())); -} - -inline SparseMatrixView::SparseMatrixView(nonstd::span bin1_ids_, - nonstd::span bin2_ids_, - nonstd::span counts_, - std::size_t bin1_offset, std::size_t num_bins) - : _marg(num_bins), - _bin1_offset(bin1_offset), - bin1_ids(bin1_ids_), - bin2_ids(bin2_ids_), - counts(counts_) {} - -inline bool SparseMatrixView::empty() const noexcept { return size() == 0; } -inline std::size_t SparseMatrixView::size() const noexcept { return counts.size(); } - -inline const std::vector& SparseMatrixView::margs() const noexcept { return _marg; } - -inline const std::vector& SparseMatrixView::marginalize() const { - std::fill(_marg.begin(), _marg.end(), 0); - for (std::size_t i = 0; i < size(); ++i) { - const auto i1 = bin1_ids[i] - _bin1_offset; - const auto i2 = bin2_ids[i] - _bin1_offset; - - _marg[i1] += counts[i]; - _marg[i2] += counts[i]; + tpool->push_task(marginalize_impl, i0, i1); } - - return _marg; + tpool->wait_for_tasks(); } -inline const std::vector& SparseMatrixView::marginalize_nnz() const { - std::fill(_marg.begin(), _marg.end(), 0); - - for (std::size_t i = 0; i < counts.size(); ++i) { - const auto i1 = bin1_ids[i] - _bin1_offset; - const auto i2 = bin2_ids[i] - _bin1_offset; +inline void SparseMatrixChunked::marginalize_nnz(MargsVector& marg, BS::thread_pool* tpool, + bool init_buffer) const { + auto marginalize_nnz_impl = [&](std::size_t istart, std::size_t iend) { + std::unique_ptr zstd_dctx(ZSTD_createDCtx()); + std::fstream fs{}; + fs.exceptions(_fs.exceptions()); + fs.open(_path, std::ios::in); + auto matrix = _matrix; + MargsVector marg_local(marg.size()); + for (const auto offset : nonstd::span(_index).subspan(istart, iend - istart)) { + fs.seekg(offset); + matrix.deserialize(fs, *zstd_dctx); + matrix.marginalize_nnz(marg, nullptr, false); + } + for (std::size_t i = 0; i < marg_local.size(); ++i) { + if (marg_local[i] != 0) { + marg.add(i, marg_local[i]); + } + } + }; - _marg[i1] += counts[i] != 0; - _marg[i2] += counts[i] != 0; + assert(!marg.empty()); + if (init_buffer) { + marg.fill(0); } - return _marg; -} - -inline const std::vector& SparseMatrixView::times_outer_product_marg( - nonstd::span biases, nonstd::span weights) const { - assert(biases.size() == _marg.size()); - assert(biases.size() == weights.size() || weights.empty()); - - std::fill(_marg.begin(), _marg.end(), 0); - for (std::size_t i = 0; i < size(); ++i) { - const auto i1 = bin1_ids[i] - _bin1_offset; - const auto i2 = bin2_ids[i] - _bin1_offset; - const auto w1 = weights.empty() ? 1 : weights[i1]; - const auto w2 = weights.empty() ? 1 : weights[i2]; - const auto count = counts[i] * (w1 * biases[i1]) * (w2 * biases[i2]); - - _marg[i1] += count; - _marg[i2] += count; + if (_index.size() == 1 || !tpool) { + marginalize_nnz_impl(0, _index.size()); + return; } - return _marg; -} + const auto offsets = compute_chunk_offsets(_index.size(), tpool->get_thread_count()); + + for (std::size_t i = 1; i < offsets.size(); ++i) { + const auto i0 = offsets[i - 1]; + const auto i1 = offsets[i]; -inline SparseMatrixChunkedView::SparseMatrixChunkedView(const std::filesystem::path& path, - nonstd::span index, - std::size_t bin1_offset, - std::size_t num_bins) - : _fs(path, std::ios::in), - _index(index.begin(), index.end()), - _marg(num_bins), - _bin1_offset(bin1_offset), - _zstd_dctx(ZSTD_createDCtx()) {} - -inline bool SparseMatrixChunkedView::empty() const noexcept { return _index.empty(); } -inline std::size_t SparseMatrixChunkedView::size() { - std::size_t size_ = 0; - - for (const auto& idx : _index) { - _fs.seekg(idx); - std::size_t chunk_size{}; - _fs.read(reinterpret_cast(&chunk_size), sizeof(std::size_t)); - size_ += chunk_size; + tpool->push_task(marginalize_nnz_impl, i0, i1); } - return size_; + tpool->wait_for_tasks(); } -inline const std::vector& SparseMatrixChunkedView::margs() const noexcept { return _marg; } - -inline const std::vector& SparseMatrixChunkedView::marginalize() const { - std::fill(_marg.begin(), _marg.end(), 0); - - for (const auto offset : _index) { - _fs.seekg(offset); - _matrix.deserialize(_fs, *_zstd_dctx); - - for (std::size_t i = 0; i < _matrix.counts().size(); ++i) { - const auto i1 = _matrix.bin1_ids()[i] - _bin1_offset; - const auto i2 = _matrix.bin2_ids()[i] - _bin1_offset; - - _marg[i1] += _matrix.counts()[i]; - _marg[i2] += _matrix.counts()[i]; +inline void SparseMatrixChunked::times_outer_product_marg(MargsVector& marg, + nonstd::span biases, + nonstd::span weights, + BS::thread_pool* tpool, + bool init_buffer) const { + auto times_outer_product_marg_impl = [&](std::size_t istart, std::size_t iend) { + std::unique_ptr zstd_dctx(ZSTD_createDCtx()); + std::fstream fs{}; + fs.exceptions(_fs.exceptions()); + fs.open(_path, std::ios::in); + auto matrix = _matrix; + MargsVector marg_local(marg.size()); + for (const auto offset : nonstd::span(_index).subspan(istart, iend - istart)) { + fs.seekg(offset); + matrix.deserialize(fs, *zstd_dctx); + matrix.times_outer_product_marg(marg_local, biases, weights, nullptr, false); } - if (_fs.peek() && _fs.eof()) { - break; + for (std::size_t i = 0; i < marg.size(); ++i) { + if (marg_local[i] != 0) { + marg.add(i, marg_local[i]); + } } - } + }; - return _marg; -} + assert(biases.size() == weights.size() || weights.empty()); + marg.resize(biases.size()); + if (init_buffer) { + marg.fill(0); + } -inline const std::vector& SparseMatrixChunkedView::marginalize_nnz() const { - std::fill(_marg.begin(), _marg.end(), 0); + if (_index.size() == 1 || !tpool) { + times_outer_product_marg_impl(0, _index.size()); + return; + } - for (const auto offset : _index) { - _fs.seekg(offset); - _matrix.deserialize(_fs, *_zstd_dctx); + const auto offsets = compute_chunk_offsets(_index.size(), tpool->get_thread_count()); - for (std::size_t i = 0; i < _matrix.counts().size(); ++i) { - const auto i1 = _matrix.bin1_ids()[i] - _bin1_offset; - const auto i2 = _matrix.bin2_ids()[i] - _bin1_offset; + for (std::size_t i = 1; i < offsets.size(); ++i) { + const auto i0 = offsets[i - 1]; + const auto i1 = offsets[i]; - _marg[i1] += _matrix.counts()[i] != 0; - _marg[i2] += _matrix.counts()[i] != 0; - } - if (_fs.peek() && _fs.eof()) { - break; - } + tpool->push_task(times_outer_product_marg_impl, i0, i1); } + tpool->wait_for_tasks(); - return _marg; + return; } -inline const std::vector& SparseMatrixChunkedView::times_outer_product_marg( - nonstd::span biases, nonstd::span weights) const { - assert(biases.size() == _marg.size()); - assert(biases.size() == weights.size() || weights.empty()); - - std::fill(_marg.begin(), _marg.end(), 0); - - for (const auto offset : _index) { - _fs.seekg(offset); - _matrix.deserialize(_fs, *_zstd_dctx); - - for (std::size_t i = 0; i < _matrix.counts().size(); ++i) { - const auto i1 = _matrix.bin1_ids()[i] - _bin1_offset; - const auto i2 = _matrix.bin2_ids()[i] - _bin1_offset; - const auto w1 = weights.empty() ? 1 : weights[i1]; - const auto w2 = weights.empty() ? 1 : weights[i2]; - const auto count = _matrix.counts()[i] * (w1 * biases[i1]) * (w2 * biases[i2]); +inline void SparseMatrixChunked::write_chunk() { + assert(!_matrix.empty()); + _index.push_back(_fs.tellg()); + _matrix.finalize(); + _matrix.serialize(_fs, *_zstd_cctx, _compression_lvl); + _matrix.clear(); +} - _marg[i1] += count; - _marg[i2] += count; - } - if (_fs.peek() && _fs.eof()) { - break; +inline std::vector SparseMatrixChunked::compute_chunk_offsets(std::size_t size, + std::size_t num_chunks) { + std::vector offsets{}; + if (size < num_chunks) { + offsets.resize(size + 1, 1); + offsets.front() = 0; + } else { + const auto n = size / num_chunks; + offsets.resize(num_chunks + 1, n); + offsets.front() = 0; + auto tot = n * num_chunks; + + for (std::size_t i = 1; i < offsets.size(); ++i) { + if (tot == size) { + break; + } + offsets[i]++; + tot++; } } - return _marg; + for (std::size_t i = 1; i < offsets.size(); ++i) { + offsets[i] += offsets[i - 1]; + } + + return offsets; } } // namespace hictk::balancing diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp index 85514f2d..2f150ddc 100644 --- a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -29,45 +30,75 @@ struct default_delete { namespace hictk::balancing { -class SparseMatrixView; +class MargsVector { + std::vector _margs{}; + mutable std::vector _mtxes; + + public: + MargsVector() = default; + explicit MargsVector(std::size_t size_); + + MargsVector(const MargsVector& other); + MargsVector(MargsVector&& other) noexcept = default; + + ~MargsVector() = default; + + MargsVector& operator=(const MargsVector& other); + MargsVector& operator=(MargsVector&& other) noexcept = default; + + [[nodiscard]] double operator[](std::size_t i) const noexcept; + [[nodiscard]] double& operator[](std::size_t i) noexcept; + void add(std::size_t i, double n) noexcept; + + [[nodiscard]] const std::vector& operator()() const noexcept; + [[nodiscard]] std::vector& operator()() noexcept; + + void fill(double n = 0) noexcept; + void resize(std::size_t size_); + + [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] bool empty() const noexcept; + + private: + static constexpr std::size_t compute_number_of_mutexes(std::size_t size) noexcept; + template >> + [[nodiscard]] static constexpr I next_pow2(I n) noexcept; + [[nodiscard]] std::size_t get_mutex_idx(std::size_t i) const noexcept; +}; + class SparseMatrix { std::vector _bin1_ids{}; std::vector _bin2_ids{}; std::vector _counts{}; - std::uint32_t _chrom_id{}; // ID of the chromosome that is being procesed - std::vector _chrom_offsets{}; - std::vector _bin1_offsets{}; - mutable std::vector _marg{}; - - static constexpr auto _gw_id = std::numeric_limits::max(); - public: SparseMatrix() = default; - explicit SparseMatrix(const BinTable& bins, std::uint32_t chrom_id = _gw_id); [[nodiscard]] bool empty() const noexcept; [[nodiscard]] std::size_t size() const noexcept; - void clear() noexcept; + void clear(bool shrink_to_fit_ = false) noexcept; void shrink_to_fit() noexcept; void finalize(); [[nodiscard]] const std::vector& bin1_ids() const noexcept; [[nodiscard]] const std::vector& bin2_ids() const noexcept; [[nodiscard]] const std::vector& counts() const noexcept; - [[nodiscard]] const std::vector& margs() const noexcept; - [[nodiscard]] const std::vector& chrom_offsets() const noexcept; - - void push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count); - [[nodiscard]] SparseMatrixView subset(std::uint32_t chrom_id) const; - [[nodiscard]] SparseMatrixView view() const; + void push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count, + std::size_t bin_offset = 0); void serialize(std::fstream& fs, ZSTD_CCtx& ctx, int compression_lvl = 3) const; void deserialize(std::fstream& fs, ZSTD_DCtx& ctx); + + void marginalize(MargsVector& marg, BS::thread_pool* tpool = nullptr, + bool init_buffer = true) const; + void marginalize_nnz(MargsVector& marg, BS::thread_pool* tpool = nullptr, + bool init_buffer = true) const; + void times_outer_product_marg(MargsVector& marg, nonstd::span biases, + nonstd::span weights, + BS::thread_pool* tpool = nullptr, bool init_buffer = true) const; }; -class SparseMatrixChunkedView; class SparseMatrixChunked { mutable SparseMatrix _matrix{}; mutable std::string _buff{}; @@ -75,16 +106,7 @@ class SparseMatrixChunked { mutable std::fstream _fs{}; std::vector _index{}; - - // chrom_id, - phmap::flat_hash_map> _chrom_index{}; - std::uint32_t _chrom_id{}; // id of the chromosome that is currently being processed; std::size_t _size{}; - - mutable std::vector _marg{}; - std::vector _chrom_offsets{}; - std::vector _bin1_offsets{}; - std::size_t _chunk_size{}; int _compression_lvl{}; @@ -93,7 +115,7 @@ class SparseMatrixChunked { public: SparseMatrixChunked() = default; - SparseMatrixChunked(const BinTable& bins, std::filesystem::path tmp_file, std::size_t chunk_size, + SparseMatrixChunked(std::filesystem::path tmp_file, std::size_t chunk_size, int compression_lvl = 3); SparseMatrixChunked(const SparseMatrixChunked& other) = delete; @@ -106,77 +128,24 @@ class SparseMatrixChunked { [[nodiscard]] bool empty() const noexcept; [[nodiscard]] std::size_t size() const noexcept; + void clear(bool shrink_to_fit_ = false); - [[nodiscard]] const std::vector& margs() const noexcept; - [[nodiscard]] const std::vector& chrom_offsets() const noexcept; - - void push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count); + void push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count, + std::size_t bin_offset = 0); void finalize(); - void finalize_chromosome(std::uint32_t chrom_id); - - void initialize_index(std::uint64_t bin1_id); - void update_index(std::uint64_t bin1_id); - - [[nodiscard]] SparseMatrixChunkedView subset(std::uint32_t chrom_id) const; - [[nodiscard]] SparseMatrixChunkedView view() const; - void read_chunk(std::size_t chunk_id, SparseMatrix& buffer); + void marginalize(MargsVector& marg, BS::thread_pool* tpool = nullptr, + bool init_buffer = true) const; + void marginalize_nnz(MargsVector& marg, BS::thread_pool* tpool = nullptr, + bool init_buffer = true) const; + void times_outer_product_marg(MargsVector& marg, nonstd::span biases, + nonstd::span weights, + BS::thread_pool* tpool = nullptr, bool init_buffer = true) const; private: void write_chunk(); -}; - -class SparseMatrixView { - mutable std::vector _marg{}; - std::size_t _bin1_offset{}; - - public: - nonstd::span bin1_ids{}; // NOLINT - nonstd::span bin2_ids{}; // NOLINT - nonstd::span counts{}; // NOLINT - - SparseMatrixView() = default; - SparseMatrixView(nonstd::span bin1_ids_, - nonstd::span bin2_ids_, nonstd::span counts_, - std::size_t bin1_offset, std::size_t num_bins); - - [[nodiscard]] bool empty() const noexcept; - [[nodiscard]] std::size_t size() const noexcept; - - [[nodiscard]] const std::vector& margs() const noexcept; - - const std::vector& marginalize() const; - const std::vector& marginalize_nnz() const; - const std::vector& times_outer_product_marg(nonstd::span biases, - nonstd::span weights) const; -}; - -class SparseMatrixChunkedView { - mutable SparseMatrix _matrix{}; - mutable std::string _buff{}; - mutable std::fstream _fs{}; - - std::vector _index{}; - - mutable std::vector _marg{}; - std::size_t _bin1_offset{}; - std::unique_ptr _zstd_dctx{}; - - public: - SparseMatrixChunkedView() = default; - SparseMatrixChunkedView(const std::filesystem::path& path, - nonstd::span index, std::size_t bin1_offset, - std::size_t num_bins); - - [[nodiscard]] bool empty() const noexcept; - [[nodiscard]] std::size_t size(); - - [[nodiscard]] const std::vector& margs() const noexcept; - - const std::vector& marginalize() const; - const std::vector& marginalize_nnz() const; - const std::vector& times_outer_product_marg(nonstd::span biases, - nonstd::span weights) const; + [[nodiscard]] static std::vector compute_chunk_offsets(std::size_t size, + std::size_t num_chunks); }; } // namespace hictk::balancing diff --git a/test/units/balancing/CMakeLists.txt b/test/units/balancing/CMakeLists.txt index ae8d8d96..22804fdd 100644 --- a/test/units/balancing/CMakeLists.txt +++ b/test/units/balancing/CMakeLists.txt @@ -12,54 +12,54 @@ add_executable(hictk_balancing_tests) target_sources(hictk_balancing_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/balancing_test.cpp) target_link_libraries( - hictk_balancing_tests - PRIVATE hictk_project_warnings hictk_project_options - PUBLIC hictk::balancing hictk::file) + hictk_balancing_tests + PRIVATE hictk_project_warnings hictk_project_options + PUBLIC hictk::balancing hictk::file) target_link_system_libraries( - hictk_balancing_tests - PUBLIC - Catch2::Catch2WithMain - std::filesystem) + hictk_balancing_tests + PUBLIC + Catch2::Catch2WithMain + std::filesystem) file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/Testing/) # automatically discover tests that are defined in catch based test files you can modify the unittests. TEST_PREFIX to # whatever you want, or use different for different binaries catch_discover_tests( - hictk_balancing_tests - TEST_SPEC - "[short]" - TEST_SUFFIX - " - SHORT" - WORKING_DIRECTORY - ${PROJECT_SOURCE_DIR} - OUTPUT_DIR - ${CMAKE_CURRENT_SOURCE_DIR}/Testing/ - EXTRA_ARGS - --success - --skip-benchmarks) + hictk_balancing_tests + TEST_SPEC + "[short]" + TEST_SUFFIX + " - SHORT" + WORKING_DIRECTORY + ${PROJECT_SOURCE_DIR} + OUTPUT_DIR + ${CMAKE_CURRENT_SOURCE_DIR}/Testing/ + EXTRA_ARGS + --success + --skip-benchmarks) catch_discover_tests( - hictk_balancing_tests - TEST_SPEC - "[medium]" - TEST_SUFFIX - " - MEDIUM" - WORKING_DIRECTORY - ${PROJECT_SOURCE_DIR} - EXTRA_ARGS - --success - --skip-benchmarks) + hictk_balancing_tests + TEST_SPEC + "[medium]" + TEST_SUFFIX + " - MEDIUM" + WORKING_DIRECTORY + ${PROJECT_SOURCE_DIR} + EXTRA_ARGS + --success + --skip-benchmarks) catch_discover_tests( - hictk_balancing_tests - TEST_SPEC - "[long]" - TEST_SUFFIX - " - LONG" - WORKING_DIRECTORY - ${PROJECT_SOURCE_DIR} - EXTRA_ARGS - --success - --skip-benchmarks) + hictk_balancing_tests + TEST_SPEC + "[long]" + TEST_SUFFIX + " - LONG" + WORKING_DIRECTORY + ${PROJECT_SOURCE_DIR} + EXTRA_ARGS + --success + --skip-benchmarks) diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index 4c0fa465..44b809ef 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2022 Roberto Rossini +// Copyright (C) 2023 Roberto Rossini // // SPDX-License-Identifier: MIT @@ -66,13 +66,10 @@ TEST_CASE("Balancing: SparseMatrix") { {3, 0, 4}, {3, 1, 5}}; // chr2 // clang-format on - SECTION("accessors") { - CHECK(SparseMatrix{}.empty()); - CHECK(SparseMatrix{bins}.empty()); - } + SECTION("accessors") { CHECK(SparseMatrix{}.empty()); } SECTION("push_back") { - SparseMatrix m{bins}; + SparseMatrix m{}; for (const auto& p : pixels) { m.push_back(p.bin1_id, p.bin2_id, p.count); } @@ -83,19 +80,6 @@ TEST_CASE("Balancing: SparseMatrix") { CHECK(m.empty()); } - SECTION("subset") { - SparseMatrix m{bins}; - for (const auto& p : pixels) { - m.push_back(p.bin1_id, p.bin2_id, p.count); - } - m.finalize(); - - CHECK(m.subset(0).empty()); - CHECK(m.subset(1).size() == 3); - CHECK(m.subset(2).size() == 2); - CHECK(m.subset(3).empty()); - } - SECTION("serde") { const auto tmpfile = testdir() / "sparse_matrix_serde.bin"; std::unique_ptr zstd_cctx{ZSTD_createCCtx()}; @@ -116,11 +100,10 @@ TEST_CASE("Balancing: SparseMatrix") { compare_vectors(m1.bin1_ids(), m2.bin1_ids()); compare_vectors(m1.bin2_ids(), m2.bin2_ids()); compare_vectors(m1.counts(), m2.counts()); - compare_vectors(m1.chrom_offsets(), m2.chrom_offsets()); } SECTION("full matrix") { - SparseMatrix m1{bins}; + SparseMatrix m1{}; for (const auto& p : pixels) { m1.push_back(p.bin1_id, p.bin2_id, p.count); } @@ -130,7 +113,7 @@ TEST_CASE("Balancing: SparseMatrix") { f.open(tmpfile, std::ios::in | std::ios::out | std::ios::trunc); f.exceptions(std::ios::badbit | std::ios::failbit); - SparseMatrix m2{bins}; + SparseMatrix m2{}; m1.serialize(f, *zstd_cctx); f.seekg(std::ios::beg); m2.deserialize(f, *zstd_dctx); @@ -138,7 +121,6 @@ TEST_CASE("Balancing: SparseMatrix") { compare_vectors(m1.bin1_ids(), m2.bin1_ids()); compare_vectors(m1.bin2_ids(), m2.bin2_ids()); compare_vectors(m1.counts(), m2.counts()); - compare_vectors(m1.chrom_offsets(), m2.chrom_offsets()); } } } @@ -156,10 +138,10 @@ TEST_CASE("Balancing: SparseMatrixChunked") { // clang-format on const auto tmpfile = testdir() / "sparse_matrix_chunked.tmp"; - SECTION("accessors") { CHECK(SparseMatrixChunked{bins, tmpfile, 2, 0}.empty()); } + SECTION("accessors") { CHECK(SparseMatrixChunked{tmpfile, 2, 0}.empty()); } SECTION("push_back") { - SparseMatrixChunked m{bins, tmpfile, 2, 0}; + SparseMatrixChunked m{tmpfile, 2, 0}; for (const auto& p : pixels) { m.push_back(p.bin1_id, p.bin2_id, p.count); } @@ -167,19 +149,6 @@ TEST_CASE("Balancing: SparseMatrixChunked") { CHECK(m.size() == pixels.size()); } - - SECTION("subset") { - SparseMatrixChunked m{bins, tmpfile, 2, 0}; - for (const auto& p : pixels) { - m.push_back(p.bin1_id, p.bin2_id, p.count); - } - m.finalize(); - - CHECK(m.subset(0).empty()); - CHECK(m.subset(1).size() == 3); - CHECK(m.subset(2).size() == 2); - CHECK(m.subset(3).empty()); - } } // NOLINTNEXTLINE(readability-function-cognitive-complexity) From d7a9053e71277a135110f4b2675d7a980f7fe7bf Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 27 Sep 2023 15:14:09 +0200 Subject: [PATCH 19/33] Initial implementation of hictk balance --- src/hictk/CMakeLists.txt | 2 + src/hictk/balance/balance.cpp | 213 ++++++++++++++++++ src/hictk/cli/cli.cpp | 13 +- src/hictk/cli/cli_balance.cpp | 178 +++++++++++++++ src/hictk/cli/cli_convert.cpp | 2 +- src/hictk/convert/cool_to_hic.cpp | 66 +----- src/hictk/include/hictk/tools/cli.hpp | 4 + src/hictk/include/hictk/tools/common.hpp | 15 ++ src/hictk/include/hictk/tools/config.hpp | 26 +++ .../include/hictk/tools/juicer_tools.hpp | 78 +++++++ src/hictk/include/hictk/tools/tools.hpp | 1 + src/hictk/main.cpp | 2 + .../balancing/include/hictk/balancing/ice.hpp | 6 +- .../include/hictk/balancing/impl/ice_impl.hpp | 3 +- .../cooler/include/hictk/cooler/cooler.hpp | 1 + .../hictk/cooler/impl/file_accessors_impl.hpp | 10 + .../hictk/cooler/impl/file_read_impl.hpp | 10 - src/libhictk/file/include/hictk/file.hpp | 1 + .../file/include/hictk/impl/file_impl.hpp | 3 + src/libhictk/hic/include/hictk/hic.hpp | 1 + .../include/hictk/hic/impl/hic_file_impl.hpp | 8 + 21 files changed, 564 insertions(+), 79 deletions(-) create mode 100644 src/hictk/balance/balance.cpp create mode 100644 src/hictk/cli/cli_balance.cpp create mode 100644 src/hictk/include/hictk/tools/common.hpp create mode 100644 src/hictk/include/hictk/tools/juicer_tools.hpp diff --git a/src/hictk/CMakeLists.txt b/src/hictk/CMakeLists.txt index 3033984d..e2b8fb8d 100644 --- a/src/hictk/CMakeLists.txt +++ b/src/hictk/CMakeLists.txt @@ -16,12 +16,14 @@ target_sources( hictk PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_balance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_convert.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_dump.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_load.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_merge.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_validate.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cli/cli_zoomify.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/balance/balance.cpp ${CMAKE_CURRENT_SOURCE_DIR}/convert/convert.cpp ${CMAKE_CURRENT_SOURCE_DIR}/convert/cool_to_hic.cpp ${CMAKE_CURRENT_SOURCE_DIR}/convert/hic_to_cool.cpp diff --git a/src/hictk/balance/balance.cpp b/src/hictk/balance/balance.cpp new file mode 100644 index 00000000..197ecefc --- /dev/null +++ b/src/hictk/balance/balance.cpp @@ -0,0 +1,213 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include + +#include + +#include "hictk/balancing/ice.hpp" +#include "hictk/balancing/methods.hpp" +#include "hictk/cooler.hpp" +#include "hictk/file.hpp" +#include "hictk/hic.hpp" +#include "hictk/tools/common.hpp" +#include "hictk/tools/config.hpp" +#include "hictk/tools/juicer_tools.hpp" + +namespace hictk::tools { + +static void write_weights_hic(const hic::File& hf, const BalanceConfig& c, + const std::vector& weights) { + auto tmpfile = c.tmp_dir / std::filesystem::path{hf.name()}.filename(); + for (std::size_t i = 0; i < 1024; ++i) { + if (!std::filesystem::exists(tmpfile)) { + break; + } + + tmpfile.replace_extension(".tmp" + std::to_string(i)); + } + + if (std::filesystem::exists(tmpfile)) { + throw std::runtime_error( + fmt::format(FMT_STRING("unable to create temporary file {}"), tmpfile)); + } + + try { + { + const std::unique_ptr f(std::fopen(tmpfile.string().c_str(), "ae")); + if (!bool(f)) { + throw fmt::system_error(errno, FMT_STRING("cannot open file {}"), tmpfile); + } + + std::ptrdiff_t i0 = 0; + + for (const auto& chrom : hf.chromosomes()) { + if (chrom.is_all()) { + continue; + } + fmt::print(f.get(), FMT_STRING("vector\t{}\t{}\t{}\tBP\n"), c.name, chrom.name(), + hf.bin_size()); + + const auto num_bins = (chrom.size() + hf.bin_size() - 1) / hf.bin_size(); + const auto i1 = i0 + static_cast(num_bins); + std::for_each(weights.begin() + i0, weights.begin() + i1, [&](const double w) { + std::isnan(w) ? fmt::print(f.get(), FMT_COMPILE(".\n")) + : fmt::print(f.get(), FMT_COMPILE("{}\n"), 1.0 / w); + if (!bool(f)) { // NOLINT + throw fmt::system_error( + errno, FMT_STRING("an error occurred while writing weights to file {}"), tmpfile); + } + }); + + i0 = i1; + } + } + + auto jt = run_juicer_tools_add_norm(c.juicer_tools_jar, tmpfile, hf.url(), c.juicer_tools_xmx); + jt->wait(); + if (jt->exit_code() != 0) { + throw std::runtime_error( + fmt::format(FMT_STRING("juicer_tools pre failed with exit code {}"), jt->exit_code())); + } + } catch (...) { + std::error_code ec{}; + std::filesystem::remove(tmpfile, ec); + } + std::filesystem::remove(tmpfile); +} + +static void write_weights_cooler(std::string_view uri, const BalanceConfig& c, + const std::vector& weights, + const std::vector& variance, + const std::vector& scale) { + const auto& [file, grp] = cooler::parse_cooler_uri(uri); + const auto path = fmt::format(FMT_STRING("{}/bins/{}"), grp, c.name); + SPDLOG_INFO(FMT_STRING("Writing weights to {}{}..."), uri, path); + + const HighFive::File clr(file, HighFive::File::ReadWrite); + + if (clr.exist(path)) { + assert(c.force); + clr.unlink(path); + } + + cooler::Dataset dset(cooler::RootGroup{clr.getGroup(grp)}, path, 0.0); + dset.append(weights); + + dset.write_attribute("cis_only", c.mode == "cis"); + dset.write_attribute("divisive_weights", false); + dset.write_attribute("ignore_diags", std::int64_t(c.masked_diags)); + dset.write_attribute("mad_max", std::int64_t(c.mad_max)); + dset.write_attribute("min_count", std::int64_t(c.min_count)); + dset.write_attribute("min_nnz", std::int64_t(c.min_nnz)); + dset.write_attribute("tol", c.tolerance); + + if (c.mode != "cis") { + dset.write_attribute("converged", variance.front() < c.tolerance); + dset.write_attribute("scale", scale.front()); + dset.write_attribute("var", variance.front()); + } else { + std::vector converged{}; + for (const auto& var : variance) { + converged.push_back(var < c.tolerance); + } + dset.write_attribute("converged", converged); + dset.write_attribute("scale", scale); + dset.write_attribute("var", variance); + } +} + +static int balance_singleres_file(File&& f, const BalanceConfig& c) { + std::filesystem::path tmpfile{}; + + if (f.is_cooler()) { + const auto& ff = f.get(); + if (ff.has_weights(c.name) && !c.force) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "{}/bins/weight already exists. Pass --force to overwrite currently stored weights."), + ff.uri())); + } + } + + if (!c.in_memory) { + tmpfile = c.tmp_dir / std::filesystem::path{f.path()}.filename(); + for (std::size_t i = 0; i < 1024; ++i) { + if (!std::filesystem::exists(tmpfile)) { + break; + } + + tmpfile.replace_extension(".tmp" + std::to_string(i)); + } + + if (std::filesystem::exists(tmpfile)) { + throw std::runtime_error( + fmt::format(FMT_STRING("unable to create temporary file {}"), tmpfile)); + } + } + + const balancing::ICE::Params params{c.tolerance, c.max_iters, c.masked_diags, + c.min_nnz, c.min_count, c.mad_max, + tmpfile, c.chunk_size, c.threads}; + balancing::ICE::Type mode{}; + if (c.mode == "gw") { + mode = balancing::ICE::Type::gw; + } else if (c.mode == "cis") { + mode = balancing::ICE::Type::cis; + } else { + mode = balancing::ICE::Type::trans; + } + + const auto balancer = + std::visit([&](const auto& ff) { return balancing::ICE(ff, mode, params); }, f.get()); + const auto weights = balancer.get_weights(c.rescale_marginals); + + if (c.stdout_) { + std::for_each(weights.begin(), weights.end(), + [&](const auto w) { fmt::print(FMT_COMPILE("{}\n"), w); }); + return 0; + } + + if (f.is_cooler()) { + const auto uri = f.uri(); + f.get().close(); + write_weights_cooler(uri, c, weights, balancer.variance(), balancer.scale()); + return 0; + } + + write_weights_hic(f.get(), c, weights); + // TODO write weights .hic + + return 0; +} + +static int balance_multires(const BalanceConfig& c) { + const auto resolutions = cooler::MultiResFile(c.path_to_input.string()).resolutions(); + + for (const auto& res : resolutions) { + balance_singleres_file( + File(fmt::format(FMT_STRING("{}::/resolutions/{}"), c.path_to_input.string(), res)), c); + } + return 0; +} + +int balance_subcmd(const BalanceConfig& c) { + if (cooler::utils::is_multires_file(c.path_to_input.string())) { + return balance_multires(c); + } + + std::vector resolutions{}; + if (hic::utils::is_hic_file(c.path_to_input)) { + resolutions = hic::utils::list_resolutions(c.path_to_input); + } else { + resolutions.push_back(File(c.path_to_input.string()).bin_size()); + } + + for (const auto& res : resolutions) { + balance_singleres_file(File(c.path_to_input, res), c); + } + + return 0; +} +} // namespace hictk::tools diff --git a/src/hictk/cli/cli.cpp b/src/hictk/cli/cli.cpp index 1a5b057b..fa044d93 100644 --- a/src/hictk/cli/cli.cpp +++ b/src/hictk/cli/cli.cpp @@ -25,7 +25,9 @@ auto Cli::parse_arguments() -> Config { _cli.name(_exec_name); _cli.parse(_argc, _argv); - if (_cli.get_subcommand("convert")->parsed()) { + if (_cli.get_subcommand("balance")->parsed()) { + _subcommand = subcommand::balance; + } else if (_cli.get_subcommand("convert")->parsed()) { _subcommand = subcommand::convert; } else if (_cli.get_subcommand("dump")->parsed()) { _subcommand = subcommand::dump; @@ -69,6 +71,8 @@ int Cli::exit(const CLI::ParseError& e) const { return _cli.exit(e); } std::string_view Cli::subcommand_to_str(subcommand s) noexcept { switch (s) { + case balance: + return "balance"; case convert: return "convert"; case dump: @@ -93,6 +97,7 @@ void Cli::make_cli() { _cli.set_version_flag("-V,--version", std::string{hictk::config::version::str_long()}); _cli.require_subcommand(1); + make_balance_subcommand(); make_convert_subcommand(); make_dump_subcommand(); make_load_subcommand(); @@ -103,6 +108,9 @@ void Cli::make_cli() { void Cli::validate_args() const { switch (_subcommand) { + case balance: + validate_balance_subcommand(); + break; case convert: validate_convert_subcommand(); break; @@ -127,6 +135,9 @@ void Cli::validate_args() const { void Cli::transform_args() { switch (_subcommand) { + case balance: + transform_args_balance_subcommand(); + break; case convert: transform_args_convert_subcommand(); break; diff --git a/src/hictk/cli/cli_balance.cpp b/src/hictk/cli/cli_balance.cpp new file mode 100644 index 00000000..6120d23a --- /dev/null +++ b/src/hictk/cli/cli_balance.cpp @@ -0,0 +1,178 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include + +#include +#include +#include +#include +#include + +#include "hictk/tools/cli.hpp" +#include "hictk/tools/config.hpp" + +namespace hictk::tools { + +void Cli::make_balance_subcommand() { + auto& sc = *_cli.add_subcommand("balance", "Balance HiC matrices using ICE.") + ->fallthrough() + ->preparse_callback([this]([[maybe_unused]] std::size_t i) { + assert(_config.index() == 0); + _config = BalanceConfig{}; + }); + + _config = BalanceConfig{}; + auto& c = std::get(_config); + + // clang-format off + sc.add_option( + "input", + c.path_to_input, + "Path to the .hic, .cool or .mcool file to be balanced.") + ->check(IsValidHiCFile | IsValidCoolerFile | IsValidMultiresCoolerFile) + ->required(); + sc.add_option( + "--mode", + c.mode, + "Balance matrix using:\n" + " - genome-wide interactions (gw)\n" + " - trans-only interactions (trans)\n" + " - cis-only interactions (cis)") + ->check(CLI::IsMember({"gw", "trans", "cis"})) + ->capture_default_str(); + sc.add_option( + "--tmpdir", + c.tmp_dir, + "Path to a folder where to store temporary data.") + ->capture_default_str(); + sc.add_option( + "--ignore-diags", + c.masked_diags, + "Number of diagonals (including the main diagonal) to mask before balancing.") + ->capture_default_str(); + sc.add_option( + "--mad-max", + c.mad_max, + "Mask bins using the MAD-max filter.\n" + "bins whose log marginal sum is less than --mad-max median\n" + "absolute deviations below the median log marginal sum of\n" + "all the bins in the same chromosome.") + ->check(CLI::NonNegativeNumber) + ->capture_default_str(); + sc.add_option( + "--min-nnz", + c.min_nnz, + "Mask rows with fewer than --min-nnz non-zero entries.") + ->capture_default_str(); + sc.add_option( + "--min-count", + c.min_count, + "Mask rows with fewer than --min-count interactions.") + ->capture_default_str(); + sc.add_option( + "--tolerance", + c.tolerance, + "Threshold of the variance of marginals used to determine whether\n" + "the algorithm has converged.") + ->check(CLI::NonNegativeNumber) + ->capture_default_str(); + sc.add_option( + "--max-iters", + c.max_iters, + "Maximum number of iterations.") + ->check(CLI::PositiveNumber) + ->capture_default_str(); + sc.add_flag( + "--rescale-weights,!--no-rescale-weights", + c.rescale_marginals, + "Rescale weights such that rows sum approximately to 2.") + ->capture_default_str(); + sc.add_option( + "--name", + c.name, + "Name to use when writing weights to file.") + ->capture_default_str(); + sc.add_flag( + "--in-memory", + c.in_memory, + "Store all interactions in memory (greatly improves performance).") + ->capture_default_str(); + sc.add_flag( + "--stdout", + c.stdout_, + "Write balancing weights to stdout instead of writing them to the input file.") + ->capture_default_str(); + sc.add_option( + "--chunk-size", + c.chunk_size, + "Number of interactions to process at once. Ignored when using --in-memory.") + ->check(CLI::PositiveNumber) + ->capture_default_str(); + sc.add_option( + "-v,--verbosity", + c.verbosity, + "Set verbosity of output to the console.") + ->check(CLI::Range(1, 4)) + ->capture_default_str(); + sc.add_option( + "-t,--threads", + c.threads, + "Maximum number of parallel threads to spawn.") + ->check(CLI::Range(std::uint32_t(1), std::thread::hardware_concurrency())) + ->capture_default_str(); + sc.add_option( + "-l,--compression-level", + c.zstd_compression_lvl, + "Compression level used to compress temporary files using ZSTD.") + ->check(CLI::Range(0, 19)) + ->capture_default_str(); + sc.add_option( + "--juicer-tools-jar", + c.juicer_tools_jar, + "Path to juicer_tools or hic_tools JAR.") + ->check(CLI::ExistingFile); + sc.add_option( + "--juicer-tools-memory", + c.juicer_tools_xmx, + "Max heap size used by juicer_tools.") + ->default_str(fmt::format(FMT_STRING("{:.0f}MB"), double(c.juicer_tools_xmx) / 1.0e6)) + ->check(CLI::PositiveNumber) + ->transform(CLI::AsSizeValue(true)); + sc.add_flag( + "-f,--force", + c.force, + "Overwrite existing files and datasets (if any).") + ->capture_default_str(); + // clang-format on +} + +void Cli::validate_balance_subcommand() const { + const auto& c = std::get(_config); + std::vector errors; + + const auto juicer_tools_jar_parsed = + !_cli.get_subcommand("balance")->get_option("--juicer-tools-jar")->empty(); + if (hic::utils::is_hic_file(c.path_to_input) && !c.stdout_ && !juicer_tools_jar_parsed) { + errors.push_back("option --juicer-tools-jar is required when balancing files in .hic format."); + } + + if (!errors.empty()) { + throw std::runtime_error(fmt::format( + FMT_STRING( + "The following error(s) where encountered while validating CLI arguments:\n - {}"), + fmt::join(errors, "\n - "))); + } +} + +void Cli::transform_args_balance_subcommand() { + auto& c = std::get(_config); + + // in spdlog, high numbers correspond to low log levels + assert(c.verbosity > 0 && c.verbosity < 5); + c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; +} + +} // namespace hictk::tools diff --git a/src/hictk/cli/cli_convert.cpp b/src/hictk/cli/cli_convert.cpp index a5374d3f..df7dd01c 100644 --- a/src/hictk/cli/cli_convert.cpp +++ b/src/hictk/cli/cli_convert.cpp @@ -95,7 +95,7 @@ void Cli::make_convert_subcommand() { c.processes, "Maximum number of parallel processes to spawn.\n" "When converting from hic to cool, only two processes will be used.") - ->check(CLI::Range(2, 1024)) + ->check(CLI::Range(std::uint32_t(2), std::thread::hardware_concurrency())) ->capture_default_str(); sc.add_option( "-l,--compression-level", diff --git a/src/hictk/convert/cool_to_hic.cpp b/src/hictk/convert/cool_to_hic.cpp index 99534a0f..19d7ec9f 100644 --- a/src/hictk/convert/cool_to_hic.cpp +++ b/src/hictk/convert/cool_to_hic.cpp @@ -16,62 +16,16 @@ #include "hictk/fmt.hpp" #include "hictk/tmpdir.hpp" +#include "hictk/tools/common.hpp" #include "hictk/tools/config.hpp" - -namespace std { -template <> -struct default_delete { - void operator()(FILE* file) const { std::fclose(file); } // NOLINT -}; -} // namespace std +#include "hictk/tools/juicer_tools.hpp" namespace hictk::tools { -[[nodiscard]] static std::filesystem::path find_java() { - auto java = boost::process::search_path("java"); - if (java.empty()) { - throw std::runtime_error("unable to find java in your PATH"); - } - return java.string(); -} - [[maybe_unused]] [[nodiscard]] static std::filesystem::path find_pigz() { return boost::process::search_path("pigz").string(); } -[[nodiscard]] static std::vector generate_juicer_tools_pre_args( - const ConvertConfig& c, const std::filesystem::path& path_to_pixels, - const std::filesystem::path& path_to_chrom_sizes, std::size_t processes) { - assert(processes != 0); - return {fmt::format(FMT_STRING("-Xmx{}M"), c.juicer_tools_xmx / 1'000'000), - "-jar", - c.juicer_tools_jar.string(), - "pre", - "-j", - fmt::to_string(processes), - "-t", - c.tmp_dir.string(), - "-n", - "-r", - fmt::format(FMT_STRING("{}"), fmt::join(c.resolutions, ",")), - path_to_pixels.string(), - c.path_to_output.string(), - path_to_chrom_sizes.string()}; -} - -[[nodiscard]] static std::vector generate_juicer_tools_add_norm_args( - const ConvertConfig& c, const std::filesystem::path& path_to_weights, std::size_t processes) { - assert(processes != 0); - return {fmt::format(FMT_STRING("-Xmx{}M"), c.juicer_tools_xmx / 1'000'000), - "-jar", - c.juicer_tools_jar.string(), - "addNorm", - "-j", - fmt::to_string(processes), - c.path_to_output.string(), - path_to_weights.string()}; -} - static void dump_chrom_sizes(const cooler::File& clr, const std::filesystem::path& dest) { SPDLOG_INFO(FMT_STRING("writing chromosomes to file {}..."), dest); const std::unique_ptr f(std::fopen(dest.string().c_str(), "we")); @@ -296,19 +250,6 @@ static bool dump_weights(const ConvertConfig& c, const std::filesystem::path& we return cooler_has_weights; } -[[nodiscard]] static std::unique_ptr run_juicer_tools_pre( - const ConvertConfig& c, const std::filesystem::path& chrom_sizes, - const std::filesystem::path& pixels, std::size_t processes) { - const auto cmd = generate_juicer_tools_pre_args(c, pixels, chrom_sizes, processes); - return std::make_unique(find_java().string(), cmd); -} - -[[nodiscard]] static std::unique_ptr run_juicer_tools_add_norm( - const ConvertConfig& c, const std::filesystem::path& path_to_weights, std::size_t processes) { - const auto cmd = generate_juicer_tools_add_norm_args(c, path_to_weights, processes); - return std::make_unique(find_java().string(), cmd); -} - void cool_to_hic(const ConvertConfig& c) { static const internal::TmpDir tmpdir{}; @@ -365,7 +306,8 @@ void cool_to_hic(const ConvertConfig& c) { if (weight_file_has_data) { t1 = std::chrono::steady_clock::now(); SPDLOG_INFO(FMT_STRING("running juicer_tools addNorm...")); - process = run_juicer_tools_add_norm(c, weights, c.processes); + process = run_juicer_tools_add_norm(c.juicer_tools_jar, weights, c.path_to_output, + c.juicer_tools_xmx); process->wait(); if (process->exit_code() != 0) { throw std::runtime_error(fmt::format( diff --git a/src/hictk/include/hictk/tools/cli.hpp b/src/hictk/include/hictk/tools/cli.hpp index 6170ead7..f0823632 100644 --- a/src/hictk/include/hictk/tools/cli.hpp +++ b/src/hictk/include/hictk/tools/cli.hpp @@ -192,6 +192,7 @@ class Cli { public: enum subcommand { help, + balance, convert, dump, load, @@ -216,6 +217,7 @@ class Cli { CLI::App _cli{}; subcommand _subcommand{subcommand::help}; + void make_balance_subcommand(); void make_convert_subcommand(); void make_dump_subcommand(); void make_load_subcommand(); @@ -224,6 +226,7 @@ class Cli { void make_zoomify_subcommand(); void make_cli(); + void validate_balance_subcommand() const; void validate_convert_subcommand() const; void validate_dump_subcommand() const; void validate_load_subcommand() const; @@ -231,6 +234,7 @@ class Cli { void validate_zoomify_subcommand() const; void validate_args() const; + void transform_args_balance_subcommand(); void transform_args_convert_subcommand(); void transform_args_dump_subcommand(); void transform_args_load_subcommand(); diff --git a/src/hictk/include/hictk/tools/common.hpp b/src/hictk/include/hictk/tools/common.hpp new file mode 100644 index 00000000..b161df1f --- /dev/null +++ b/src/hictk/include/hictk/tools/common.hpp @@ -0,0 +1,15 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +namespace std { +template <> +struct default_delete { + void operator()(FILE* file) const { std::fclose(file); } // NOLINT +}; +} // namespace std diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index a02876d9..3fa92dc5 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -14,6 +14,31 @@ namespace hictk::tools { +struct BalanceConfig { + std::filesystem::path path_to_input{}; + std::filesystem::path tmp_dir{std::filesystem::temp_directory_path()}; + std::filesystem::path juicer_tools_jar{}; + + std::string mode{"gw"}; + std::size_t masked_diags{2}; + double mad_max{5.0}; + std::size_t min_nnz{10}; + std::size_t min_count{0}; + double tolerance{1.0e-5}; + std::size_t max_iters{500}; + bool rescale_marginals{true}; + std::string name{"weight"}; + bool in_memory{true}; + bool stdout_{false}; + std::uint8_t zstd_compression_lvl{3}; + std::size_t threads{1}; + std::size_t chunk_size{10'000'000}; + std::size_t juicer_tools_xmx{256'000'000}; + + std::uint8_t verbosity{4}; + bool force{false}; +}; + struct ConvertConfig { std::filesystem::path path_to_input{}; std::filesystem::path path_to_output{}; @@ -104,6 +129,7 @@ struct ZoomifyConfig { // clang-format off using Config = std::variant +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/tools/config.hpp" + +namespace hictk::tools { + +[[nodiscard]] inline std::filesystem::path find_java() { + auto java = boost::process::search_path("java"); + if (java.empty()) { + throw std::runtime_error("unable to find java in your PATH"); + } + return java.string(); +} + +[[nodiscard]] inline std::vector generate_juicer_tools_pre_args( + const ConvertConfig& c, const std::filesystem::path& path_to_pixels, + const std::filesystem::path& path_to_chrom_sizes, std::size_t processes) { + assert(processes != 0); + return {fmt::format(FMT_STRING("-Xmx{}M"), c.juicer_tools_xmx / 1'000'000), + "-jar", + c.juicer_tools_jar.string(), + "pre", + "-j", + fmt::to_string(processes), + "-t", + c.tmp_dir.string(), + "-n", + "-r", + fmt::format(FMT_STRING("{}"), fmt::join(c.resolutions, ",")), + path_to_pixels.string(), + c.path_to_output.string(), + path_to_chrom_sizes.string()}; +} + +[[nodiscard]] inline std::vector generate_juicer_tools_add_norm_args( + const std::filesystem::path& juicer_tools_jar, const std::filesystem::path& path_to_weights, + const std::filesystem::path& path_to_output, std::size_t juicer_tools_xmx) { + return {fmt::format(FMT_STRING("-Xmx{}M"), juicer_tools_xmx / 1'000'000), + "-jar", + juicer_tools_jar.string(), + "addNorm", + "-j", + "1", + path_to_output.string(), + path_to_weights.string()}; +} + +[[nodiscard]] inline std::unique_ptr run_juicer_tools_pre( + const ConvertConfig& c, const std::filesystem::path& chrom_sizes, + const std::filesystem::path& pixels, std::size_t processes) { + const auto cmd = generate_juicer_tools_pre_args(c, pixels, chrom_sizes, processes); + return std::make_unique(find_java().string(), cmd); +} + +[[nodiscard]] inline std::unique_ptr run_juicer_tools_add_norm( + const std::filesystem::path& juicer_tools_jar, const std::filesystem::path& path_to_weights, + const std::filesystem::path& path_to_output, std::size_t juicer_tools_xmx) { + const auto cmd = generate_juicer_tools_add_norm_args(juicer_tools_jar, path_to_weights, + path_to_output, juicer_tools_xmx); + return std::make_unique(find_java().string(), cmd); +} + +} // namespace hictk::tools diff --git a/src/hictk/include/hictk/tools/tools.hpp b/src/hictk/include/hictk/tools/tools.hpp index 062203a3..3dae7308 100644 --- a/src/hictk/include/hictk/tools/tools.hpp +++ b/src/hictk/include/hictk/tools/tools.hpp @@ -8,6 +8,7 @@ namespace hictk::tools { +[[nodiscard]] int balance_subcmd(const BalanceConfig& c); [[nodiscard]] int convert_subcmd(const ConvertConfig& c); [[nodiscard]] int dump_subcmd(const DumpConfig& c); [[nodiscard]] int load_subcmd(const LoadConfig& c); diff --git a/src/hictk/main.cpp b/src/hictk/main.cpp index 19673e20..e27103a9 100644 --- a/src/hictk/main.cpp +++ b/src/hictk/main.cpp @@ -99,6 +99,8 @@ int main(int argc, char** argv) noexcept { } using sc = Cli::subcommand; switch (subcmd) { + case sc::balance: + return balance_subcmd(std::get(config)); case sc::convert: return convert_subcmd(std::get(config)); case sc::dump: diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index 91739ae9..ce492da6 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -28,7 +28,7 @@ class ICE { enum Type { cis, trans, gw }; struct Params { - double tol{1.0e-6}; + double tol{1.0e-5}; std::size_t max_iters{200}; std::size_t num_masked_diags{2}; std::size_t min_nnz{10}; @@ -39,8 +39,8 @@ class ICE { std::size_t threads{1}; }; - inline static const Params DefaultParams{1.0e-6, 200, 2, 10, 0, - 5.0, "", 10'000'000, 1}; // NOLINT + // NOLINTNEXTLINE + inline static const Params DefaultParams{1.0e-5, 200, 2, 10, 0, 5.0, "", 10'000'000, 1}; template explicit ICE(const File& f, Type type = Type::gw, const Params& params = DefaultParams); diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp index 5b0f7a37..db036cb4 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp @@ -76,8 +76,7 @@ inline void ICE::balance_chunked(const File& f, Type type, double tol, std::size std::size_t min_count, double mad_max, const std::filesystem::path& tmpfile, std::size_t chunk_size, BS::thread_pool* tpool) { - auto matrix = - construct_sparse_matrix_chunked(f, type, num_masked_diags, tmpfile, chunk_size); + auto matrix = construct_sparse_matrix_chunked(f, type, num_masked_diags, tmpfile, chunk_size); initialize_biases(matrix, _biases, _chrom_offsets, min_nnz, min_count, mad_max, tpool); diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index 3ddaa263..0c06ecdf 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -248,6 +248,7 @@ class File { balancing::Weights::Type type, bool rescale = false) const; + [[nodiscard]] bool has_normalization(std::string_view normalization) const; [[nodiscard]] std::vector avail_normalizations() const; [[nodiscard]] bool has_normalization(const balancing::Method &normalization) const; std::shared_ptr read_weights(const balancing::Method &normalization, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp index d0ca7987..00f1a175 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_accessors_impl.hpp @@ -87,6 +87,16 @@ inline auto File::dataset(std::string_view dataset_name) const -> const Dataset } } +inline bool File::has_normalization(const balancing::Method &normalization) const { + const auto dset_path = fmt::format(FMT_STRING("{}/{}"), _groups.at("bins").group.getPath(), + normalization.to_string()); + if (_weights.contains(dset_path)) { + return true; + } + + return _root_group().exist(dset_path); +} + inline std::vector File::avail_normalizations() const { const phmap::flat_hash_set bin_table_dsets{"chrom", "start", "end"}; diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp index 819bb657..07cc0a4b 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp @@ -239,16 +239,6 @@ inline std::shared_ptr File::read_weights(std::string_ return read_weights(balancing::Method{normalization}, type, rescale); } -inline bool File::has_normalization(const balancing::Method &normalization) const { - const auto dset_path = fmt::format(FMT_STRING("{}/{}"), _groups.at("bins").group.getPath(), - normalization.to_string()); - if (_weights.contains(dset_path)) { - return true; - } - - return _root_group().exist(dset_path); -} - inline std::shared_ptr File::read_weights( const balancing::Method &normalization, bool rescale) const { if (normalization == "NONE") { diff --git a/src/libhictk/file/include/hictk/file.hpp b/src/libhictk/file/include/hictk/file.hpp index c8e05983..c162cff7 100644 --- a/src/libhictk/file/include/hictk/file.hpp +++ b/src/libhictk/file/include/hictk/file.hpp @@ -132,6 +132,7 @@ class File { std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, const balancing::Method &normalization = balancing::Method::NONE()) const; + [[nodiscard]] bool has_normalization(std::string_view normalization) const; [[nodiscard]] std::vector avail_normalizations() const; template diff --git a/src/libhictk/file/include/hictk/impl/file_impl.hpp b/src/libhictk/file/include/hictk/impl/file_impl.hpp index e8adf527..cb617891 100644 --- a/src/libhictk/file/include/hictk/impl/file_impl.hpp +++ b/src/libhictk/file/include/hictk/impl/file_impl.hpp @@ -277,6 +277,9 @@ inline PixelSelector File::fetch(std::string_view chrom1_name, std::uint32_t sta _fp); } +inline bool File::has_normalization(std::string_view normalization) const { + return std::visit([&](const auto& fp) { return fp.has_normalization(normalization); }, _fp); +} inline std::vector File::avail_normalizations() const { return std::visit([](const auto& fp) { return fp.avail_normalizations(); }, _fp); } diff --git a/src/libhictk/hic/include/hictk/hic.hpp b/src/libhictk/hic/include/hictk/hic.hpp index 42f2689f..62023ab1 100644 --- a/src/libhictk/hic/include/hictk/hic.hpp +++ b/src/libhictk/hic/include/hictk/hic.hpp @@ -56,6 +56,7 @@ class File { [[nodiscard]] std::uint64_t nchroms() const; [[nodiscard]] const std::string &assembly() const noexcept; [[nodiscard]] const std::vector &avail_resolutions() const noexcept; + [[nodiscard]] bool has_normalization(std::string_view normalization) const; [[nodiscard]] std::vector avail_normalizations() const; [[nodiscard]] std::uint32_t resolution() const noexcept; diff --git a/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp index ef8128b4..94f072b2 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/hic_file_impl.hpp @@ -87,6 +87,14 @@ inline const std::vector& File::avail_resolutions() const noexcep return _fs->header().resolutions; } +inline bool File::has_normalization(std::string_view normalization) const { + const auto normalizations = avail_normalizations(); + const auto it = std::find_if(normalizations.begin(), normalizations.end(), + [&](const auto& norm) { return norm.to_string() == normalization; }); + + return it != normalizations.end(); +} + inline std::vector File::avail_normalizations() const { return _fs->list_avail_normalizations(_type, _unit, _bins->bin_size()); } From a48e37ac028fe197850ddac88bc04040850fbb6e Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 27 Sep 2023 18:29:44 +0200 Subject: [PATCH 20/33] Bugfix --- src/hictk/balance/balance.cpp | 18 +++++++----------- src/hictk/cli/cli_balance.cpp | 3 ++- src/hictk/convert/cool_to_hic.cpp | 2 +- .../cooler/include/hictk/cooler/cooler.hpp | 1 - .../hictk/cooler/impl/file_read_impl.hpp | 2 +- test/units/cooler/file_weights_test.cpp | 6 +++--- 6 files changed, 14 insertions(+), 18 deletions(-) diff --git a/src/hictk/balance/balance.cpp b/src/hictk/balance/balance.cpp index 197ecefc..e58b57c6 100644 --- a/src/hictk/balance/balance.cpp +++ b/src/hictk/balance/balance.cpp @@ -83,7 +83,7 @@ static void write_weights_cooler(std::string_view uri, const BalanceConfig& c, const std::vector& scale) { const auto& [file, grp] = cooler::parse_cooler_uri(uri); const auto path = fmt::format(FMT_STRING("{}/bins/{}"), grp, c.name); - SPDLOG_INFO(FMT_STRING("Writing weights to {}{}..."), uri, path); + SPDLOG_INFO(FMT_STRING("Writing weights to {}::{}..."), uri, path); const HighFive::File clr(file, HighFive::File::ReadWrite); @@ -110,7 +110,7 @@ static void write_weights_cooler(std::string_view uri, const BalanceConfig& c, } else { std::vector converged{}; for (const auto& var : variance) { - converged.push_back(var < c.tolerance); + converged.push_back(var < c.tolerance); // NOLINT } dset.write_attribute("converged", converged); dset.write_attribute("scale", scale); @@ -121,14 +121,11 @@ static void write_weights_cooler(std::string_view uri, const BalanceConfig& c, static int balance_singleres_file(File&& f, const BalanceConfig& c) { std::filesystem::path tmpfile{}; - if (f.is_cooler()) { - const auto& ff = f.get(); - if (ff.has_weights(c.name) && !c.force) { - throw std::runtime_error(fmt::format( - FMT_STRING( - "{}/bins/weight already exists. Pass --force to overwrite currently stored weights."), - ff.uri())); - } + if (!c.force && !c.stdout_ && f.has_normalization(c.name)) { + throw std::runtime_error( + fmt::format(FMT_STRING("Normalization weights for \"{}\" already exist in file {}. Pass " + "--force to overwrite existing weights."), + c.name, f.path())); } if (!c.in_memory) { @@ -177,7 +174,6 @@ static int balance_singleres_file(File&& f, const BalanceConfig& c) { } write_weights_hic(f.get(), c, weights); - // TODO write weights .hic return 0; } diff --git a/src/hictk/cli/cli_balance.cpp b/src/hictk/cli/cli_balance.cpp index 6120d23a..7015b4f2 100644 --- a/src/hictk/cli/cli_balance.cpp +++ b/src/hictk/cli/cli_balance.cpp @@ -156,7 +156,8 @@ void Cli::validate_balance_subcommand() const { const auto juicer_tools_jar_parsed = !_cli.get_subcommand("balance")->get_option("--juicer-tools-jar")->empty(); if (hic::utils::is_hic_file(c.path_to_input) && !c.stdout_ && !juicer_tools_jar_parsed) { - errors.push_back("option --juicer-tools-jar is required when balancing files in .hic format."); + errors.emplace_back( + "option --juicer-tools-jar is required when balancing files in .hic format."); } if (!errors.empty()) { diff --git a/src/hictk/convert/cool_to_hic.cpp b/src/hictk/convert/cool_to_hic.cpp index 19d7ec9f..97b41763 100644 --- a/src/hictk/convert/cool_to_hic.cpp +++ b/src/hictk/convert/cool_to_hic.cpp @@ -204,7 +204,7 @@ static bool dump_weights(std::uint32_t resolution, std::string_view cooler_uri, const cooler::File clr(cooler_uri); assert(clr.bin_size() == resolution); - if (!clr.has_weights("weight")) { + if (!clr.has_normalization("weight")) { SPDLOG_WARN(FMT_STRING("[{}] unable to read weights from \"{}\"..."), resolution, cooler_uri); return false; } diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index 0c06ecdf..bea4d22a 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -241,7 +241,6 @@ class File { std::uint64_t first_bin1, std::uint64_t last_bin1, std::uint64_t first_bin2, std::uint64_t last_bin2, std::shared_ptr weights = nullptr) const; - bool has_weights(std::string_view normalization) const; std::shared_ptr read_weights(std::string_view normalization, bool rescale = false) const; std::shared_ptr read_weights(std::string_view normalization, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp index 07cc0a4b..e27e2d2a 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp @@ -226,7 +226,7 @@ inline PixelSelector File::fetch(PixelCoordinates coord1, PixelCoordinates coord // clang-format on } -inline bool File::has_weights(std::string_view normalization) const { +inline bool File::has_normalization(std::string_view normalization) const { return has_normalization(balancing::Method{normalization}); } inline std::shared_ptr File::read_weights(std::string_view normalization, diff --git a/test/units/cooler/file_weights_test.cpp b/test/units/cooler/file_weights_test.cpp index 5218c0a8..c5e5e4c6 100644 --- a/test/units/cooler/file_weights_test.cpp +++ b/test/units/cooler/file_weights_test.cpp @@ -22,8 +22,8 @@ TEST_CASE("Cooler: read weights", "[cooler][short]") { SECTION("wo/ weights") { CHECK(clr1.avail_normalizations().empty()); } SECTION("w/ weights") { CHECK(clr2.avail_normalizations().size() == 6); - CHECK(clr2.has_weights("SCALE")); - CHECK(!clr2.has_weights("FOOBAR")); + CHECK(clr2.has_normalization("SCALE")); + CHECK(!clr2.has_normalization("FOOBAR")); CHECK(clr2.read_weights("SCALE")->type() == hictk::balancing::Weights::Type::DIVISIVE); } @@ -38,7 +38,7 @@ TEST_CASE("Cooler: write weights", "[cooler][short]") { std::filesystem::remove(path2); std::filesystem::remove(path3); std::filesystem::copy(path1, path2); - REQUIRE_FALSE(File(path2.string()).has_weights("weight")); + REQUIRE_FALSE(File(path2.string()).has_normalization("weight")); const auto num_bins = File(path1.string()).bins().size(); From c55ba8100bd4b330ab0a4235e511dec22b9fe266 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 27 Sep 2023 18:32:21 +0200 Subject: [PATCH 21/33] Add integration tests for hictk balance --- .github/workflows/codecov.yml | 1 + .github/workflows/macos-ci.yml | 4 + .github/workflows/ubuntu-ci.yml | 4 + test/scripts/hictk_balance.sh | 147 ++++++++++++++++++++++++++++++++ 4 files changed, 156 insertions(+) create mode 100755 test/scripts/hictk_balance.sh diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index e4ef6830..e6ec535e 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -190,6 +190,7 @@ jobs: - name: Run integration tests run: | + test/scripts/hictk_balance.sh build/src/hictk/hictk hic_tools.jar test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk test/scripts/hictk_dump_bins.sh build/src/hictk/hictk test/scripts/hictk_dump_resolutions.sh build/src/hictk/hictk diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index baf850eb..5c8a0857 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -362,6 +362,10 @@ jobs: zstd -dcf binaries.tar.zst | tar -xf - tar -xf test/data/hictk_test_data.tar.xz + - name: Test hictk balance + run: | + test/scripts/hictk_balance.sh bin/hictk hic_tools.jar + - name: Test hictk dump chroms run: | test/scripts/hictk_dump_chroms.sh bin/hictk diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml index 19bf42eb..17953490 100644 --- a/.github/workflows/ubuntu-ci.yml +++ b/.github/workflows/ubuntu-ci.yml @@ -415,6 +415,10 @@ jobs: zstd -dcf binaries.tar.zst | tar -xf - tar -xf test/data/hictk_test_data.tar.xz + - name: Test hictk balance + run: | + test/scripts/hictk_balance.sh bin/hictk hic_tools.jar + - name: Test hictk dump chroms run: | test/scripts/hictk_dump_chroms.sh bin/hictk diff --git a/test/scripts/hictk_balance.sh b/test/scripts/hictk_balance.sh new file mode 100755 index 00000000..7a09df60 --- /dev/null +++ b/test/scripts/hictk_balance.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash + +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +set -e +set -o pipefail +set -u + +echo "##################################" +echo "#### hictk balance ####" + +# readlink -f is not available on macos... +function readlink_py { + set -eu + python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" +} + +function check_files_exist { + set -eu + status=0 + for f in "$@"; do + if [ ! -f "$f" ]; then + 2>&1 echo "Unable to find test file \"$f\"" + status=1 + fi + done + + return "$status" +} + +function dump_interactions { + set -o pipefail + set -eu + + hictk="$1" + resolution="$3" + f="$2" + + if [[ "$f" == *.hic ]]; then + weight=WEIGHT + else + weight=weight + fi + + "$hictk" dump "$f" \ + --balance="$weight" \ + --resolution \ + "$resolution" | + cut -f 3 +} + +function absolute_error { + set -o pipefail + set -eu + + f1="$1" + f2="$2" + + # shellcheck disable=SC2016 + cmd='function abs(v) { + return v < 0 ? -v : v + } + ($2!=$1 && abs($1 - $2) > 1.0e-5) { print $0 } + ' + + # Fail if the absolute error is > 1.0e-5 + if paste "$f1" "$f2" | awk -F '\t' "$cmd" | grep . ; then + return 1 + else + return 0 + fi +} + +function compare_matrices { + set -o pipefail + set -eu + + hictk="$1" + resolution="$4" + f1="$2" + f2="$3" + + 2>&1 echo "Comparing $f1 with $f2..." + if absolute_error \ + <(dump_interactions "$hictk" "$f1" "$resolution") \ + <(dump_interactions "$hictk" "$f2" "$resolution"); then + 2>&1 echo "Files are identical" + return 0 + else + 2>&1 echo "Files differ" + return 1 + fi +} + +export function readlink_py + +status=0 + +if [ $# -ne 2 ]; then + 2>&1 echo "Usage: $0 path_to_hictk juicer_tools.jar" + status=1 +fi + +hictk_bin="$1" +juicer_tools_jar="$2" + +data_dir="$(readlink_py "$(dirname "$0")/../data/")" +script_dir="$(readlink_py "$(dirname "$0")")" + +ref_cool="$data_dir/cooler/ENCFF993FGR.2500000.cool" +ref_hic="$data_dir/hic/ENCFF993FGR.hic" + +export PATH="$PATH:$script_dir" + +if ! check_files_exist "$ref_cool" "$ref_hic" "$juicer_tools_jar"; then + exit 1 +fi + +outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" +trap 'rm -rf -- "$outdir"' EXIT + +cp "$ref_cool" "$ref_hic" "$outdir" + +"$hictk_bin" balance "$outdir/"*.cool -t $(nproc) --chunk-size=100 --mode=cis --force +if ! compare_matrices "$hictk_bin" "$outdir/"*.cool "$ref_cool" 2500000; then + status=1 +fi + +"$hictk_bin" balance "$outdir/"*.hic -t $(nproc) --chunk-size=100 --mode=cis --force --juicer-tools-jar "$juicer_tools_jar" +if ! compare_matrices "$hictk_bin" "$outdir/"*.hic "$ref_cool" 2500000; then + status=1 +fi + +"$hictk_bin" balance "$outdir/"*.cool -t $(nproc) --in-memory --mode=cis --force +if ! compare_matrices "$hictk_bin" "$outdir/"*.cool "$ref_cool" 2500000; then + status=1 +fi + +if [ "$status" -eq 0 ]; then + printf '\n### PASS ###\n' +else + printf '\n### FAIL ###\n' +fi + +exit "$status" From a61d3099e965dba5877047f8d695b12d1cbb8520 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 27 Sep 2023 18:32:31 +0200 Subject: [PATCH 22/33] Update .clang-tidy --- .clang-tidy | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.clang-tidy b/.clang-tidy index 9ec34b36..016e969d 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -17,9 +17,11 @@ Checks: > -cppcoreguidelines-pro-bounds-constant-array-index, -hicpp-no-array-decay, -misc-no-recursion, + -misc-use-anonymous-namespace, -modernize-use-trailing-return-type, -readability-identifier-length, - -readability-magic-numbers + -readability-magic-numbers, + -readability-static-definition-in-anonymous-namespace WarningsAsErrors: '' HeaderFilterRegex: '' FormatStyle: none From 12ab184f9c7bb249dac4418c177ecf4969939423 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 27 Sep 2023 18:47:41 +0200 Subject: [PATCH 23/33] Fix typos --- .github/workflows/codecov.yml | 1 + src/hictk/include/hictk/tools/config.hpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index e6ec535e..fb62099e 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -191,6 +191,7 @@ jobs: - name: Run integration tests run: | test/scripts/hictk_balance.sh build/src/hictk/hictk hic_tools.jar + test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk test/scripts/hictk_dump_bins.sh build/src/hictk/hictk test/scripts/hictk_dump_resolutions.sh build/src/hictk/hictk diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index 3fa92dc5..7910e06b 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -28,7 +28,7 @@ struct BalanceConfig { std::size_t max_iters{500}; bool rescale_marginals{true}; std::string name{"weight"}; - bool in_memory{true}; + bool in_memory{false}; bool stdout_{false}; std::uint8_t zstd_compression_lvl{3}; std::size_t threads{1}; From 114a7be4858ce52e1b61d3d77c34288d4edbc4ea Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 27 Sep 2023 20:32:28 +0200 Subject: [PATCH 24/33] Fix macos builds --- src/libhictk/balancing/include/hictk/balancing/ice.hpp | 6 +++--- .../balancing/include/hictk/balancing/impl/ice_impl.hpp | 6 +++--- .../include/hictk/balancing/impl/sparse_matrix_impl.hpp | 4 ++-- .../balancing/include/hictk/balancing/sparse_matrix.hpp | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index ce492da6..021745e0 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -139,18 +139,18 @@ class ICE { static void min_count_filtering(nonstd::span biases, std::size_t min_count, nonstd::span marg); - static void mad_max_filtering(nonstd::span chrom_offsets, + static void mad_max_filtering(nonstd::span chrom_offsets, nonstd::span biases, nonstd::span marg, double mad_max); template static void initialize_biases(const MatrixT& matrix, nonstd::span biases, - nonstd::span chrom_bin_offsets, + nonstd::span chrom_bin_offsets, std::size_t min_nnz, std::size_t min_count, double mad_max, BS::thread_pool* tpool); [[nodiscard]] static std::vector compute_weights_from_chromosome_sizes( - const BinTable& bins, nonstd::span chrom_bin_offsets); + const BinTable& bins, nonstd::span chrom_bin_offsets); }; } // namespace hictk::balancing diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp index db036cb4..408cbead 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp @@ -421,7 +421,7 @@ inline void ICE::min_count_filtering(nonstd::span biases, std::size_t mi } } -inline void ICE::mad_max_filtering(nonstd::span chrom_offsets, +inline void ICE::mad_max_filtering(nonstd::span chrom_offsets, nonstd::span biases, nonstd::span marg, double mad_max) { auto median = [](auto v) { @@ -601,7 +601,7 @@ inline double ICE::compute_ssq_nzmarg(nonstd::span marg, double av template inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span biases, - nonstd::span chrom_bin_offsets, + nonstd::span chrom_bin_offsets, std::size_t min_nnz, std::size_t min_count, double mad_max, BS::thread_pool* tpool) { if (min_nnz == 0 && min_count == 0 && mad_max == 0) { @@ -631,7 +631,7 @@ inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span b } inline std::vector ICE::compute_weights_from_chromosome_sizes( - const BinTable& bins, nonstd::span chrom_bin_offsets) { + const BinTable& bins, nonstd::span chrom_bin_offsets) { std::vector weights(bins.size()); for (std::uint32_t i = 1; i < chrom_bin_offsets.size(); ++i) { const auto& chrom = bins.chromosomes().at(i - 1); diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp index 412b8440..455ec685 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -69,7 +69,7 @@ inline void MargsVector::resize(std::size_t size_) { inline std::size_t MargsVector::size() const noexcept { return _margs.size(); } inline bool MargsVector::empty() const noexcept { return size() == 0; } -constexpr std::size_t MargsVector::compute_number_of_mutexes(std::size_t size) noexcept { +inline std::size_t MargsVector::compute_number_of_mutexes(std::size_t size) noexcept { if (size == 0) { return 0; } @@ -80,7 +80,7 @@ constexpr std::size_t MargsVector::compute_number_of_mutexes(std::size_t size) n } template -constexpr I MargsVector::next_pow2(I n) noexcept { +inline I MargsVector::next_pow2(I n) noexcept { using ull = unsigned long long; if constexpr (std::is_signed_v) { assert(n >= 0); diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp index 2f150ddc..10123271 100644 --- a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -60,9 +60,9 @@ class MargsVector { [[nodiscard]] bool empty() const noexcept; private: - static constexpr std::size_t compute_number_of_mutexes(std::size_t size) noexcept; + static std::size_t compute_number_of_mutexes(std::size_t size) noexcept; template >> - [[nodiscard]] static constexpr I next_pow2(I n) noexcept; + [[nodiscard]] static I next_pow2(I n) noexcept; [[nodiscard]] std::size_t get_mutex_idx(std::size_t i) const noexcept; }; From c35c9a8b65a85f0ecc9b1daeeca39bf92ad38be4 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 28 Sep 2023 11:59:03 +0200 Subject: [PATCH 25/33] Fix windows builds --- src/hictk/balance/balance.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hictk/balance/balance.cpp b/src/hictk/balance/balance.cpp index e58b57c6..4b3fab75 100644 --- a/src/hictk/balance/balance.cpp +++ b/src/hictk/balance/balance.cpp @@ -201,7 +201,7 @@ int balance_subcmd(const BalanceConfig& c) { } for (const auto& res : resolutions) { - balance_singleres_file(File(c.path_to_input, res), c); + balance_singleres_file(File(c.path_to_input.string(), res), c); } return 0; From 6a9f05018a3af66dcbd1ee4233384b0db2f8e0d0 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 28 Sep 2023 12:32:17 +0200 Subject: [PATCH 26/33] Fix integration tests on macOS --- test/scripts/hictk_balance.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/scripts/hictk_balance.sh b/test/scripts/hictk_balance.sh index 7a09df60..cf253b90 100755 --- a/test/scripts/hictk_balance.sh +++ b/test/scripts/hictk_balance.sh @@ -17,6 +17,11 @@ function readlink_py { python3 -c 'import os, sys; print(os.path.realpath(sys.argv[1]))' "$1" } +function nproc_py { + set -eu + python3 -c 'import multiprocessing as mp; print(mp.cpu_count())' +} + function check_files_exist { set -eu status=0 @@ -123,17 +128,17 @@ trap 'rm -rf -- "$outdir"' EXIT cp "$ref_cool" "$ref_hic" "$outdir" -"$hictk_bin" balance "$outdir/"*.cool -t $(nproc) --chunk-size=100 --mode=cis --force +"$hictk_bin" balance "$outdir/"*.cool -t $(nproc_py) --chunk-size=100 --mode=cis --force if ! compare_matrices "$hictk_bin" "$outdir/"*.cool "$ref_cool" 2500000; then status=1 fi -"$hictk_bin" balance "$outdir/"*.hic -t $(nproc) --chunk-size=100 --mode=cis --force --juicer-tools-jar "$juicer_tools_jar" +"$hictk_bin" balance "$outdir/"*.hic -t $(nproc_py) --chunk-size=100 --mode=cis --force --juicer-tools-jar "$juicer_tools_jar" if ! compare_matrices "$hictk_bin" "$outdir/"*.hic "$ref_cool" 2500000; then status=1 fi -"$hictk_bin" balance "$outdir/"*.cool -t $(nproc) --in-memory --mode=cis --force +"$hictk_bin" balance "$outdir/"*.cool -t $(nproc_py) --in-memory --mode=cis --force if ! compare_matrices "$hictk_bin" "$outdir/"*.cool "$ref_cool" 2500000; then status=1 fi From 13e01af4df99889fa350bb62afff1e7efd8a4d50 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 28 Sep 2023 12:39:19 +0200 Subject: [PATCH 27/33] Bugfix --- .../balancing/include/hictk/balancing/sparse_matrix.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp index 10123271..af8855bb 100644 --- a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -67,8 +67,8 @@ class MargsVector { }; class SparseMatrix { - std::vector _bin1_ids{}; - std::vector _bin2_ids{}; + std::vector _bin1_ids{}; + std::vector _bin2_ids{}; std::vector _counts{}; public: From 35ab0e25b3f1e35b799c3d79633c18abd81bfe29 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 28 Sep 2023 12:40:04 +0200 Subject: [PATCH 28/33] Update test dataset --- cmake/FetchTestDataset.cmake | 4 ++-- test/units/balancing/balancing_test.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/FetchTestDataset.cmake b/cmake/FetchTestDataset.cmake index 4d30a60d..856a3b9d 100644 --- a/cmake/FetchTestDataset.cmake +++ b/cmake/FetchTestDataset.cmake @@ -4,8 +4,8 @@ # cmake-format: off file( - DOWNLOAD https://www.dropbox.com/s/l6rymg9mezixin6/hictk_test_data.tar.xz?dl=1 - EXPECTED_HASH SHA256=a97c3a66d25c7441154ef15c9b747e69ac1b6a5810a478a67139565ee3ea999c + DOWNLOAD https://zenodo.org/record/8386066/files/hictk_test_data.tar.xz?download=1 + EXPECTED_HASH SHA256=77e2b9186d9edb90a8436fc41d6c0632727df3af95acb74e9635d1c0f29f2b8d "${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz") # cmake-format: on diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index 44b809ef..68e01231 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -158,7 +158,7 @@ TEST_CASE("Balancing: ICE (intra)", "[balancing][short]") { std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; const auto tmpfile = testdir() / "balancing_ice_intra.tmp"; - const auto path_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.cis.txt"; + const auto path_weights = datadir / "balancing/ENCFF993FGR.2500000.ICE.cis.txt"; for (const auto& [label, path] : files) { SECTION(label) { @@ -194,7 +194,7 @@ TEST_CASE("Balancing: ICE (inter)", "[balancing][medium]") { std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; const auto tmpfile = testdir() / "balancing_ice_inter.tmp"; - const auto path_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.trans.txt"; + const auto path_weights = datadir / "balancing/ENCFF993FGR.2500000.ICE.trans.txt"; for (const auto& [label, path] : files) { SECTION(label) { @@ -230,7 +230,7 @@ TEST_CASE("Balancing: ICE (gw)", "[balancing][medium]") { std::make_pair("hic", datadir / "hic/ENCFF993FGR.hic")}; const auto tmpfile = testdir() / "balancing_ice_inter.tmp"; - const auto path_weights = datadir / "cooler/balancing/ENCFF993FGR.2500000.ICE.gw.txt"; + const auto path_weights = datadir / "balancing/ENCFF993FGR.2500000.ICE.gw.txt"; for (const auto& [label, path] : files) { SECTION(label) { From 094582a0f8806dc4fbfeaa0c8fc13cbd932a90b9 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 28 Sep 2023 12:45:32 +0200 Subject: [PATCH 29/33] Fix macos builds --- .../include/hictk/balancing/impl/sparse_matrix_impl.hpp | 4 ++-- .../balancing/include/hictk/balancing/sparse_matrix.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp index 455ec685..0c917dab 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -136,8 +136,8 @@ inline void SparseMatrix::shrink_to_fit() noexcept { inline void SparseMatrix::finalize() { shrink_to_fit(); } -inline const std::vector& SparseMatrix::bin1_ids() const noexcept { return _bin1_ids; } -inline const std::vector& SparseMatrix::bin2_ids() const noexcept { return _bin2_ids; } +inline const std::vector& SparseMatrix::bin1_ids() const noexcept { return _bin1_ids; } +inline const std::vector& SparseMatrix::bin2_ids() const noexcept { return _bin2_ids; } inline const std::vector& SparseMatrix::counts() const noexcept { return _counts; } inline void SparseMatrix::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count, diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp index af8855bb..f9e6b669 100644 --- a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -80,8 +80,8 @@ class SparseMatrix { void shrink_to_fit() noexcept; void finalize(); - [[nodiscard]] const std::vector& bin1_ids() const noexcept; - [[nodiscard]] const std::vector& bin2_ids() const noexcept; + [[nodiscard]] const std::vector& bin1_ids() const noexcept; + [[nodiscard]] const std::vector& bin2_ids() const noexcept; [[nodiscard]] const std::vector& counts() const noexcept; void push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count, From b7180f3aea75c0a80f0b1ccf9767b0935df4c66b Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 28 Sep 2023 16:17:27 +0200 Subject: [PATCH 30/33] Fix macos tests --- src/libhictk/balancing/CMakeLists.txt | 2 +- test/units/balancing/CMakeLists.txt | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/libhictk/balancing/CMakeLists.txt b/src/libhictk/balancing/CMakeLists.txt index fefed0f1..e6325e44 100644 --- a/src/libhictk/balancing/CMakeLists.txt +++ b/src/libhictk/balancing/CMakeLists.txt @@ -28,7 +28,7 @@ target_link_system_libraries( bshoshany-thread-pool::bshoshany-thread-pool nonstd::span-lite phmap - xxHash::xxhash + xxHash::xxhash "zstd::libzstd_$,shared,static>") target_compile_definitions(balancing INTERFACE span_FEATURE_MAKE_SPAN=1) diff --git a/test/units/balancing/CMakeLists.txt b/test/units/balancing/CMakeLists.txt index 22804fdd..4d9f96c7 100644 --- a/test/units/balancing/CMakeLists.txt +++ b/test/units/balancing/CMakeLists.txt @@ -22,7 +22,7 @@ target_link_system_libraries( Catch2::Catch2WithMain std::filesystem) -file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/Testing/) +file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/Testing/") # automatically discover tests that are defined in catch based test files you can modify the unittests. TEST_PREFIX to # whatever you want, or use different for different binaries @@ -33,9 +33,9 @@ catch_discover_tests( TEST_SUFFIX " - SHORT" WORKING_DIRECTORY - ${PROJECT_SOURCE_DIR} + "${PROJECT_SOURCE_DIR}" OUTPUT_DIR - ${CMAKE_CURRENT_SOURCE_DIR}/Testing/ + "${CMAKE_CURRENT_BINARY_DIR}/Testing/" EXTRA_ARGS --success --skip-benchmarks) @@ -47,7 +47,7 @@ catch_discover_tests( TEST_SUFFIX " - MEDIUM" WORKING_DIRECTORY - ${PROJECT_SOURCE_DIR} + "${PROJECT_SOURCE_DIR}" EXTRA_ARGS --success --skip-benchmarks) @@ -59,7 +59,7 @@ catch_discover_tests( TEST_SUFFIX " - LONG" WORKING_DIRECTORY - ${PROJECT_SOURCE_DIR} + "${PROJECT_SOURCE_DIR}" EXTRA_ARGS --success --skip-benchmarks) From 3399bb49e923fd069f4b357a0f448dd5eef8793f Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 28 Sep 2023 16:54:11 +0200 Subject: [PATCH 31/33] Fix ubuntu builds --- .../include/hictk/balancing/impl/sparse_matrix_impl.hpp | 8 ++++++-- .../balancing/include/hictk/balancing/sparse_matrix.hpp | 6 ++++-- src/libhictk/common/include/hictk/common.hpp | 2 +- src/libhictk/cooler/include/hictk/cooler/cooler.hpp | 2 +- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp index 0c917dab..55016ba0 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -136,8 +136,12 @@ inline void SparseMatrix::shrink_to_fit() noexcept { inline void SparseMatrix::finalize() { shrink_to_fit(); } -inline const std::vector& SparseMatrix::bin1_ids() const noexcept { return _bin1_ids; } -inline const std::vector& SparseMatrix::bin2_ids() const noexcept { return _bin2_ids; } +inline const std::vector& SparseMatrix::bin1_ids() const noexcept { + return _bin1_ids; +} +inline const std::vector& SparseMatrix::bin2_ids() const noexcept { + return _bin2_ids; +} inline const std::vector& SparseMatrix::counts() const noexcept { return _counts; } inline void SparseMatrix::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count, diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp index f9e6b669..f7dd0dd0 100644 --- a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -15,6 +15,7 @@ #include #include "hictk/bin_table.hpp" +#include "hictk/common.hpp" namespace std { template <> @@ -119,12 +120,13 @@ class SparseMatrixChunked { int compression_lvl = 3); SparseMatrixChunked(const SparseMatrixChunked& other) = delete; - SparseMatrixChunked(SparseMatrixChunked&& other) noexcept = default; + SparseMatrixChunked(SparseMatrixChunked&& other) noexcept(noexcept_move_ctor()) = default; ~SparseMatrixChunked() noexcept; SparseMatrixChunked& operator=(const SparseMatrixChunked& other) = delete; - SparseMatrixChunked& operator=(SparseMatrixChunked&& other) noexcept = default; + SparseMatrixChunked& operator=(SparseMatrixChunked&& other) noexcept( + noexcept_move_assignment_op()) = default; [[nodiscard]] bool empty() const noexcept; [[nodiscard]] std::size_t size() const noexcept; diff --git a/src/libhictk/common/include/hictk/common.hpp b/src/libhictk/common/include/hictk/common.hpp index c33e77b1..b0fff824 100644 --- a/src/libhictk/common/include/hictk/common.hpp +++ b/src/libhictk/common/include/hictk/common.hpp @@ -86,7 +86,7 @@ inline constexpr std::uint8_t SENTINEL_ATTR_VALUE{255}; #endif } -[[nodiscard]] constexpr bool noexcept_move_assigment_op() noexcept { +[[nodiscard]] constexpr bool noexcept_move_assignment_op() noexcept { #if defined(__GNUC__) && defined(__clang__) return __clang_major__ > 8; #elif defined(__GNUC__) diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index bea4d22a..9ff256d9 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -150,7 +150,7 @@ class File { ~File() noexcept; File &operator=(const File &other) = delete; - File &operator=(File &&other) noexcept(noexcept_move_assigment_op()) = default; // NOLINT + File &operator=(File &&other) noexcept(noexcept_move_assignment_op()) = default; // NOLINT [[nodiscard]] explicit operator bool() const noexcept; From a12a1d68b126d58af08b1ca6b3b8687cd2d23153 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 28 Sep 2023 19:41:15 +0200 Subject: [PATCH 32/33] Bugfix --- .../hictk/balancing/impl/sparse_matrix_impl.hpp | 10 +++++----- test/units/balancing/balancing_test.cpp | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp index 55016ba0..a152f6f6 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -351,7 +351,7 @@ inline SparseMatrixChunked::SparseMatrixChunked(std::filesystem::path tmp_file, _zstd_cctx(ZSTD_createCCtx()), _zstd_dctx(ZSTD_createDCtx()) { _fs.exceptions(std::ios::badbit); - _fs.open(_path, std::ios::out); + _fs.open(_path, std::ios::out | std::ios::binary); } inline SparseMatrixChunked::~SparseMatrixChunked() noexcept { @@ -388,7 +388,7 @@ inline void SparseMatrixChunked::finalize() { if (!_matrix.empty()) { write_chunk(); } - _fs.open(_path, std::ios::in); + _fs.open(_path, std::ios::in | std::ios::binary); } inline void SparseMatrixChunked::marginalize(MargsVector& marg, BS::thread_pool* tpool, @@ -397,7 +397,7 @@ inline void SparseMatrixChunked::marginalize(MargsVector& marg, BS::thread_pool* std::unique_ptr zstd_dctx(ZSTD_createDCtx()); std::fstream fs{}; fs.exceptions(_fs.exceptions()); - fs.open(_path, std::ios::in); + fs.open(_path, std::ios::in | std::ios::binary); auto matrix = _matrix; MargsVector marg_local(marg.size()); for (const auto offset : nonstd::span(_index).subspan(istart, iend - istart)) { @@ -440,7 +440,7 @@ inline void SparseMatrixChunked::marginalize_nnz(MargsVector& marg, BS::thread_p std::unique_ptr zstd_dctx(ZSTD_createDCtx()); std::fstream fs{}; fs.exceptions(_fs.exceptions()); - fs.open(_path, std::ios::in); + fs.open(_path, std::ios::in | std::ios::binary); auto matrix = _matrix; MargsVector marg_local(marg.size()); for (const auto offset : nonstd::span(_index).subspan(istart, iend - istart)) { @@ -484,7 +484,7 @@ inline void SparseMatrixChunked::times_outer_product_marg(MargsVector& marg, std::unique_ptr zstd_dctx(ZSTD_createDCtx()); std::fstream fs{}; fs.exceptions(_fs.exceptions()); - fs.open(_path, std::ios::in); + fs.open(_path, std::ios::in | std::ios::binary); auto matrix = _matrix; MargsVector marg_local(marg.size()); for (const auto offset : nonstd::span(_index).subspan(istart, iend - istart)) { diff --git a/test/units/balancing/balancing_test.cpp b/test/units/balancing/balancing_test.cpp index 68e01231..7c994305 100644 --- a/test/units/balancing/balancing_test.cpp +++ b/test/units/balancing/balancing_test.cpp @@ -55,15 +55,15 @@ static void compare_vectors(const std::vector& v1, const std::vector& v2) } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Balancing: SparseMatrix") { +TEST_CASE("Balancing: SparseMatrix", "[balancing][short]") { using SparseMatrix = hictk::balancing::SparseMatrix; const BinTable bins{Reference{Chromosome{0, "chr0", 50}, Chromosome{1, "chr1", 100}, Chromosome{2, "chr2", 50}, Chromosome{3, "chr3", 50}}, 50}; // clang-format off const std::vector> pixels{ - {1, 0, 1}, {1, 1, 2}, {2, 1, 3}, // chr1 - {3, 0, 4}, {3, 1, 5}}; // chr2 + {1, 1, 1}, {1, 2, 2}, {2, 2, 3}, // chr1 + {3, 3, 4}, {3, 4, 5}}; // chr2 // clang-format on SECTION("accessors") { CHECK(SparseMatrix{}.empty()); } @@ -126,15 +126,15 @@ TEST_CASE("Balancing: SparseMatrix") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Balancing: SparseMatrixChunked") { +TEST_CASE("Balancing: SparseMatrixChunked", "[balancing][short]") { using SparseMatrixChunked = hictk::balancing::SparseMatrixChunked; const BinTable bins{Reference{Chromosome{0, "chr0", 50}, Chromosome{1, "chr1", 100}, Chromosome{2, "chr2", 50}, Chromosome{3, "chr3", 50}}, 50}; // clang-format off const std::vector> pixels{ - {1, 0, 1}, {1, 1, 2}, {2, 1, 3}, // chr1 - {3, 0, 4}, {3, 1, 5}}; // chr2 + {1, 1, 1}, {1, 2, 2}, {2, 2, 3}, // chr1 + {3, 3, 4}, {3, 4, 5}}; // chr2 // clang-format on const auto tmpfile = testdir() / "sparse_matrix_chunked.tmp"; From 3b694cbb54bad1bf6bd6398d8edf995c73f75f11 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 28 Sep 2023 20:22:57 +0200 Subject: [PATCH 33/33] Fix ubuntu builds --- .../balancing/include/hictk/balancing/sparse_matrix.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp index f7dd0dd0..fde72faa 100644 --- a/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/sparse_matrix.hpp @@ -120,7 +120,11 @@ class SparseMatrixChunked { int compression_lvl = 3); SparseMatrixChunked(const SparseMatrixChunked& other) = delete; - SparseMatrixChunked(SparseMatrixChunked&& other) noexcept(noexcept_move_ctor()) = default; +#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ > 9 + SparseMatrixChunked(SparseMatrixChunked&& other) noexcept = default; +#else + SparseMatrixChunked(SparseMatrixChunked&& other) = default; +#endif ~SparseMatrixChunked() noexcept;