diff --git a/src/libhictk/balancing/include/hictk/balancing/ice.hpp b/src/libhictk/balancing/include/hictk/balancing/ice.hpp index 44005f6f..ce08b580 100644 --- a/src/libhictk/balancing/include/hictk/balancing/ice.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/ice.hpp @@ -125,8 +125,6 @@ class ICE { nonstd::span chrom_bin_offsets, std::size_t min_nnz, std::size_t min_count, double mad_max); - [[nodiscard]] static std::vector read_chrom_bin_offsets(const BinTable& bins); - [[nodiscard]] static std::vector compute_weights_from_chromosome_sizes( const BinTable& bins, nonstd::span chrom_bin_offsets); }; diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp index f1be680b..ff806751 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/ice_impl.hpp @@ -22,7 +22,7 @@ namespace hictk::balancing { template inline ICE::ICE(const File& f, Type type, const Params& params) - : _chrom_offsets(read_chrom_bin_offsets(f.bins())), _biases(f.bins().size(), 1.0) { + : _chrom_offsets(f.bins().num_bin_prefix_sum()), _biases(f.bins().size(), 1.0) { if (params.tmpfile.empty()) { balance_in_memory(f, type, params.tol, params.max_iters, params.num_masked_diags, params.min_nnz, params.min_count, params.mad_max); @@ -111,27 +111,29 @@ inline void ICE::balance_trans(const MatrixT& matrix, const BinTable& bins, std: } template -inline void ICE::balance_cis(const MatrixT& matrix, [[maybe_unused]] const BinTable& bins, - std::size_t max_iters, double tol) { - _variance.resize(_chrom_offsets.size() - 1, 0); - _scale.resize(_chrom_offsets.size() - 1, std::numeric_limits::quiet_NaN()); +inline void ICE::balance_cis(const MatrixT& matrix, const BinTable& bins, std::size_t max_iters, + double tol) { + _variance.resize(bins.chromosomes().size(), 0); + _scale.resize(bins.chromosomes().size(), std::numeric_limits::quiet_NaN()); std::vector margs(_biases.size()); - for (std::uint32_t chrom_id = 0; chrom_id < _chrom_offsets.size() - 1; ++chrom_id) { - const auto cis_matrix = matrix.subset(chrom_id); + for (const auto& chrom : bins.chromosomes()) { + if (chrom.is_all()) { + continue; + } + const auto cis_matrix = matrix.subset(chrom.id()); - const auto j0 = _chrom_offsets[chrom_id]; - const auto j1 = _chrom_offsets[chrom_id + 1]; + const auto j0 = _chrom_offsets[chrom.id()]; + const auto j1 = _chrom_offsets[chrom.id() + 1]; auto biases_ = nonstd::span(_biases).subspan(j0, j1 - j0); for (std::size_t k = 0; k < max_iters; ++k) { const auto res = inner_loop(cis_matrix, biases_); - SPDLOG_INFO(FMT_STRING("[{}] iteration {}: {}"), bins.chromosomes().at(chrom_id).name(), - k + 1, res.variance); - _variance[chrom_id] = res.variance; - _scale[chrom_id] = res.scale; + SPDLOG_INFO(FMT_STRING("[{}] iteration {}: {}"), chrom.name(), k + 1, res.variance); + _variance[chrom.id()] = res.variance; + _scale[chrom.id()] = res.scale; if (res.variance < tol) { break; @@ -177,6 +179,9 @@ template SparseMatrix m(f.bins()); for (const Chromosome& chrom : f.chromosomes()) { + if (chrom.is_all()) { + continue; + } const auto sel = f.fetch(chrom.name()); std::for_each(sel.template begin(), sel.template end(), [&](const ThinPixel& p) { @@ -282,6 +287,9 @@ inline auto ICE::construct_sparse_matrix_chunked_cis(const File& f, std::size_t SparseMatrixChunked m(f.bins(), tmpfile, chunk_size); for (const Chromosome& chrom : f.chromosomes()) { + if (chrom.is_all()) { + continue; + } const auto sel = f.fetch(chrom.name()); std::for_each(sel.template begin(), sel.template end(), [&](const ThinPixel& p) { @@ -496,16 +504,6 @@ inline void ICE::initialize_biases(const MatrixT& matrix, nonstd::span b } } -inline std::vector ICE::read_chrom_bin_offsets(const BinTable& bins) { - std::vector buff{0}; - for (const Chromosome& chrom : bins.chromosomes()) { - const auto nbins = (chrom.size() + bins.bin_size() - 1) / bins.bin_size(); - buff.push_back(buff.back() + nbins); - } - - return buff; -} - inline std::vector ICE::compute_weights_from_chromosome_sizes( const BinTable& bins, nonstd::span chrom_bin_offsets) { std::vector weights(bins.size()); @@ -528,12 +526,12 @@ inline std::vector ICE::compute_weights_from_chromosome_sizes( return weights; } -inline std::vector ICE::get_weights([[maybe_unused]] bool rescale) const { - std::vector biases(_biases.size()); +inline std::vector ICE::get_weights(bool rescale) const { if (!rescale) { - return biases; + return _biases; } + std::vector biases(_biases.size()); if (_scale.size() == 1) { const auto scale = std::sqrt(_scale[0]); std::transform(_biases.begin(), _biases.end(), biases.begin(), [&](const auto n) { diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp index 382c8314..1a663ccc 100644 --- a/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp +++ b/src/libhictk/balancing/include/hictk/balancing/impl/sparse_matrix_impl.hpp @@ -20,16 +20,11 @@ namespace hictk::balancing { -inline SparseMatrix::SparseMatrix(const hictk::BinTable& bins, std::uint32_t chrom_id) +inline SparseMatrix::SparseMatrix(const BinTable& bins, std::uint32_t chrom_id) : _chrom_id(chrom_id == _gw_id ? 0 : chrom_id), - _marg(chrom_id == _gw_id ? bins.size() : bins.subset(chrom_id).size()) { - _chrom_offsets.push_back(0); - for (const Chromosome& chrom : bins.chromosomes()) { - const auto nbins = (chrom.size() + bins.bin_size() - 1) / bins.bin_size(); - _chrom_offsets.push_back(_chrom_offsets.back() + nbins); - } - _bin1_offsets.resize(_chrom_offsets.size(), 0); -} + _chrom_offsets(bins.num_bin_prefix_sum()), + _bin1_offsets(_chrom_offsets.size(), 0), + _marg(chrom_id == _gw_id ? bins.size() : bins.subset(chrom_id).size()) {} inline bool SparseMatrix::empty() const noexcept { return size() == 0; } inline std::size_t SparseMatrix::size() const noexcept { return _counts.size(); } @@ -65,6 +60,12 @@ inline const std::vector& SparseMatrix::chrom_offsets() const noexc } inline void SparseMatrix::push_back(std::uint64_t bin1_id, std::uint64_t bin2_id, double count) { + if (empty()) { + _chrom_id = static_cast( + std::lower_bound(_chrom_offsets.begin(), _chrom_offsets.end(), bin1_id) - + _chrom_offsets.begin()); + } + if (!empty() && bin1_id >= _chrom_offsets[_chrom_id + 1]) { _chrom_id = static_cast( std::lower_bound(_chrom_offsets.begin(), _chrom_offsets.end(), _bin1_ids.back()) - diff --git a/src/libhictk/balancing/include/hictk/balancing/impl/vc_impl.hpp b/src/libhictk/balancing/include/hictk/balancing/impl/vc_impl.hpp deleted file mode 100644 index 88b08e8a..00000000 --- a/src/libhictk/balancing/include/hictk/balancing/impl/vc_impl.hpp +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include -#include - -#include "hictk/pixel.hpp" -#include "hictk/type_traits.hpp" - -namespace hictk::balancing { - -template -template -inline VC::VC(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_rows, - std::size_t bin_id_offset) { - if constexpr (std::is_floating_point_v) { - _rowsum = std::vector(num_rows, 0); - _sum = 0.0; - } else { - _rowsum = std::vector(num_rows, 0); - _sum = std::int64_t(0); - } - - // Compute rowsum and matrix sum - std::visit( - [&](auto& sum) { - using T = remove_cvref_t; - auto& rowsum = std::get>(_rowsum); - std::for_each(first_pixel, last_pixel, [&](const ThinPixel& p) { - if constexpr (std::is_floating_point_v) { - if (std::isnan(p.count)) { - return; - } - } - const auto bin1_id = p.bin1_id - bin_id_offset; - const auto bin2_id = p.bin2_id - bin_id_offset; - const auto count = conditional_static_cast(p.count); - - rowsum[bin1_id] += count; - - if (bin1_id != bin2_id) { - rowsum[bin2_id] += count; - } - }); - }, - _sum); - - std::visit( - [&](auto& sum) { - using T = remove_cvref_t; - const auto& rowsum = std::get>(_rowsum); - std::for_each(first_pixel, last_pixel, [&](const ThinPixel& p) { - if constexpr (std::is_floating_point_v) { - if (std::isnan(p.count)) { - return; - } - } - const auto bin1_id = p.bin1_id - bin_id_offset; - const auto bin2_id = p.bin2_id - bin_id_offset; - - const auto rs1 = conditional_static_cast(rowsum[bin1_id]); - const auto rs2 = conditional_static_cast(rowsum[bin2_id]); - if (rs1 == 0 || rs2 == 0) { - return; - } - - const auto count = conditional_static_cast(bin1_id == bin2_id ? p.count : 2 * p.count); - sum += count; - _norm_sum += conditional_static_cast(count) / (rs1 * rs2); - }); - }, - _sum); -} - -template -inline std::vector VC::get_weights() const { - std::vector weights; - - const auto scaling_factor = std::visit( - [&](const auto& sum) { return std::sqrt(_norm_sum / conditional_static_cast(sum)); }, - _sum); - - std::visit( - [&](const auto& rowsum) { - weights.reserve(rowsum.size()); - - for (const auto rs : rowsum) { - weights.push_back(conditional_static_cast(rs) * scaling_factor); - } - }, - _rowsum); - - return weights; -} - -} // namespace hictk::balancing diff --git a/src/libhictk/balancing/include/hictk/balancing/vc.hpp b/src/libhictk/balancing/include/hictk/balancing/vc.hpp deleted file mode 100644 index 5a6affc1..00000000 --- a/src/libhictk/balancing/include/hictk/balancing/vc.hpp +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include - -namespace hictk::balancing { - -template -class VC { - std::variant, std::vector> _rowsum{}; - std::variant _sum{}; - double _norm_sum{}; - - public: - template - VC(PixelIt first_pixel, PixelIt last_pixel, std::size_t num_rows, std::size_t binid_offset = 0); - - [[nodiscard]] std::vector get_weights() const; -}; -} // namespace hictk::balancing - -#include "./impl/vc_impl.hpp"