From e43ecfded3b02c8ec002624345b75b82eac3ca28 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 6 Dec 2023 11:34:42 +0100 Subject: [PATCH 01/29] Refactor BinTable and initial implementation of BinTableVariable - Rename BinTable to BinTableFixed - Implement BinTableVariable to support operating on tables with variable bin size - BinTable is now a wrapper around BinTableFixed and BinTableVariable - Update Reference to stor the prefix sum of the chromosome sizes - Fix minor bug in Reference ctor when constructing reference from iterators of chromosomes --- src/libhictk/bin_table/include/hictk/bin.hpp | 63 ++ .../bin_table/include/hictk/bin_table.hpp | 120 ++-- .../include/hictk/bin_table_fixed.hpp | 146 +++++ .../include/hictk/bin_table_variable.hpp | 132 +++++ .../bin_table/include/hictk/impl/bin_impl.hpp | 78 +++ .../hictk/impl/bin_table_fixed_impl.hpp | 453 +++++++++++++++ .../include/hictk/impl/bin_table_impl.hpp | 537 ++++++------------ .../hictk/impl/bin_table_variable_impl.hpp | 473 +++++++++++++++ src/libhictk/formatting/include/hictk/fmt.hpp | 2 +- .../hictk/fmt/{bin_table.hpp => bin.hpp} | 2 +- .../formatting/include/hictk/fmt/pixel.hpp | 2 +- .../include/hictk/impl/reference_impl.hpp | 34 +- .../reference/include/hictk/reference.hpp | 10 + test/units/bin_table/CMakeLists.txt | 7 +- test/units/bin_table/bin_table_test.cpp | 283 +++++---- test/units/bin_table/bin_test.cpp | 103 ++++ test/units/reference/reference_test.cpp | 7 + 17 files changed, 1892 insertions(+), 560 deletions(-) create mode 100644 src/libhictk/bin_table/include/hictk/bin.hpp create mode 100644 src/libhictk/bin_table/include/hictk/bin_table_fixed.hpp create mode 100644 src/libhictk/bin_table/include/hictk/bin_table_variable.hpp create mode 100644 src/libhictk/bin_table/include/hictk/impl/bin_impl.hpp create mode 100644 src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp create mode 100644 src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp rename src/libhictk/formatting/include/hictk/fmt/{bin_table.hpp => bin.hpp} (98%) create mode 100644 test/units/bin_table/bin_test.cpp diff --git a/src/libhictk/bin_table/include/hictk/bin.hpp b/src/libhictk/bin_table/include/hictk/bin.hpp new file mode 100644 index 00000000..6431d221 --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/bin.hpp @@ -0,0 +1,63 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include "hictk/chromosome.hpp" +#include "hictk/genomic_interval.hpp" + +namespace hictk { + +class Bin { + public: + static constexpr std::uint64_t null_id{(std::numeric_limits::max)()}; + static constexpr std::uint32_t rel_null_id{(std::numeric_limits::max)()}; + + private: + std::uint64_t _id{null_id}; + std::uint32_t _rel_id{rel_null_id}; + GenomicInterval _interval{}; + + public: + constexpr Bin() = default; + Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept; + Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, std::uint32_t start_, + std::uint32_t end_) noexcept; + explicit Bin(GenomicInterval interval) noexcept; + Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept; + + [[nodiscard]] explicit operator bool() const noexcept; + + [[nodiscard]] bool operator==(const Bin &other) const noexcept; + [[nodiscard]] bool operator!=(const Bin &other) const noexcept; + + [[nodiscard]] bool operator<(const Bin &other) const noexcept; + [[nodiscard]] bool operator<=(const Bin &other) const noexcept; + + [[nodiscard]] bool operator>(const Bin &other) const noexcept; + [[nodiscard]] bool operator>=(const Bin &other) const noexcept; + + [[nodiscard]] constexpr std::uint64_t id() const noexcept; + [[nodiscard]] constexpr std::uint32_t rel_id() const noexcept; + [[nodiscard]] const GenomicInterval &interval() const noexcept; + [[nodiscard]] const Chromosome &chrom() const noexcept; + [[nodiscard]] constexpr std::uint32_t start() const noexcept; + [[nodiscard]] constexpr std::uint32_t end() const noexcept; + + [[nodiscard]] constexpr bool has_null_id() const noexcept; +}; + +} // namespace hictk + +namespace std { +template <> +struct hash { + size_t operator()(const hictk::Bin &b) const; +}; +} // namespace std + +#include "./impl/bin_impl.hpp" diff --git a/src/libhictk/bin_table/include/hictk/bin_table.hpp b/src/libhictk/bin_table/include/hictk/bin_table.hpp index 9c22414d..ce5b5791 100644 --- a/src/libhictk/bin_table/include/hictk/bin_table.hpp +++ b/src/libhictk/bin_table/include/hictk/bin_table.hpp @@ -8,69 +8,27 @@ #include #include #include +#include #include +#include "hictk/bin_table_fixed.hpp" +#include "hictk/bin_table_variable.hpp" #include "hictk/common.hpp" #include "hictk/genomic_interval.hpp" #include "hictk/reference.hpp" - namespace hictk { -class Bin { - public: - static constexpr std::uint64_t null_id{(std::numeric_limits::max)()}; - static constexpr std::uint32_t rel_null_id{(std::numeric_limits::max)()}; - - private: - std::uint64_t _id{null_id}; - std::uint32_t _rel_id{rel_null_id}; - GenomicInterval _interval{}; - - public: - constexpr Bin() = default; - Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept; - Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, std::uint32_t start_, - std::uint32_t end_) noexcept; - explicit Bin(GenomicInterval interval) noexcept; - Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept; - - [[nodiscard]] explicit operator bool() const noexcept; - - [[nodiscard]] bool operator==(const Bin &other) const noexcept; - [[nodiscard]] bool operator!=(const Bin &other) const noexcept; - - [[nodiscard]] bool operator<(const Bin &other) const noexcept; - [[nodiscard]] bool operator<=(const Bin &other) const noexcept; - - [[nodiscard]] bool operator>(const Bin &other) const noexcept; - [[nodiscard]] bool operator>=(const Bin &other) const noexcept; - - [[nodiscard]] constexpr std::uint64_t id() const noexcept; - [[nodiscard]] constexpr std::uint32_t rel_id() const noexcept; - [[nodiscard]] const GenomicInterval &interval() const noexcept; - [[nodiscard]] const Chromosome &chrom() const noexcept; - [[nodiscard]] constexpr std::uint32_t start() const noexcept; - [[nodiscard]] constexpr std::uint32_t end() const noexcept; - - [[nodiscard]] constexpr bool has_null_id() const noexcept; -}; - -struct BinTableConcrete { - std::vector chroms{}; - std::vector bin_starts{}; - std::vector bin_ends{}; -}; - class BinTable { - Reference _chroms{}; - std::vector _num_bins_prefix_sum{}; - std::uint32_t _bin_size{std::numeric_limits::max()}; + std::variant> _table{BinTableFixed{}}; public: + using BinTableVar = decltype(_table); class iterator; friend iterator; BinTable() = default; + template + BinTable(BinTableT table); BinTable(Reference chroms, std::uint32_t bin_size, std::size_t bin_offset = 0); template BinTable(ChromIt first_chrom, ChromIt last_chrom, std::uint32_t bin_size, @@ -78,12 +36,16 @@ class BinTable { template BinTable(ChromNameIt first_chrom_name, ChromNameIt last_chrom_name, ChromSizeIt first_chrom_size, std::uint32_t bin_size, std::size_t bin_offset = 0); + template + BinTable(Reference chroms, const std::vector &start_pos, const std::vector &end_pos, + I bin_offset = 0); [[nodiscard]] std::size_t size() const noexcept; [[nodiscard]] bool empty() const noexcept; [[nodiscard]] std::size_t num_chromosomes() const; [[nodiscard]] constexpr std::uint32_t bin_size() const noexcept; [[nodiscard]] constexpr const Reference &chromosomes() const noexcept; + [[nodiscard]] constexpr bool has_fixed_bin_size() const noexcept; [[nodiscard]] constexpr const std::vector &num_bin_prefix_sum() const noexcept; @@ -123,30 +85,26 @@ class BinTable { [[nodiscard]] std::uint64_t map_to_bin_id(std::string_view chrom_name, std::uint32_t pos) const; [[nodiscard]] std::uint64_t map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const; - [[nodiscard]] BinTableConcrete concretize() const; - [[nodiscard]] bool operator==(const BinTable &other) const; [[nodiscard]] bool operator!=(const BinTable &other) const; - private: - [[nodiscard]] static std::vector compute_num_bins_prefix_sum( - const Reference &chroms, std::uint32_t bin_size, std::size_t bin_offset); + template + [[nodiscard]] constexpr const BinTableT &get() const; + template + [[nodiscard]] constexpr BinTableT &get(); + [[nodiscard]] constexpr auto get() const noexcept -> const BinTableVar &; + [[nodiscard]] constexpr auto get() noexcept -> BinTableVar &; - public: class iterator { friend BinTable; - const BinTable *_bin_table{}; - std::size_t _chrom_bin_id{}; - std::uint32_t _rel_bin_id{}; - std::uint32_t _chrom_id{}; - - static constexpr auto null_rel_bin_id = (std::numeric_limits::max)(); - static constexpr auto null_bin_id = (std::numeric_limits::max)(); - static constexpr auto nchrom = (std::numeric_limits::max)(); + std::variant::iterator> _it{ + BinTableFixed::iterator{}}; - explicit iterator(const BinTable &bin_table) noexcept; + template + explicit iterator(It it) noexcept; public: + using IteratorVar = decltype(_it); using difference_type = std::ptrdiff_t; using value_type = Bin; using pointer = value_type *; @@ -155,12 +113,12 @@ class BinTable { constexpr iterator() noexcept = default; - constexpr bool operator==(const iterator &other) const noexcept; - constexpr bool operator!=(const iterator &other) const noexcept; - constexpr bool operator<(const iterator &other) const noexcept; - constexpr bool operator<=(const iterator &other) const noexcept; - constexpr bool operator>(const iterator &other) const noexcept; - constexpr bool operator>=(const iterator &other) const noexcept; + constexpr bool operator==(const iterator &other) const; + constexpr bool operator!=(const iterator &other) const; + constexpr bool operator<(const iterator &other) const; + constexpr bool operator<=(const iterator &other) const; + constexpr bool operator>(const iterator &other) const; + constexpr bool operator>=(const iterator &other) const; auto operator*() const -> value_type; auto operator[](std::size_t i) const -> iterator; @@ -176,25 +134,15 @@ class BinTable { auto operator-(std::size_t i) const -> iterator; auto operator-(const iterator &other) const -> difference_type; - private: - [[nodiscard]] static auto make_end_iterator(const BinTable &table) noexcept -> iterator; - [[nodiscard]] const Chromosome &chromosome(std::uint32_t chrom_id) const; - [[nodiscard]] const Chromosome &chromosome() const; - [[nodiscard]] constexpr std::uint32_t bin_size() const noexcept; - [[nodiscard]] constexpr std::size_t bin_id() const noexcept; - [[nodiscard]] std::uint32_t compute_num_chrom_bins() const noexcept; - [[nodiscard]] std::size_t compute_bin_offset() const noexcept; - [[nodiscard]] std::size_t num_chromosomes() const noexcept; + template + [[nodiscard]] constexpr const IteratorT &get() const; + template + [[nodiscard]] constexpr IteratorT &get(); + [[nodiscard]] constexpr auto get() const noexcept -> const IteratorVar &; + [[nodiscard]] constexpr auto get() noexcept -> IteratorVar &; }; }; } // namespace hictk -namespace std { -template <> -struct hash { - size_t operator()(const hictk::Bin &b) const; -}; -} // namespace std - #include "./impl/bin_table_impl.hpp" diff --git a/src/libhictk/bin_table/include/hictk/bin_table_fixed.hpp b/src/libhictk/bin_table/include/hictk/bin_table_fixed.hpp new file mode 100644 index 00000000..be4cb917 --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/bin_table_fixed.hpp @@ -0,0 +1,146 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +#include "hictk/bin.hpp" +#include "hictk/common.hpp" +#include "hictk/genomic_interval.hpp" +#include "hictk/reference.hpp" + +namespace hictk { + +class BinTableFixed { + Reference _chroms{}; + std::vector _num_bins_prefix_sum{}; + std::uint32_t _bin_size{std::numeric_limits::max()}; + + public: + class iterator; + friend iterator; + + BinTableFixed() = default; + BinTableFixed(Reference chroms, std::uint32_t bin_size, std::size_t bin_offset = 0); + template + BinTableFixed(ChromIt first_chrom, ChromIt last_chrom, std::uint32_t bin_size, + std::size_t bin_offset = 0); + template + BinTableFixed(ChromNameIt first_chrom_name, ChromNameIt last_chrom_name, + ChromSizeIt first_chrom_size, std::uint32_t bin_size, std::size_t bin_offset = 0); + + [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t num_chromosomes() const; + [[nodiscard]] constexpr std::uint32_t bin_size() const noexcept; + [[nodiscard]] constexpr const Reference &chromosomes() const noexcept; + + [[nodiscard]] constexpr const std::vector &num_bin_prefix_sum() const noexcept; + + [[nodiscard]] auto begin() const -> iterator; + [[nodiscard]] auto end() const -> iterator; + [[nodiscard]] auto cbegin() const -> iterator; + [[nodiscard]] auto cend() const -> iterator; + + [[nodiscard]] BinTableFixed subset(const Chromosome &chrom) const; + [[nodiscard]] BinTableFixed subset(std::string_view chrom_name) const; + [[nodiscard]] BinTableFixed subset(std::uint32_t chrom_id) const; + + [[nodiscard]] auto find_overlap(const GenomicInterval &query) const + -> std::pair; + [[nodiscard]] auto find_overlap(const Chromosome &chrom, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + [[nodiscard]] auto find_overlap(std::string_view chrom_name, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + [[nodiscard]] auto find_overlap(std::uint32_t chrom_id, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + + // Map bin_id to Bin + [[nodiscard]] Bin at(std::uint64_t bin_id) const; + [[nodiscard]] std::pair at(const GenomicInterval &gi) const; + [[nodiscard]] Bin at(const Chromosome &chrom, std::uint32_t pos = 0) const; + [[nodiscard]] Bin at(std::string_view chrom_name, std::uint32_t pos = 0) const; + [[nodiscard]] Bin at(std::uint32_t chrom_id, std::uint32_t pos) const; + [[nodiscard]] Bin at_hint(std::uint64_t bin_id, const Chromosome &chrom) const; + + // Map genomic coords to bin_id + [[nodiscard]] std::pair map_to_bin_ids( + const GenomicInterval &gi) const; + [[nodiscard]] std::uint64_t map_to_bin_id(const Chromosome &chrom, std::uint32_t pos) const; + [[nodiscard]] std::uint64_t map_to_bin_id(std::string_view chrom_name, std::uint32_t pos) const; + [[nodiscard]] std::uint64_t map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const; + + [[nodiscard]] bool operator==(const BinTableFixed &other) const; + [[nodiscard]] bool operator!=(const BinTableFixed &other) const; + + private: + [[nodiscard]] static std::vector compute_num_bins_prefix_sum( + const Reference &chroms, std::uint32_t bin_size, std::size_t bin_offset); + + public: + class iterator { + friend BinTableFixed; + const BinTableFixed *_bin_table{}; + std::size_t _chrom_bin_id{}; + std::uint32_t _rel_bin_id{}; + std::uint32_t _chrom_id{}; + + static constexpr auto null_rel_bin_id = (std::numeric_limits::max)(); + static constexpr auto null_bin_id = (std::numeric_limits::max)(); + + explicit iterator(const BinTableFixed &bin_table) noexcept; + + public: + using difference_type = std::ptrdiff_t; + using value_type = Bin; + using pointer = value_type *; + using reference = value_type &; + using iterator_category = std::random_access_iterator_tag; + + constexpr iterator() noexcept = default; + + constexpr bool operator==(const iterator &other) const noexcept; + constexpr bool operator!=(const iterator &other) const noexcept; + constexpr bool operator<(const iterator &other) const noexcept; + constexpr bool operator<=(const iterator &other) const noexcept; + constexpr bool operator>(const iterator &other) const noexcept; + constexpr bool operator>=(const iterator &other) const noexcept; + + auto operator*() const -> value_type; + auto operator[](std::size_t i) const -> iterator; + + auto operator++() -> iterator &; + auto operator++(int) -> iterator; + auto operator+=(std::size_t i) -> iterator &; + auto operator+(std::size_t i) const -> iterator; + + auto operator--() -> iterator &; + auto operator--(int) -> iterator; + auto operator-=(std::size_t i) -> iterator &; + auto operator-(std::size_t i) const -> iterator; + auto operator-(const iterator &other) const -> difference_type; + + private: + [[nodiscard]] static auto make_end_iterator(const BinTableFixed &table) noexcept -> iterator; + [[nodiscard]] const Chromosome &chromosome(std::uint32_t chrom_id) const; + [[nodiscard]] const Chromosome &chromosome() const; + [[nodiscard]] constexpr std::uint32_t bin_size() const noexcept; + [[nodiscard]] constexpr std::size_t bin_id() const noexcept; + [[nodiscard]] std::uint32_t compute_num_chrom_bins() const noexcept; + [[nodiscard]] std::size_t compute_bin_offset() const noexcept; + [[nodiscard]] std::size_t num_chromosomes() const noexcept; + }; +}; + +} // namespace hictk + +#include "./impl/bin_table_fixed_impl.hpp" diff --git a/src/libhictk/bin_table/include/hictk/bin_table_variable.hpp b/src/libhictk/bin_table/include/hictk/bin_table_variable.hpp new file mode 100644 index 00000000..3589ba65 --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/bin_table_variable.hpp @@ -0,0 +1,132 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +#include "hictk/bin.hpp" +#include "hictk/common.hpp" +#include "hictk/genomic_interval.hpp" +#include "hictk/reference.hpp" + +namespace hictk { + +template +class BinTableVariable { + Reference _chroms{}; + std::vector _bin_end_prefix_sum{0}; + std::vector _num_bins_prefix_sum{0}; + + public: + class iterator; + friend iterator; + + BinTableVariable() = default; + BinTableVariable(Reference chroms, const std::vector &start_pos, const std::vector &end_pos, + I bin_offset = 0); + + [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t num_chromosomes() const; + [[nodiscard]] constexpr const Reference &chromosomes() const noexcept; + + [[nodiscard]] constexpr const std::vector &num_bin_prefix_sum() const noexcept; + + [[nodiscard]] auto begin() const -> iterator; + [[nodiscard]] auto end() const -> iterator; + [[nodiscard]] auto cbegin() const -> iterator; + [[nodiscard]] auto cend() const -> iterator; + + [[nodiscard]] BinTableVariable subset(const Chromosome &chrom) const; + [[nodiscard]] BinTableVariable subset(std::string_view chrom_name) const; + [[nodiscard]] BinTableVariable subset(std::uint32_t chrom_id) const; + + [[nodiscard]] auto find_overlap(const GenomicInterval &query) const + -> std::pair; + [[nodiscard]] auto find_overlap(const Chromosome &chrom, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + [[nodiscard]] auto find_overlap(std::string_view chrom_name, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + [[nodiscard]] auto find_overlap(std::uint32_t chrom_id, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + + // Map bin_id to Bin + [[nodiscard]] Bin at(std::uint64_t bin_id) const; + [[nodiscard]] std::pair at(const GenomicInterval &gi) const; + [[nodiscard]] Bin at(const Chromosome &chrom, std::uint32_t pos = 0) const; + [[nodiscard]] Bin at(std::string_view chrom_name, std::uint32_t pos = 0) const; + [[nodiscard]] Bin at(std::uint32_t chrom_id, std::uint32_t pos) const; + [[nodiscard]] Bin at_hint(std::uint64_t bin_id, const Chromosome &chrom) const; + + // Map genomic coords to bin_id + [[nodiscard]] std::pair map_to_bin_ids( + const GenomicInterval &gi) const; + [[nodiscard]] std::uint64_t map_to_bin_id(const Chromosome &chrom, std::uint32_t pos) const; + [[nodiscard]] std::uint64_t map_to_bin_id(std::string_view chrom_name, std::uint32_t pos) const; + [[nodiscard]] std::uint64_t map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const; + + [[nodiscard]] bool operator==(const BinTableVariable &other) const; + [[nodiscard]] bool operator!=(const BinTableVariable &other) const; + + class iterator { + friend BinTableVariable; + Bin _value{}; + const BinTableVariable *_bin_table{}; + std::uint32_t _chrom_id{}; + std::uint64_t _bin_id{}; + + static constexpr auto null_bin_id = (std::numeric_limits::max)(); + static constexpr auto nchrom = (std::numeric_limits::max)(); + + explicit iterator(const BinTableVariable &bin_table) noexcept; + + public: + using difference_type = std::ptrdiff_t; + using value_type = Bin; + using pointer = value_type *; + using reference = value_type &; + using iterator_category = std::random_access_iterator_tag; + + constexpr iterator() noexcept = default; + + constexpr bool operator==(const iterator &other) const noexcept; + constexpr bool operator!=(const iterator &other) const noexcept; + constexpr bool operator<(const iterator &other) const noexcept; + constexpr bool operator<=(const iterator &other) const noexcept; + constexpr bool operator>(const iterator &other) const noexcept; + constexpr bool operator>=(const iterator &other) const noexcept; + + auto operator*() const noexcept -> value_type; + auto operator[](std::size_t i) const -> iterator; + + auto operator++() -> iterator &; + auto operator++(int) -> iterator; + auto operator+=(std::size_t i) -> iterator &; + auto operator+(std::size_t i) const -> iterator; + + auto operator--() -> iterator &; + auto operator--(int) -> iterator; + auto operator-=(std::size_t i) -> iterator &; + auto operator-(std::size_t i) const -> iterator; + auto operator-(const iterator &other) const -> difference_type; + + private: + [[nodiscard]] static auto make_end_iterator(const BinTableVariable &table) noexcept -> iterator; + [[nodiscard]] const Chromosome &chromosome(std::uint32_t chrom_id) const; + [[nodiscard]] const Chromosome &chromosome() const; + [[nodiscard]] Bin get_bin() const; + }; +}; + +} // namespace hictk + +#include "./impl/bin_table_variable_impl.hpp" diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_impl.hpp new file mode 100644 index 00000000..c130933d --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/impl/bin_impl.hpp @@ -0,0 +1,78 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include "hictk/chromosome.hpp" +#include "hictk/genomic_interval.hpp" + +namespace hictk { + +inline Bin::Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end_) noexcept + : Bin(Bin::null_id, Bin::rel_null_id, chrom_, start_, end_) {} + +inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, + std::uint32_t start_, std::uint32_t end_) noexcept + : _id(id_), _rel_id(rel_id_), _interval(chrom_, start_, end_) {} + +inline Bin::Bin(GenomicInterval interval) noexcept + : Bin(Bin::null_id, Bin::rel_null_id, std::move(interval)) {} + +inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept + : _id(id_), _rel_id(rel_id_), _interval(std::move(interval)) {} + +inline Bin::operator bool() const noexcept { return !!chrom(); } + +inline bool Bin::operator==(const Bin &other) const noexcept { + if (!has_null_id() && !other.has_null_id()) { + return id() == other.id(); + } + return _interval == other._interval; +} +inline bool Bin::operator!=(const Bin &other) const noexcept { return !(*this == other); } + +inline bool Bin::operator<(const Bin &other) const noexcept { + if (!has_null_id() && !other.has_null_id()) { + return id() < other.id(); + } + return _interval < other._interval; +} + +inline bool Bin::operator<=(const Bin &other) const noexcept { + if (!has_null_id() && !other.has_null_id()) { + return id() <= other.id(); + } + return _interval <= other._interval; +} + +inline bool Bin::operator>(const Bin &other) const noexcept { + if (!has_null_id() && !other.has_null_id()) { + return id() > other.id(); + } + return _interval > other._interval; +} + +inline bool Bin::operator>=(const Bin &other) const noexcept { + if (!has_null_id() && !other.has_null_id()) { + return id() >= other.id(); + } + return _interval >= other._interval; +} + +constexpr std::uint64_t Bin::id() const noexcept { return _id; } +constexpr std::uint32_t Bin::rel_id() const noexcept { return _rel_id; } +inline const GenomicInterval &Bin::interval() const noexcept { return _interval; } +inline const Chromosome &Bin::chrom() const noexcept { return interval().chrom(); } +constexpr std::uint32_t Bin::start() const noexcept { return _interval.start(); } +constexpr std::uint32_t Bin::end() const noexcept { return _interval.end(); } + +constexpr bool Bin::has_null_id() const noexcept { return id() == Bin::null_id; } + +}; // namespace hictk + +inline std::size_t std::hash::operator()(const hictk::Bin &b) const { + return hictk::internal::hash_combine(0, b.id(), b.interval()); +} diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp new file mode 100644 index 00000000..7022ab7c --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp @@ -0,0 +1,453 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/common.hpp" +#include "hictk/genomic_interval.hpp" +#include "hictk/suppress_warnings.hpp" + +namespace hictk { // NOLINT + +inline BinTableFixed::BinTableFixed(Reference chroms, std::uint32_t bin_size, + std::size_t bin_offset) + : _chroms(std::move(chroms)), + _num_bins_prefix_sum(compute_num_bins_prefix_sum(_chroms, bin_size, bin_offset)), + _bin_size(bin_size) { + assert(!_chroms.empty()); + assert(bin_size != 0); +} + +template +inline BinTableFixed::BinTableFixed(ChromIt first_chrom, ChromIt last_chrom, std::uint32_t bin_size, + std::size_t bin_offset) + : BinTableFixed(Reference(first_chrom, last_chrom), bin_size, bin_offset) {} + +template +inline BinTableFixed::BinTableFixed(ChromNameIt first_chrom_name, ChromNameIt last_chrom_name, + ChromSizeIt first_chrom_size, std::uint32_t bin_size, + std::size_t bin_offset) + : BinTableFixed(Reference(first_chrom_name, last_chrom_name, first_chrom_size), bin_size, + bin_offset) {} + +inline std::size_t BinTableFixed::size() const noexcept { + if (_num_bins_prefix_sum.empty()) { + return 0; + } + return conditional_static_cast(_num_bins_prefix_sum.back() - + _num_bins_prefix_sum.front()); +} + +inline bool BinTableFixed::empty() const noexcept { return size() == 0; } + +inline std::size_t BinTableFixed::num_chromosomes() const { return _chroms.size(); } + +constexpr std::uint32_t BinTableFixed::bin_size() const noexcept { return _bin_size; } + +constexpr const Reference &BinTableFixed::chromosomes() const noexcept { return _chroms; } + +constexpr const std::vector &BinTableFixed::num_bin_prefix_sum() const noexcept { + return _num_bins_prefix_sum; +} + +inline auto BinTableFixed::begin() const -> iterator { return iterator(*this); } +inline auto BinTableFixed::end() const -> iterator { return iterator::make_end_iterator(*this); } +inline auto BinTableFixed::cbegin() const -> iterator { return begin(); } +inline auto BinTableFixed::cend() const -> iterator { return end(); } + +inline bool BinTableFixed::operator==(const BinTableFixed &other) const { + return _bin_size == other._bin_size && _chroms == other._chroms; +} +inline bool BinTableFixed::operator!=(const BinTableFixed &other) const { + return !(*this == other); +} + +inline BinTableFixed BinTableFixed::subset(const Chromosome &chrom) const { + // GCC8 fails to compile when using if constexpr instead #ifndef + // See: https://github.com/fmtlib/fmt/issues/1455 +#ifndef NDEBUG + if (!_chroms.contains(chrom)) { + throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); + } +#endif + const auto offset = at(chrom, 0).id(); + return {Reference{chrom}, _bin_size, offset}; +} +inline BinTableFixed BinTableFixed::subset(std::string_view chrom_name) const { + return subset(_chroms.at(chrom_name)); +} +inline BinTableFixed BinTableFixed::subset(std::uint32_t chrom_id) const { + return subset(_chroms.at(chrom_id)); +} + +inline auto BinTableFixed::find_overlap(const GenomicInterval &query) const + -> std::pair { + return find_overlap(query.chrom(), query.start(), query.end()); +} + +inline auto BinTableFixed::find_overlap(const Chromosome &chrom, std::uint32_t start, + std::uint32_t end) const + -> std::pair { + assert(start < end); + + const auto bin1_id = at(chrom, start).id(); + const auto bin2_id = at(chrom, end - (std::min)(end, 1U)).id(); + + return std::make_pair(begin() + bin1_id, begin() + bin2_id + 1); +} +inline auto BinTableFixed::find_overlap(std::string_view chrom_name, std::uint32_t start, + std::uint32_t end) const + -> std::pair { + return find_overlap(_chroms.at(chrom_name), start, end); +} +inline auto BinTableFixed::find_overlap(std::uint32_t chrom_id, std::uint32_t start, + std::uint32_t end) const + -> std::pair { + return find_overlap(_chroms.at(chrom_id), start, end); +} + +inline Bin BinTableFixed::at(std::uint64_t bin_id) const { + // I tried benchmarking linear search as well as std::set (including third-party implementations). + // Binary search and find on flat vectors are always faster for a reasonable number of chromosomes + // (e.g. 5-100) and have fairly similar performance. + // Linear search is however better in practice because chromosomes are usually sorted by (approx.) + // size, with unplaced scaffolds etc. ending up last. + auto match = std::find_if(_num_bins_prefix_sum.begin(), _num_bins_prefix_sum.end(), + [&](const auto n) { return n > bin_id; }); + + if (match == _num_bins_prefix_sum.end()) { + throw std::out_of_range(fmt::format(FMT_STRING("bin id {} not found: out of range"), bin_id)); + } + assert(match != _num_bins_prefix_sum.begin()); + + const auto chrom_id = + static_cast(std::distance(_num_bins_prefix_sum.begin(), --match)); + return at_hint(bin_id, _chroms[chrom_id]); +} + +inline Bin BinTableFixed::at_hint(std::uint64_t bin_id, const Chromosome &chrom) const { + const auto offset = _num_bins_prefix_sum[chrom.id()]; + const auto relative_bin_id = bin_id - offset; + const auto start = static_cast(relative_bin_id * bin_size()); + assert(start < chrom.size()); + const auto end = (std::min)(start + bin_size(), chrom.size()); + + return {bin_id, static_cast(relative_bin_id), chrom, start, end}; +} + +inline std::pair BinTableFixed::at(const GenomicInterval &gi) const { + const auto [bin1_id, bin2_id] = map_to_bin_ids(gi); + return std::make_pair(at_hint(bin1_id, gi.chrom()), at_hint(bin2_id, gi.chrom())); +} +inline Bin BinTableFixed::at(const Chromosome &chrom, std::uint32_t pos) const { + return at_hint(map_to_bin_id(chrom, pos), chrom); +} +inline Bin BinTableFixed::at(std::string_view chrom_name, std::uint32_t pos) const { + return at(map_to_bin_id(chrom_name, pos)); +} +inline Bin BinTableFixed::at(std::uint32_t chrom_id, std::uint32_t pos) const { + return at(map_to_bin_id(chrom_id, pos)); +} + +inline std::pair BinTableFixed::map_to_bin_ids( + const GenomicInterval &gi) const { + return std::make_pair(map_to_bin_id(gi.chrom(), gi.start()), + map_to_bin_id(gi.chrom(), gi.end() - (std::min)(gi.end(), 1U))); +} + +inline std::uint64_t BinTableFixed::map_to_bin_id(const Chromosome &chrom, + std::uint32_t pos) const { + // GCC8 fails to compile when using if constexpr instead #ifndef + // See: https://github.com/fmtlib/fmt/issues/1455 +#ifndef NDEBUG + if (!_chroms.contains(chrom)) { + throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); + } +#endif + + if (pos > chrom.size()) { + throw std::out_of_range(fmt::format( + FMT_STRING("position is greater than chromosome size: {} > {}"), pos, chrom.size())); + } + + const auto bin_offset = _num_bins_prefix_sum[chrom.id()] - _num_bins_prefix_sum.front(); + + return bin_offset + static_cast(pos / bin_size()); +} + +inline std::uint64_t BinTableFixed::map_to_bin_id(std::string_view chrom_name, + std::uint32_t pos) const { + return map_to_bin_id(_chroms.at(chrom_name), pos); +} + +inline std::uint64_t BinTableFixed::map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const { + return map_to_bin_id(_chroms.at(chrom_id), pos); +} + +inline std::vector BinTableFixed::compute_num_bins_prefix_sum( + const Reference &chroms, std::uint32_t bin_size, std::size_t bin_offset) { + assert(bin_size != 0); + + DISABLE_WARNING_PUSH + DISABLE_WARNING_NULL_DEREF + std::vector prefix_sum(chroms.size() + 1); + prefix_sum.front() = bin_offset; + DISABLE_WARNING_POP + + // I am using transform instead of inclusive_scan because the latter is not always available + std::transform(chroms.begin(), chroms.end(), prefix_sum.begin() + 1, + [&, sum = bin_offset](const Chromosome &chrom) mutable { + if (chrom.is_all()) { + return sum; + } + const auto num_bins = (chrom.size() + bin_size - 1) / bin_size; + return sum += static_cast(num_bins); + }); + + return prefix_sum; +} + +inline BinTableFixed::iterator::iterator(const BinTableFixed &bin_table) noexcept + : _bin_table{&bin_table}, _chrom_bin_id(_bin_table->_num_bins_prefix_sum.front()) { + if (_bin_table->chromosomes().at(_chrom_id).is_all()) { + _chrom_id++; + } +} + +constexpr bool BinTableFixed::iterator::operator==(const iterator &other) const noexcept { + // clang-format off + return _bin_table == other._bin_table && + _chrom_id == other._chrom_id && + _rel_bin_id == other._rel_bin_id; + // clang-format on +} + +constexpr bool BinTableFixed::iterator::operator!=(const iterator &other) const noexcept { + return !(*this == other); +} + +constexpr bool BinTableFixed::iterator::operator<(const iterator &other) const noexcept { + return bin_id() < other.bin_id(); +} + +constexpr bool BinTableFixed::iterator::operator<=(const iterator &other) const noexcept { + return bin_id() <= other.bin_id(); +} + +constexpr bool BinTableFixed::iterator::operator>(const iterator &other) const noexcept { + return bin_id() > other.bin_id(); +} + +constexpr bool BinTableFixed::iterator::operator>=(const iterator &other) const noexcept { + return bin_id() >= other.bin_id(); +} + +inline auto BinTableFixed::iterator::make_end_iterator(const BinTableFixed &table) noexcept + -> iterator { + iterator it(table); + + it._chrom_id = static_cast(table.chromosomes().size()); + it._rel_bin_id = null_rel_bin_id; + return it; +} + +inline auto BinTableFixed::iterator::operator*() const -> value_type { + assert(_bin_table); + + const auto &chrom = chromosome(); + const auto bin_size = this->bin_size(); + + const auto start = std::min(_rel_bin_id * bin_size, chrom.size()); + const auto end = std::min(start + bin_size, chrom.size()); + + return value_type{bin_id(), _rel_bin_id, chrom, start, end}; +} + +inline auto BinTableFixed::iterator::operator++() -> iterator & { + assert(_bin_table); + if (_chrom_id >= _bin_table->chromosomes().size()) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to increment iterator past end()"); + } + + if (++_rel_bin_id >= compute_num_chrom_bins()) { + if (_chrom_id + 1 >= num_chromosomes()) { + return *this = make_end_iterator(*_bin_table); + } + ++_chrom_id; + _chrom_bin_id = compute_bin_offset(); + _rel_bin_id = 0; + } + + return *this; +} + +inline auto BinTableFixed::iterator::operator++(int) -> iterator { + auto it = *this; + std::ignore = ++(*this); + return it; +} + +inline auto BinTableFixed::iterator::operator+=(std::size_t i) -> iterator & { + assert(_bin_table); + if (i == 0) { + return *this; + } + + if (bin_id() + i >= _bin_table->size()) { + throw std::out_of_range( + "BinTableFixed::iterator: caught attempt to increment iterator past end()"); + } + + const auto ii = static_cast(i); + const auto num_bins = compute_num_chrom_bins(); + if (_rel_bin_id + ii < num_bins) { + _rel_bin_id += ii; + return *this; + } + + _chrom_id++; + _chrom_bin_id = compute_bin_offset(); + i -= (num_bins - _rel_bin_id); + _rel_bin_id = 0; + return *this += i; +} + +inline auto BinTableFixed::iterator::operator+(std::size_t i) const -> iterator { + auto it = *this; + return it += i; +} + +inline auto BinTableFixed::iterator::operator--() -> iterator & { + assert(_bin_table); + if (bin_id() == 0) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to decrement iterator past begin()"); + } + + if (_rel_bin_id == null_rel_bin_id) { + assert(*this == make_end_iterator(*_bin_table)); + _chrom_id = static_cast(num_chromosomes() - 1); + _chrom_bin_id = compute_bin_offset(); + _rel_bin_id = compute_num_chrom_bins() - 1; + return *this; + } + + if (_rel_bin_id-- == 0) { + _chrom_id--; + _chrom_bin_id = compute_bin_offset(); + _rel_bin_id = compute_num_chrom_bins() - 1; + } + + return *this; +} + +inline auto BinTableFixed::iterator::operator--(int) -> iterator { + auto it = *this; + std::ignore = --(*this); + return it; +} + +inline auto BinTableFixed::iterator::operator-=(std::size_t i) -> iterator & { + assert(_bin_table); + + if (i == 0) { + return *this; + } + + if (bin_id() < i) { + throw std::out_of_range( + "BinTableFixed::iterator: caught attempt to decrement iterator past begin()"); + } + + if (_rel_bin_id == null_rel_bin_id) { + assert(*this == make_end_iterator(*_bin_table)); + _chrom_id = static_cast(num_chromosomes() - 1); + _chrom_bin_id = compute_bin_offset(); + _rel_bin_id = compute_num_chrom_bins(); + return *this -= i; + } + + if (i <= _rel_bin_id) { + _rel_bin_id -= static_cast(i); + return *this; + } + + _chrom_id--; + _chrom_bin_id = compute_bin_offset(); + i -= _rel_bin_id; + _rel_bin_id = compute_num_chrom_bins(); + return *this -= i; +} + +inline auto BinTableFixed::iterator::operator-(std::size_t i) const -> iterator { + auto it = *this; + return it -= i; +} + +inline auto BinTableFixed::iterator::operator-(const iterator &other) const -> difference_type { + assert(_bin_table); + assert(other._bin_table); + + const auto offset1 = _chrom_id == _bin_table->num_chromosomes() ? _bin_table->size() : bin_id(); + const auto offset2 = other._chrom_id == other._bin_table->num_chromosomes() + ? other._bin_table->size() + : other.bin_id(); + + return static_cast(offset1) - static_cast(offset2); +} + +inline auto BinTableFixed::iterator::operator[](std::size_t i) const -> iterator { + return (*this + i); +} + +inline const Chromosome &BinTableFixed::iterator::chromosome() const { + return chromosome(_chrom_id); +} + +constexpr std::uint32_t BinTableFixed::iterator::bin_size() const noexcept { + assert(_bin_table); + return _bin_table->bin_size(); +} + +constexpr std::size_t BinTableFixed::iterator::bin_id() const noexcept { + return _chrom_bin_id + _rel_bin_id; +} + +inline const Chromosome &BinTableFixed::iterator::chromosome(std::uint32_t chrom_id) const { + return _bin_table->chromosomes().at(chrom_id); +} + +inline std::uint32_t BinTableFixed::iterator::compute_num_chrom_bins() const noexcept { + assert(_bin_table); + + const auto chrom_size = chromosome().size(); + const auto bin_size = this->bin_size(); + + return (chrom_size + bin_size - 1) / bin_size; +} + +inline std::size_t BinTableFixed::iterator::compute_bin_offset() const noexcept { + return _bin_table->at(_chrom_id, 0).id(); +} + +inline std::size_t BinTableFixed::iterator::num_chromosomes() const noexcept { + assert(_bin_table); + + return _bin_table->num_chromosomes(); +} + +} // namespace hictk diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp index c8b04a1c..4b8a5e19 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp @@ -1,517 +1,320 @@ -// Copyright (C) 2022 Roberto Rossini +// Copyright (C) 2023 Roberto Rossini // // SPDX-License-Identifier: MIT #pragma once -#include - -#include -#include #include #include -#include +#include #include -#include -#include #include +#include "hictk/bin.hpp" #include "hictk/common.hpp" #include "hictk/genomic_interval.hpp" -#include "hictk/suppress_warnings.hpp" - -namespace hictk { // NOLINT - -inline Bin::Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end_) noexcept - : Bin(Bin::null_id, Bin::rel_null_id, chrom_, start_, end_) {} - -inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, - std::uint32_t start_, std::uint32_t end_) noexcept - : _id(id_), _rel_id(rel_id_), _interval(chrom_, start_, end_) {} - -inline Bin::Bin(GenomicInterval interval) noexcept - : Bin(Bin::null_id, Bin::rel_null_id, std::move(interval)) {} - -inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept - : _id(id_), _rel_id(rel_id_), _interval(std::move(interval)) {} - -inline Bin::operator bool() const noexcept { return !!chrom(); } - -inline bool Bin::operator==(const Bin &other) const noexcept { - if (!has_null_id() && !other.has_null_id()) { - return id() == other.id(); - } - return _interval == other._interval; -} -inline bool Bin::operator!=(const Bin &other) const noexcept { return !(*this == other); } - -inline bool Bin::operator<(const Bin &other) const noexcept { - if (!has_null_id() && !other.has_null_id()) { - return id() < other.id(); - } - return _interval < other._interval; -} - -inline bool Bin::operator<=(const Bin &other) const noexcept { - if (!has_null_id() && !other.has_null_id()) { - return id() <= other.id(); - } - return _interval <= other._interval; -} +#include "hictk/reference.hpp" +#include "hictk/type_traits.hpp" -inline bool Bin::operator>(const Bin &other) const noexcept { - if (!has_null_id() && !other.has_null_id()) { - return id() > other.id(); - } - return _interval > other._interval; -} - -inline bool Bin::operator>=(const Bin &other) const noexcept { - if (!has_null_id() && !other.has_null_id()) { - return id() >= other.id(); - } - return _interval >= other._interval; -} - -constexpr std::uint64_t Bin::id() const noexcept { return _id; } -constexpr std::uint32_t Bin::rel_id() const noexcept { return _rel_id; } -inline const GenomicInterval &Bin::interval() const noexcept { return _interval; } -inline const Chromosome &Bin::chrom() const noexcept { return interval().chrom(); } -constexpr std::uint32_t Bin::start() const noexcept { return _interval.start(); } -constexpr std::uint32_t Bin::end() const noexcept { return _interval.end(); } - -constexpr bool Bin::has_null_id() const noexcept { return id() == Bin::null_id; } +namespace hictk { +template +inline BinTable::BinTable(BinTableT table) : _table(std::move(table)) {} inline BinTable::BinTable(Reference chroms, std::uint32_t bin_size, std::size_t bin_offset) - : _chroms(std::move(chroms)), - _num_bins_prefix_sum(compute_num_bins_prefix_sum(_chroms, bin_size, bin_offset)), - _bin_size(bin_size) { - assert(bin_size != 0); -} + : BinTable(BinTableFixed(std::move(chroms), bin_size, bin_offset)) {} template inline BinTable::BinTable(ChromIt first_chrom, ChromIt last_chrom, std::uint32_t bin_size, std::size_t bin_offset) - : BinTable(Reference(first_chrom, last_chrom), bin_size, bin_offset) {} + : BinTable(BinTableFixed(first_chrom, last_chrom, bin_size, bin_offset)) {} template inline BinTable::BinTable(ChromNameIt first_chrom_name, ChromNameIt last_chrom_name, ChromSizeIt first_chrom_size, std::uint32_t bin_size, std::size_t bin_offset) - : BinTable(Reference(first_chrom_name, last_chrom_name, first_chrom_size), bin_size, - bin_offset) {} + : BinTable(BinTableFixed(first_chrom_name, last_chrom_name, first_chrom_size, bin_size, + bin_offset)) {} + +template +inline BinTable::BinTable(Reference chroms, const std::vector &start_pos, + const std::vector &end_pos, I bin_offset) + : BinTable(BinTableVariable(std::move(chroms), start_pos, end_pos, bin_offset)) {} inline std::size_t BinTable::size() const noexcept { - if (_num_bins_prefix_sum.empty()) { - return 0; - } - return conditional_static_cast(_num_bins_prefix_sum.back() - - _num_bins_prefix_sum.front()); + return std::visit([&](const auto &t) { return t.size(); }, _table); } inline bool BinTable::empty() const noexcept { return size() == 0; } -inline std::size_t BinTable::num_chromosomes() const { return _chroms.size(); } - -constexpr std::uint32_t BinTable::bin_size() const noexcept { return _bin_size; } - -constexpr const Reference &BinTable::chromosomes() const noexcept { return _chroms; } - -constexpr const std::vector &BinTable::num_bin_prefix_sum() const noexcept { - return _num_bins_prefix_sum; +inline std::size_t BinTable::num_chromosomes() const { + return std::visit([&](const auto &t) { return t.num_chromosomes(); }, _table); } -inline auto BinTable::begin() const -> iterator { return iterator(*this); } -inline auto BinTable::end() const -> iterator { return iterator::make_end_iterator(*this); } -inline auto BinTable::cbegin() const -> iterator { return begin(); } -inline auto BinTable::cend() const -> iterator { return end(); } - -constexpr std::uint32_t BinTable::iterator::bin_size() const noexcept { - assert(_bin_table); - return _bin_table->bin_size(); +constexpr std::uint32_t BinTable::bin_size() const noexcept { + if (std::holds_alternative(_table)) { + return std::get(_table).bin_size(); + } + return 0; } -constexpr std::size_t BinTable::iterator::bin_id() const noexcept { - return _chrom_bin_id + _rel_bin_id; +constexpr const Reference &BinTable::chromosomes() const noexcept { + return std::visit([&](const auto &t) -> const Reference & { return t.chromosomes(); }, _table); } -inline BinTableConcrete BinTable::concretize() const { - std::vector chroms(size()); - std::vector starts(size()); - std::vector ends(size()); +constexpr bool BinTable::has_fixed_bin_size() const noexcept { + return std::holds_alternative(_table); +} - std::size_t i = 0; - for (const auto &bin : *this) { - chroms[i] = bin.chrom(); - starts[i] = bin.start(); - ends[i++] = bin.end(); - } - assert(i == chroms.size()); +constexpr const std::vector &BinTable::num_bin_prefix_sum() const noexcept { + return std::visit( + [&](const auto &t) -> const std::vector & { return t.num_bin_prefix_sum(); }, + _table); +} - return BinTableConcrete{chroms, starts, ends}; +inline auto BinTable::begin() const -> iterator { + return std::visit([&](const auto &t) { return iterator{t.begin()}; }, _table); } -inline bool BinTable::operator==(const BinTable &other) const { - return _bin_size == other._bin_size && _chroms == other._chroms; +inline auto BinTable::end() const -> iterator { + return std::visit([&](const auto &t) { return iterator{t.end()}; }, _table); } -inline bool BinTable::operator!=(const BinTable &other) const { return !(*this == other); } + +inline auto BinTable::cbegin() const -> iterator { return begin(); } + +inline auto BinTable::cend() const -> iterator { return end(); } inline BinTable BinTable::subset(const Chromosome &chrom) const { - // GCC8 fails to compile when using if constexpr instead #ifndef - // See: https://github.com/fmtlib/fmt/issues/1455 -#ifndef NDEBUG - if (!_chroms.contains(chrom)) { - throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); - } -#endif - const auto offset = at(chrom, 0).id(); - return {Reference{chrom}, _bin_size, offset}; + return std::visit([&](const auto &t) -> BinTable { return {t.subset(chrom)}; }, _table); } + inline BinTable BinTable::subset(std::string_view chrom_name) const { - return subset(_chroms.at(chrom_name)); + return subset(chromosomes().at(chrom_name)); } + inline BinTable BinTable::subset(std::uint32_t chrom_id) const { - return subset(_chroms.at(chrom_id)); + return subset(chromosomes().at(chrom_id)); } inline auto BinTable::find_overlap(const GenomicInterval &query) const -> std::pair { - return find_overlap(query.chrom(), query.start(), query.end()); + return std::visit( + [&](const auto &t) { + auto its = t.find_overlap(query); + return std::make_pair(iterator{its.first}, iterator{its.second}); + }, + _table); } inline auto BinTable::find_overlap(const Chromosome &chrom, std::uint32_t start, std::uint32_t end) const -> std::pair { - assert(start < end); - - const auto bin1_id = at(chrom, start).id(); - const auto bin2_id = at(chrom, end - (std::min)(end, 1U)).id(); - - return std::make_pair(begin() + bin1_id, begin() + bin2_id + 1); + return find_overlap(GenomicInterval{chrom, start, end}); } + inline auto BinTable::find_overlap(std::string_view chrom_name, std::uint32_t start, std::uint32_t end) const -> std::pair { - return find_overlap(_chroms.at(chrom_name), start, end); + return find_overlap(chromosomes().at(chrom_name), start, end); } + inline auto BinTable::find_overlap(std::uint32_t chrom_id, std::uint32_t start, std::uint32_t end) const -> std::pair { - return find_overlap(_chroms.at(chrom_id), start, end); + return find_overlap(chromosomes().at(chrom_id), start, end); } +// Map bin_id to Bin inline Bin BinTable::at(std::uint64_t bin_id) const { - // I tried benchmarking linear search as well as std::set (including third-party implementations). - // Binary search and find on flat vectors are always faster for a reasonable number of chromosomes - // (e.g. 5-100) and have fairly similar performanc. - // Linear search is however better in practice because chromosomes are usually sorted by (approx.) - // size, with unplaced scaffolds etc. ending up last. - auto match = std::find_if(_num_bins_prefix_sum.begin(), _num_bins_prefix_sum.end(), - [&](const auto n) { return n > bin_id; }); - - if (match == _num_bins_prefix_sum.end()) { - throw std::out_of_range(fmt::format(FMT_STRING("bin id {} not found: out of range"), bin_id)); - } - assert(match != _num_bins_prefix_sum.begin()); - - const auto chrom_id = - static_cast(std::distance(_num_bins_prefix_sum.begin(), --match)); - return at_hint(bin_id, _chroms[chrom_id]); -} - -inline Bin BinTable::at_hint(std::uint64_t bin_id, const Chromosome &chrom) const { - const auto offset = _num_bins_prefix_sum[chrom.id()]; - const auto relative_bin_id = bin_id - offset; - const auto start = static_cast(relative_bin_id * bin_size()); - assert(start < chrom.size()); - const auto end = (std::min)(start + bin_size(), chrom.size()); - - return {bin_id, static_cast(relative_bin_id), chrom, start, end}; + return std::visit([&](const auto &t) { return t.at(bin_id); }, _table); } inline std::pair BinTable::at(const GenomicInterval &gi) const { - const auto [bin1_id, bin2_id] = map_to_bin_ids(gi); - return std::make_pair(at_hint(bin1_id, gi.chrom()), at_hint(bin2_id, gi.chrom())); + return std::visit([&](const auto &t) { return t.at(gi); }, _table); } + inline Bin BinTable::at(const Chromosome &chrom, std::uint32_t pos) const { - return at_hint(map_to_bin_id(chrom, pos), chrom); + return std::visit([&](const auto &t) { return t.at(chrom, pos); }, _table); } + inline Bin BinTable::at(std::string_view chrom_name, std::uint32_t pos) const { - return at(map_to_bin_id(chrom_name, pos)); + return std::visit([&](const auto &t) { return t.at(chrom_name, pos); }, _table); } + inline Bin BinTable::at(std::uint32_t chrom_id, std::uint32_t pos) const { - return at(map_to_bin_id(chrom_id, pos)); + return std::visit([&](const auto &t) { return t.at(chrom_id, pos); }, _table); } +inline Bin BinTable::at_hint(std::uint64_t bin_id, const Chromosome &chrom) const { + return std::visit([&](const auto &t) { return t.at_hint(bin_id, chrom); }, _table); +} + +// Map genomic coords to bin_id inline std::pair BinTable::map_to_bin_ids( const GenomicInterval &gi) const { - return std::make_pair(map_to_bin_id(gi.chrom(), gi.start()), - map_to_bin_id(gi.chrom(), gi.end() - (std::min)(gi.end(), 1U))); + return std::visit([&](const auto &t) { return t.map_to_bin_ids(gi); }, _table); } inline std::uint64_t BinTable::map_to_bin_id(const Chromosome &chrom, std::uint32_t pos) const { - // GCC8 fails to compile when using if constexpr instead #ifndef - // See: https://github.com/fmtlib/fmt/issues/1455 -#ifndef NDEBUG - if (!_chroms.contains(chrom)) { - throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); - } -#endif - - if (pos > chrom.size()) { - throw std::out_of_range(fmt::format( - FMT_STRING("position is greater than chromosome size: {} > {}"), pos, chrom.size())); - } - - const auto bin_offset = _num_bins_prefix_sum[chrom.id()] - _num_bins_prefix_sum.front(); - - return bin_offset + static_cast(pos / bin_size()); + return std::visit([&](const auto &t) { return t.map_to_bin_id(chrom, pos); }, _table); } inline std::uint64_t BinTable::map_to_bin_id(std::string_view chrom_name, std::uint32_t pos) const { - return map_to_bin_id(_chroms.at(chrom_name), pos); + return std::visit([&](const auto &t) { return t.map_to_bin_id(chrom_name, pos); }, _table); } inline std::uint64_t BinTable::map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const { - return map_to_bin_id(_chroms.at(chrom_id), pos); + return std::visit([&](const auto &t) { return t.map_to_bin_id(chrom_id, pos); }, _table); } -inline std::vector BinTable::compute_num_bins_prefix_sum(const Reference &chroms, - std::uint32_t bin_size, - std::size_t bin_offset) { - assert(bin_size != 0); - - DISABLE_WARNING_PUSH - DISABLE_WARNING_NULL_DEREF - std::vector prefix_sum(chroms.size() + 1); - prefix_sum.front() = bin_offset; - DISABLE_WARNING_POP +inline bool BinTable::operator==(const BinTable &other) const { + return std::visit( + [&](const auto &t1) { + const auto &t2 = std::get>(other._table); + return t1 == t2; + }, + _table); +} - // I am using transform instead of inclusive_scan because the latter is not always available - std::transform(chroms.begin(), chroms.end(), prefix_sum.begin() + 1, - [&, sum = bin_offset](const Chromosome &chrom) mutable { - if (chrom.is_all()) { - return sum; - } - const auto num_bins = (chrom.size() + bin_size - 1) / bin_size; - return sum += static_cast(num_bins); - }); +inline bool BinTable::operator!=(const BinTable &other) const { return !(*this == other); }; - return prefix_sum; +template +constexpr const BinTableT &BinTable::get() const { + return std::get(get()); } -inline BinTable::iterator::iterator(const BinTable &bin_table) noexcept - : _bin_table{&bin_table}, _chrom_bin_id(_bin_table->_num_bins_prefix_sum.front()) { - if (_bin_table->chromosomes().at(_chrom_id).is_all()) { - _chrom_id++; - } +template +constexpr BinTableT &BinTable::get() { + return std::get(get()); } -constexpr bool BinTable::iterator::operator==(const iterator &other) const noexcept { - // clang-format off - return _bin_table == other._bin_table && - _chrom_id == other._chrom_id && - _rel_bin_id == other._rel_bin_id; - // clang-format on -} +constexpr auto BinTable::get() const noexcept -> const BinTableVar & { return _table; } -constexpr bool BinTable::iterator::operator!=(const iterator &other) const noexcept { - return !(*this == other); -} +constexpr auto BinTable::get() noexcept -> BinTableVar & { return _table; } + +template +inline BinTable::iterator::iterator(It it) noexcept : _it(it) {} -constexpr bool BinTable::iterator::operator<(const iterator &other) const noexcept { - return bin_id() < other.bin_id(); +constexpr bool BinTable::iterator::operator==(const iterator &other) const { + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 == it2; + }, + _it); } -constexpr bool BinTable::iterator::operator<=(const iterator &other) const noexcept { - return bin_id() <= other.bin_id(); +constexpr bool BinTable::iterator::operator!=(const iterator &other) const { + return !(*this == other); } -constexpr bool BinTable::iterator::operator>(const iterator &other) const noexcept { - return bin_id() > other.bin_id(); +constexpr bool BinTable::iterator::operator<(const iterator &other) const { + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 < it2; + }, + _it); } -constexpr bool BinTable::iterator::operator>=(const iterator &other) const noexcept { - return bin_id() >= other.bin_id(); +constexpr bool BinTable::iterator::operator<=(const iterator &other) const { + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 <= it2; + }, + _it); } -inline auto BinTable::iterator::make_end_iterator(const BinTable &table) noexcept -> iterator { - iterator it(table); +constexpr bool BinTable::iterator::operator>(const iterator &other) const { + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 > it2; + }, + _it); +} - it._chrom_id = nchrom; - it._rel_bin_id = null_rel_bin_id; - return it; +constexpr bool BinTable::iterator::operator>=(const iterator &other) const { + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 >= it2; + }, + _it); } inline auto BinTable::iterator::operator*() const -> value_type { - assert(_bin_table); - - const auto &chrom = chromosome(); - const auto bin_size = this->bin_size(); - - const auto start = std::min(_rel_bin_id * bin_size, chrom.size()); - const auto end = std::min(start + bin_size, chrom.size()); - - return value_type{bin_id(), _rel_bin_id, chrom, start, end}; + return std::visit([&](const auto &it) { return *it; }, _it); +} +inline auto BinTable::iterator::operator[](std::size_t i) const -> iterator { + std::visit([&](const auto &it) { std::ignore = it + i; }, _it); + return *this; } inline auto BinTable::iterator::operator++() -> iterator & { - assert(_bin_table); - if (_chrom_id == nchrom) { - return *this; - } - - if (++_rel_bin_id >= compute_num_chrom_bins()) { - if (_chrom_id + 1 >= num_chromosomes()) { - return *this = make_end_iterator(*_bin_table); - } - ++_chrom_id; - _chrom_bin_id = compute_bin_offset(); - _rel_bin_id = 0; - } - + std::visit([&](auto &it) { std::ignore = ++it; }, _it); return *this; } inline auto BinTable::iterator::operator++(int) -> iterator { - auto it = *this; - std::ignore = ++(*this); - return it; + auto old_it = *this; + std::visit([&](auto &it) { std::ignore = ++it; }, _it); + return old_it; } inline auto BinTable::iterator::operator+=(std::size_t i) -> iterator & { - assert(_bin_table); - if (_chrom_id == nchrom) { - if (i == 0) { - return *this; - } - throw std::out_of_range("BinTable::iterator: caught attempt to increment iterator past end()"); - } - - const auto ii = static_cast(i); - const auto num_bins = compute_num_chrom_bins(); - if (_rel_bin_id + ii < num_bins) { - _rel_bin_id += ii; - return *this; - } - - _chrom_id++; - _chrom_bin_id = compute_bin_offset(); - i -= (num_bins - _rel_bin_id); - _rel_bin_id = 0; - return *this += i; + std::visit([&](auto &it) { std::ignore = it += i; }, _it); + return *this; } inline auto BinTable::iterator::operator+(std::size_t i) const -> iterator { auto it = *this; - return it += i; + it += i; + return it; } inline auto BinTable::iterator::operator--() -> iterator & { - assert(_bin_table); - if (bin_id() == 0) { - return *this; - } - - if (_rel_bin_id == null_rel_bin_id) { - assert(*this == make_end_iterator(*_bin_table)); - _chrom_id = static_cast(num_chromosomes() - 1); - _chrom_bin_id = compute_bin_offset(); - _rel_bin_id = compute_num_chrom_bins() - 1; - return *this; - } - - if (_rel_bin_id-- == 0) { - _chrom_id--; - _chrom_bin_id = compute_bin_offset(); - _rel_bin_id = compute_num_chrom_bins() - 1; - } - + std::visit([&](auto &it) { std::ignore = --it; }, _it); return *this; } inline auto BinTable::iterator::operator--(int) -> iterator { - auto it = *this; - std::ignore = --(*this); - return it; + auto old_it = *this; + std::visit([&](auto &it) { std::ignore = --it; }, _it); + return old_it; } inline auto BinTable::iterator::operator-=(std::size_t i) -> iterator & { - assert(_bin_table); - - if (_chrom_id == 0 && _rel_bin_id == 0 && i == 0) { - return *this; - } - - if (_chrom_id == 0 && _rel_bin_id < i) { - throw std::out_of_range( - "BinTable::iterator: caught attempt to decrement iterator past begin()"); - } - - if (_rel_bin_id == null_rel_bin_id) { - assert(*this == make_end_iterator(*_bin_table)); - _chrom_id = static_cast(num_chromosomes() - 1); - _chrom_bin_id = compute_bin_offset(); - _rel_bin_id = compute_num_chrom_bins(); - return *this -= i; - } - - if (i <= _rel_bin_id) { - _rel_bin_id -= static_cast(i); - return *this; - } - - _chrom_id--; - _chrom_bin_id = compute_bin_offset(); - i -= _rel_bin_id; - _rel_bin_id = compute_num_chrom_bins(); - return *this -= i; + std::visit([&](auto &it) { std::ignore = it -= i; }, _it); + return *this; } inline auto BinTable::iterator::operator-(std::size_t i) const -> iterator { auto it = *this; - return it -= i; + it -= i; + return it; } inline auto BinTable::iterator::operator-(const iterator &other) const -> difference_type { - assert(_bin_table); - assert(other._bin_table); - - const auto offset1 = _chrom_id == nchrom ? _bin_table->size() : bin_id(); - const auto offset2 = other._chrom_id == nchrom ? other._bin_table->size() : other.bin_id(); - - return static_cast(offset1) - static_cast(offset2); + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 - it2; + }, + _it); } -inline auto BinTable::iterator::operator[](std::size_t i) const -> iterator { return (*this + i); } - -inline const Chromosome &BinTable::iterator::chromosome() const { return chromosome(_chrom_id); } - -inline const Chromosome &BinTable::iterator::chromosome(std::uint32_t chrom_id) const { - return _bin_table->chromosomes().at(chrom_id); +template +constexpr const IteratorT &BinTable::iterator::get() const { + return std::get(get()); } -inline std::uint32_t BinTable::iterator::compute_num_chrom_bins() const noexcept { - assert(_bin_table); - - const auto chrom_size = chromosome().size(); - const auto bin_size = this->bin_size(); - - return (chrom_size + bin_size - 1) / bin_size; -} - -inline std::size_t BinTable::iterator::compute_bin_offset() const noexcept { - return _bin_table->at(_chrom_id, 0).id(); +template +constexpr IteratorT &BinTable::iterator::get() { + return std::get(get()); } -inline std::size_t BinTable::iterator::num_chromosomes() const noexcept { - assert(_bin_table); - - return _bin_table->num_chromosomes(); -} +constexpr auto BinTable::iterator::get() const noexcept -> const IteratorVar & { return _it; } +constexpr auto BinTable::iterator::get() noexcept -> IteratorVar & { return _it; } } // namespace hictk - -inline std::size_t std::hash::operator()(const hictk::Bin &b) const { - return hictk::internal::hash_combine(0, b.id(), b.interval()); -} diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp new file mode 100644 index 00000000..d423a5f3 --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp @@ -0,0 +1,473 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/common.hpp" +#include "hictk/genomic_interval.hpp" +#include "hictk/suppress_warnings.hpp" + +namespace hictk { // NOLINT + +template +inline BinTableVariable::BinTableVariable(Reference chroms, const std::vector &start_pos, + const std::vector &end_pos, I bin_offset) + : _chroms(std::move(chroms)) { + assert(start_pos.size() == end_pos.size()); + _bin_end_prefix_sum[0] = bin_offset; + if (start_pos.empty()) { + return; + } + + _bin_end_prefix_sum.reserve(start_pos.size() + 1); + _bin_end_prefix_sum.push_back(end_pos.front()); + + for (std::size_t i = 1; i < start_pos.size(); ++i) { + if (start_pos[i] <= start_pos[i - 1]) { + _num_bins_prefix_sum.push_back(_bin_end_prefix_sum.size() - 1); + } + _bin_end_prefix_sum.push_back(_bin_end_prefix_sum.back() + end_pos[i] - start_pos[i]); + } + _num_bins_prefix_sum.push_back(_bin_end_prefix_sum.size() - 1); +} + +template +inline std::size_t BinTableVariable::size() const noexcept { + return _bin_end_prefix_sum.size() - 1; +} + +template +inline bool BinTableVariable::empty() const noexcept { + return size() == 0; +} + +template +inline std::size_t BinTableVariable::num_chromosomes() const { + return chromosomes().size(); +} + +template +constexpr const Reference &BinTableVariable::chromosomes() const noexcept { + return _chroms; +} + +template +constexpr const std::vector &BinTableVariable::num_bin_prefix_sum() + const noexcept { + return _num_bins_prefix_sum; +} + +template +inline auto BinTableVariable::begin() const -> iterator { + volatile auto chrom = *chromosomes().begin(); + return iterator(*this); +} + +template +inline auto BinTableVariable::end() const -> iterator { + return iterator::make_end_iterator(*this); +} + +template +inline auto BinTableVariable::cbegin() const -> iterator { + return begin(); +} + +template +inline auto BinTableVariable::cend() const -> iterator { + return end(); +} + +template +inline bool BinTableVariable::operator==(const BinTableVariable &other) const { + return _chroms == other._chroms && + std::equal(_num_bins_prefix_sum.begin(), _num_bins_prefix_sum.end(), + other._num_bins_prefix_sum.begin()); +} + +template +inline bool BinTableVariable::operator!=(const BinTableVariable &other) const { + return !(*this == other); +} + +template +inline BinTableVariable BinTableVariable::subset(const Chromosome &chrom) const { + // GCC8 fails to compile when using if constexpr instead #ifndef + // See: https://github.com/fmtlib/fmt/issues/1455 +#ifndef NDEBUG + if (!_chroms.contains(chrom)) { + throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); + } +#endif + std::vector start_pos{}; + std::vector end_pos{}; + + const auto [first_bin, last_bin] = find_overlap(chrom, 0, chrom.size()); + std::for_each(first_bin, last_bin, [&](const Bin &b) { + start_pos.push_back(b.start()); + end_pos.push_back(b.end()); + }); + + return {Reference{chrom}, start_pos, end_pos, start_pos.front()}; +} + +template +inline BinTableVariable BinTableVariable::subset(std::string_view chrom_name) const { + return subset(_chroms.at(chrom_name)); +} + +template +inline BinTableVariable BinTableVariable::subset(std::uint32_t chrom_id) const { + return subset(_chroms.at(chrom_id)); +} + +template +inline auto BinTableVariable::find_overlap(const GenomicInterval &query) const + -> std::pair::iterator, BinTableVariable::iterator> { + return find_overlap(query.chrom(), query.start(), query.end()); +} + +template +inline auto BinTableVariable::find_overlap(const Chromosome &chrom, std::uint32_t start, + std::uint32_t end) const + -> std::pair::iterator, BinTableVariable::iterator> { + assert(start < end); + + const auto bin1_id = at(chrom, start).id(); + const auto bin2_id = at(chrom, end - (std::min)(end, 1U)).id(); + + return std::make_pair(begin() + bin1_id, begin() + bin2_id + 1); +} + +template +inline auto BinTableVariable::find_overlap(std::string_view chrom_name, std::uint32_t start, + std::uint32_t end) const + -> std::pair::iterator, BinTableVariable::iterator> { + return find_overlap(_chroms.at(chrom_name), start, end); +} + +template +inline auto BinTableVariable::find_overlap(std::uint32_t chrom_id, std::uint32_t start, + std::uint32_t end) const + -> std::pair::iterator, BinTableVariable::iterator> { + return find_overlap(_chroms.at(chrom_id), start, end); +} + +template +Bin BinTableVariable::at(std::uint64_t bin_id) const { + // I tried benchmarking linear search as well as std::set (including third-party implementations). + // Binary search and find on flat vectors are always faster for a reasonable number of chromosomes + // (e.g. 5-100) and have fairly similar performance. + // Linear search is however better in practice because chromosomes are usually sorted by (approx.) + // size, with unplaced scaffolds etc. ending up last. + auto match = std::find_if(_num_bins_prefix_sum.begin(), _num_bins_prefix_sum.end(), + [&](const auto n) { return n > bin_id; }); + + if (match == _num_bins_prefix_sum.end()) { + throw std::out_of_range(fmt::format(FMT_STRING("bin id {} not found: out of range"), bin_id)); + } + assert(match != _num_bins_prefix_sum.begin()); + + const auto chrom_id = + static_cast(std::distance(_num_bins_prefix_sum.begin(), --match)); + return at_hint(bin_id, _chroms[chrom_id]); +} + +template +inline Bin BinTableVariable::at_hint(std::uint64_t bin_id, const Chromosome &chrom) const { + if (_bin_end_prefix_sum.size() <= bin_id) { + throw std::out_of_range(fmt::format(FMT_STRING("bin id {} not found: out of range"), bin_id)); + } + + const auto bin_id_offset = _num_bins_prefix_sum[chrom.id()]; + const auto chrom_size_offset = _chroms.chrom_size_prefix_sum()[chrom.id()]; + const auto relative_bin_id = bin_id - bin_id_offset; + + const auto start = + conditional_static_cast(_bin_end_prefix_sum[bin_id] - chrom_size_offset); + const auto end = + conditional_static_cast(_bin_end_prefix_sum[bin_id + 1] - chrom_size_offset); + + if (_bin_end_prefix_sum[bin_id] < chrom_size_offset || end > chrom.size()) { + throw std::out_of_range(fmt::format( + FMT_STRING("bin id {} not found using {} as hint: out of range"), bin_id, chrom.name())); + } + + return {bin_id, static_cast(relative_bin_id), chrom, start, end}; +} + +template +inline std::pair BinTableVariable::at(const GenomicInterval &gi) const { + const auto [bin1_id, bin2_id] = map_to_bin_ids(gi); + return std::make_pair(at_hint(bin1_id, gi.chrom()), at_hint(bin2_id, gi.chrom())); +} + +template +inline Bin BinTableVariable::at(const Chromosome &chrom, std::uint32_t pos) const { + return at_hint(map_to_bin_id(chrom, pos), chrom); +} + +template +inline Bin BinTableVariable::at(std::string_view chrom_name, std::uint32_t pos) const { + return at(map_to_bin_id(chrom_name, pos)); +} + +template +inline Bin BinTableVariable::at(std::uint32_t chrom_id, std::uint32_t pos) const { + return at(map_to_bin_id(chrom_id, pos)); +} + +template +inline std::pair BinTableVariable::map_to_bin_ids( + const GenomicInterval &gi) const { + return std::make_pair(map_to_bin_id(gi.chrom(), gi.start()), + map_to_bin_id(gi.chrom(), gi.end() - (std::min)(gi.end(), 1U))); +} + +template +inline std::uint64_t BinTableVariable::map_to_bin_id(const Chromosome &chrom, + std::uint32_t pos) const { + // GCC8 fails to compile when using if constexpr instead #ifndef + // See: https://github.com/fmtlib/fmt/issues/1455 +#ifndef NDEBUG + if (!_chroms.contains(chrom)) { + throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); + } +#endif + + if (pos > chrom.size()) { + throw std::out_of_range(fmt::format( + FMT_STRING("position is greater than chromosome size: {} > {}"), pos, chrom.size())); + } + + const auto pos_offset = _chroms.chrom_size_prefix_sum()[chrom.id()]; + auto match = + std::upper_bound(_bin_end_prefix_sum.begin(), _bin_end_prefix_sum.end(), pos + pos_offset); + + return static_cast(std::distance(_bin_end_prefix_sum.begin(), --match)); +} + +template +inline std::uint64_t BinTableVariable::map_to_bin_id(std::string_view chrom_name, + std::uint32_t pos) const { + return map_to_bin_id(_chroms.at(chrom_name), pos); +} + +template +inline std::uint64_t BinTableVariable::map_to_bin_id(std::uint32_t chrom_id, + std::uint32_t pos) const { + return map_to_bin_id(_chroms.at(chrom_id), pos); +} + +template +inline BinTableVariable::iterator::iterator(const BinTableVariable &bin_table) noexcept + : _bin_table{&bin_table} { + if (_bin_table->chromosomes().at(_chrom_id).is_all()) { + _chrom_id++; + } + _value = get_bin(); +} + +template +constexpr bool BinTableVariable::iterator::operator==(const iterator &other) const noexcept { + // clang-format off + return _bin_table == other._bin_table && + _chrom_id == other._chrom_id && + _bin_id == other._bin_id; + // clang-format on +} + +template +constexpr bool BinTableVariable::iterator::operator!=(const iterator &other) const noexcept { + return !(*this == other); +} + +template +constexpr bool BinTableVariable::iterator::operator<(const iterator &other) const noexcept { + return _bin_id < other._bin_id; +} + +template +constexpr bool BinTableVariable::iterator::operator<=(const iterator &other) const noexcept { + return _bin_id <= other._bin_id; +} + +template +constexpr bool BinTableVariable::iterator::operator>(const iterator &other) const noexcept { + return _bin_id > other._bin_id; +} + +template +constexpr bool BinTableVariable::iterator::operator>=(const iterator &other) const noexcept { + return _bin_id >= other._bin_id; +} + +template +inline auto BinTableVariable::iterator::make_end_iterator(const BinTableVariable &table) noexcept + -> iterator { + iterator it(table); + + it._chrom_id = nchrom; + it._bin_id = table.size(); + return it; +} + +template +inline auto BinTableVariable::iterator::operator*() const noexcept -> value_type { + return _value; +} + +template +inline auto BinTableVariable::iterator::operator++() -> iterator & { + assert(_bin_table); + if (++_bin_id == _bin_table->size()) { + *this = make_end_iterator(*_bin_table); + return *this; + } + try { + _value = get_bin(); + _chrom_id = _value.chrom().id(); + } catch (const std::out_of_range &) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to increment iterator past end()"); + } + + return *this; +} + +template +inline auto BinTableVariable::iterator::operator++(int) -> iterator { + auto it = *this; + std::ignore = ++(*this); + return it; +} + +template +inline auto BinTableVariable::iterator::operator+=(std::size_t i) -> iterator & { + if (i == 0) { + return *this; + } + assert(_bin_table); + try { + if (_bin_id > _bin_table->size() - i) { + throw std::out_of_range(""); + } + + _bin_id += i; + _value = get_bin(); + _chrom_id = _value.chrom().id(); + } catch (const std::out_of_range &) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to increment iterator past end()"); + } + return *this; +} + +template +inline auto BinTableVariable::iterator::operator+(std::size_t i) const -> iterator { + auto it = *this; + return it += i; +} + +template +inline auto BinTableVariable::iterator::operator--() -> iterator & { + assert(_bin_table); + --_bin_id; + + try { + _value = get_bin(); + _chrom_id = _value.chrom().id(); + } catch (const std::out_of_range &) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to decrement iterator past begin()"); + } + return *this; +} + +template +inline auto BinTableVariable::iterator::operator--(int) -> iterator { + auto it = *this; + std::ignore = --(*this); + return it; +} + +template +inline auto BinTableVariable::iterator::operator-=(std::size_t i) -> iterator & { + if (i == 0) { + return *this; + } + assert(_bin_table); + try { + if (_bin_id < _bin_table->size() - i) { + throw std::out_of_range(""); + } + _bin_id -= i; + _value = get_bin(); + _chrom_id = _value.chrom().id(); + } catch (const std::out_of_range &) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to decrement iterator past begin()"); + } + return *this; +} + +template +inline auto BinTableVariable::iterator::operator-(std::size_t i) const -> iterator { + auto it = *this; + return it -= i; +} + +template +inline auto BinTableVariable::iterator::operator-(const iterator &other) const + -> difference_type { + assert(_bin_table); + assert(other._bin_table); + + const auto offset1 = _chrom_id == nchrom ? _bin_table->size() : _bin_id; + const auto offset2 = other._chrom_id == nchrom ? other._bin_table->size() : other._bin_id; + + return static_cast(offset1) - static_cast(offset2); +} + +template +inline auto BinTableVariable::iterator::operator[](std::size_t i) const -> iterator { + return (*this + i); +} + +template +inline const Chromosome &BinTableVariable::iterator::chromosome() const { + return chromosome(_chrom_id); +} + +template +inline const Chromosome &BinTableVariable::iterator::chromosome(std::uint32_t chrom_id) const { + return _bin_table->chromosomes().at(chrom_id); +} + +template +inline Bin BinTableVariable::iterator::get_bin() const { + if (_bin_id == _bin_table->size()) { + return {}; + } + try { + const auto &chrom = chromosome(); + return _bin_table->at_hint(_bin_id, chrom); + } catch (const std::out_of_range &) { + return _bin_table->at(_bin_id); + } +} + +} // namespace hictk diff --git a/src/libhictk/formatting/include/hictk/fmt.hpp b/src/libhictk/formatting/include/hictk/fmt.hpp index 070d8284..70cc8986 100644 --- a/src/libhictk/formatting/include/hictk/fmt.hpp +++ b/src/libhictk/formatting/include/hictk/fmt.hpp @@ -4,7 +4,7 @@ #pragma once -#include "hictk/fmt/bin_table.hpp" +#include "hictk/fmt/bin.hpp" #include "hictk/fmt/chromosome.hpp" #include "hictk/fmt/genomic_interval.hpp" #include "hictk/fmt/pixel.hpp" diff --git a/src/libhictk/formatting/include/hictk/fmt/bin_table.hpp b/src/libhictk/formatting/include/hictk/fmt/bin.hpp similarity index 98% rename from src/libhictk/formatting/include/hictk/fmt/bin_table.hpp rename to src/libhictk/formatting/include/hictk/fmt/bin.hpp index 2e060601..adc90ced 100644 --- a/src/libhictk/formatting/include/hictk/fmt/bin_table.hpp +++ b/src/libhictk/formatting/include/hictk/fmt/bin.hpp @@ -6,7 +6,7 @@ #include -#include "hictk/bin_table.hpp" +#include "hictk/bin.hpp" #include "hictk/fmt/common.hpp" #include "hictk/fmt/genomic_interval.hpp" diff --git a/src/libhictk/formatting/include/hictk/fmt/pixel.hpp b/src/libhictk/formatting/include/hictk/fmt/pixel.hpp index 2f8a377f..dce5a95a 100644 --- a/src/libhictk/formatting/include/hictk/fmt/pixel.hpp +++ b/src/libhictk/formatting/include/hictk/fmt/pixel.hpp @@ -6,7 +6,7 @@ #include -#include "hictk/fmt/bin_table.hpp" +#include "hictk/fmt/bin.hpp" #include "hictk/fmt/common.hpp" #include "hictk/pixel.hpp" diff --git a/src/libhictk/reference/include/hictk/impl/reference_impl.hpp b/src/libhictk/reference/include/hictk/impl/reference_impl.hpp index e71ce366..8b5d9aec 100644 --- a/src/libhictk/reference/include/hictk/impl/reference_impl.hpp +++ b/src/libhictk/reference/include/hictk/impl/reference_impl.hpp @@ -27,8 +27,9 @@ namespace hictk { template inline Reference::Reference(ChromosomeIt first_chrom, ChromosomeIt last_chrom) - : _buff(first_chrom, last_chrom), + : _buff(construct_chrom_buffer(first_chrom, last_chrom)), _map(construct_chrom_map(_buff)), + _size_prefix_sum(compute_size_prefix_sum(_buff)), _longest_chrom(find_longest_chromosome(_buff)), _chrom_with_longest_name(find_chromosome_with_longest_name(_buff)) { validate(); @@ -39,6 +40,7 @@ inline Reference::Reference(ChromosomeNameIt first_chrom_name, ChromosomeNameIt ChromosomeSizeIt first_chrom_size) : _buff(construct_chrom_buffer(first_chrom_name, last_chrom_name, first_chrom_size)), _map(construct_chrom_map(_buff)), + _size_prefix_sum(compute_size_prefix_sum(_buff)), _longest_chrom(find_longest_chromosome(_buff)), _chrom_with_longest_name(find_chromosome_with_longest_name(_buff)) { validate(); @@ -156,6 +158,10 @@ inline bool Reference::operator==(const Reference& other) const { inline bool Reference::operator!=(const Reference& other) const { return !(*this == other); } +constexpr const std::vector& Reference::chrom_size_prefix_sum() const noexcept { + return _size_prefix_sum; +} + inline const Chromosome& Reference::longest_chromosome() const { if (empty()) { throw std::runtime_error("longest_chromosome() was called on an empty Reference"); @@ -195,6 +201,20 @@ inline auto Reference::construct_chrom_buffer(ChromosomeNameIt first_chrom_name, return buff; } +template +inline auto Reference::construct_chrom_buffer(ChromosomeIt first_chrom, ChromosomeIt last_chrom) + -> ChromBuff { + std::vector chrom_names{}; + std::vector chrom_sizes{}; + + std::for_each(first_chrom, last_chrom, [&](const Chromosome& chrom) { + chrom_names.emplace_back(std::string{chrom.name()}); + chrom_sizes.emplace_back(chrom.size()); + }); + + return construct_chrom_buffer(chrom_names.begin(), chrom_names.end(), chrom_sizes.begin()); +} + inline auto Reference::construct_chrom_map(const ChromBuff& chroms) -> ChromMap { ChromMap buff(chroms.size()); std::transform(chroms.begin(), chroms.end(), std::inserter(buff, buff.begin()), @@ -233,6 +253,18 @@ inline std::size_t Reference::find_chromosome_with_longest_name(const ChromBuff& return static_cast(std::distance(chroms.begin(), match)); } +inline std::vector Reference::compute_size_prefix_sum( + const ChromBuff& chroms) noexcept { + std::vector buff(chroms.size() + 2, 0); + for (std::size_t i = 1; i < chroms.size() + 1; ++i) { + buff[i] = buff[i - 1] + chroms[i - 1].size(); + } + + buff.back() = buff[chroms.size()] + 1; + + return buff; +} + inline void Reference::validate() const { if (empty()) { return; diff --git a/src/libhictk/reference/include/hictk/reference.hpp b/src/libhictk/reference/include/hictk/reference.hpp index 63261ed1..e9eebc54 100644 --- a/src/libhictk/reference/include/hictk/reference.hpp +++ b/src/libhictk/reference/include/hictk/reference.hpp @@ -25,6 +25,7 @@ class Reference { ChromBuff _buff{}; ChromMap _map{}; + std::vector _size_prefix_sum{}; std::size_t _longest_chrom{Chromosome{}.id()}; std::size_t _chrom_with_longest_name{Chromosome{}.id()}; @@ -87,6 +88,8 @@ class Reference { [[nodiscard]] bool operator==(const Reference& other) const; [[nodiscard]] bool operator!=(const Reference& other) const; + [[nodiscard]] constexpr const std::vector& chrom_size_prefix_sum() const noexcept; + // In case of ties, the first match is returned [[nodiscard]] const Chromosome& longest_chromosome() const; [[nodiscard]] const Chromosome& chromosome_with_longest_name() const; @@ -99,12 +102,19 @@ class Reference { ChromosomeNameIt last_chrom_name, ChromosomeSizeIt first_chrom_size) -> ChromBuff; + template + [[nodiscard]] static auto construct_chrom_buffer(ChromosomeIt first_chrom, + ChromosomeIt last_chrom) -> ChromBuff; + [[nodiscard]] static auto construct_chrom_map(const ChromBuff& chroms) -> ChromMap; [[nodiscard]] static std::size_t find_longest_chromosome(const ChromBuff& chroms) noexcept; [[nodiscard]] static std::size_t find_chromosome_with_longest_name( const ChromBuff& chroms) noexcept; + [[nodiscard]] static std::vector compute_size_prefix_sum( + const ChromBuff& chroms) noexcept; + void validate() const; }; } // namespace hictk diff --git a/test/units/bin_table/CMakeLists.txt b/test/units/bin_table/CMakeLists.txt index afc87653..71223665 100644 --- a/test/units/bin_table/CMakeLists.txt +++ b/test/units/bin_table/CMakeLists.txt @@ -3,13 +3,17 @@ # SPDX-License-Identifier: MIT find_package(Filesystem REQUIRED) +find_package(fmt REQUIRED) include(CTest) include(Catch) add_executable(bin_table_tests) -target_sources(bin_table_tests PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/bin_table_test.cpp") +target_sources( + bin_table_tests + PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/bin_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/bin_table_test.cpp") target_link_libraries( bin_table_tests @@ -20,6 +24,7 @@ target_link_system_libraries( bin_table_tests PUBLIC Catch2::Catch2WithMain + fmt::fmt-header-only std::filesystem) file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/Testing/") diff --git a/test/units/bin_table/bin_table_test.cpp b/test/units/bin_table/bin_table_test.cpp index 4d92e35d..12f27576 100644 --- a/test/units/bin_table/bin_table_test.cpp +++ b/test/units/bin_table/bin_table_test.cpp @@ -12,102 +12,10 @@ #include #include -#include "hictk/fmt/bin_table.hpp" - namespace hictk::test::bin_table { // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Bin", "[bin][short]") { - const Chromosome chrom1{0, "chr1", 50}; - const Chromosome chrom2{1, "chr2", 10}; - SECTION("Ctors") { - CHECK(Bin{chrom1, 1, 2}.has_null_id()); - CHECK_FALSE(Bin{0, 0, chrom1, 1, 2}.has_null_id()); - CHECK_FALSE(Bin{0, 0, GenomicInterval{chrom1, 1, 2}}.has_null_id()); - } - - SECTION("Accessors") { - const Bin bin1{chrom1, 1, 2}; - const Bin bin2{10, 5, chrom1, 1, 2}; - - CHECK(bin1.id() == Bin::null_id); - CHECK(bin2.id() == 10); - CHECK(bin2.rel_id() == 5); - - CHECK(bin1.interval() == GenomicInterval{chrom1, 1, 2}); - - CHECK(bin2.chrom() == chrom1); - CHECK(bin2.start() == 1); - CHECK(bin2.end() == 2); - } - - SECTION("operators (wo/ id)") { - const Bin bin0{}; - const Bin bin1{chrom1, 1, 2}; - const Bin bin2{chrom1, 2, 3}; - - const Bin bin3{chrom2, 1, 2}; - - CHECK(!bin0); - CHECK(!!bin1); - - CHECK(bin1 != bin2); - CHECK(bin1 != bin3); - - CHECK(bin1 < bin2); - CHECK(bin1 < bin3); - - CHECK(bin1 <= bin2); - CHECK(bin1 <= bin3); - - CHECK(bin2 > bin1); - CHECK(bin3 > bin1); - - CHECK(bin2 >= bin1); - CHECK(bin3 >= bin1); - } - - SECTION("operators (w/ id)") { - const Bin bin1{0, 0, chrom1, 1, 2}; - const Bin bin2{1, 1, chrom1, 2, 3}; - - const Bin bin3{10, 10, chrom2, 1, 2}; - const Bin bin4{10, 10, chrom2, 10, 20}; - - CHECK(bin1 != bin2); - CHECK(bin1 != bin3); - - // This is true because they have the same ID. - // However, comparing bins with same ID and different interval is UB - CHECK(bin3 == bin4); - - CHECK(bin1 < bin2); - CHECK(bin1 < bin3); - - CHECK(bin1 <= bin2); - CHECK(bin1 <= bin3); - - CHECK(bin2 > bin1); - CHECK(bin3 > bin1); - - CHECK(bin2 >= bin1); - CHECK(bin3 >= bin1); - } - - SECTION("fmt") { - const Bin bin1{chrom1, 0, 100}; - const Bin bin2{123, 123, chrom1, 0, 100}; - - CHECK(fmt::format(FMT_STRING("{}"), bin1) == std::to_string(Bin::null_id)); - CHECK(fmt::format(FMT_STRING("{}"), bin2) == "123"); - CHECK(fmt::format(FMT_STRING("{:bed}"), bin1) == "chr1\t0\t100"); - CHECK(fmt::format(FMT_STRING("{:ucsc}"), bin1) == "chr1:0-100"); - CHECK(fmt::format(FMT_STRING("{:raw}"), bin2) == "123"); - } -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("BinTable", "[bin-table][short]") { +TEST_CASE("BinTable (fixed bins)", "[bin-table][short]") { constexpr std::uint32_t bin_size = 5000; // clang-format off const BinTable table({ @@ -150,7 +58,7 @@ TEST_CASE("BinTable", "[bin-table][short]") { } SECTION("subset") { - const BinTable expected{{Chromosome{1, "chr2", 25017}}, bin_size}; + const BinTableFixed expected{{Chromosome{1, "chr2", 25017}}, bin_size}; CHECK(table.subset(Chromosome{1, "chr2", 25017}) == expected); CHECK(table.subset("chr2") == expected); @@ -181,6 +89,12 @@ TEST_CASE("BinTable", "[bin-table][short]") { CHECK(std::distance(its.first, its.second) == std::distance(table1.begin(), table1.end())); } + SECTION("accessors") { + CHECK(table.has_fixed_bin_size()); + CHECK_NOTHROW(table.get()); + CHECK_THROWS(table.get>()); + } + SECTION("iterators") { const auto& chr1 = table.chromosomes().at("chr1"); const auto& chr2 = table.chromosomes().at("chr2"); @@ -222,6 +136,9 @@ TEST_CASE("BinTable", "[bin-table][short]") { } CHECK(first_bin == last_bin); + + CHECK_THROWS_AS(first_bin++, std::out_of_range); + CHECK_THROWS_AS(last_bin++, std::out_of_range); } SECTION("backward") { @@ -234,6 +151,9 @@ TEST_CASE("BinTable", "[bin-table][short]") { } CHECK(first_bin == last_bin); + + CHECK_THROWS_AS(--first_bin, std::out_of_range); + CHECK_THROWS_AS(--last_bin, std::out_of_range); } SECTION("operator+") { @@ -244,6 +164,8 @@ TEST_CASE("BinTable", "[bin-table][short]") { for (std::size_t i = 0; i < expected.size() - 5; ++i) { CHECK(*(it + i) == expected[i + 5]); // NOLINT } + + CHECK_THROWS_AS(it + 100, std::out_of_range); } SECTION("operator-") { @@ -255,20 +177,177 @@ TEST_CASE("BinTable", "[bin-table][short]") { for (std::size_t i = 1; i < expected.size(); ++i) { CHECK(*(it1 - i) == *(it2 - i)); // NOLINT } + + CHECK_THROWS_AS(it1 - 100, std::out_of_range); + } + + SECTION("accessors") { + CHECK_NOTHROW(table.begin().get()); + CHECK_THROWS(table.begin().get::iterator>()); } } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("BinTable (variable bins)", "[bin-table][short]") { + const Chromosome chrom1{0, "chr1", 32}; + const Chromosome chrom2{1, "chr2", 32}; + + const std::vector start_pos{0, 8, 15, 23, 0, 5, 10, 26}; + const std::vector end_pos{8, 15, 23, 32, 5, 10, 26, 32}; - SECTION("concretize") { - const auto concrete_table = table.concretize(); + const BinTable table({chrom1, chrom2}, start_pos, end_pos); - REQUIRE(concrete_table.chroms.size() == table.size()); + SECTION("stats") { + CHECK(BinTable{}.empty()); + CHECK(table.size() == start_pos.size()); + CHECK(table.num_chromosomes() == 2); + CHECK(table.bin_size() == 0); + } - std::size_t i = 0; - for (const auto& bin : table) { - CHECK(concrete_table.chroms[i] == bin.chrom()); - CHECK(concrete_table.bin_starts[i] == bin.start()); - CHECK(concrete_table.bin_ends[i++] == bin.end()); + SECTION("at") { + const auto& chr1 = table.chromosomes().at("chr1"); + const auto& chr2 = table.chromosomes().at("chr2"); + + CHECK(table.at(0) == Bin{chr1, 0, 8}); + CHECK(table.at(3) == Bin{chr1, 23, 32}); + + CHECK(table.at(4) == Bin{chr2, 0, 5}); + + CHECK_THROWS_AS(table.at(table.size()), std::out_of_range); + } + + SECTION("coord to bin id") { + const auto& chr2 = table.chromosomes().at("chr2"); + + CHECK(table.map_to_bin_id(0, 8) == 1); + CHECK(table.map_to_bin_id("chr1", 25) == 3); + CHECK(table.map_to_bin_id(chr2, 9) == 5); + + CHECK_THROWS_AS(table.map_to_bin_id("a", 0), std::out_of_range); + CHECK_THROWS_AS(table.map_to_bin_id("chr1", 33), std::out_of_range); + CHECK_THROWS_AS(table.map_to_bin_id(chr2, 50), std::out_of_range); + CHECK_THROWS_AS(table.map_to_bin_id(1, 50), std::out_of_range); + } + + SECTION("subset") { + const std::vector start_pos_{0, 5, 10, 26}; + const std::vector end_pos_{5, 10, 26, 32}; + const BinTableVariable expected{{Chromosome{1, "chr2", 32}}, start_pos_, end_pos_}; + + CHECK(table.subset(Chromosome{1, "chr2", 32}) == expected); + CHECK(table.subset("chr2") == expected); + CHECK(table.subset(1) == expected); + CHECK(table.subset("chr1") != expected); + + if constexpr (ndebug_not_defined()) { + CHECK_THROWS_AS(table.subset(Chromosome{4, "chr5", 1}), std::out_of_range); + } + CHECK_THROWS_AS(table.subset("a"), std::out_of_range); + CHECK_THROWS_AS(table.subset(10), std::out_of_range); + } + + SECTION("find overlap") { + const auto& chrom = *table.chromosomes().begin(); + + auto its = table.find_overlap({chrom, 8, 9}); + CHECK(std::distance(its.first, its.second) == 1); + + its = table.find_overlap({chrom, 8, 15 - 1}); + CHECK(std::distance(its.first, its.second) == 1); + + its = table.find_overlap({chrom, 14, 23}); + CHECK(std::distance(its.first, its.second) == 2); + + its = table.find_overlap({chrom, 0, chrom.size()}); + CHECK(std::distance(its.first, its.second) == 4); + } + + SECTION("accessors") { + CHECK_FALSE(table.has_fixed_bin_size()); + CHECK_NOTHROW(table.get>()); + CHECK_THROWS(table.get()); + } + + SECTION("iterators") { + const auto& chr1 = table.chromosomes().at("chr1"); + const auto& chr2 = table.chromosomes().at("chr2"); + + // clang-format off + const std::array expected{ + Bin{0, 0, chr1, 0, 8}, + Bin{1, 1, chr1, 8, 15}, + Bin{2, 2, chr1, 15, 23}, + Bin{3, 3, chr1, 23, 32}, + Bin{4, 0, chr2, 0, 5}, + Bin{5, 1, chr2, 5, 10}, + Bin{6, 2, chr2, 10, 26}, + Bin{7, 3, chr2, 26, 32} + }; + // clang-format on + + REQUIRE(table.size() == expected.size()); + + SECTION("forward") { + auto first_bin = table.begin(); + auto last_bin = table.end(); + + // NOLINTNEXTLINE + for (std::size_t i = 0; i < expected.size(); ++i) { + CHECK(*first_bin++ == expected[i]); + } + + CHECK(first_bin == last_bin); + + CHECK_THROWS_AS(first_bin++, std::out_of_range); + CHECK_THROWS_AS(last_bin++, std::out_of_range); + } + + SECTION("backward") { + auto first_bin = table.begin(); + auto last_bin = table.end(); + + // NOLINTNEXTLINE + for (std::size_t i = expected.size(); i != 0; --i) { + CHECK(*(--last_bin) == expected[i - 1]); + } + + CHECK(first_bin == last_bin); + + CHECK_THROWS_AS(--first_bin, std::out_of_range); + CHECK_THROWS_AS(--last_bin, std::out_of_range); + } + + SECTION("operator+") { + CHECK(table.begin() + 0 == table.begin()); + CHECK(*(table.begin() + 5) == expected[5]); + + auto it = table.begin() + 5; + for (std::size_t i = 0; i < expected.size() - 5; ++i) { + CHECK(*(it + i) == expected[i + 5]); // NOLINT + } + + CHECK_THROWS_AS(it + 100, std::out_of_range); + } + + SECTION("operator-") { + CHECK(table.begin() - 0 == table.begin()); + CHECK(*(table.end() - 5) == *(expected.end() - 5)); + + auto it1 = table.end(); + auto it2 = expected.end(); // NOLINT + for (std::size_t i = 1; i < expected.size(); ++i) { + CHECK(*(it1 - i) == *(it2 - i)); // NOLINT + } + + CHECK_THROWS_AS(it1 - 100, std::out_of_range); + } + + SECTION("accessors") { + CHECK_NOTHROW(table.begin().get::iterator>()); + CHECK_THROWS(table.begin().get()); } } } + } // namespace hictk::test::bin_table diff --git a/test/units/bin_table/bin_test.cpp b/test/units/bin_table/bin_test.cpp new file mode 100644 index 00000000..814e1c2f --- /dev/null +++ b/test/units/bin_table/bin_test.cpp @@ -0,0 +1,103 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include "hictk/bin.hpp" +#include +#include +#include + +#include "hictk/fmt/bin.hpp" + +namespace hictk::test::bin_table { + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Bin", "[bin][short]") { + const Chromosome chrom1{0, "chr1", 50}; + const Chromosome chrom2{1, "chr2", 10}; + SECTION("Ctors") { + CHECK(Bin{chrom1, 1, 2}.has_null_id()); + CHECK_FALSE(Bin{0, 0, chrom1, 1, 2}.has_null_id()); + CHECK_FALSE(Bin{0, 0, GenomicInterval{chrom1, 1, 2}}.has_null_id()); + } + + SECTION("Accessors") { + const Bin bin1{chrom1, 1, 2}; + const Bin bin2{10, 5, chrom1, 1, 2}; + + CHECK(bin1.id() == Bin::null_id); + CHECK(bin2.id() == 10); + CHECK(bin2.rel_id() == 5); + + CHECK(bin1.interval() == GenomicInterval{chrom1, 1, 2}); + + CHECK(bin2.chrom() == chrom1); + CHECK(bin2.start() == 1); + CHECK(bin2.end() == 2); + } + + SECTION("operators (wo/ id)") { + const Bin bin0{}; + const Bin bin1{chrom1, 1, 2}; + const Bin bin2{chrom1, 2, 3}; + + const Bin bin3{chrom2, 1, 2}; + + CHECK(!bin0); + CHECK(!!bin1); + + CHECK(bin1 != bin2); + CHECK(bin1 != bin3); + + CHECK(bin1 < bin2); + CHECK(bin1 < bin3); + + CHECK(bin1 <= bin2); + CHECK(bin1 <= bin3); + + CHECK(bin2 > bin1); + CHECK(bin3 > bin1); + + CHECK(bin2 >= bin1); + CHECK(bin3 >= bin1); + } + + SECTION("operators (w/ id)") { + const Bin bin1{0, 0, chrom1, 1, 2}; + const Bin bin2{1, 1, chrom1, 2, 3}; + + const Bin bin3{10, 10, chrom2, 1, 2}; + const Bin bin4{10, 10, chrom2, 10, 20}; + + CHECK(bin1 != bin2); + CHECK(bin1 != bin3); + + // This is true because they have the same ID. + // However, comparing bins with same ID and different interval is UB + CHECK(bin3 == bin4); + + CHECK(bin1 < bin2); + CHECK(bin1 < bin3); + + CHECK(bin1 <= bin2); + CHECK(bin1 <= bin3); + + CHECK(bin2 > bin1); + CHECK(bin3 > bin1); + + CHECK(bin2 >= bin1); + CHECK(bin3 >= bin1); + } + + SECTION("fmt") { + const Bin bin1{chrom1, 0, 100}; + const Bin bin2{123, 123, chrom1, 0, 100}; + + CHECK(fmt::format(FMT_STRING("{}"), bin1) == fmt::to_string(Bin::null_id)); + CHECK(fmt::format(FMT_STRING("{}"), bin2) == "123"); + CHECK(fmt::format(FMT_STRING("{:bed}"), bin1) == "chr1\t0\t100"); + CHECK(fmt::format(FMT_STRING("{:ucsc}"), bin1) == "chr1:0-100"); + CHECK(fmt::format(FMT_STRING("{:raw}"), bin2) == "123"); + } +} +} // namespace hictk::test::bin_table diff --git a/test/units/reference/reference_test.cpp b/test/units/reference/reference_test.cpp index 5dc740fb..e13bbb14 100644 --- a/test/units/reference/reference_test.cpp +++ b/test/units/reference/reference_test.cpp @@ -115,6 +115,13 @@ TEST_CASE("Reference", "[reference][short]") { CHECK(chroms2.chromosome_with_longest_name().name() == "chr123"); CHECK(chroms2.longest_chromosome().name() == "chr1"); + + const auto& prefix_sum = chroms2.chrom_size_prefix_sum(); + REQUIRE(prefix_sum.size() == 4); + CHECK(prefix_sum[0] == 0); + CHECK(prefix_sum[1] == 1000); + CHECK(prefix_sum[2] == 1005); + CHECK(prefix_sum[3] == 1006); } } // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) } // namespace hictk::test::reference From 8eb85077b5f1edf1da6e091b994c82d87efb5054 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:30:13 +0100 Subject: [PATCH 02/29] Update cooler::File to support reading files with variable bin size --- .../cooler/include/hictk/cooler/cooler.hpp | 3 +++ .../include/hictk/cooler/impl/file_impl.hpp | 3 +-- .../hictk/cooler/impl/file_read_impl.hpp | 18 +++++++++++++++++- .../hictk/cooler/impl/file_validation_impl.hpp | 8 ++++---- .../include/hictk/cooler/impl/index_impl.hpp | 14 ++++++-------- .../hictk/cooler/impl/validation_impl.hpp | 12 ++++++------ .../cooler/include/hictk/cooler/index.hpp | 6 +++--- test/units/bin_table/bin_test.cpp | 2 ++ test/units/cooler/file_ctors_test.cpp | 14 +++++++++++++- 9 files changed, 55 insertions(+), 25 deletions(-) diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index 78d7f3d7..c8b9bd38 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -304,6 +304,9 @@ class File { [[nodiscard]] static auto import_chroms(const Dataset &chrom_names, const Dataset &chrom_sizes, bool missing_ok) -> Reference; + [[nodiscard]] static BinTable init_bin_table(const DatasetMap &dsets, std::string_view bin_type, + std::uint32_t bin_size); + [[nodiscard]] static Index init_index(const Dataset &chrom_offset_dset, const Dataset &bin_offset_dset, std::shared_ptr bin_table, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp index 8e647b79..f8cb102f 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp @@ -41,8 +41,7 @@ inline File::File(RootGroup entrypoint, unsigned int mode, std::size_t cache_siz _attrs(read_standard_attributes(_root_group)), _pixel_variant(detect_pixel_type(_root_group)), _bins(std::make_shared( - import_chroms(_datasets.at("chroms/name"), _datasets.at("chroms/length"), false), - bin_size())), + init_bin_table(_datasets, _attrs.bin_type.value(), _attrs.bin_size))), _index(std::make_shared(init_index(_datasets.at("indexes/chrom_offset"), _datasets.at("indexes/bin1_offset"), _bins, _datasets.at("pixels/count").size(), false))) { diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp index e27e2d2a..f8b75d65 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp @@ -467,12 +467,14 @@ inline auto File::read_standard_attributes(const RootGroup &root_grp, bool initi // Read mandatory attributes // We read format-version first because some attributes are mandatory only for cooler v3 read_or_throw("format-version", attrs.format_version); - read_or_throw("bin-size", attrs.bin_size); read_or_throw("format", attrs.format); // Read mandatory attributes for Cooler v3 auto missing_ok = attrs.format_version < 3; internal::read_optional(root_grp, "bin-type", attrs.bin_type, missing_ok); + if (attrs.bin_type.value() == "fixed") { + read_or_throw("bin-size", attrs.bin_size); + } internal::read_optional(root_grp, "storage-mode", attrs.storage_mode, missing_ok); // Try to read reserved attributes @@ -526,6 +528,20 @@ inline auto File::import_chroms(const Dataset &chrom_names, const Dataset &chrom } } +inline BinTable File::init_bin_table(const DatasetMap &dsets, std::string_view bin_type, + std::uint32_t bin_size) { + auto chroms = import_chroms(dsets.at("chroms/name"), dsets.at("chroms/length"), false); + if (bin_type == "fixed") { + return {BinTableFixed{std::move(chroms), bin_size}}; + } + assert(bin_type == "variable"); + assert(bin_size == 0); + + return {BinTableVariable{std::move(chroms), + dsets.at("bins/start").read_all>(), + dsets.at("bins/end").read_all>()}}; +} + inline Index File::init_index(const Dataset &chrom_offset_dset, const Dataset &bin_offset_dset, std::shared_ptr bin_table, std::uint64_t expected_nnz, bool missing_ok) { diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_validation_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_validation_impl.hpp index da286e9e..fefce8b7 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_validation_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_validation_impl.hpp @@ -25,7 +25,7 @@ namespace hictk::cooler { inline void File::validate_bins(bool full) const { try { - assert(_attrs.bin_type == "fixed"); + assert(_attrs.bin_type == "fixed" || _attrs.bin_type == "variable"); auto nchroms = dataset("bins/chrom").size(); auto nstarts = dataset("bins/start").size(); auto nends = dataset("bins/end").size(); @@ -63,7 +63,7 @@ inline void File::validate_bins(bool full) const { if (chromosomes().at(*chrom_it).name() != bin.chrom().name() || *start_it != bin.start() || *end_it != bin.end()) { throw std::runtime_error( - fmt::format(FMT_STRING("GenomicInterval #{}: expected {}:{}-{}, found {:ucsc}"), i, + fmt::format(FMT_STRING("Bin #{}: expected {}:{}-{}, found {:ucsc}"), i, chromosomes().at(*chrom_it).name(), *start_it, *end_it, bin)); } ++chrom_it; @@ -75,8 +75,8 @@ inline void File::validate_bins(bool full) const { } catch (const HighFive::Exception &e) { throw std::runtime_error( - fmt::format(FMT_STRING("GenomicInterval table at URI {}/{} is invalid or corrupted: {}"), - uri(), group("bins")().getPath(), e.what())); + fmt::format(FMT_STRING("Bin table at URI {}/{} is invalid or corrupted: {}"), uri(), + group("bins")().getPath(), e.what())); } } diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp index 1df72d1c..4bd4848b 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp @@ -24,10 +24,8 @@ inline Index::Index(std::shared_ptr bins, const std::vector &chrom_offsets, std::uint64_t nnz, bool allocate) : _bins(std::move(bins)), - _idx(Index::init(_bins->chromosomes(), chrom_offsets, _bins->bin_size(), allocate)), - _nnz(nnz) { - assert(bin_size() != 0); -} + _idx(Index::init(_bins->chromosomes(), *_bins, chrom_offsets, allocate)), + _nnz(nnz) {} inline const Reference &Index::chromosomes() const noexcept { assert(_bins); @@ -201,15 +199,15 @@ inline void Index::compute_chrom_offsets(std::vector &buff) const }); } -inline auto Index::init(const Reference &chroms, const std::vector &chrom_offsets, - std::uint32_t bin_size, bool allocate) -> MapT { +inline auto Index::init(const Reference &chroms, const BinTable &bins, + const std::vector &chrom_offsets, bool allocate) -> MapT { assert(!chroms.empty()); - assert(bin_size != 0); assert(chrom_offsets.empty() || chroms.size() + 1 == chrom_offsets.size()); MapT idx{}; for (std::uint32_t i = 0; i < chroms.size(); ++i) { const auto &chrom = chroms.at(i); - const auto num_bins = (chrom.size() + bin_size - 1) / bin_size; + const auto [first_bin, last_bin] = bins.find_overlap(chrom, 0, chrom.size()); + const auto num_bins = static_cast(std::distance(first_bin, last_bin)); auto node = idx.emplace(chrom, OffsetVect(allocate ? num_bins : 1, Index::offset_not_set_value)); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp index d0a313b8..f0233ec6 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp @@ -121,10 +121,10 @@ inline ValidationStatusCooler is_cooler(const HighFive::Group &root_group) { status.missing_or_invalid_format_attr |= version == 0 || version > 3; } - // Check file has a bin-type that we support (currently only "fixed" is supported) + // Check file has a bin-type that we support if (Attribute::exists(root_group, "bin-type")) { const auto bin_type = Attribute::read(root_group, "bin-type"); - status.missing_or_invalid_bin_type_attr = bin_type != "fixed"; + status.missing_or_invalid_bin_type_attr = bin_type != "fixed" && bin_type != "variable"; } // Check file has the mandatory groups @@ -177,12 +177,12 @@ inline ValidationStatusMultiresCooler is_multires_file(const HighFive::File &fp, status.missing_or_invalid_format_attr |= version == 0 || version > 3; } - // Check file has a bin-type that we support (currently only "fixed" is supported) + // Check file has a bin-type that we support // NOTE: .mcool files are not required to advertise the bin type they are using at the root level status.missing_or_invalid_bin_type_attr = false; if (Attribute::exists(fp, "bin-type")) { const auto bin_type = Attribute::read(fp, "bin-type"); - status.missing_or_invalid_bin_type_attr = bin_type != "fixed"; + status.missing_or_invalid_bin_type_attr = bin_type != "fixed" && bin_type != "variable"; } // Try to read resolutions from the Cooler's root @@ -261,12 +261,12 @@ inline ValidationStatusScool is_scool_file(const HighFive::File &fp, bool valida status.missing_or_invalid_format_attr |= version == 0 || version > 3; } - // Check file has a bin-type that we support (currently only "fixed" is supported) + // Check file has a bin-type that we support // NOTE: .scool files are not required to advertise the bin type they are using at the root level status.missing_or_invalid_bin_type_attr = false; if (Attribute::exists(fp, "bin-type")) { const auto bin_type = Attribute::read(fp, "bin-type"); - status.missing_or_invalid_bin_type_attr = bin_type != "fixed"; + status.missing_or_invalid_bin_type_attr = bin_type != "fixed" && bin_type != "variable"; } constexpr std::array scool_root_groups{"chroms", "bins", "cells"}; diff --git a/src/libhictk/cooler/include/hictk/cooler/index.hpp b/src/libhictk/cooler/include/hictk/cooler/index.hpp index 5ea32299..4aae8ee1 100644 --- a/src/libhictk/cooler/include/hictk/cooler/index.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/index.hpp @@ -109,9 +109,9 @@ class Index { [[nodiscard]] auto at(std::string_view chrom_name) -> mapped_type&; [[nodiscard]] auto at(std::uint32_t chrom_id) -> mapped_type&; - [[nodiscard]] static auto init(const Reference& chroms, - const std::vector& chrom_offsets, - std::uint32_t bin_size, bool allocate) -> MapT; + [[nodiscard]] static auto init(const Reference& chroms, const BinTable& bins, + const std::vector& chrom_offsets, bool allocate) + -> MapT; public: class iterator { diff --git a/test/units/bin_table/bin_test.cpp b/test/units/bin_table/bin_test.cpp index 814e1c2f..7d0889e8 100644 --- a/test/units/bin_table/bin_test.cpp +++ b/test/units/bin_table/bin_test.cpp @@ -3,7 +3,9 @@ // SPDX-License-Identifier: MIT #include "hictk/bin.hpp" + #include + #include #include diff --git a/test/units/cooler/file_ctors_test.cpp b/test/units/cooler/file_ctors_test.cpp index 470aab67..560ad53c 100644 --- a/test/units/cooler/file_ctors_test.cpp +++ b/test/units/cooler/file_ctors_test.cpp @@ -62,7 +62,7 @@ TEST_CASE("Cooler: file ctors", "[cooler][short]") { f.append_pixels(pixels.begin(), pixels.end(), true); } } - SECTION("open .cool") { + SECTION("open .cool (fixed bin size)") { const auto path = datadir / "cooler_test_file.cool"; const File f(path.string()); @@ -74,6 +74,18 @@ TEST_CASE("Cooler: file ctors", "[cooler][short]") { CHECK(f.has_pixel_of_type()); } + SECTION("open .cool (variable bin size)") { + const auto path = datadir / "cooler_variable_bins_test_file.cool"; + const File f(path.string()); + + CHECK(f.path() == path); + CHECK(f.uri() == path); + CHECK(f.bin_size() == 0); + CHECK(f.chromosomes().size() == 2); + CHECK(f.bins().size() == 8); + CHECK(f.has_pixel_of_type()); + } + SECTION("open .scool") { const auto path = datadir / "single_cell_cooler_test_file.scool"; From 77763762570fb72382a80845ae3a43506054697d Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 6 Dec 2023 13:22:07 +0100 Subject: [PATCH 03/29] Initial support for creating coolers with variable bin size (C++ API) --- cmake/FetchTestDataset.cmake | 4 +- .../hictk/impl/bin_table_fixed_impl.hpp | 6 ++- .../cooler/include/hictk/cooler/cooler.hpp | 16 +++++++- .../include/hictk/cooler/impl/file_impl.hpp | 39 ++++++++++++------- .../hictk/cooler/impl/file_write_impl.hpp | 9 ++++- test/units/cooler/file_ctors_test.cpp | 30 +++++++++++--- 6 files changed, 79 insertions(+), 25 deletions(-) diff --git a/cmake/FetchTestDataset.cmake b/cmake/FetchTestDataset.cmake index e01c0c39..6785573a 100644 --- a/cmake/FetchTestDataset.cmake +++ b/cmake/FetchTestDataset.cmake @@ -4,8 +4,8 @@ # cmake-format: off file( - DOWNLOAD https://zenodo.org/record/8392109/files/hictk_test_data.tar.xz?download=1 - EXPECTED_HASH SHA256=43d05077082603cf03dc5ce2bfc0f81b2422c7249ee987e09512d2e3a56afff1 + DOWNLOAD https://zenodo.org/record/10276826/files/hictk_test_data.tar.xz?download=1 + EXPECTED_HASH SHA256=7e698deefc4bf090b5bcf3966f22b81770d7b48976c884e1d69c45bf83f17fa7 "${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz") # cmake-format: on diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp index 7022ab7c..21978d98 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp @@ -307,10 +307,14 @@ inline auto BinTableFixed::iterator::operator+=(std::size_t i) -> iterator & { return *this; } - if (bin_id() + i >= _bin_table->size()) { + if (bin_id() + i > _bin_table->size()) { throw std::out_of_range( "BinTableFixed::iterator: caught attempt to increment iterator past end()"); } + if (bin_id() + i == _bin_table->size()) { + *this = make_end_iterator(*_bin_table); + return *this; + } const auto ii = static_cast(i); const auto num_bins = compute_num_chrom_bins(); diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index c8b9bd38..258d0d3c 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -98,13 +98,14 @@ class File { mutable std::shared_ptr _index{}; bool _finalize{false}; - // Constructors are private. Cooler files are opened using factory methods + // Private ctors File(RootGroup entrypoint, unsigned int mode, std::size_t cache_size_bytes, double w0, bool validate); template - File(RootGroup entrypoint, Reference chroms, PixelT pixel, Attributes attributes, + File(RootGroup entrypoint, BinTable bins, PixelT pixel, Attributes attributes, std::size_t cache_size_bytes, double w0); + // Ctor for SingleCellCooler template File(RootGroup entrypoint, PixelT pixel, Attributes attributes, std::size_t cache_size_bytes, @@ -135,6 +136,12 @@ class File { Attributes attributes = Attributes::init(0), std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + template + [[nodiscard]] static File create(std::string_view uri, BinTable bins, + bool overwrite_if_exists = false, + Attributes attributes = Attributes::init(0), + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + [[nodiscard]] static File open_random_access( RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); @@ -147,6 +154,11 @@ class File { Attributes attributes = Attributes::init(0), std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + template + [[nodiscard]] static File create(RootGroup entrypoint, BinTable bins, + Attributes attributes = Attributes::init(0), + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + ~File() noexcept; File &operator=(const File &other) = delete; diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp index f8cb102f..bb5f1d62 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp @@ -52,18 +52,17 @@ inline File::File(RootGroup entrypoint, unsigned int mode, std::size_t cache_siz } template -inline File::File(RootGroup entrypoint, Reference chroms, [[maybe_unused]] PixelT pixel, +inline File::File(RootGroup entrypoint, BinTable bins, [[maybe_unused]] PixelT pixel, Attributes attributes, std::size_t cache_size_bytes, double w0) : _mode(HighFive::File::ReadWrite), _root_group(std::move(entrypoint)), _groups(create_groups(_root_group)), - _datasets(create_datasets(_root_group, chroms, cache_size_bytes, w0)), + _datasets(create_datasets(_root_group, bins.chromosomes(), cache_size_bytes, w0)), _attrs(std::move(attributes)), _pixel_variant(PixelT(0)), - _bins(std::make_shared(std::move(chroms), bin_size())), + _bins(std::make_shared(std::move(bins))), _index(std::make_shared(_bins)), _finalize(true) { - assert(bin_size() != 0); assert(!_bins->empty()); assert(!chromosomes().empty()); assert(!_index->empty()); @@ -116,11 +115,17 @@ inline File File::open_read_once(std::string_view uri, std::size_t cache_size_by return File(open_or_create_root_group(open_file(uri, HighFive::File::ReadOnly, validate), uri), HighFive::File::ReadOnly, cache_size_bytes, 1.0, validate); } - template inline File File::create(std::string_view uri, const Reference &chroms, std::uint32_t bin_size, bool overwrite_if_exists, Attributes attributes, std::size_t cache_size_bytes) { + return File::create(uri, BinTable(chroms, bin_size), overwrite_if_exists, attributes, + cache_size_bytes); +} + +template +inline File File::create(std::string_view uri, BinTable bins, bool overwrite_if_exists, + Attributes attributes, std::size_t cache_size_bytes) { try { const auto [file_path, root_path] = parse_cooler_uri(uri); const auto uri_is_file_path = root_path.empty() || root_path == "/"; @@ -161,8 +166,8 @@ inline File File::create(std::string_view uri, const Reference &chroms, std::uin } return create( - open_or_create_root_group(open_file(uri, HighFive::File::ReadWrite, false), uri), chroms, - bin_size, attributes, cache_size_bytes); + open_or_create_root_group(open_file(uri, HighFive::File::ReadWrite, false), uri), bins, + attributes, cache_size_bytes); } catch (const std::exception &e) { throw std::runtime_error( fmt::format(FMT_STRING("Cannot create cooler at the following URI: \"{}\". Reason: {}"), @@ -183,18 +188,26 @@ inline File File::open_read_once(RootGroup entrypoint, std::size_t cache_size_by template inline File File::create(RootGroup entrypoint, const Reference &chroms, std::uint32_t bin_size, Attributes attributes, std::size_t cache_size_bytes) { + return File::create(entrypoint, BinTable(chroms, bin_size), attributes, cache_size_bytes); +} + +template +inline File File::create(RootGroup entrypoint, BinTable bins, Attributes attributes, + std::size_t cache_size_bytes) { static_assert(std::is_arithmetic_v); - if (bin_size == 0) { - throw std::logic_error("bin_size cannot be zero."); + if (std::holds_alternative>(bins.get())) { + attributes.bin_type = "variable"; + attributes.bin_size = 0; + } else { + attributes.bin_type = "fixed"; + attributes.bin_size = bins.bin_size(); } - attributes.bin_size = bin_size; + try { if (utils::is_cooler(entrypoint())) { throw std::runtime_error("URI points to an already existing cooler."); } - // At this point the parent file is guaranteed to exist, so we can always open it in ReadWrite - // mode - return File(entrypoint, chroms, PixelT(0), attributes, cache_size_bytes, true); + return File(entrypoint, bins, PixelT(0), attributes, cache_size_bytes, true); } catch (const std::exception &e) { throw std::runtime_error( diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp index b43765ff..b8645683 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp @@ -262,12 +262,17 @@ inline auto File::create_datasets(RootGroup &root_grp, const Reference &chroms, inline void File::write_standard_attributes(RootGroup &root_grp, const Attributes &attributes, bool skip_sentinel_attr) { - assert(attributes.bin_size != 0); [[maybe_unused]] HighFive::SilenceHDF5 silencer{}; // NOLINT if (attributes.assembly) { Attribute::write(root_grp(), "assembly", *attributes.assembly); } - Attribute::write(root_grp(), "bin-size", attributes.bin_size); + if (attributes.bin_size == 0) { + assert(attributes.bin_type.value() == "variable"); + Attribute::write(root_grp(), "bin-size", "null"); + } else { + assert(attributes.bin_type.value() == "fixed"); + Attribute::write(root_grp(), "bin-size", attributes.bin_size); + } Attribute::write(root_grp(), "bin-type", *attributes.bin_type); // NOLINT Attribute::write(root_grp(), "creation-date", *attributes.creation_date); // NOLINT Attribute::write(root_grp(), "format", std::string{COOL_MAGIC}); diff --git a/test/units/cooler/file_ctors_test.cpp b/test/units/cooler/file_ctors_test.cpp index 560ad53c..e8febbb8 100644 --- a/test/units/cooler/file_ctors_test.cpp +++ b/test/units/cooler/file_ctors_test.cpp @@ -18,11 +18,31 @@ namespace hictk::cooler::test::cooler_file { TEST_CASE("Cooler: init files", "[cooler][short]") { const Reference chroms{Chromosome{0, "chr1", 10000}, Chromosome{1, "chr2", 5000}}; - const auto path = testdir() / "test_init.cool"; - constexpr std::uint32_t bin_size = 1000; - std::ignore = File::create(path.string(), chroms, bin_size, true); - CHECK(utils::is_cooler(path.string())); // NOLINTNEXTLINE - CHECK(File(path.string()).attributes().generated_by->find("hictk") == 0); + SECTION("fixed bins") { + const auto path = testdir() / "test_init_fixed_bins.cool"; + constexpr std::uint32_t bin_size = 1000; + std::ignore = File::create(path.string(), chroms, bin_size, true); + CHECK(utils::is_cooler(path.string())); // NOLINTNEXTLINE + CHECK(File(path.string()).attributes().generated_by->find("hictk") == 0); + CHECK(File(path.string()).attributes().bin_type.value() == "fixed"); + } + + SECTION("variable bins") { + const auto path = testdir() / "test_init_variable_bins.cool"; + + const Chromosome chrom1{0, "chr1", 32}; + const Chromosome chrom2{1, "chr2", 32}; + + const std::vector start_pos{0, 8, 15, 23, 0, 5, 10, 26}; + const std::vector end_pos{8, 15, 23, 32, 5, 10, 26, 32}; + + const BinTable table({chrom1, chrom2}, start_pos, end_pos); + + std::ignore = File::create(path.string(), table, true); + CHECK(utils::is_cooler(path.string())); // NOLINTNEXTLINE + CHECK(File(path.string()).attributes().generated_by->find("hictk") == 0); + CHECK(File(path.string()).attributes().bin_type.value() == "variable"); + } } // NOLINTNEXTLINE(readability-function-cognitive-complexity) From 98e671bdca9884f89f2352fe8742b25bdad1e5f8 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 6 Dec 2023 13:44:05 +0100 Subject: [PATCH 04/29] Update test dataset --- cmake/FetchTestDataset.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/FetchTestDataset.cmake b/cmake/FetchTestDataset.cmake index 6785573a..c47596be 100644 --- a/cmake/FetchTestDataset.cmake +++ b/cmake/FetchTestDataset.cmake @@ -4,7 +4,7 @@ # cmake-format: off file( - DOWNLOAD https://zenodo.org/record/10276826/files/hictk_test_data.tar.xz?download=1 + DOWNLOAD https://zenodo.org/records/10276826/files/hictk_test_data.tar.xz?download=1 EXPECTED_HASH SHA256=7e698deefc4bf090b5bcf3966f22b81770d7b48976c884e1d69c45bf83f17fa7 "${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz") # cmake-format: on From ad8766b17a3861786d1a036c457d0b7645efc315 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:03:29 +0100 Subject: [PATCH 05/29] Update hictk load to support variable bin sizes --- src/hictk/cli/cli_load.cpp | 27 ++++++-- src/hictk/include/hictk/tools/config.hpp | 2 + src/hictk/load/load.cpp | 63 ++++++++++++----- .../include/hictk/cooler/impl/file_impl.hpp | 4 +- .../hictk/cooler/impl/file_read_impl.hpp | 7 +- .../include/hictk/cooler/impl/index_impl.hpp | 13 ++-- .../cooler/impl/singlecell_cooler_impl.hpp | 38 ++++++++--- .../hictk/cooler/impl/utils_merge_impl.hpp | 13 ++-- .../cooler/include/hictk/cooler/index.hpp | 1 + .../hictk/cooler/singlecell_cooler.hpp | 4 +- .../cooler/include/hictk/cooler/utils.hpp | 5 +- .../include/hictk/genomic_interval.hpp | 4 +- .../hictk/impl/genomic_interval_impl.hpp | 68 +++++++++---------- 13 files changed, 155 insertions(+), 94 deletions(-) diff --git a/src/hictk/cli/cli_load.cpp b/src/hictk/cli/cli_load.cpp index 0c1eb4b6..faaf1529 100644 --- a/src/hictk/cli/cli_load.cpp +++ b/src/hictk/cli/cli_load.cpp @@ -35,19 +35,25 @@ void Cli::make_load_subcommand() { ->check(CLI::ExistingFile) ->required(); - sc.add_option( - "bin-size", - c.bin_size, - "Bin size (bp).") - ->check(CLI::PositiveNumber) - ->required(); - sc.add_option( "output-uri", c.uri, "Path to output Cooler (URI syntax supported).") ->required(); + sc.add_option( + "-b,--bin-size", + c.bin_size, + "Bin size (bp).\n" + "Required when --bin-table is not used.") + ->check(CLI::PositiveNumber); + + sc.add_option( + "-t,--bin-table", + c.path_to_bin_table, + "Path to a BED3+ file with the bin table.") + ->check(CLI::ExistingFile); + sc.add_option( "-f,--format", c.format, @@ -93,6 +99,7 @@ void Cli::make_load_subcommand() { ->capture_default_str(); // clang-format on + sc.get_option("--bin-size")->excludes(sc.get_option("--bin-table")); _config = std::monostate{}; } @@ -107,6 +114,12 @@ void Cli::validate_load_subcommand() const { FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.uri)); } + if (c.path_to_bin_table.empty() && c.path_to_chrom_sizes.empty()) { + assert(c.bin_size == 0); + errors.emplace_back( + "--bin-size is required when --bin-table is not specified."); + } + if (!errors.empty()) { throw std::runtime_error( fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI " diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index 66b07928..bb775f48 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -103,7 +103,9 @@ struct LoadConfig { std::string uri{}; std::filesystem::path path_to_chrom_sizes{}; + std::filesystem::path path_to_bin_table{}; std::uint32_t bin_size{}; + std::string format{}; std::string assembly{"unknown"}; bool count_as_float{false}; diff --git a/src/hictk/load/load.cpp b/src/hictk/load/load.cpp index 1770f38f..1575dbf8 100644 --- a/src/hictk/load/load.cpp +++ b/src/hictk/load/load.cpp @@ -37,7 +37,35 @@ namespace hictk::tools { -void ingest_pixels_sorted(const LoadConfig& c) { +[[nodiscard]] static BinTable init_bin_table(const std::filesystem::path& path_to_chrom_sizes, + std::uint32_t bin_size) { + auto chroms = Reference::from_chrom_sizes(path_to_chrom_sizes); + return {chroms, bin_size}; +} + +[[nodiscard]] static BinTable init_bin_table(const std::filesystem::path& path_to_chrom_sizes, + const std::filesystem::path& path_to_bin_table) { + auto chroms = Reference::from_chrom_sizes(path_to_chrom_sizes); + + std::ifstream ifs{}; + ifs.exceptions(std::ios::badbit); + ifs.open(path_to_bin_table); + + std::vector start_pos{}; + std::vector end_pos{}; + + std::string line{}; + GenomicInterval record{}; + while (std::getline(ifs, line)) { + record = GenomicInterval::parse_bed(chroms, line); + start_pos.push_back(record.start()); + end_pos.push_back(record.end()); + } + + return {chroms, start_pos, end_pos}; +} + +static void ingest_pixels_sorted(const LoadConfig& c) { assert(c.assume_sorted); auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); const auto format = format_from_string(c.format); @@ -50,9 +78,11 @@ void ingest_pixels_sorted(const LoadConfig& c) { format, c.batch_size, c.validate_pixels); } -void ingest_pixels_unsorted(const LoadConfig& c) { +static void ingest_pixels_unsorted(const LoadConfig& c) { assert(!c.assume_sorted); - auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); + auto bins = c.path_to_bin_table.empty() + ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) + : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); const auto format = format_from_string(c.format); const auto tmp_cooler_path = c.uri + ".tmp"; @@ -70,8 +100,7 @@ void ingest_pixels_unsorted(const LoadConfig& c) { [&](auto& buffer) { using N = decltype(buffer.front().count); { - auto tmp_clr = - cooler::SingleCellFile::create(tmp_cooler_path, chroms, c.bin_size, c.force); + auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, c.force); for (std::size_t i = 0; true; ++i) { SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, tmp_cooler_path); @@ -92,22 +121,25 @@ void ingest_pixels_unsorted(const LoadConfig& c) { std::filesystem::remove(tmp_cooler_path); } -void ingest_pairs_sorted(const LoadConfig& c) { +static void ingest_pairs_sorted(const LoadConfig& c) { assert(c.assume_sorted); - auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); + auto bins = c.path_to_bin_table.empty() + ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) + : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); const auto format = format_from_string(c.format); - c.count_as_float ? ingest_pairs_sorted( - cooler::File::create(c.uri, chroms, c.bin_size, c.force), format, - c.batch_size, c.validate_pixels) - : ingest_pairs_sorted( - cooler::File::create(c.uri, chroms, c.bin_size, c.force), - format, c.batch_size, c.validate_pixels); + c.count_as_float + ? ingest_pairs_sorted(cooler::File::create(c.uri, bins, c.force), format, + c.batch_size, c.validate_pixels) + : ingest_pairs_sorted(cooler::File::create(c.uri, bins, c.force), + format, c.batch_size, c.validate_pixels); } static void ingest_pairs_unsorted(const LoadConfig& c) { assert(!c.assume_sorted); - auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); + auto bins = c.path_to_bin_table.empty() + ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) + : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); const auto format = format_from_string(c.format); const auto tmp_cooler_path = c.uri + ".tmp"; @@ -125,8 +157,7 @@ static void ingest_pairs_unsorted(const LoadConfig& c) { [&](auto& buffer) { using N = decltype(buffer.begin()->count); { - auto tmp_clr = - cooler::SingleCellFile::create(tmp_cooler_path, chroms, c.bin_size, c.force); + auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, c.force); for (std::size_t i = 0; true; ++i) { SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp index bb5f1d62..5f51b4f3 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp @@ -84,12 +84,10 @@ inline File::File(RootGroup entrypoint, [[maybe_unused]] PixelT pixel, Attribute _groups = open_groups(_root_group); _datasets = open_datasets(_root_group, cache_size_bytes, w0); - _bins = std::make_shared( - import_chroms(_datasets.at("chroms/name"), _datasets.at("chroms/length"), false), bin_size()); + _bins = std::make_shared(init_bin_table(_datasets, *_attrs.bin_type, _attrs.bin_size)); _index = std::make_shared(_bins); assert(std::holds_alternative(_pixel_variant)); - assert(bin_size() != 0); assert(!_bins->empty()); assert(!chromosomes().empty()); assert(!_index->empty()); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp index f8b75d65..9c837e41 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp @@ -532,14 +532,13 @@ inline BinTable File::init_bin_table(const DatasetMap &dsets, std::string_view b std::uint32_t bin_size) { auto chroms = import_chroms(dsets.at("chroms/name"), dsets.at("chroms/length"), false); if (bin_type == "fixed") { - return {BinTableFixed{std::move(chroms), bin_size}}; + return {std::move(chroms), bin_size}; } assert(bin_type == "variable"); assert(bin_size == 0); - return {BinTableVariable{std::move(chroms), - dsets.at("bins/start").read_all>(), - dsets.at("bins/end").read_all>()}}; + return {std::move(chroms), dsets.at("bins/start").read_all>(), + dsets.at("bins/end").read_all>()}; } inline Index File::init_index(const Dataset &chrom_offset_dset, const Dataset &bin_offset_dset, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp index 4bd4848b..66dff5d7 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp @@ -118,7 +118,8 @@ inline std::uint64_t Index::get_offset_by_row_idx(std::uint32_t chrom_id, } inline void Index::set(const Chromosome &chrom, OffsetVect offsets) { - const auto expected_size = (chrom.size() + bin_size() - 1) / bin_size(); + const auto [fist_bin, last_bin] = _bins->find_overlap(chrom, 0, chrom.size()); + const auto expected_size = static_cast(std::distance(fist_bin, last_bin)); if (offsets.size() != expected_size) { throw std::runtime_error( fmt::format(FMT_STRING("expected index for {} to have size {}, found {}"), chrom, @@ -127,9 +128,12 @@ inline void Index::set(const Chromosome &chrom, OffsetVect offsets) { _idx.at(chrom) = std::move(offsets); } +inline void Index::set_offset_by_bin(const Bin &bin, std::uint64_t offset) { + set_offset_by_row_idx(bin.chrom().id(), bin.rel_id(), offset); +} + inline void Index::set_offset_by_bin_id(std::uint64_t bin_id, std::uint64_t offset) { - const auto &bin = _bins->at(bin_id); - set_offset_by_pos(bin.chrom(), bin.start(), offset); + set_offset_by_bin(_bins->at(bin_id), offset); } inline void Index::set_offset_by_pos(const Chromosome &chrom, std::uint32_t pos, @@ -139,8 +143,7 @@ inline void Index::set_offset_by_pos(const Chromosome &chrom, std::uint32_t pos, inline void Index::set_offset_by_pos(std::uint32_t chrom_id, std::uint32_t pos, std::uint64_t offset) { - const auto row_idx = pos / bin_size(); - set_offset_by_row_idx(chrom_id, row_idx, offset); + set_offset_by_bin(_bins->at(chrom_id, pos), offset); } inline void Index::set_offset_by_row_idx(std::uint32_t chrom_id, std::size_t row_idx, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp index 7d116960..5d6483f4 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp @@ -25,6 +25,11 @@ namespace hictk::cooler { inline SingleCellAttributes SingleCellAttributes::init(std::uint32_t bin_size_) { SingleCellAttributes attrs{}; attrs.bin_size = bin_size_; + if (bin_size_ == 0) { + attrs.bin_type = "variable"; + } else { + attrs.bin_type = "fixed"; + } return attrs; } @@ -66,13 +71,19 @@ inline SingleCellFile::SingleCellFile(HighFive::File fp, BinTable bins, SingleCe _bins(std::make_shared(std::move(bins))) {} inline SingleCellFile::SingleCellFile(const std::filesystem::path& path, unsigned int mode) - : SingleCellFile(HighFive::File(path.string(), mode), read_bins(HighFive::File(path.string())), + : SingleCellFile(HighFive::File(path.string(), mode), + init_bin_table(HighFive::File(path.string())), read_standard_attributes(HighFive::File(path.string()), mode != HighFive::File::ReadOnly)) {} inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path, const Reference& chroms, std::uint32_t bin_size, bool force_overwrite) { + return SingleCellFile::create(path, {BinTableFixed{chroms, bin_size}}, force_overwrite); +} + +inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path, BinTable bins, + bool force_overwrite) { if (!force_overwrite && std::filesystem::exists(path)) { throw std::runtime_error( fmt::format(FMT_STRING("unable to initialize file \"{}\": file already exists"), path)); @@ -82,12 +93,10 @@ inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path, std::filesystem::remove(path); } - const BinTable bins(chroms, bin_size); - HighFive::File fp(path.string(), HighFive::File::Create); RootGroup root_grp{fp.getGroup("/")}; - auto attrs = SingleCellAttributes::init(bin_size); + auto attrs = SingleCellAttributes::init(bins.bin_size()); create_groups(root_grp); create_datasets(root_grp, bins); @@ -188,8 +197,7 @@ inline File SingleCellFile::aggregate(std::string_view uri, bool overwrite_if_ex tails.emplace_back(std::move(last)); } }); - utils::merge(heads, tails, chromosomes(), bins().bin_size(), uri, overwrite_if_exists, chunk_size, - update_frequency); + utils::merge(heads, tails, bins(), uri, overwrite_if_exists, chunk_size, update_frequency); return File(uri); } @@ -240,14 +248,22 @@ SingleCellFile::read_standard_attributes(const HighFive::File& f, bool initializ } DISABLE_WARNING_POP -inline BinTable SingleCellFile::read_bins(const HighFive::File& f) { +inline BinTable SingleCellFile::init_bin_table(const HighFive::File& f) { [[maybe_unused]] HighFive::SilenceHDF5 silencer{}; // NOLINT const RootGroup root_grp{f.getGroup("/")}; - const auto chroms = File::import_chroms(Dataset(root_grp, f.getDataSet("/chroms/name")), - Dataset(root_grp, f.getDataSet("/chroms/length")), false); - const auto bin_size = Attribute::read(root_grp(), "bin-size"); + auto chroms = File::import_chroms(Dataset(root_grp, f.getDataSet("/chroms/name")), + Dataset(root_grp, f.getDataSet("/chroms/length")), false); + const auto bin_type = Attribute::read(root_grp(), "bin-type"); + if (bin_type == "fixed") { + const auto bin_size = Attribute::read(root_grp(), "bin-size"); + + return {std::move(chroms), bin_size}; + } - return {chroms, bin_size}; + assert(bin_type == "variable"); + return {std::move(chroms), + Dataset(root_grp, f.getDataSet("bins/start")).read_all>(), + Dataset(root_grp, f.getDataSet("bins/end")).read_all>()}; } inline phmap::btree_set SingleCellFile::read_cells(const HighFive::File& f) { diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp index 489e81af..c9627243 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp @@ -96,11 +96,8 @@ inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool o } } - const cooler::File clr(clrs.front().uri); - const auto chroms = clr.chromosomes(); - const auto bin_size = clr.bin_size(); - merge(heads, tails, chroms, bin_size, dest_uri, overwrite_if_exists, chunk_size, - update_frequency); + merge(heads, tails, cooler::File(clrs.front().uri).bins(), dest_uri, overwrite_if_exists, + chunk_size, update_frequency); } catch (const std::exception& e) { throw std::runtime_error(fmt::format(FMT_STRING("failed to merge {} cooler files: {}"), std::distance(first_uri, last_uri), e.what())); @@ -109,15 +106,15 @@ inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool o template inline void merge(const std::vector& heads, const std::vector& tails, - const Reference& chromosomes, std::uint32_t bin_size, std::string_view dest_uri, - bool overwrite_if_exists, std::size_t chunk_size, std::size_t update_frequency) { + const BinTable& bins, std::string_view dest_uri, bool overwrite_if_exists, + std::size_t chunk_size, std::size_t update_frequency) { using N = remove_cvref_tcount)>; hictk::transformers::PixelMerger merger{heads, tails}; std::vector> buffer(chunk_size); buffer.clear(); - auto dest = File::create(dest_uri, chromosomes, bin_size, overwrite_if_exists); + auto dest = File::create(dest_uri, bins, overwrite_if_exists); std::size_t pixels_processed{}; auto t0 = std::chrono::steady_clock::now(); diff --git a/src/libhictk/cooler/include/hictk/cooler/index.hpp b/src/libhictk/cooler/include/hictk/cooler/index.hpp index 4aae8ee1..c35b14e9 100644 --- a/src/libhictk/cooler/include/hictk/cooler/index.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/index.hpp @@ -84,6 +84,7 @@ class Index { std::size_t row_idx) const; void set(const Chromosome& chrom, OffsetVect offsets); + void set_offset_by_bin(const Bin& bin, std::uint64_t offset); void set_offset_by_bin_id(std::uint64_t bin_id, std::uint64_t offset); void set_offset_by_pos(const Chromosome& chrom, std::uint32_t pos, std::uint64_t offset); diff --git a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp index 86228e5b..14a840ce 100644 --- a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp @@ -62,6 +62,8 @@ class SingleCellFile { unsigned int mode = HighFive::File::ReadOnly); [[nodiscard]] static SingleCellFile create(const std::filesystem::path& path, const Reference& chroms, std::uint32_t bin_size, + bool force_overwrite); + [[nodiscard]] static SingleCellFile create(const std::filesystem::path& path, BinTable bins, bool force_overwrite = false); [[nodiscard]] constexpr const phmap::btree_set& cells() const noexcept; @@ -83,7 +85,7 @@ class SingleCellFile { private: [[nodiscard]] static SingleCellAttributes read_standard_attributes(const HighFive::File& f, bool initialize_missing); - [[nodiscard]] static BinTable read_bins(const HighFive::File& f); + [[nodiscard]] static BinTable init_bin_table(const HighFive::File& f); [[nodiscard]] static phmap::btree_set read_cells(const HighFive::File& f); static void create_groups(RootGroup& root_grp); diff --git a/src/libhictk/cooler/include/hictk/cooler/utils.hpp b/src/libhictk/cooler/include/hictk/cooler/utils.hpp index fe2d6075..af26b6e7 100644 --- a/src/libhictk/cooler/include/hictk/cooler/utils.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/utils.hpp @@ -23,9 +23,8 @@ void merge(Str first_file, Str last_file, std::string_view dest_uri, template void merge(const std::vector& heads, const std::vector& tails, - const Reference& chromosomes, std::uint32_t bin_size, std::string_view dest_uri, - bool overwrite_if_exists = false, std::size_t chunk_size = 500'000, - std::size_t update_frequency = 10'000'000); + const BinTable& bins, std::string_view dest_uri, bool overwrite_if_exists = false, + std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000); [[nodiscard]] bool equal(std::string_view uri1, std::string_view uri2, bool ignore_attributes = true); diff --git a/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp b/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp index 05f49829..c5805937 100644 --- a/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp +++ b/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp @@ -25,8 +25,8 @@ class GenomicInterval { GenomicInterval(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept; [[nodiscard]] static GenomicInterval parse(const Reference &chroms, std::string query, Type type = Type::UCSC); - [[nodiscard]] static GenomicInterval parse_ucsc(const Reference &chroms, std::string query); - [[nodiscard]] static GenomicInterval parse_bed(const Reference &chroms, std::string_view query, + [[nodiscard]] static GenomicInterval parse_ucsc(const Reference &chroms, std::string buffer); + [[nodiscard]] static GenomicInterval parse_bed(const Reference &chroms, std::string_view buffer, char sep = '\t'); [[nodiscard]] explicit operator bool() const noexcept; diff --git a/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp b/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp index d5126183..f27762c5 100644 --- a/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp +++ b/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp @@ -104,69 +104,69 @@ inline GenomicInterval GenomicInterval::parse(const Reference &chroms, std::stri return GenomicInterval::parse_bed(chroms, query); } -inline GenomicInterval GenomicInterval::parse_ucsc(const Reference &chroms, std::string query) { - if (query.empty()) { +inline GenomicInterval GenomicInterval::parse_ucsc(const Reference &chroms, std::string buffer) { + if (buffer.empty()) { throw std::runtime_error("query is empty"); } - if (const auto match = chroms.find(query); match != chroms.end()) { + if (const auto match = chroms.find(buffer); match != chroms.end()) { return GenomicInterval{*match}; } - const auto p1 = query.find_last_of(':'); - auto p2 = query.find_last_of('-'); + const auto p1 = buffer.find_last_of(':'); + auto p2 = buffer.find_last_of('-'); if (p1 == std::string::npos && p2 == std::string::npos) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid chromosome \"{0}\" in query \"{0}\""), query)); + fmt::format(FMT_STRING("invalid chromosome \"{0}\" in query \"{0}\""), buffer)); } if (p1 == std::string::npos || p2 == std::string::npos || p1 > p2) { - throw std::runtime_error(fmt::format(FMT_STRING("query \"{}\" is malformed"), query)); + throw std::runtime_error(fmt::format(FMT_STRING("query \"{}\" is malformed"), buffer)); } - if (query.find(',', p1) != std::string::npos) { - query.erase(std::remove(query.begin() + std::ptrdiff_t(p1), query.end(), ','), query.end()); - p2 = query.find_last_of('-'); + if (buffer.find(',', p1) != std::string::npos) { + buffer.erase(std::remove(buffer.begin() + std::ptrdiff_t(p1), buffer.end(), ','), buffer.end()); + p2 = buffer.find_last_of('-'); } - query[p1] = '\t'; - query[p2] = '\t'; + buffer[p1] = '\t'; + buffer[p2] = '\t'; - return GenomicInterval::parse_bed(chroms, query); + return GenomicInterval::parse_bed(chroms, buffer); } -inline GenomicInterval GenomicInterval::parse_bed(const Reference &chroms, std::string_view query, +inline GenomicInterval GenomicInterval::parse_bed(const Reference &chroms, std::string_view buffer, char sep) { - if (query.empty()) { - throw std::runtime_error("query is empty"); + if (buffer.empty()) { + throw std::runtime_error("interval is empty"); } - const auto p1 = query.find(sep); - const auto p2 = query.find(sep, p1 + 1); + const auto p1 = buffer.find(sep); + const auto p2 = buffer.find(sep, p1 + 1); if (p1 == std::string_view::npos || p2 == std::string_view::npos || p1 > p2) { - throw std::runtime_error(fmt::format(FMT_STRING("query \"{}\" is malformed"), query)); + throw std::runtime_error(fmt::format(FMT_STRING("interval \"{}\" is malformed"), buffer)); } - const auto chrom_name = query.substr(0, p1); - const auto start_pos_str = query.substr(p1 + 1, p2 - (p1 + 1)); - const auto end_pos_str = query.substr(p2 + 1); + const auto chrom_name = buffer.substr(0, p1); + const auto start_pos_str = buffer.substr(p1 + 1, p2 - (p1 + 1)); + const auto end_pos_str = buffer.substr(p2 + 1); const auto match = chroms.find(chrom_name); if (match == chroms.end()) { - throw std::runtime_error( - fmt::format(FMT_STRING("invalid chromosome \"{}\" in query \"{}\""), chrom_name, query)); + throw std::runtime_error(fmt::format(FMT_STRING("invalid chromosome \"{}\" in interval \"{}\""), + chrom_name, buffer)); } if (start_pos_str.empty()) { throw std::runtime_error( - fmt::format(FMT_STRING("query \"{}\" is malformed: missing start position"), query)); + fmt::format(FMT_STRING("interval \"{}\" is malformed: missing start position"), buffer)); } if (end_pos_str.empty()) { throw std::runtime_error( - fmt::format(FMT_STRING("query \"{}\" is malformed: missing end position"), query)); + fmt::format(FMT_STRING("interval \"{}\" is malformed: missing end position"), buffer)); } GenomicInterval gi{*match}; @@ -175,29 +175,29 @@ inline GenomicInterval GenomicInterval::parse_bed(const Reference &chroms, std:: internal::parse_numeric_or_throw(start_pos_str, gi._start); } catch (const std::exception &e) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid start position \"{}\" in query \"{}\": {}"), start_pos_str, - query, e.what())); + fmt::format(FMT_STRING("invalid start position \"{}\" in interval \"{}\": {}"), + start_pos_str, buffer, e.what())); } try { internal::parse_numeric_or_throw(end_pos_str, gi._end); } catch (const std::exception &e) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid end position \"{}\" in query \"{}\": {}"), end_pos_str, - query, e.what())); + fmt::format(FMT_STRING("invalid end position \"{}\" in interval \"{}\": {}"), end_pos_str, + buffer, e.what())); } if (gi._end > gi.chrom().size()) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid end position \"{0}\" in query \"{1}\": end position is " + fmt::format(FMT_STRING("invalid end position \"{0}\" in interval \"{1}\": end position is " "greater than the chromosome size ({0} > {2})"), - gi._end, query, gi.chrom().size())); + gi._end, buffer, gi.chrom().size())); } if (gi._start >= gi._end) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid query \"{}\": query end position should be " + fmt::format(FMT_STRING("invalid interval \"{}\": query end position should be " "greater than the start position ({} >= {})"), - query, gi._start, gi._end)); + buffer, gi._start, gi._end)); } return gi; From 3c4da66c101843bb357e03ad33e1a80cd25eb15f Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:42:31 +0100 Subject: [PATCH 06/29] Bugfix --- src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp index 66dff5d7..bbbecc25 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp @@ -129,7 +129,8 @@ inline void Index::set(const Chromosome &chrom, OffsetVect offsets) { } inline void Index::set_offset_by_bin(const Bin &bin, std::uint64_t offset) { - set_offset_by_row_idx(bin.chrom().id(), bin.rel_id(), offset); + set_offset_by_row_idx(bin.chrom().id(), conditional_static_cast(bin.rel_id()), + offset); } inline void Index::set_offset_by_bin_id(std::uint64_t bin_id, std::uint64_t offset) { From 3c537d6af5d952b1b3caa0690d949010c7a7d6bb Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:04:26 +0100 Subject: [PATCH 07/29] Bugfix --- src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp index 14a840ce..86fcf6f4 100644 --- a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp @@ -12,6 +12,7 @@ #include #include +#include "hictk/bin.hpp" #include "hictk/chromosome.hpp" #include "hictk/cooler/cooler.hpp" From 8e20097931c840d37df527a5897c27611497ee61 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 6 Dec 2023 18:03:15 +0100 Subject: [PATCH 08/29] Bugfix --- src/libhictk/cooler/include/hictk/cooler/index.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libhictk/cooler/include/hictk/cooler/index.hpp b/src/libhictk/cooler/include/hictk/cooler/index.hpp index c35b14e9..2a0e7597 100644 --- a/src/libhictk/cooler/include/hictk/cooler/index.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/index.hpp @@ -14,6 +14,7 @@ #include #include +#include "hictk/bin.hpp" #include "hictk/chromosome.hpp" namespace hictk { From 01778b85d7213980ac7fef56fd0c8dda13ddfd58 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 19:35:02 +0100 Subject: [PATCH 09/29] Bugfix --- .../bin_table/include/hictk/impl/bin_table_fixed_impl.hpp | 4 ++-- .../bin_table/include/hictk/impl/bin_table_variable_impl.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp index 21978d98..86d9bd66 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp @@ -178,9 +178,9 @@ inline std::uint64_t BinTableFixed::map_to_bin_id(const Chromosome &chrom, } #endif - if (pos > chrom.size()) { + if (pos >= chrom.size()) { throw std::out_of_range(fmt::format( - FMT_STRING("position is greater than chromosome size: {} > {}"), pos, chrom.size())); + FMT_STRING("position is greater than chromosome size: {} >= {}"), pos, chrom.size())); } const auto bin_offset = _num_bins_prefix_sum[chrom.id()] - _num_bins_prefix_sum.front(); diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp index d423a5f3..9f1e14c7 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp @@ -248,9 +248,9 @@ inline std::uint64_t BinTableVariable::map_to_bin_id(const Chromosome &chrom, } #endif - if (pos > chrom.size()) { + if (pos >= chrom.size()) { throw std::out_of_range(fmt::format( - FMT_STRING("position is greater than chromosome size: {} > {}"), pos, chrom.size())); + FMT_STRING("position is greater than chromosome size: {} >= {}"), pos, chrom.size())); } const auto pos_offset = _chroms.chrom_size_prefix_sum()[chrom.id()]; From 3c6895912ad51168811dcb21d6538c751591c44d Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 19:37:40 +0100 Subject: [PATCH 10/29] Add more tests for BinTable --- test/units/bin_table/bin_table_test.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/test/units/bin_table/bin_table_test.cpp b/test/units/bin_table/bin_table_test.cpp index 12f27576..d346600c 100644 --- a/test/units/bin_table/bin_table_test.cpp +++ b/test/units/bin_table/bin_table_test.cpp @@ -38,10 +38,15 @@ TEST_CASE("BinTable (fixed bins)", "[bin-table][short]") { CHECK(table.at(0) == Bin{chr1, 0, bin_size}); CHECK(table.at(10) == Bin{chr1, 50000, 50001}); - CHECK(table.at(11) == Bin{chr2, 0, bin_size}); + CHECK(table.at(chr1, bin_size - 1).id() == 0); + CHECK(table.at(chr1, 50000).id() == 10); + CHECK(table.at(chr2, 1).id() == 11); + CHECK_THROWS_AS(table.at(table.size()), std::out_of_range); + CHECK_THROWS_AS(table.at(chr1, 50001), std::out_of_range); + CHECK_THROWS_AS(table.at(chr2, 26000), std::out_of_range); } SECTION("coord to bin id") { @@ -211,10 +216,18 @@ TEST_CASE("BinTable (variable bins)", "[bin-table][short]") { CHECK(table.at(0) == Bin{chr1, 0, 8}); CHECK(table.at(3) == Bin{chr1, 23, 32}); - CHECK(table.at(4) == Bin{chr2, 0, 5}); + CHECK(table.at(chr1, 0).id() == 0); + CHECK(table.at(chr1, 7).id() == 0); + CHECK(table.at(chr1, 8).id() == 1); + + CHECK(table.at(chr1, 23).id() == 3); + CHECK(table.at(chr2, 4).id() == 4); + CHECK_THROWS_AS(table.at(table.size()), std::out_of_range); + CHECK_THROWS_AS(table.at(chr1, 32), std::out_of_range); + CHECK_THROWS_AS(table.at(chr2, 32), std::out_of_range); } SECTION("coord to bin id") { From 0f1f8f9ec4bd80586019d5f625dc5f5a56fbbb1a Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 19:37:59 +0100 Subject: [PATCH 11/29] Update pixel parsers --- .../pixel/include/hictk/impl/pixel_impl.hpp | 56 ++++++++++++------- src/libhictk/pixel/include/hictk/pixel.hpp | 17 ++++-- test/units/pixel/pixel_test.cpp | 33 ++++++++--- 3 files changed, 72 insertions(+), 34 deletions(-) diff --git a/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp b/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp index e2785692..cafd7831 100644 --- a/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp +++ b/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp @@ -119,12 +119,15 @@ inline bool ThinPixel::operator>=(const ThinPixel &other) const noexcept { } template -inline auto ThinPixel::from_coo(const BinTable &bins, std::string_view line) -> ThinPixel { +inline auto ThinPixel::from_coo(const BinTable &bins, std::string_view line, std::int64_t offset) + -> ThinPixel { try { const auto toks = internal::tokenize_n<3>(line); - const auto bin1_id = internal::parse_numeric_or_throw(toks[0]); - const auto bin2_id = internal::parse_numeric_or_throw(toks[1]); + const auto bin1_id = + static_cast(internal::parse_numeric_or_throw(toks[0]) + offset); + const auto bin2_id = + static_cast(internal::parse_numeric_or_throw(toks[1]) + offset); const auto count = internal::parse_numeric_or_throw(toks[2]); if (bin1_id > bins.size()) { @@ -140,13 +143,16 @@ inline auto ThinPixel::from_coo(const BinTable &bins, std::string_view line) fmt::format(FMT_STRING("line \"{}\" is not in coo format: {}"), line, e.what())); } } + template -inline auto ThinPixel::from_coo(std::string_view line) -> ThinPixel { +inline auto ThinPixel::from_coo(std::string_view line, std::int64_t offset) -> ThinPixel { try { const auto toks = internal::tokenize_n<3>(line); - const auto bin1_id = internal::parse_numeric_or_throw(toks[0]); - const auto bin2_id = internal::parse_numeric_or_throw(toks[1]); + const auto bin1_id = + static_cast(internal::parse_numeric_or_throw(toks[0]) + offset); + const auto bin2_id = + static_cast(internal::parse_numeric_or_throw(toks[1]) + offset); const auto count = internal::parse_numeric_or_throw(toks[2]); return {bin1_id, bin2_id, count}; @@ -285,12 +291,15 @@ inline ThinPixel Pixel::to_thin() const noexcept { } template -inline auto Pixel::from_coo(const hictk::BinTable &bins, std::string_view line) -> Pixel { +inline auto Pixel::from_coo(const hictk::BinTable &bins, std::string_view line, + std::int64_t offset) -> Pixel { try { const auto toks = internal::tokenize_n<3>(line); - const auto bin1_id = internal::parse_numeric_or_throw(toks[0]); - const auto bin2_id = internal::parse_numeric_or_throw(toks[1]); + const auto bin1_id = + static_cast(internal::parse_numeric_or_throw(toks[0]) + offset); + const auto bin2_id = + static_cast(internal::parse_numeric_or_throw(toks[1]) + offset); const auto count = internal::parse_numeric_or_throw(toks[2]); return {bins.at(bin1_id), bins.at(bin2_id), count}; @@ -301,15 +310,18 @@ inline auto Pixel::from_coo(const hictk::BinTable &bins, std::string_view lin } template -inline auto Pixel::from_bg2(const hictk::BinTable &bins, std::string_view line) -> Pixel { +inline auto Pixel::from_bg2(const hictk::BinTable &bins, std::string_view line, + std::int64_t offset) -> Pixel { try { const auto toks = internal::tokenize_n_or_more<7>(line); const auto &chrom1 = toks[0]; - const auto start1 = internal::parse_numeric_or_throw(toks[1]); + const auto start1 = static_cast( + internal::parse_numeric_or_throw(toks[1]) + offset); const auto &chrom2 = toks[3]; - const auto start2 = internal::parse_numeric_or_throw(toks[4]); + const auto start2 = static_cast( + internal::parse_numeric_or_throw(toks[4]) + offset); const auto count = internal::parse_numeric_or_throw(toks[6]); return {bins.at(chrom1, start1), bins.at(chrom2, start2), count}; @@ -320,16 +332,18 @@ inline auto Pixel::from_bg2(const hictk::BinTable &bins, std::string_view lin } template -inline auto Pixel::from_validpair(const hictk::BinTable &bins, std::string_view line) - -> Pixel { +inline auto Pixel::from_validpair(const hictk::BinTable &bins, std::string_view line, + std::int64_t offset) -> Pixel { try { const auto toks = internal::tokenize_n_or_more<6>(line); const auto &chrom1 = toks[1]; - const auto start1 = internal::parse_numeric_or_throw(toks[2]); + const auto start1 = static_cast( + internal::parse_numeric_or_throw(toks[2]) + offset); const auto &chrom2 = toks[4]; - const auto start2 = internal::parse_numeric_or_throw(toks[5]); + const auto start2 = static_cast( + internal::parse_numeric_or_throw(toks[5]) + offset); return {bins.at(chrom1, start1), bins.at(chrom2, start2), 1}; } catch (const std::exception &e) { @@ -339,16 +353,18 @@ inline auto Pixel::from_validpair(const hictk::BinTable &bins, std::string_vi } template -inline auto Pixel::from_4dn_pairs(const hictk::BinTable &bins, std::string_view line) - -> Pixel { +inline auto Pixel::from_4dn_pairs(const hictk::BinTable &bins, std::string_view line, + std::int64_t offset) -> Pixel { try { const auto toks = internal::tokenize_n_or_more<6>(line); const auto &chrom1 = toks[1]; - const auto start1 = internal::parse_numeric_or_throw(toks[2]); + const auto start1 = static_cast( + internal::parse_numeric_or_throw(toks[2]) + offset); const auto &chrom2 = toks[3]; - const auto start2 = internal::parse_numeric_or_throw(toks[4]); + const auto start2 = static_cast( + internal::parse_numeric_or_throw(toks[4]) + offset); return {bins.at(chrom1, start1), bins.at(chrom2, start2), 1}; } catch (const std::exception &e) { diff --git a/src/libhictk/pixel/include/hictk/pixel.hpp b/src/libhictk/pixel/include/hictk/pixel.hpp index 6837c12e..d38309d7 100644 --- a/src/libhictk/pixel/include/hictk/pixel.hpp +++ b/src/libhictk/pixel/include/hictk/pixel.hpp @@ -31,8 +31,9 @@ struct ThinPixel { [[nodiscard]] bool operator>(const ThinPixel &other) const noexcept; [[nodiscard]] bool operator>=(const ThinPixel &other) const noexcept; - static auto from_coo(std::string_view line) -> ThinPixel; - static auto from_coo(const BinTable &bins, std::string_view line) -> ThinPixel; + static auto from_coo(std::string_view line, std::int64_t offset = 0) -> ThinPixel; + static auto from_coo(const BinTable &bins, std::string_view line, std::int64_t offset = 0) + -> ThinPixel; }; struct PixelCoordinates { @@ -83,10 +84,14 @@ struct Pixel { [[nodiscard]] bool operator>=(const Pixel &other) const noexcept; [[nodiscard]] ThinPixel to_thin() const noexcept; - static auto from_coo(const BinTable &bins, std::string_view line) -> Pixel; - static auto from_bg2(const BinTable &bins, std::string_view line) -> Pixel; - static auto from_validpair(const BinTable &bins, std::string_view line) -> Pixel; - static auto from_4dn_pairs(const BinTable &bins, std::string_view line) -> Pixel; + static auto from_coo(const BinTable &bins, std::string_view line, std::int64_t offset = 0) + -> Pixel; + static auto from_bg2(const BinTable &bins, std::string_view line, std::int64_t offset = 0) + -> Pixel; + static auto from_validpair(const BinTable &bins, std::string_view line, std::int64_t offset = 0) + -> Pixel; + static auto from_4dn_pairs(const BinTable &bins, std::string_view line, std::int64_t offset = 0) + -> Pixel; }; } // namespace hictk diff --git a/test/units/pixel/pixel_test.cpp b/test/units/pixel/pixel_test.cpp index 539cbe28..520dc7db 100644 --- a/test/units/pixel/pixel_test.cpp +++ b/test/units/pixel/pixel_test.cpp @@ -138,12 +138,14 @@ TEST_CASE("ThinPixel: parsers", "[pixel][short]") { } SECTION("invalid") { - CHECK_THROWS_WITH(Pixel::from_coo(bins, ""), + CHECK_THROWS_WITH(ThinPixel::from_coo(bins, ""), Catch::Matchers::ContainsSubstring("expected exactly 3 fields")); - CHECK_THROWS_WITH(Pixel::from_coo(bins, "chr1\t0\t10\tchr1\t10\t20\t1"), + CHECK_THROWS_WITH(ThinPixel::from_coo(bins, "chr1\t0\t10\tchr1\t10\t20\t1"), Catch::Matchers::ContainsSubstring("expected exactly 3 fields")); - CHECK_THROWS_WITH(Pixel::from_coo(bins, "0\t1\tchr"), + CHECK_THROWS_WITH(ThinPixel::from_coo(bins, "0\t1\tchr"), Catch::Matchers::ContainsSubstring("Unable to convert field \"chr\"")); + CHECK_THROWS_WITH(ThinPixel::from_coo(bins, "9999999999\t9999999999\t1"), + Catch::Matchers::ContainsSubstring("out of range")); } } } @@ -159,10 +161,11 @@ TEST_CASE("Pixel: parsers", "[pixel][short]") { const BinTable bins{chroms, bin_size}; using N = std::uint32_t; - const Pixel expected{{bins.at(0), bins.at(1)}, 1}; + const Pixel expected1{{bins.at(0), bins.at(1)}, 1}; + const Pixel expected2{{bins.at(24895642), bins.at(24895642)}, 1}; SECTION("coo") { - SECTION("valid") { CHECK(Pixel::from_coo(bins, "0\t1\t1") == expected); } + SECTION("valid") { CHECK(Pixel::from_coo(bins, "0\t1\t1") == expected1); } SECTION("invalid") { CHECK_THROWS_WITH(Pixel::from_coo(bins, ""), Catch::Matchers::ContainsSubstring("expected exactly 3 fields")); @@ -170,13 +173,17 @@ TEST_CASE("Pixel: parsers", "[pixel][short]") { Catch::Matchers::ContainsSubstring("expected exactly 3 fields")); CHECK_THROWS_WITH(Pixel::from_coo(bins, "0\t1\tchr"), Catch::Matchers::ContainsSubstring("Unable to convert field \"chr\"")); + CHECK_THROWS_WITH(Pixel::from_coo(bins, "9999999999\t9999999999\t1"), + Catch::Matchers::ContainsSubstring("out of range")); } } SECTION("bg2") { SECTION("valid") { - CHECK(Pixel::from_bg2(bins, "chr1\t0\t10\tchr1\t10\t20\t1") == expected); - CHECK(Pixel::from_bg2(bins, "chr1\t0\t10\tchr1\t10\t20\t1\ta\tb\tc") == expected); + CHECK(Pixel::from_bg2(bins, "chr1\t0\t10\tchr1\t10\t20\t1") == expected1); + CHECK(Pixel::from_bg2(bins, "chr1\t248956421\t248956422\tchr1\t248956421\t248956422\t1") == + expected2); + CHECK(Pixel::from_bg2(bins, "chr1\t0\t10\tchr1\t10\t20\t1\ta\tb\tc") == expected1); } SECTION("invalid") { CHECK_THROWS_WITH(Pixel::from_bg2(bins, "chr999\t0\t10\tchr1\t0\t10\t1"), @@ -187,11 +194,18 @@ TEST_CASE("Pixel: parsers", "[pixel][short]") { Catch::Matchers::ContainsSubstring("expected 7 or more fields, found 1")); CHECK_THROWS_WITH(Pixel::from_bg2(bins, "chr1\ta\t10\tchr1\t10\t20\t1"), Catch::Matchers::ContainsSubstring("Unable to convert field \"a\"")); + CHECK_THROWS_WITH(Pixel::from_coo(bins, "9999999999\t9999999999\t1"), + Catch::Matchers::ContainsSubstring("out of range")); } } SECTION("validpair") { - SECTION("valid") { CHECK(Pixel::from_validpair(bins, "read_id\tchr1\t5\t+\tchr1\t15\t-")); } + SECTION("valid") { + CHECK(Pixel::from_validpair(bins, "read_id\tchr1\t5\t+\tchr1\t15\t-")); + CHECK(Pixel::from_validpair(bins, "read_id\tchr1\t248956422\t+\tchr1\t248956422\t-") == + expected2); + } + SECTION("invalid") { CHECK_THROWS_WITH(Pixel::from_validpair(bins, ""), Catch::Matchers::ContainsSubstring("expected 6 or more fields, found 0")); @@ -201,6 +215,9 @@ TEST_CASE("Pixel: parsers", "[pixel][short]") { Catch::Matchers::ContainsSubstring("expected 6 or more fields, found 5")); CHECK_THROWS_WITH(Pixel::from_validpair(bins, "read_id\tchr1\tchr1\t+\tchr1\t15\t-"), Catch::Matchers::ContainsSubstring("Unable to convert field \"chr1\"")); + CHECK_THROWS_WITH( + Pixel::from_validpair(bins, "read_id\tchr1\t248956423\t+\tchr1\t248956423\t-"), + Catch::Matchers::ContainsSubstring("position is greater than chromosome size")); } } } From 44e93f3445dc46d9b47d7edcd420b2b7a3221c88 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 19:38:45 +0100 Subject: [PATCH 12/29] Update hictk load - Simplify PairsAggregator - Drop optimization for ingesting pre-sorted pairs (it never worked as intended) - Add --one-based and --zero-based CLI options - Improve logging --- src/hictk/cli/cli_load.cpp | 28 ++++- src/hictk/include/hictk/tools/config.hpp | 2 + src/hictk/load/common.hpp | 10 +- src/hictk/load/load.cpp | 66 ++++++---- src/hictk/load/load_pairs.hpp | 151 ++++++----------------- src/hictk/load/load_pixels.hpp | 16 +-- 6 files changed, 120 insertions(+), 153 deletions(-) diff --git a/src/hictk/cli/cli_load.cpp b/src/hictk/cli/cli_load.cpp index faaf1529..b9ee81d2 100644 --- a/src/hictk/cli/cli_load.cpp +++ b/src/hictk/cli/cli_load.cpp @@ -73,6 +73,11 @@ void Cli::make_load_subcommand() { "Assembly name.") ->capture_default_str(); + sc.add_flag( + "--one-based,!--zero-based", + c.one_based, + "Interpret genomic coordinates or bins as one/zero based."); + sc.add_flag( "--count-as-float", c.count_as_float, @@ -95,7 +100,8 @@ void Cli::make_load_subcommand() { sc.add_option( "--batch-size", c.batch_size, - "Number of pixels to buffer in memory. Only used when processing unsorted interactions or pairs") + "Number of pixels to buffer in memory.\n" + "Only used when processing unsorted interactions or pairs.") ->capture_default_str(); // clang-format on @@ -106,8 +112,10 @@ void Cli::make_load_subcommand() { void Cli::validate_load_subcommand() const { assert(_cli.get_subcommand("load")->parsed()); + std::vector warnings; std::vector errors; const auto& c = std::get(_config); + const auto& sc = *_cli.get_subcommand("load"); if (!c.force && std::filesystem::exists(c.uri)) { errors.emplace_back(fmt::format( @@ -116,8 +124,22 @@ void Cli::validate_load_subcommand() const { if (c.path_to_bin_table.empty() && c.path_to_chrom_sizes.empty()) { assert(c.bin_size == 0); + errors.emplace_back("--bin-size is required when --bin-table is not specified."); + } + + if ((c.format == "bg2" || c.format == "coo") && !sc.get_option("--bin-table")->empty()) { errors.emplace_back( - "--bin-size is required when --bin-table is not specified."); + "specifying bins through the --bin-table is not supported when ingesting pre-binned " + "interactions."); + } + + if (c.format == "4dn" && c.format == "validpairs" && c.assume_sorted) { + warnings.emplace_back( + "--assume-sorted has no effect when ingesting interactions in 4dn or validpairs format."); + } + + for (const auto& w : warnings) { + SPDLOG_WARN(FMT_STRING("{}"), w); } if (!errors.empty()) { @@ -131,6 +153,8 @@ void Cli::validate_load_subcommand() const { void Cli::transform_args_load_subcommand() { auto& c = std::get(_config); + c.offset = c.one_based ? -1 : 0; + // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity < 5); c.verbosity = static_cast(spdlog::level::critical) - c.verbosity; diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index bb775f48..985227fd 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -108,6 +108,8 @@ struct LoadConfig { std::string format{}; std::string assembly{"unknown"}; + bool one_based{true}; + std::int64_t offset{0}; bool count_as_float{false}; bool assume_sorted{false}; bool force{false}; diff --git a/src/hictk/load/common.hpp b/src/hictk/load/common.hpp index e1bbb6dc..0763901a 100644 --- a/src/hictk/load/common.hpp +++ b/src/hictk/load/common.hpp @@ -31,20 +31,20 @@ enum class Format { COO, BG2, VP, _4DN }; template [[nodiscard]] inline ThinPixel parse_pixel(const BinTable& bins, std::string_view line, - Format format) { + Format format, std::int64_t offset) { ThinPixel pixel{}; switch (format) { case Format::COO: - pixel = ThinPixel::from_coo(bins, line); + pixel = ThinPixel::from_coo(bins, line, offset); break; case Format::BG2: - pixel = Pixel::from_bg2(bins, line).to_thin(); + pixel = Pixel::from_bg2(bins, line, offset).to_thin(); break; case Format::VP: - pixel = Pixel::from_validpair(bins, line).to_thin(); + pixel = Pixel::from_validpair(bins, line, offset).to_thin(); break; case Format::_4DN: - pixel = Pixel::from_4dn_pairs(bins, line).to_thin(); + pixel = Pixel::from_4dn_pairs(bins, line, offset).to_thin(); break; } if (pixel.bin1_id > pixel.bin2_id) { diff --git a/src/hictk/load/load.cpp b/src/hictk/load/load.cpp index 1575dbf8..be68bdf0 100644 --- a/src/hictk/load/load.cpp +++ b/src/hictk/load/load.cpp @@ -56,12 +56,28 @@ namespace hictk::tools { std::string line{}; GenomicInterval record{}; + bool fixed_bin_size = true; + std::uint32_t bin_size = 0; while (std::getline(ifs, line)) { record = GenomicInterval::parse_bed(chroms, line); + if (bin_size == 0) { + bin_size = record.size(); + } + + // TODO uncomment + // fixed_bin_size &= record.size() == bin_size || record.chrom().size() == record.end(); + fixed_bin_size = false; + start_pos.push_back(record.start()); end_pos.push_back(record.end()); } + if (fixed_bin_size) { + SPDLOG_INFO(FMT_STRING("detected bin table with uniform bin size.")); + return {chroms, bin_size}; + } + + SPDLOG_INFO(FMT_STRING("detected bin table with variable bin size.")); return {chroms, start_pos, end_pos}; } @@ -72,10 +88,10 @@ static void ingest_pixels_sorted(const LoadConfig& c) { c.count_as_float ? ingest_pixels_sorted( cooler::File::create(c.uri, chroms, c.bin_size, c.force), format, - c.batch_size, c.validate_pixels) + c.offset, c.batch_size, c.validate_pixels) : ingest_pixels_sorted( cooler::File::create(c.uri, chroms, c.bin_size, c.force), - format, c.batch_size, c.validate_pixels); + format, c.offset, c.batch_size, c.validate_pixels); } static void ingest_pixels_unsorted(const LoadConfig& c) { @@ -105,7 +121,7 @@ static void ingest_pixels_unsorted(const LoadConfig& c) { SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, tmp_cooler_path); const auto nnz = ingest_pixels_unsorted(tmp_clr.create_cell(fmt::to_string(i)), - buffer, format, c.validate_pixels); + buffer, format, c.offset, c.validate_pixels); SPDLOG_INFO(FMT_STRING("done writing chunk #{} to tmp file \"{}\"."), i + 1, tmp_cooler_path); if (nnz == 0) { @@ -121,22 +137,7 @@ static void ingest_pixels_unsorted(const LoadConfig& c) { std::filesystem::remove(tmp_cooler_path); } -static void ingest_pairs_sorted(const LoadConfig& c) { - assert(c.assume_sorted); - auto bins = c.path_to_bin_table.empty() - ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) - : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); - const auto format = format_from_string(c.format); - - c.count_as_float - ? ingest_pairs_sorted(cooler::File::create(c.uri, bins, c.force), format, - c.batch_size, c.validate_pixels) - : ingest_pairs_sorted(cooler::File::create(c.uri, bins, c.force), - format, c.batch_size, c.validate_pixels); -} - -static void ingest_pairs_unsorted(const LoadConfig& c) { - assert(!c.assume_sorted); +static void ingest_pairs(const LoadConfig& c) { auto bins = c.path_to_bin_table.empty() ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); @@ -162,8 +163,8 @@ static void ingest_pairs_unsorted(const LoadConfig& c) { for (std::size_t i = 0; true; ++i) { SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, tmp_cooler_path); - const auto nnz = ingest_pairs_unsorted(tmp_clr.create_cell(fmt::to_string(i)), - buffer, c.batch_size, format, c.validate_pixels); + const auto nnz = ingest_pairs(tmp_clr.create_cell(fmt::to_string(i)), buffer, + c.batch_size, format, c.offset, c.validate_pixels); SPDLOG_INFO(FMT_STRING("done writing chunk #{} to tmp file \"{}\"."), i + 1, tmp_cooler_path); @@ -185,6 +186,7 @@ static void ingest_pairs_unsorted(const LoadConfig& c) { int load_subcmd(const LoadConfig& c) { const auto format = format_from_string(c.format); const auto pixel_has_count = format == Format::COO || format == Format::BG2; + const auto t0 = std::chrono::system_clock::now(); if (c.assume_sorted && pixel_has_count) { SPDLOG_INFO(FMT_STRING("begin loading presorted pixels...")); @@ -192,13 +194,23 @@ int load_subcmd(const LoadConfig& c) { } else if (!c.assume_sorted && pixel_has_count) { SPDLOG_INFO(FMT_STRING("begin loading un-sorted pixels...")); ingest_pixels_unsorted(c); - } else if (c.assume_sorted && !pixel_has_count) { - SPDLOG_INFO(FMT_STRING("begin loading presorted pairs...")); - ingest_pairs_sorted(c); - } else { - SPDLOG_INFO(FMT_STRING("begin loading un-sorted pairs...")); - ingest_pairs_unsorted(c); + } else if (!pixel_has_count) { + SPDLOG_INFO(FMT_STRING("begin loading pairs...")); + ingest_pairs(c); } + + const cooler::File clr(c.uri); + + const auto t1 = std::chrono::system_clock::now(); + const auto delta = std::chrono::duration_cast(t1 - t0).count(); + + std::visit( + [&](const auto& sum) { + SPDLOG_INFO(FMT_STRING("ingested {} interactions ({} nnz) in {}s!"), sum, clr.nnz(), + static_cast(delta) / 1.0e9); + }, + clr.attributes().sum.value()); + return 0; } diff --git a/src/hictk/load/load_pairs.hpp b/src/hictk/load/load_pairs.hpp index 69436123..f4a07e23 100644 --- a/src/hictk/load/load_pairs.hpp +++ b/src/hictk/load/load_pairs.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include @@ -33,84 +34,66 @@ struct PixelCmp { template class PairsAggregator { phmap::btree_set, PixelCmp> _buffer{}; - typename phmap::btree_set, PixelCmp>::const_iterator _it{}; const BinTable& _bins{}; Format _format{}; - ThinPixel _last{}; + ThinPixel _last_pixel{}; + std::string _line_buffer{}; + std::int64_t _offset{}; public: PairsAggregator() = delete; - inline PairsAggregator(const BinTable& bins, Format format) - : _it(_buffer.end()), _bins(bins), _format(format) {} + inline PairsAggregator(const BinTable& bins, Format format, std::int64_t offset) + : _bins(bins), _format(format), _offset(offset) { + while (std::getline(std::cin, _line_buffer)) { + if (!line_is_header(_line_buffer)) { + break; + } + } + if (!_line_buffer.empty()) { + _last_pixel = parse_pixel(_bins, _line_buffer, _format, _offset); + } + } - inline void read_next_chunk(std::vector>& buffer) { + inline bool read_next_chunk(std::vector>& buffer) { buffer.clear(); read_next_batch(buffer.capacity()); std::copy(_buffer.begin(), _buffer.end(), std::back_inserter(buffer)); _buffer.clear(); - } - [[nodiscard]] inline ThinPixel next() { - if (_it == _buffer.end()) { - read_next_batch(); - } - if (_buffer.empty()) { - auto last = _last; - _last = {}; - return last; - } - return *_it++; + return buffer.size() == buffer.capacity(); } private: - inline void read_next_batch() { - auto last_bin1 = _last.bin1_id; - std::string line{}; - - _buffer.clear(); - if (!!_last) { - _buffer.emplace(_last); - } + inline ThinPixel aggregate_pixel() { + assert(!!_last_pixel); - while (std::getline(std::cin, line)) { - if (line_is_header(line)) { + while (std::getline(std::cin, _line_buffer)) { + if (_line_buffer.empty()) { continue; } - auto pixel = parse_pixel(_bins, line, _format); - auto node = _buffer.find(pixel); - if (node != _buffer.end()) { - node->count += pixel.count; - } else { - _buffer.emplace(pixel); - } - - if (last_bin1 != ThinPixel::null_id && pixel.bin1_id != last_bin1) { - break; + auto p = parse_pixel(_bins, _line_buffer, _format, _offset); + if (p.bin1_id != _last_pixel.bin1_id || p.bin2_id != _last_pixel.bin2_id) { + std::swap(p, _last_pixel); + return p; } - last_bin1 = pixel.bin1_id; + _last_pixel.count += p.count; } - _it = _buffer.begin(); - if (_buffer.empty()) { - _last = {}; - } else { - _last = *_buffer.rbegin(); - _buffer.erase(_last); - } + ThinPixel p{}; + std::swap(p, _last_pixel); + return p; } inline void read_next_batch(std::size_t batch_size) { - std::string line{}; - _buffer.clear(); - while (std::getline(std::cin, line)) { - if (line_is_header(line)) { - continue; - } - auto pixel = parse_pixel(_bins, line, _format); + while (!!_last_pixel) { + const auto pixel = aggregate_pixel(); + if (!pixel) { + break; + } auto node = _buffer.find(pixel); if (node != _buffer.end()) { node->count += pixel.count; @@ -122,72 +105,16 @@ class PairsAggregator { break; } } - - _it = _buffer.begin(); } }; template -inline void read_sort_and_aggregate_batch(PairsAggregator& aggregator, - std::vector>& buffer, - std::size_t batch_size) { - buffer.reserve(batch_size); - buffer.clear(); - - while (true) { - if (buffer.size() == batch_size) { - return; - } - - auto pixel = aggregator.next(); - if (!pixel) { - break; - } - buffer.emplace_back(std::move(pixel)); - } -} - -template -inline void ingest_pairs_sorted(cooler::File&& clr, Format format, std::size_t batch_size, - bool validate_pixels) { - PairsAggregator aggregator{clr.bins(), format}; - std::vector> buffer{}; - buffer.reserve(batch_size); - - std::size_t i0 = 0; - std::size_t i1 = i0; - try { - for (; true; ++i1) { - if (buffer.size() == batch_size) { - SPDLOG_INFO(FMT_STRING("processing chunk {}-{}..."), i0, i1); - clr.append_pixels(buffer.begin(), buffer.end(), validate_pixels); - buffer.clear(); - i0 = i1; - } - - auto pixel = aggregator.next(); - if (!pixel) { - break; - } - buffer.emplace_back(std::move(pixel)); - } - - if (!buffer.empty()) { - clr.append_pixels(buffer.begin(), buffer.end(), validate_pixels); - } - } catch (const std::exception& e) { - throw std::runtime_error(fmt::format( - FMT_STRING("an error occurred while processing chunk {}-{}: {}"), i0, i1, e.what())); - } -} - -template -[[nodiscard]] inline std::uint64_t ingest_pairs_unsorted(cooler::File&& clr, - std::vector>& buffer, - std::size_t batch_size, Format format, - bool validate_pixels) { +[[nodiscard]] inline std::uint64_t ingest_pairs(cooler::File&& clr, + std::vector>& buffer, + std::size_t batch_size, Format format, + std::int64_t offset, bool validate_pixels) { buffer.reserve(batch_size); - PairsAggregator{clr.bins(), format}.read_next_chunk(buffer); + PairsAggregator{clr.bins(), format, offset}.read_next_chunk(buffer); if (buffer.empty()) { assert(std::cin.eof()); diff --git a/src/hictk/load/load_pixels.hpp b/src/hictk/load/load_pixels.hpp index e6ec599e..211f6102 100644 --- a/src/hictk/load/load_pixels.hpp +++ b/src/hictk/load/load_pixels.hpp @@ -20,7 +20,8 @@ namespace hictk::tools { template -inline void read_batch(const BinTable& bins, std::vector>& buffer, Format format) { +inline void read_batch(const BinTable& bins, std::vector>& buffer, Format format, + std::int64_t offset) { buffer.clear(); std::string line{}; try { @@ -28,7 +29,7 @@ inline void read_batch(const BinTable& bins, std::vector>& buffer, if (line_is_header(line)) { continue; } - buffer.emplace_back(parse_pixel(bins, line, format)); + buffer.emplace_back(parse_pixel(bins, line, format, offset)); if (buffer.size() == buffer.capacity()) { return; } @@ -43,15 +44,15 @@ inline void read_batch(const BinTable& bins, std::vector>& buffer, } template -inline void ingest_pixels_sorted(cooler::File&& clr, Format format, std::size_t batch_size, - bool validate_pixels) { +inline void ingest_pixels_sorted(cooler::File&& clr, Format format, std::int64_t offset, + std::size_t batch_size, bool validate_pixels) { std::vector> buffer(batch_size); std::size_t i = 0; try { for (; !std::cin.eof(); ++i) { SPDLOG_INFO(FMT_STRING("processing chunk #{}..."), i + 1); - read_batch(clr.bins(), buffer, format); + read_batch(clr.bins(), buffer, format, offset); clr.append_pixels(buffer.begin(), buffer.end(), validate_pixels); buffer.clear(); } @@ -69,10 +70,11 @@ inline void ingest_pixels_sorted(cooler::File&& clr, Format format, std::size_t template [[nodiscard]] inline std::size_t ingest_pixels_unsorted(cooler::File&& clr, std::vector>& buffer, - Format format, bool validate_pixels) { + Format format, std::int64_t offset, + bool validate_pixels) { assert(buffer.capacity() != 0); - read_batch(clr.bins(), buffer, format); + read_batch(clr.bins(), buffer, format, offset); if (buffer.empty()) { assert(std::cin.eof()); From b83a7e415fb58d474b1a09578ff7e2da9aff042b Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 19:54:05 +0100 Subject: [PATCH 13/29] Detect when a fixed bin table is passed through --bin-table --- src/hictk/load/load.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/hictk/load/load.cpp b/src/hictk/load/load.cpp index be68bdf0..87248424 100644 --- a/src/hictk/load/load.cpp +++ b/src/hictk/load/load.cpp @@ -64,9 +64,7 @@ namespace hictk::tools { bin_size = record.size(); } - // TODO uncomment - // fixed_bin_size &= record.size() == bin_size || record.chrom().size() == record.end(); - fixed_bin_size = false; + fixed_bin_size &= record.size() == bin_size || record.chrom().size() == record.end(); start_pos.push_back(record.start()); end_pos.push_back(record.end()); From 2199789afbb754906d2b9a9f17f5bde5b151264f Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 20:07:44 +0100 Subject: [PATCH 14/29] Update integration tests for hictk load --- test/scripts/hictk_load_4dn.sh | 72 ++++++++++++++++------------------ test/scripts/hictk_load_bg2.sh | 6 +-- test/scripts/hictk_load_coo.sh | 6 +-- 3 files changed, 39 insertions(+), 45 deletions(-) diff --git a/test/scripts/hictk_load_4dn.sh b/test/scripts/hictk_load_4dn.sh index a81cf6af..d3b337a5 100755 --- a/test/scripts/hictk_load_4dn.sh +++ b/test/scripts/hictk_load_4dn.sh @@ -49,35 +49,23 @@ function compare_coolers { fi } -function shuffle { - if command -v shuf &> /dev/null; then - shuf - else - sort -R - fi -} - export function readlink_py shuffle status=0 -if [ $# -ne 2 ]; then - 2>&1 echo "Usage: $0 path_to_hictk [un]sorted" +if [ $# -ne 1 ]; then + 2>&1 echo "Usage: $0 path_to_hictk" status=1 fi hictk_bin="$1" -if [[ "$2" == 'sorted' ]]; then - sorted=true -else - sorted=false -fi data_dir="$(readlink_py "$(dirname "$0")/../data/integration_tests")" script_dir="$(readlink_py "$(dirname "$0")")" pairs="$data_dir/4DNFIKNWM36K.subset.pairs.xz" -ref_cooler="$data_dir/4DNFIKNWM36K.subset.cool" +ref_cooler_fixed_bins="$data_dir/4DNFIKNWM36K.subset.fixed-bins.cool" +ref_cooler_variable_bins="$data_dir/4DNFIKNWM36K.subset.variable-bins.cool" export PATH="$PATH:$script_dir" @@ -99,37 +87,43 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$ref_cooler"; then +if ! check_files_exist "$pairs" "$ref_cooler_fixed_bins" "$ref_cooler_variable_bins"; then exit 1 fi outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" trap 'rm -rf -- "$outdir"' EXIT -cooler dump -t chroms "$ref_cooler" > "$outdir/chrom.sizes" - -if [[ "$sorted" == true ]]; then - xzcat "$pairs" | - "$hictk_bin" load \ - -f 4dn \ - --assume-sorted \ - --batch-size 1000000 \ - "$outdir/chrom.sizes" \ - 10000 \ - "$outdir/out.cool" -else - xzcat "$pairs" | - shuffle | - "$hictk_bin" load \ - -f 4dn \ - --assume-unsorted \ - --batch-size 1000000 \ - "$outdir/chrom.sizes" \ - 10000 \ - "$outdir/out.cool" +cooler dump -t chroms "$ref_cooler_fixed_bins" > "$outdir/chrom.sizes" + +# Test cooler with fixed bin size +xzcat "$pairs" | + "$hictk_bin" load \ + -f 4dn \ + --assume-sorted \ + --batch-size 1000000 \ + --bin-size 10000 \ + "$outdir/chrom.sizes" \ + "$outdir/out.cool" + +if ! compare_coolers "$outdir/out.cool" "$ref_cooler_fixed_bins"; then + status=1 fi -if ! compare_coolers "$outdir/out.cool" "$ref_cooler"; then +# Test cooler with variable bin size +cooler dump -t bins "$ref_cooler_variable_bins" > "$outdir/bins.bed" + +xzcat "$pairs" | + "$hictk_bin" load \ + -f 4dn \ + --assume-sorted \ + --batch-size 1000000 \ + --bin-table "$outdir/bins.bed" \ + --force \ + "$outdir/chrom.sizes" \ + "$outdir/out.cool" + +if ! compare_coolers "$outdir/out.cool" "$ref_cooler_variable_bins"; then status=1 fi diff --git a/test/scripts/hictk_load_bg2.sh b/test/scripts/hictk_load_bg2.sh index 3c1df19b..5bb7ea75 100755 --- a/test/scripts/hictk_load_bg2.sh +++ b/test/scripts/hictk_load_bg2.sh @@ -76,7 +76,7 @@ fi data_dir="$(readlink_py "$(dirname "$0")/../data/integration_tests")" script_dir="$(readlink_py "$(dirname "$0")")" -ref_cooler="$data_dir/4DNFIKNWM36K.subset.cool" +ref_cooler="$data_dir/4DNFIKNWM36K.subset.fixed-bins.cool" export PATH="$PATH:$script_dir" @@ -108,8 +108,8 @@ if [[ "$sorted" == true ]]; then -f bg2 \ --assume-sorted \ --batch-size 1000000 \ + --bin-size 10000 \ "$outdir/chrom.sizes" \ - 10000 \ "$outdir/out.cool" else cooler dump -t pixels --join "$ref_cooler" | @@ -118,8 +118,8 @@ else -f bg2 \ --assume-unsorted \ --batch-size 1000000 \ + --bin-size 10000 \ "$outdir/chrom.sizes" \ - 10000 \ "$outdir/out.cool" fi diff --git a/test/scripts/hictk_load_coo.sh b/test/scripts/hictk_load_coo.sh index 6ab669d6..0d46f9c7 100755 --- a/test/scripts/hictk_load_coo.sh +++ b/test/scripts/hictk_load_coo.sh @@ -76,7 +76,7 @@ fi data_dir="$(readlink_py "$(dirname "$0")/../data/integration_tests")" script_dir="$(readlink_py "$(dirname "$0")")" -ref_cooler="$data_dir/4DNFIKNWM36K.subset.cool" +ref_cooler="$data_dir/4DNFIKNWM36K.subset.fixed-bins.cool" export PATH="$PATH:$script_dir" @@ -108,8 +108,8 @@ if [[ "$sorted" == true ]]; then -f coo \ --assume-sorted \ --batch-size 1000000 \ + --bin-size 10000 \ "$outdir/chrom.sizes" \ - 10000 \ "$outdir/out.cool" else cooler dump -t pixels "$ref_cooler" | @@ -118,8 +118,8 @@ else -f coo \ --assume-unsorted \ --batch-size 1000000 \ + --bin-size 10000 \ "$outdir/chrom.sizes" \ - 10000 \ "$outdir/out.cool" fi From 13865cb2209087d9f6e3dcc54d09b92870b67575 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 20:08:14 +0100 Subject: [PATCH 15/29] Add script to generate bin tables with variable bin sizes --- test/scripts/segment_genome_random.py | 70 +++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100755 test/scripts/segment_genome_random.py diff --git a/test/scripts/segment_genome_random.py b/test/scripts/segment_genome_random.py new file mode 100755 index 00000000..30a4b7c8 --- /dev/null +++ b/test/scripts/segment_genome_random.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import random +import argparse +import pathlib +import pandas as pd + + +def make_cli(): + def positive_int(arg): + if (n := int(arg)) > 0: + return n + + raise ValueError("Not a positive integer") + + cli = argparse.ArgumentParser() + + cli.add_argument( + "chrom-sizes", + type=pathlib.Path, + help="Path to a .chrom.sizes file.", + ) + + cli.add_argument( + "bin-size", + type=positive_int, + help="Average bin size.", + ) + + cli.add_argument( + "--seed", + type=positive_int, + default=2074288341, + help="Seed used to initialize the PRNG.", + ) + + return cli + + +def read_chrom_sizes(path_to_chrom_sizes: pathlib.Path): + return pd.read_table(path_to_chrom_sizes, names=["chrom", "length"]) + + +def segment_chromosome(chrom: str, size: int, bin_size: int): + start = 0 + end = min(start + int(random.gauss(bin_size, 0.1 * bin_size)), size) + + print(f"{chrom}\t{start}\t{end}") + + while end < size: + start = end + end = min(start + int(random.gauss(bin_size, 0.1 * bin_size)), size) + print(f"{chrom}\t{start}\t{end}") + + +def main(): + args = vars(make_cli().parse_args()) + random.seed(args["seed"]) + + chrom_sizes = read_chrom_sizes(args["chrom-sizes"]) + for _, (chrom, size) in chrom_sizes.iterrows(): + segment_chromosome(chrom, size, args["bin-size"]) + + +if __name__ == "__main__": + main() From 8d692dda232199c37669d2c7b2b1471a216bce15 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 20:11:37 +0100 Subject: [PATCH 16/29] Update test dataset --- cmake/FetchTestDataset.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/FetchTestDataset.cmake b/cmake/FetchTestDataset.cmake index c47596be..94900f53 100644 --- a/cmake/FetchTestDataset.cmake +++ b/cmake/FetchTestDataset.cmake @@ -4,8 +4,8 @@ # cmake-format: off file( - DOWNLOAD https://zenodo.org/records/10276826/files/hictk_test_data.tar.xz?download=1 - EXPECTED_HASH SHA256=7e698deefc4bf090b5bcf3966f22b81770d7b48976c884e1d69c45bf83f17fa7 + DOWNLOAD https://zenodo.org/records/10289491/files/hictk_test_data.tar.xz?download=1 + EXPECTED_HASH SHA256=5e69dceb8789d923a38aed7add8fc18abfdfe531aea6effcdb7efe3c9bcf5246 "${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz") # cmake-format: on From 003d0f49983d6bb62d2890e0108dd558960b3d7c Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 20:41:23 +0100 Subject: [PATCH 17/29] Fix off-by-one bug --- src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp | 2 +- test/units/pixel/pixel_test.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp index 3325fe2d..0512cc78 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp @@ -257,7 +257,7 @@ inline std::size_t PixelSelector::estimate_optimal_cache_size( const std::size_t first_bin_id = 0; const std::size_t last_bin_id = - bins().at(coord1().bin1.chrom(), coord1().bin1.chrom().size()).rel_id() - 1; + bins().at(coord1().bin1.chrom(), coord1().bin1.chrom().size() - 1).rel_id() - 1; const auto samples = (std::min)(num_samples, bins().subset(coord1().bin1.chrom()).size()); for (std::size_t i = 0; i < samples; ++i) { const auto bin_id = diff --git a/test/units/pixel/pixel_test.cpp b/test/units/pixel/pixel_test.cpp index 520dc7db..1f7efff3 100644 --- a/test/units/pixel/pixel_test.cpp +++ b/test/units/pixel/pixel_test.cpp @@ -202,7 +202,7 @@ TEST_CASE("Pixel: parsers", "[pixel][short]") { SECTION("validpair") { SECTION("valid") { CHECK(Pixel::from_validpair(bins, "read_id\tchr1\t5\t+\tchr1\t15\t-")); - CHECK(Pixel::from_validpair(bins, "read_id\tchr1\t248956422\t+\tchr1\t248956422\t-") == + CHECK(Pixel::from_validpair(bins, "read_id\tchr1\t248956421\t+\tchr1\t248956421\t-") == expected2); } From af0644cb4513cff180771f2ca92faab8937dc087 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 20:58:09 +0100 Subject: [PATCH 18/29] Bugfix --- test/scripts/hictk_load_coo.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/scripts/hictk_load_coo.sh b/test/scripts/hictk_load_coo.sh index 0d46f9c7..5aa301b6 100755 --- a/test/scripts/hictk_load_coo.sh +++ b/test/scripts/hictk_load_coo.sh @@ -109,6 +109,7 @@ if [[ "$sorted" == true ]]; then --assume-sorted \ --batch-size 1000000 \ --bin-size 10000 \ + --zero-based \ "$outdir/chrom.sizes" \ "$outdir/out.cool" else @@ -119,6 +120,7 @@ else --assume-unsorted \ --batch-size 1000000 \ --bin-size 10000 \ + --zero-based \ "$outdir/chrom.sizes" \ "$outdir/out.cool" fi From 3c48ee34c05ae69bbe57fac06ed75a2a06f08fda Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 21:04:04 +0100 Subject: [PATCH 19/29] Set sensible defaults for one/zero based coords --- src/hictk/cli/cli_load.cpp | 13 +++++++++++-- test/scripts/hictk_load_coo.sh | 2 -- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/hictk/cli/cli_load.cpp b/src/hictk/cli/cli_load.cpp index b9ee81d2..ba1b5de8 100644 --- a/src/hictk/cli/cli_load.cpp +++ b/src/hictk/cli/cli_load.cpp @@ -76,7 +76,9 @@ void Cli::make_load_subcommand() { sc.add_flag( "--one-based,!--zero-based", c.one_based, - "Interpret genomic coordinates or bins as one/zero based."); + "Interpret genomic coordinates or bins as one/zero based.\n" + "By default coordinates are assumed to be one-based for interactions in\n" + "4dn and validapairs formats and zero-based otherwise."); sc.add_flag( "--count-as-float", @@ -152,8 +154,15 @@ void Cli::validate_load_subcommand() const { void Cli::transform_args_load_subcommand() { auto& c = std::get(_config); + const auto& sc = *_cli.get_subcommand("load"); - c.offset = c.one_based ? -1 : 0; + if (sc.get_option("--one-based")->empty()) { + if (c.format == "4dn" || c.format == "validpairs") { + c.offset = -1; + } + } else { + c.offset = c.one_based ? -1 : 0; + } // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity < 5); diff --git a/test/scripts/hictk_load_coo.sh b/test/scripts/hictk_load_coo.sh index 5aa301b6..0d46f9c7 100755 --- a/test/scripts/hictk_load_coo.sh +++ b/test/scripts/hictk_load_coo.sh @@ -109,7 +109,6 @@ if [[ "$sorted" == true ]]; then --assume-sorted \ --batch-size 1000000 \ --bin-size 10000 \ - --zero-based \ "$outdir/chrom.sizes" \ "$outdir/out.cool" else @@ -120,7 +119,6 @@ else --assume-unsorted \ --batch-size 1000000 \ --bin-size 10000 \ - --zero-based \ "$outdir/chrom.sizes" \ "$outdir/out.cool" fi From 1f89d83dfca62814ceef8ed1494abefebdfa9deb Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 21:47:23 +0100 Subject: [PATCH 20/29] Bugfix --- .github/workflows/codecov.yml | 3 +-- .github/workflows/macos-ci.yml | 8 ++------ .github/workflows/ubuntu-ci.yml | 8 ++------ 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 6b7c9cff..538a9bcd 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -212,8 +212,7 @@ jobs: test/scripts/hictk_load_coo.sh build/src/hictk/hictk unsorted test/scripts/hictk_load_bg2.sh build/src/hictk/hictk sorted test/scripts/hictk_load_bg2.sh build/src/hictk/hictk unsorted - test/scripts/hictk_load_4dn.sh build/src/hictk/hictk sorted - test/scripts/hictk_load_4dn.sh build/src/hictk/hictk unsorted + test/scripts/hictk_load_4dn.sh build/src/hictk/hictk test/scripts/hictk_merge.sh build/src/hictk/hictk diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index eaf25d58..35e1202f 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -430,13 +430,9 @@ jobs: run: | test/scripts/hictk_load_bg2.sh bin/hictk unsorted - - name: Test hictk load 4dn sorted + - name: Test hictk load 4dn run: | - test/scripts/hictk_load_4dn.sh bin/hictk sorted - - - name: Test hictk load 4dn unsorted - run: | - test/scripts/hictk_load_4dn.sh bin/hictk unsorted + test/scripts/hictk_load_4dn.sh bin/hictk - name: Test hictk merge run: | diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml index e06ea125..788f1695 100644 --- a/.github/workflows/ubuntu-ci.yml +++ b/.github/workflows/ubuntu-ci.yml @@ -483,13 +483,9 @@ jobs: run: | test/scripts/hictk_load_bg2.sh bin/hictk unsorted - - name: Test hictk load 4dn sorted + - name: Test hictk load 4dn run: | - test/scripts/hictk_load_4dn.sh bin/hictk sorted - - - name: Test hictk load 4dn unsorted - run: | - test/scripts/hictk_load_4dn.sh bin/hictk unsorted + test/scripts/hictk_load_4dn.sh bin/hictk - name: Test hictk merge run: | From ca1fa65779dec0caab6ae8bb2ac639e3e1363aa4 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 22:29:39 +0100 Subject: [PATCH 21/29] Bugfix --- src/hictk/load/load_pairs.hpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/hictk/load/load_pairs.hpp b/src/hictk/load/load_pairs.hpp index f4a07e23..05101e54 100644 --- a/src/hictk/load/load_pairs.hpp +++ b/src/hictk/load/load_pairs.hpp @@ -86,6 +86,15 @@ class PairsAggregator { return p; } + inline void insert_or_update(const ThinPixel& pixel) { + auto node = _buffer.find(pixel); + if (node != _buffer.end()) { + node->count += pixel.count; + } else { + _buffer.emplace(pixel); + } + } + inline void read_next_batch(std::size_t batch_size) { _buffer.clear(); @@ -94,14 +103,11 @@ class PairsAggregator { if (!pixel) { break; } - auto node = _buffer.find(pixel); - if (node != _buffer.end()) { - node->count += pixel.count; - } else { - _buffer.emplace(pixel); - } - if (_buffer.size() == batch_size) { + insert_or_update(pixel); + + if (_buffer.size() == batch_size - 1) { + insert_or_update(_last_pixel); break; } } From 4e885258bb146d494ef8f548efc9b1d01a2bf55a Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 7 Dec 2023 23:28:02 +0100 Subject: [PATCH 22/29] Fix GCC12 builds --- src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp | 3 +++ src/libhictk/common/include/hictk/suppress_warnings.hpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp index 4b8a5e19..7715db31 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp @@ -100,7 +100,10 @@ inline auto BinTable::find_overlap(const GenomicInterval &query) const return std::visit( [&](const auto &t) { auto its = t.find_overlap(query); + DISABLE_WARNING_PUSH + DISABLE_WARNING_MAYBE_UNINITIALIZED return std::make_pair(iterator{its.first}, iterator{its.second}); + DISABLE_WARNING_POP }, _table); } diff --git a/src/libhictk/common/include/hictk/suppress_warnings.hpp b/src/libhictk/common/include/hictk/suppress_warnings.hpp index 71bc934e..f8be4c68 100644 --- a/src/libhictk/common/include/hictk/suppress_warnings.hpp +++ b/src/libhictk/common/include/hictk/suppress_warnings.hpp @@ -16,6 +16,7 @@ #define DISABLE_WARNING(warningNumber) __pragma(warning(disable : warningNumber)) #define DISABLE_WARNING_DEPRECATED_DECLARATIONS + #define DISABLE_WARNING_MAYBE_UNINITIALIZED #define DISABLE_WARNING_NULL_DEREF #define DISABLE_WARNING_USELESS_CAST #define DISABLE_WARNING_SIGN_COMPARE @@ -30,6 +31,7 @@ #define DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored warningName) // NOLINT(cppcoreguidelines-macro-usage) #define DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING("-Wdeprecated-declarations") // NOLINT(cppcoreguidelines-macro-usage) + #define DISABLE_WARNING_MAYBE_UNINITIALIZED DISABLE_WARNING("-Wmaybe-uninitialized") // NOLINT(cppcoreguidelines-macro-usage) #define DISABLE_WARNING_NULL_DEREF DISABLE_WARNING("-Wnull-dereference") // NOLINT(cppcoreguidelines-macro-usage) #define DISABLE_WARNING_SIGN_COMPARE DISABLE_WARNING("-Wsign-compare") #define DISABLE_WARNING_UNREACHABLE_CODE @@ -52,6 +54,7 @@ #define DISABLE_WARNING_POP #define DISABLE_WARNING_DEPRECATED_DECLARATIONS + #define DISABLE_WARNING_MAYBE_UNINITIALIZED #define DISABLE_WARNING_NULL_DEREF #define DISABLE_WARNING_USELESS_CAST #define DISABLE_WARNING_SIGN_COMPARE From b03ce51de2072f683db728241cb41a4d90baa392 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 8 Dec 2023 00:14:10 +0100 Subject: [PATCH 23/29] Fix clang builds --- src/libhictk/common/include/hictk/suppress_warnings.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/libhictk/common/include/hictk/suppress_warnings.hpp b/src/libhictk/common/include/hictk/suppress_warnings.hpp index f8be4c68..b589a760 100644 --- a/src/libhictk/common/include/hictk/suppress_warnings.hpp +++ b/src/libhictk/common/include/hictk/suppress_warnings.hpp @@ -31,7 +31,6 @@ #define DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored warningName) // NOLINT(cppcoreguidelines-macro-usage) #define DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING("-Wdeprecated-declarations") // NOLINT(cppcoreguidelines-macro-usage) - #define DISABLE_WARNING_MAYBE_UNINITIALIZED DISABLE_WARNING("-Wmaybe-uninitialized") // NOLINT(cppcoreguidelines-macro-usage) #define DISABLE_WARNING_NULL_DEREF DISABLE_WARNING("-Wnull-dereference") // NOLINT(cppcoreguidelines-macro-usage) #define DISABLE_WARNING_SIGN_COMPARE DISABLE_WARNING("-Wsign-compare") #define DISABLE_WARNING_UNREACHABLE_CODE @@ -39,11 +38,13 @@ // Defines for GCC only #if defined(__GNUC__) && !defined(__clang__) - #define DISABLE_WARNING_USELESS_CAST DISABLE_WARNING("-Wuseless-cast") // NOLINT(cppcoreguidelines-macro-usage) + #define DISABLE_WARNING_MAYBE_UNINITIALIZED DISABLE_WARNING("-Wmaybe-uninitialized") // NOLINT(cppcoreguidelines-macro-usage) + #define DISABLE_WARNING_USELESS_CAST DISABLE_WARNING("-Wuseless-cast") // NOLINT(cppcoreguidelines-macro-usage) #endif // Defines for Clang only #ifdef __clang__ + #define DISABLE_WARNING_MAYBE_UNINITIALIZED #define DISABLE_WARNING_USELESS_CAST #endif From 1cc71df259c2cc42ed47d8412092a64c454f1498 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 8 Dec 2023 11:07:46 +0100 Subject: [PATCH 24/29] Address clang-tidy warnings --- src/hictk/load/load.cpp | 4 ++-- .../include/hictk/cooler/impl/singlecell_cooler_impl.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hictk/load/load.cpp b/src/hictk/load/load.cpp index 87248424..aa86f5f7 100644 --- a/src/hictk/load/load.cpp +++ b/src/hictk/load/load.cpp @@ -206,8 +206,8 @@ int load_subcmd(const LoadConfig& c) { [&](const auto& sum) { SPDLOG_INFO(FMT_STRING("ingested {} interactions ({} nnz) in {}s!"), sum, clr.nnz(), static_cast(delta) / 1.0e9); - }, - clr.attributes().sum.value()); + }, // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + *clr.attributes().sum); return 0; } diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp index 5d6483f4..19190c93 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp @@ -264,7 +264,7 @@ inline BinTable SingleCellFile::init_bin_table(const HighFive::File& f) { return {std::move(chroms), Dataset(root_grp, f.getDataSet("bins/start")).read_all>(), Dataset(root_grp, f.getDataSet("bins/end")).read_all>()}; -} +} // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) inline phmap::btree_set SingleCellFile::read_cells(const HighFive::File& f) { [[maybe_unused]] HighFive::SilenceHDF5 silencer{}; // NOLINT From a0036825820595cd5082c9f58be55c2e9ebc0fbd Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 8 Dec 2023 11:58:06 +0100 Subject: [PATCH 25/29] Validate bins upon constructing an instance of BinTableVariable --- .../include/hictk/bin_table_variable.hpp | 4 ++ .../hictk/impl/bin_table_variable_impl.hpp | 50 +++++++++++++++++++ test/units/bin_table/bin_table_test.cpp | 43 ++++++++++++++++ 3 files changed, 97 insertions(+) diff --git a/src/libhictk/bin_table/include/hictk/bin_table_variable.hpp b/src/libhictk/bin_table/include/hictk/bin_table_variable.hpp index 3589ba65..0016410e 100644 --- a/src/libhictk/bin_table/include/hictk/bin_table_variable.hpp +++ b/src/libhictk/bin_table/include/hictk/bin_table_variable.hpp @@ -77,6 +77,10 @@ class BinTableVariable { [[nodiscard]] bool operator==(const BinTableVariable &other) const; [[nodiscard]] bool operator!=(const BinTableVariable &other) const; + private: + static void validate_bin_coords(const std::vector &start_pos, const std::vector &end_pos); + + public: class iterator { friend BinTableVariable; Bin _value{}; diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp index 9f1e14c7..8d3a70c9 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp @@ -32,6 +32,8 @@ inline BinTableVariable::BinTableVariable(Reference chroms, const std::vector return; } + validate_bin_coords(start_pos, end_pos); + _bin_end_prefix_sum.reserve(start_pos.size() + 1); _bin_end_prefix_sum.push_back(end_pos.front()); @@ -42,6 +44,12 @@ inline BinTableVariable::BinTableVariable(Reference chroms, const std::vector _bin_end_prefix_sum.push_back(_bin_end_prefix_sum.back() + end_pos[i] - start_pos[i]); } _num_bins_prefix_sum.push_back(_bin_end_prefix_sum.size() - 1); + if (_num_bins_prefix_sum.size() != _chroms.size() + 1) { + throw std::runtime_error( + fmt::format(FMT_STRING("Bin table contains an unexpected number of chromosomes: expected " + "{} chromosomes, found {}."), + _chroms.size(), _num_bins_prefix_sum.size() - 1)); + } } template @@ -272,6 +280,48 @@ inline std::uint64_t BinTableVariable::map_to_bin_id(std::uint32_t chrom_id, return map_to_bin_id(_chroms.at(chrom_id), pos); } +template +inline void BinTableVariable::validate_bin_coords(const std::vector &start_pos, + const std::vector &end_pos) { + if (start_pos.front() != 0) { + throw std::runtime_error("Bin table does not start from zero"); + } + + for (std::size_t i = 1; i < start_pos.size(); ++i) { + const auto s1 = start_pos[i - 1]; + const auto s2 = start_pos[i]; + const auto e1 = end_pos[i - 1]; + const auto e2 = end_pos[i]; + + if (s1 >= e1) { + throw std::runtime_error(fmt::format( + FMT_STRING("Bin #{} is not valid: start_pos >= end_pos: {} >= {}"), i, s1, e1)); + } + + if (s1 >= s2 && s2 != 0) { + throw std::runtime_error(fmt::format(FMT_STRING("Bin table is not sorted in ascending order: " + "bin #{} >= bin #{} (start_pos {} >= {})"), + i - 1, i, s1, s2)); + } + + if (e1 >= e2 && s2 != 0) { + throw std::runtime_error(fmt::format(FMT_STRING("Bin table is not sorted in ascending order: " + "bin #{} >= bin #{} (end_pos {} >= {})"), + i - 1, i, e1, e2)); + } + + if (e1 != s2 && s2 != 0) { + throw std::runtime_error( + fmt::format(FMT_STRING("Detected a gap between bins #{} and #{}."), i - 1, i)); + } + } + if (start_pos.back() >= end_pos.back()) { + throw std::runtime_error( + fmt::format(FMT_STRING("Bin #{} is not valid: start_pos >= end_pos: {} >= {}"), + start_pos.size() - 1, start_pos.back(), end_pos.back())); + } +} + template inline BinTableVariable::iterator::iterator(const BinTableVariable &bin_table) noexcept : _bin_table{&bin_table} { diff --git a/test/units/bin_table/bin_table_test.cpp b/test/units/bin_table/bin_table_test.cpp index d346600c..3be5c752 100644 --- a/test/units/bin_table/bin_table_test.cpp +++ b/test/units/bin_table/bin_table_test.cpp @@ -282,6 +282,49 @@ TEST_CASE("BinTable (variable bins)", "[bin-table][short]") { CHECK_THROWS(table.get()); } + SECTION("invalid bins") { + SECTION("bins out of order") { + const std::vector start_pos1{0, 8, 7}; + const std::vector end_pos1{8, 15, 23}; + + CHECK_THROWS_WITH(BinTable({chrom1, chrom2}, start_pos1, end_pos1), + Catch::Matchers::ContainsSubstring("not sorted")); + + const std::vector start_pos2{0, 8, 15}; + const std::vector end_pos2{8, 15, 14}; + + CHECK_THROWS_WITH(BinTable({chrom1, chrom2}, start_pos2, end_pos2), + Catch::Matchers::ContainsSubstring("not sorted")); + } + SECTION("gap between bins") { + const std::vector start_pos1{0, 8, 16}; + const std::vector end_pos1{8, 15, 23}; + + CHECK_THROWS_WITH(BinTable({chrom1, chrom2}, start_pos1, end_pos1), + Catch::Matchers::ContainsSubstring("gap between bins")); + + const std::vector start_pos2{1, 8, 16}; + const std::vector end_pos2{8, 15, 23}; + + CHECK_THROWS_WITH(BinTable({chrom1, chrom2}, start_pos2, end_pos2), + Catch::Matchers::ContainsSubstring("does not start from zero")); + } + + SECTION("start pos >= end pos") { + const std::vector start_pos1{0, 8, 10, 15}; + const std::vector end_pos1{0, 10, 15, 23}; + + CHECK_THROWS_WITH(BinTable({chrom1, chrom2}, start_pos1, end_pos1), + Catch::Matchers::ContainsSubstring("start_pos >= end_pos")); + } + + SECTION("number of chromosome mismatch") { + const Chromosome chrom3{2, "chr3", 32}; + CHECK_THROWS_WITH(BinTable({chrom1, chrom2, chrom3}, start_pos, end_pos), + Catch::Matchers::ContainsSubstring("unexpected number of chromosomes")); + } + } + SECTION("iterators") { const auto& chr1 = table.chromosomes().at("chr1"); const auto& chr2 = table.chromosomes().at("chr2"); From 8017d5648318d8cd1cd60e8c7b661c80b5ae30e3 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 8 Dec 2023 13:21:34 +0100 Subject: [PATCH 26/29] Update hictk merge to support coolers with variable bin size --- .../include/hictk/impl/bin_table_impl.hpp | 9 +++++-- .../hictk/cooler/impl/utils_merge_impl.hpp | 12 ++++++++-- test/units/bin_table/bin_table_test.cpp | 24 +++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp index 7715db31..2ca0e05c 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp @@ -172,8 +172,13 @@ inline std::uint64_t BinTable::map_to_bin_id(std::uint32_t chrom_id, std::uint32 inline bool BinTable::operator==(const BinTable &other) const { return std::visit( [&](const auto &t1) { - const auto &t2 = std::get>(other._table); - return t1 == t2; + try { + using BinTableT = remove_cvref_t; + const auto &t2 = std::get(other._table); + return t1 == t2; + } catch (const std::bad_variant_access &) { + return false; + } }, _table); } diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp index c9627243..c02b4156 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp @@ -39,13 +39,21 @@ template inline void validate_bin_size(const std::vector>& coolers) { assert(coolers.size() > 1); const auto& clr1 = coolers.front(); + const auto bin_table1 = cooler::File(clr1.uri).bins(); for (std::size_t i = 1; i < coolers.size(); ++i) { const auto& clr2 = coolers[i]; - if (clr1.bin_size != clr2.bin_size) { + + if (clr1.bin_size == 0 || clr2.bin_size == 0) { // Table(s) with variable bin size + const auto bin_table2 = cooler::File(clr2.uri).bins(); + if (bin_table1 != bin_table2) { + throw std::runtime_error(fmt::format( + FMT_STRING("cooler \"{}\" and \"{}\" have different bin tables"), clr1.uri, clr2.uri)); + } + } else if (clr1.bin_size != clr2.bin_size) { throw std::runtime_error(fmt::format( FMT_STRING( - "cooler \"{}\" and \"{}\" have different resolutions ({} and {} respectively)"), + "cooler \"{}\" and \"{}\" have different resolutions ({} and {} respectively)"), clr1.uri, clr2.uri, clr1.bin_size, clr2.bin_size)); } } diff --git a/test/units/bin_table/bin_table_test.cpp b/test/units/bin_table/bin_table_test.cpp index 3be5c752..65e8dfb9 100644 --- a/test/units/bin_table/bin_table_test.cpp +++ b/test/units/bin_table/bin_table_test.cpp @@ -100,6 +100,13 @@ TEST_CASE("BinTable (fixed bins)", "[bin-table][short]") { CHECK_THROWS(table.get>()); } + SECTION("operator==") { + CHECK(BinTable(table.chromosomes(), 10) == BinTable(table.chromosomes(), 10)); + CHECK(BinTable(table.chromosomes(), 10) != BinTable(table.chromosomes(), 20)); + CHECK(BinTable(Reference{table.chromosomes().begin(), table.chromosomes().end() - 1}, 10) != + BinTable(table.chromosomes(), 10)); + } + SECTION("iterators") { const auto& chr1 = table.chromosomes().at("chr1"); const auto& chr2 = table.chromosomes().at("chr2"); @@ -325,6 +332,23 @@ TEST_CASE("BinTable (variable bins)", "[bin-table][short]") { } } + SECTION("operator==") { + CHECK(BinTable(table.chromosomes(), start_pos, end_pos) == + BinTable(table.chromosomes(), start_pos, end_pos)); + + const std::vector start_pos1{0, 0}; + const std::vector end_pos1{32, 32}; + CHECK(BinTable(table.chromosomes(), start_pos, end_pos) != + BinTable(table.chromosomes(), start_pos1, end_pos1)); + + const std::vector start_pos2{0}; + const std::vector end_pos2{32}; + CHECK(BinTable(Reference{table.chromosomes().begin(), table.chromosomes().end() - 1}, + start_pos2, end_pos2) != BinTable(table.chromosomes(), start_pos, end_pos)); + + CHECK(BinTable(table.chromosomes(), start_pos, end_pos) != BinTable(table.chromosomes(), 10)); + } + SECTION("iterators") { const auto& chr1 = table.chromosomes().at("chr1"); const auto& chr2 = table.chromosomes().at("chr2"); From e1213636f7c0bcab89de297f5372a9744189eef5 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 8 Dec 2023 13:26:08 +0100 Subject: [PATCH 27/29] Update hictk convert to prevent conversion of cooler with variable bin size --- src/hictk/convert/cool_to_hic.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/hictk/convert/cool_to_hic.cpp b/src/hictk/convert/cool_to_hic.cpp index af723ceb..8b403b78 100644 --- a/src/hictk/convert/cool_to_hic.cpp +++ b/src/hictk/convert/cool_to_hic.cpp @@ -314,6 +314,12 @@ void cool_to_hic(const ConvertConfig& c) { c.path_to_input.string(), c.resolutions.front()); const cooler::File clr(uri); + + if (clr.bin_size() == 0) { + throw std::runtime_error( + "converting cooler files with variable bin size is not supported."); + } + dump_chrom_sizes(clr, chrom_sizes); dump_pixels(clr, pixels, c.gzip_compression_lvl, c.threads); } From 05dc1bbe826c14b51aeeb61609840402ac27423e Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 8 Dec 2023 13:39:43 +0100 Subject: [PATCH 28/29] Update hictk zoomify to prevent processing of coolers with variable bin size --- src/hictk/cli/cli_zoomify.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/hictk/cli/cli_zoomify.cpp b/src/hictk/cli/cli_zoomify.cpp index 9c5fc534..fdfedac2 100644 --- a/src/hictk/cli/cli_zoomify.cpp +++ b/src/hictk/cli/cli_zoomify.cpp @@ -145,6 +145,12 @@ void Cli::validate_zoomify_subcommand() const { "--resolutions."); } + if (clr.bin_size() == 0) { // Variable bin size + errors.clear(); + warnings.clear(); + errors.emplace_back("zoomifying files with variable bin size is not currently supported."); + } + for (const auto& w : warnings) { SPDLOG_WARN(FMT_STRING("{}"), w); } From 80965a7f01cbbe58e24b8e2bd2093752bb8300ea Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 8 Dec 2023 14:48:55 +0100 Subject: [PATCH 29/29] Address clang-tidy warnings --- test/units/cooler/file_ctors_test.cpp | 4 ++-- test/units/cooler/file_pixels_test.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/units/cooler/file_ctors_test.cpp b/test/units/cooler/file_ctors_test.cpp index e8febbb8..bd99cd95 100644 --- a/test/units/cooler/file_ctors_test.cpp +++ b/test/units/cooler/file_ctors_test.cpp @@ -24,7 +24,7 @@ TEST_CASE("Cooler: init files", "[cooler][short]") { std::ignore = File::create(path.string(), chroms, bin_size, true); CHECK(utils::is_cooler(path.string())); // NOLINTNEXTLINE CHECK(File(path.string()).attributes().generated_by->find("hictk") == 0); - CHECK(File(path.string()).attributes().bin_type.value() == "fixed"); + CHECK(File(path.string()).attributes().bin_type.value() == "fixed"); // NOLINT } SECTION("variable bins") { @@ -41,7 +41,7 @@ TEST_CASE("Cooler: init files", "[cooler][short]") { std::ignore = File::create(path.string(), table, true); CHECK(utils::is_cooler(path.string())); // NOLINTNEXTLINE CHECK(File(path.string()).attributes().generated_by->find("hictk") == 0); - CHECK(File(path.string()).attributes().bin_type.value() == "variable"); + CHECK(File(path.string()).attributes().bin_type.value() == "variable"); // NOLINT } } diff --git a/test/units/cooler/file_pixels_test.cpp b/test/units/cooler/file_pixels_test.cpp index 9d224836..6f9d3b1a 100644 --- a/test/units/cooler/file_pixels_test.cpp +++ b/test/units/cooler/file_pixels_test.cpp @@ -29,7 +29,7 @@ TEST_CASE("Cooler: read/write pixels", "[cooler][long]") { std::mt19937_64 rand_eng{rd()}; auto pixel_it = expected.begin(); - do { + do { // NOLINT(cppcoreguidelines-avoid-do-while) const auto diff = std::distance(pixel_it, expected.end()); // Write pixels in chunks of random size const auto offset =