From 63d271cfe1642401a42f0d64da9e86798b13947e Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 1 Dec 2023 12:38:31 +0100 Subject: [PATCH 1/8] Initial implementation of --cis/trans-only flags for hictk dump --- src/hictk/cli/cli_dump.cpp | 17 ++ src/hictk/dump/dump.cpp | 146 +++++++++++++++++- src/hictk/include/hictk/tools/config.hpp | 3 + src/libhictk/pixel/include/hictk/pixel.hpp | 9 ++ .../impl/join_genomic_coords_impl.hpp | 4 +- 5 files changed, 177 insertions(+), 2 deletions(-) diff --git a/src/hictk/cli/cli_dump.cpp b/src/hictk/cli/cli_dump.cpp index aa606c90..b0da739e 100644 --- a/src/hictk/cli/cli_dump.cpp +++ b/src/hictk/cli/cli_dump.cpp @@ -83,6 +83,16 @@ void Cli::make_dump_subcommand() { ->check(CLI::ExistingFile | CLI::IsMember({"-"})) ->capture_default_str(); + sc.add_flag( + "--cis-only", + c.cis_only, + "Dump intra-chromosomal interactions only."); + + sc.add_flag( + "--trans-only", + c.trans_only, + "Dump inter-chromosomal interactions only."); + sc.add_option( "-b,--balance", c.normalization, @@ -113,6 +123,9 @@ void Cli::make_dump_subcommand() { sc.get_option("--range2")->needs(sc.get_option("--range")); sc.get_option("--query-file")->excludes(sc.get_option("--range")); sc.get_option("--query-file")->excludes(sc.get_option("--range2")); + sc.get_option("--query-file")->excludes(sc.get_option("--cis-only")); + sc.get_option("--query-file")->excludes(sc.get_option("--trans-only")); + sc.get_option("--cis-only")->excludes(sc.get_option("--trans-only")); _config = std::monostate{}; } @@ -168,6 +181,10 @@ void Cli::validate_dump_subcommand() const { errors.emplace_back("--matrix-type=FRAG is not yet supported."); } + if ((c.cis_only || c.trans_only) && c.table != "pixels") { + errors.emplace_back("--cis-only and --trans-only require --table=pixels."); + } + for (const auto& w : warnings) { SPDLOG_WARN(FMT_STRING("{}"), w); } diff --git a/src/hictk/dump/dump.cpp b/src/hictk/dump/dump.cpp index 014beb35..409f54be 100644 --- a/src/hictk/dump/dump.cpp +++ b/src/hictk/dump/dump.cpp @@ -108,6 +108,89 @@ static void dump_pixels(hic::File& f, std::string_view range1, std::string_view print_pixels(jsel.begin(), jsel.end()); } +static void dump_pixels_trans_only_sorted(const cooler::File& f, std::string_view normalization, + bool join) { + auto weights = f.read_weights(balancing::Method{normalization}); + + using It = decltype(f.fetch("chr1", "chr2").template begin()); + + const auto chroms = f.chromosomes(); + for (std::uint32_t i = 0; i < chroms.size(); ++i) { + std::vector heads{}; + std::vector tails{}; + for (std::uint32_t j = i + 1; j < chroms.size(); ++j) { + const auto& chrom1 = chroms[i]; + const auto& chrom2 = chroms[j]; + + if (chrom1.is_all() || chrom2.is_all()) { + continue; + } + + auto sel = f.fetch(chrom1.name(), chrom2.name(), weights); + heads.emplace_back(sel.begin()); + tails.emplace_back(sel.end()); + } + + internal::PixelMerger merger(std::move(heads), std::move(tails)); + + if (!join) { + print_pixels(merger.begin(), merger.end()); + continue; + } + + auto jsel = transformers::JoinGenomicCoords(merger.begin(), merger.end(), f.bins_ptr()); + print_pixels(jsel.begin(), jsel.end()); + } +} + +static void dump_pixels_trans_only_sorted(hic::File& f, std::string_view normalization, bool join) { + auto norm = balancing::Method{std::string{normalization}}; + + std::vector selectors; + + for (std::uint32_t chrom1_id = 0; chrom1_id < f.chromosomes().size(); ++chrom1_id) { + const auto& chrom1 = f.chromosomes().at(chrom1_id); + if (chrom1.is_all()) { + continue; + } + for (std::uint32_t chrom2_id = chrom1_id + 1; chrom2_id < f.chromosomes().size(); ++chrom2_id) { + const auto& chrom2 = f.chromosomes().at(chrom2_id); + if (chrom2.is_all()) { + continue; + } + try { + auto sel = f.fetch(chrom1.name(), chrom2.name(), norm); + if (!sel.empty()) { + selectors.emplace_back(std::move(sel)); + } + } catch (const std::exception& e) { + const std::string_view msg{e.what()}; + const auto missing_norm = msg.find("unable to find") != std::string_view::npos && + msg.find("normalization vector") != std::string_view::npos; + if (!missing_norm) { + throw; + } + } + } + } + + if (selectors.empty()) { + throw std::runtime_error( + fmt::format(FMT_STRING("unable to find {} normalization vectors at {} ({})"), + norm.to_string(), f.resolution(), hic::MatrixUnit::BP)); + } + + hic::PixelSelectorAll sel(std::move(selectors)); + + if (!join) { + print_pixels(sel.template begin(), sel.template end()); + } + + auto jsel = transformers::JoinGenomicCoords(sel.template begin(), + sel.template end(), f.bins_ptr()); + print_pixels(jsel.begin(), jsel.end()); +} + static void process_query(File& f, std::string_view table, std::string_view range1, std::string_view range2, std::string_view normalization, bool join, bool sorted) { @@ -120,6 +203,55 @@ static void process_query(File& f, std::string_view table, std::string_view rang f.get()); } +static void process_query_cis_only(File& f, std::string_view normalization, bool join, + bool sorted) { + std::visit( + [&](auto& ff) { + for (const auto& chrom : ff.chromosomes()) { + if (chrom.is_all()) { + continue; + } + dump_pixels(ff, chrom.name(), chrom.name(), normalization, join, sorted); + } + }, + f.get()); +} + +static void process_query_trans_only_unsorted(File& f, std::string_view normalization, bool join) { + std::visit( + [&](auto& ff) { + const auto& chromosomes = ff.chromosomes(); + for (std::uint32_t chrom1_id = 0; chrom1_id < chromosomes.size(); ++chrom1_id) { + const auto& chrom1 = chromosomes[chrom1_id]; + if (chrom1.is_all()) { + continue; + } + for (std::uint32_t chrom2_id = chrom1_id + 1; chrom2_id < chromosomes.size(); + ++chrom2_id) { + const auto& chrom2 = chromosomes[chrom2_id]; + if (chrom2.is_all()) { + continue; + } + + dump_pixels(ff, chrom1.name(), chrom2.name(), normalization, join, false); + } + } + }, + f.get()); +} + +static void process_query_trans_only_sorted(File& f, std::string_view normalization, bool join) { + std::visit([&](auto& ff) { dump_pixels_trans_only_sorted(ff, normalization, join); }, f.get()); +} + +static void process_query_trans_only(File& f, std::string_view normalization, bool join, + bool sorted) { + if (sorted) { + return process_query_trans_only_sorted(f, normalization, join); + } + return process_query_trans_only_unsorted(f, normalization, join); +} + static int dump_chroms(std::string_view uri, std::string_view format, std::uint32_t resolution) { Reference ref{}; @@ -244,11 +376,23 @@ static int dump_cells(std::string_view uri, std::string_view format) { static int dump_tables(const DumpConfig& c) { hictk::File f{c.uri, c.resolution, c.matrix_type, c.matrix_unit}; - if (c.query_file.empty()) { + if (c.query_file.empty() && !c.cis_only && !c.trans_only) { process_query(f, c.table, c.range1, c.range2, c.normalization, c.join, c.sorted); return 0; } + if (c.cis_only) { + assert(c.table == "pixels"); + process_query_cis_only(f, c.normalization, c.join, c.sorted); + return 0; + } + + if (c.trans_only) { + assert(c.table == "pixels"); + process_query_trans_only(f, c.normalization, c.join, c.sorted); + return 0; + } + const auto read_from_stdin = c.query_file == "-"; std::ifstream ifs{}; ifs.exceptions(ifs.exceptions() | std::ios_base::badbit | std::ios_base::failbit); diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index 66b07928..39371439 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -73,6 +73,9 @@ struct DumpConfig { bool join{false}; bool sorted{true}; + bool cis_only{false}; + bool trans_only{false}; + std::string normalization{"NONE"}; std::string weight_type{"infer"}; hic::MatrixType matrix_type{hic::MatrixType::observed}; diff --git a/src/libhictk/pixel/include/hictk/pixel.hpp b/src/libhictk/pixel/include/hictk/pixel.hpp index c4c596b7..7afab0a1 100644 --- a/src/libhictk/pixel/include/hictk/pixel.hpp +++ b/src/libhictk/pixel/include/hictk/pixel.hpp @@ -13,6 +13,7 @@ #include #include "hictk/bin_table.hpp" +#include "hictk/type_traits.hpp" namespace hictk { @@ -131,6 +132,14 @@ class PixelMerger { ThinPixel _value{}; public: + using difference_type = std::ptrdiff_t; + using value_type = remove_cvref_t())>; + using pointer = value_type *; + using const_pointer = const value_type *; + using reference = value_type &; + using const_reference = const value_type &; + using iterator_category = std::forward_iterator_tag; + iterator() = default; explicit iterator(PixelMerger &merger); diff --git a/src/libhictk/transformers/include/hictk/transformers/impl/join_genomic_coords_impl.hpp b/src/libhictk/transformers/include/hictk/transformers/impl/join_genomic_coords_impl.hpp index 91273336..0584747a 100644 --- a/src/libhictk/transformers/include/hictk/transformers/impl/join_genomic_coords_impl.hpp +++ b/src/libhictk/transformers/include/hictk/transformers/impl/join_genomic_coords_impl.hpp @@ -78,13 +78,15 @@ inline auto JoinGenomicCoords::iterator::operator->() const -> const_po template inline auto JoinGenomicCoords::iterator::operator++() -> iterator& { - ++_it; + std::ignore = ++_it; return *this; } + template inline auto JoinGenomicCoords::iterator::operator++(int) -> iterator { auto it = *this; std::ignore = ++_it; return it; } + } // namespace hictk::transformers From 08c9e5cf9160f6b0a141737ae7ff937a6b85f9d8 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:02:41 +0100 Subject: [PATCH 2/8] Address segfault when merging iterators of hic pixels The segfault occurred when storing a relatively large number of iterators in a vector. If the vector was created by repeatedly calling emplace_back(), and one or more iterators were generated immediately after, the pointer to the hic::PixelSelector stored in hic::PixelSelector::iterator would become dangling. The segfault is solved by storing member variables of hic::PixelSelector as shared pointers, then passing these pointers when creating an instance of hic::PixelSelector::iterator<>. --- .../hictk/cooler/impl/pixel_selector_impl.hpp | 2 + .../include/hictk/cooler/pixel_selector.hpp | 2 + .../hictk/hic/impl/pixel_selector_impl.hpp | 182 ++++++++++++------ .../hic/include/hictk/hic/pixel_selector.hpp | 16 +- 4 files changed, 143 insertions(+), 59 deletions(-) diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/pixel_selector_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/pixel_selector_impl.hpp index 56833de3..2f74b0ee 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/pixel_selector_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/pixel_selector_impl.hpp @@ -99,6 +99,8 @@ inline auto PixelSelector::cend() const -> iterator { return iterator::at_end(_index, *_pixels_bin1_id, *_pixels_bin2_id, *_pixels_count, _weights); } +inline bool PixelSelector::empty() const { return begin() == end(); } + template inline std::vector> PixelSelector::read_all() const { // We push_back into buff to avoid traversing pixels twice (once to figure out the vector size, diff --git a/src/libhictk/cooler/include/hictk/cooler/pixel_selector.hpp b/src/libhictk/cooler/include/hictk/cooler/pixel_selector.hpp index a2f9cf96..eff50f8d 100644 --- a/src/libhictk/cooler/include/hictk/cooler/pixel_selector.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/pixel_selector.hpp @@ -63,6 +63,8 @@ class PixelSelector { template [[nodiscard]] auto cend() const -> iterator; + [[nodiscard]] bool empty() const; + template [[nodiscard]] std::vector> read_all() const; #ifdef HICTK_WITH_EIGEN diff --git a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp index 3325fe2d..2cc1a3f0 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp @@ -33,10 +33,11 @@ inline PixelSelector::PixelSelector(std::shared_ptr hfs std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coord1_, PixelCoordinates coord2_) noexcept - : _reader(std::move(hfs_), footer_->index(), std::move(bins_), std::move(cache_)), + : _reader(std::make_shared(std::move(hfs_), footer_->index(), + std::move(bins_), std::move(cache_))), _footer(std::move(footer_)), - _coord1(std::move(coord1_)), - _coord2(std::move(coord2_)) {} + _coord1(std::make_shared(std::move(coord1_))), + _coord2(std::make_shared(std::move(coord2_))) {} inline PixelSelector::~PixelSelector() noexcept { try { @@ -48,8 +49,8 @@ inline PixelSelector::~PixelSelector() noexcept { } inline bool PixelSelector::operator==(const PixelSelector &other) const noexcept { - return _reader.index().chrom1() == _reader.index().chrom2() && _coord1 == other._coord1 && - _coord2 == other._coord2; + return _reader->index().chrom1() == _reader->index().chrom2() && *_coord1 == *other._coord1 && + *_coord2 == *other._coord2; } inline bool PixelSelector::operator!=(const PixelSelector &other) const noexcept { @@ -63,7 +64,7 @@ inline auto PixelSelector::cbegin(bool sorted) const -> iterator { template inline auto PixelSelector::cend() const -> iterator { - return iterator::at_end(*this); + return iterator::at_end(_reader, _coord1, _coord2); } template @@ -110,7 +111,7 @@ inline ThinPixel PixelSelector::transform_pixel(ThinPixel pixel) const const auto expectedCount = [&]() { if (is_inter()) { - return float(_reader.avg()); + return float(_reader->avg()); } const auto i = (bin2 - bin1); @@ -197,19 +198,19 @@ template } #endif -inline const PixelCoordinates &PixelSelector::coord1() const noexcept { return _coord1; } -inline const PixelCoordinates &PixelSelector::coord2() const noexcept { return _coord2; } +inline const PixelCoordinates &PixelSelector::coord1() const noexcept { return *_coord1; } +inline const PixelCoordinates &PixelSelector::coord2() const noexcept { return *_coord2; } inline MatrixType PixelSelector::matrix_type() const noexcept { return metadata().matrix_type; } inline balancing::Method PixelSelector::normalization() const noexcept { return metadata().normalization; } -inline MatrixUnit PixelSelector::unit() const noexcept { return _reader.index().unit(); } +inline MatrixUnit PixelSelector::unit() const noexcept { return _reader->index().unit(); } inline std::uint32_t PixelSelector::resolution() const noexcept { - return _reader.index().resolution(); + return _reader->index().resolution(); } -inline const Chromosome &PixelSelector::chrom1() const noexcept { return _coord1.bin1.chrom(); } -inline const Chromosome &PixelSelector::chrom2() const noexcept { return _coord2.bin1.chrom(); } +inline const Chromosome &PixelSelector::chrom1() const noexcept { return _coord1->bin1.chrom(); } +inline const Chromosome &PixelSelector::chrom2() const noexcept { return _coord2->bin1.chrom(); } inline const balancing::Weights &PixelSelector::weights1() const noexcept { return _footer->weights1(); @@ -218,7 +219,7 @@ inline const balancing::Weights &PixelSelector::weights2() const noexcept { return _footer->weights2(); } -inline const BinTable &PixelSelector::bins() const noexcept { return _reader.bins(); } +inline const BinTable &PixelSelector::bins() const noexcept { return _reader->bins(); } inline const internal::HiCFooterMetadata &PixelSelector::metadata() const noexcept { assert(!!_footer); @@ -229,25 +230,25 @@ inline bool PixelSelector::is_inter() const noexcept { return !is_intra(); } inline bool PixelSelector::is_intra() const noexcept { return chrom1() == chrom2(); } -inline bool PixelSelector::empty() const noexcept { return _reader.index().empty(); } +inline bool PixelSelector::empty() const noexcept { return _reader->index().empty(); } inline std::size_t PixelSelector::estimate_optimal_cache_size( [[maybe_unused]] std::size_t num_samples) const { - if (_reader.index().empty()) { + if (_reader->index().empty()) { return 0; // should we throw instead? } - std::seed_seq sseq({_reader.index().size()}); + std::seed_seq sseq({_reader->index().size()}); std::mt19937_64 rand_eng(sseq); // Try to guess the average block size std::size_t avg_block_size = 0; + const auto &idx = _reader->index(); - std::vector blocks(std::min(_reader.index().size(), num_samples)); - std::sample(_reader.index().begin(), _reader.index().end(), blocks.begin(), blocks.size(), - rand_eng); + std::vector blocks(std::min(idx.size(), num_samples)); + std::sample(idx.begin(), idx.end(), blocks.begin(), blocks.size(), rand_eng); for (const auto &blki : blocks) { - avg_block_size += _reader.read_size(chrom1(), chrom2(), blki); + avg_block_size += _reader->read_size(chrom1(), chrom2(), blki); } avg_block_size /= blocks.size(); @@ -266,7 +267,7 @@ inline std::size_t PixelSelector::estimate_optimal_cache_size( const auto pos1 = static_cast(bin_id * bin_size); const auto bin1 = bins().at(coord1().bin1.chrom(), pos1); - auto overlap = _reader.index().find_overlaps({bin1, bin1}, coord2()); + auto overlap = idx.find_overlaps({bin1, bin1}, coord2()); const auto num_blocks = static_cast(std::distance(overlap.begin(), overlap.end())); max_blocks_per_row = (std::max)(max_blocks_per_row, num_blocks); } @@ -275,26 +276,29 @@ inline std::size_t PixelSelector::estimate_optimal_cache_size( } inline void PixelSelector::clear_cache() const { - if (_reader.cache_size() == 0) { + if (_reader->cache_size() == 0) { return; } - for (auto blki : _reader.index().find_overlaps(coord1(), coord2())) { - _reader.evict(coord1().bin1.chrom(), coord2().bin1.chrom(), blki); + for (auto blki : _reader->index().find_overlaps(coord1(), coord2())) { + _reader->evict(coord1().bin1.chrom(), coord2().bin1.chrom(), blki); } } template inline PixelSelector::iterator::iterator(const PixelSelector &sel, bool sorted) - : _sel(&sel), + : _reader(sel._reader), + _coord1(sel._coord1), + _coord2(sel._coord2), + _footer(sel._footer), _block_idx(std::make_shared( - sel._reader.index().find_overlaps(coord1(), coord2()))), + _reader->index().find_overlaps(coord1(), coord2()))), _block_blacklist(std::make_shared()), _block_it(_block_idx->begin()), _buffer(std::make_shared()), _bin1_id(coord1().bin1.rel_id()), _sorted(sorted) { - if (_sel->_reader.index().empty()) { - *this = at_end(sel); + if (_reader->index().empty()) { + *this = at_end(_reader, _coord1, _coord2); return; } @@ -304,10 +308,15 @@ inline PixelSelector::iterator::iterator(const PixelSelector &sel, bool sorte } template -inline auto PixelSelector::iterator::at_end(const PixelSelector &sel) -> iterator { +inline auto PixelSelector::iterator::at_end(std::shared_ptr reader, + std::shared_ptr coord1, + std::shared_ptr coord2) + -> iterator { iterator it{}; - it._sel = &sel; + it._reader = reader; + it._coord1 = coord1; + it._coord2 = coord2; it._buffer = nullptr; // end of queue return it; @@ -315,10 +324,15 @@ inline auto PixelSelector::iterator::at_end(const PixelSelector &sel) -> iter template inline bool PixelSelector::iterator::operator==(const iterator &other) const noexcept { - if (_sel != other._sel) { + const auto same_query = coord1() == other.coord1() && coord2() == other.coord2(); + + if (!same_query) { return false; } + assert(_reader->index().chrom1() == other._reader->index().chrom1()); + assert(_reader->index().chrom2() == other._reader->index().chrom2()); + const auto bin11 = bin1_id(); const auto bin12 = bin2_id(); const auto bin21 = other.bin1_id(); @@ -402,18 +416,18 @@ inline bool PixelSelector::iterator::is_at_end() const noexcept { template inline const BinTable &PixelSelector::iterator::bins() const noexcept { - assert(!!_sel); - return _sel->bins(); + assert(!!_reader); + return _reader->bins(); } template inline const PixelCoordinates &PixelSelector::iterator::coord1() const noexcept { - assert(!!_sel); - return _sel->coord1(); + assert(!!_coord1); + return *_coord1; } template inline const PixelCoordinates &PixelSelector::iterator::coord2() const noexcept { - assert(!!_sel); - return _sel->coord2(); + assert(!!_coord2); + return *_coord2; } template @@ -442,14 +456,15 @@ PixelSelector::iterator::find_blocks_overlapping_next_chunk(std::size_t num_b const auto coord1_ = PixelCoordinates(bins().at(coord1().bin1.chrom(), pos1), bins().at(coord1().bin1.chrom(), pos2)); - return _sel->_reader.index().find_overlaps(coord1_, coord2()); + return _reader->index().find_overlaps(coord1_, coord2()); } template inline std::uint32_t PixelSelector::iterator::compute_chunk_size() const noexcept { + assert(!!_reader); const auto bin_size = bins().bin_size(); const auto num_bins = std::min( - static_cast(_sel->_reader.index().block_bin_count()), + static_cast(_reader->index().block_bin_count()), static_cast(0.005 * double(coord1().bin2.rel_id() - coord1().bin1.rel_id()))); const auto end_pos = coord1().bin2.start(); @@ -461,13 +476,14 @@ inline std::uint32_t PixelSelector::iterator::compute_chunk_size() const noex template inline void PixelSelector::iterator::read_next_chunk() { + assert(!!_reader); if (!_sorted) { return read_next_chunk_unsorted(); } const auto is_intra = coord1().bin1.chrom() == coord2().bin1.chrom(); - if (_sel->_reader.index().version() > 8 && is_intra) { + if (_reader->index().version() > 8 && is_intra) { return read_next_chunk_v9_intra_sorted(); } read_next_chunk_sorted(); @@ -475,10 +491,10 @@ inline void PixelSelector::iterator::read_next_chunk() { template inline void PixelSelector::iterator::read_next_chunk_unsorted() { - assert(!!_sel); + assert(!!_reader); if (_block_it == _block_idx->end()) { - *this = at_end(*_sel); + *this = at_end(_reader, _coord1, _coord2); return; } @@ -495,7 +511,7 @@ inline void PixelSelector::iterator::read_next_chunk_unsorted() { const auto bin1_offset = bins().at(coord1().bin1.chrom()).id(); const auto bin2_offset = bins().at(coord2().bin1.chrom()).id(); - auto blk = *_sel->_reader.read(coord1().bin1.chrom(), coord2().bin1.chrom(), *_block_it++, false); + auto blk = *_reader->read(coord1().bin1.chrom(), coord2().bin1.chrom(), *_block_it++, false); for (auto p : blk) { if (static_cast(p.bin1_id) < bin1_lb || static_cast(p.bin1_id) > bin1_ub || @@ -504,7 +520,7 @@ inline void PixelSelector::iterator::read_next_chunk_unsorted() { continue; } - auto pt = _sel->transform_pixel(p); + auto pt = transform_pixel(p); pt.bin1_id += bin1_offset; pt.bin2_id += bin2_offset; _buffer->emplace_back(std::move(pt)); @@ -513,10 +529,10 @@ inline void PixelSelector::iterator::read_next_chunk_unsorted() { template inline void PixelSelector::iterator::read_next_chunk_sorted() { - assert(!!_sel); + assert(!!_reader); if (_block_it == _block_idx->end()) { - *this = at_end(*_sel); + *this = at_end(_reader, _coord1, _coord2); return; } @@ -535,7 +551,7 @@ inline void PixelSelector::iterator::read_next_chunk_sorted() { const auto bin2_offset = bins().at(coord2().bin1.chrom()).id(); auto first_blki = *_block_it; while (_block_it->coords().i1 == first_blki.coords().i1) { - auto blk = *_sel->_reader.read(coord1().bin1.chrom(), coord2().bin1.chrom(), *_block_it, false); + auto blk = *_reader->read(coord1().bin1.chrom(), coord2().bin1.chrom(), *_block_it, false); for (auto p : blk) { if (static_cast(p.bin1_id) < bin1_lb || static_cast(p.bin1_id) > bin1_ub || @@ -544,7 +560,7 @@ inline void PixelSelector::iterator::read_next_chunk_sorted() { continue; } - auto pt = _sel->transform_pixel(p); + auto pt = transform_pixel(p); pt.bin1_id += bin1_offset; pt.bin2_id += bin2_offset; _buffer->emplace_back(std::move(pt)); @@ -560,10 +576,10 @@ inline void PixelSelector::iterator::read_next_chunk_sorted() { template inline void PixelSelector::iterator::read_next_chunk_v9_intra_sorted() { - assert(!!_sel); + assert(!!_reader); if (_bin1_id > coord1().bin2.rel_id()) { - *this = at_end(*_sel); + *this = at_end(_reader, _coord1, _coord2); return; } @@ -594,7 +610,7 @@ inline void PixelSelector::iterator::read_next_chunk_v9_intra_sorted() { const auto bin1_offset = bins().at(coord1().bin1.chrom()).id(); const auto bin2_offset = bins().at(coord2().bin1.chrom()).id(); bool block_overlaps_query = false; - for (auto p : *_sel->_reader.read(coord1().bin1.chrom(), coord2().bin1.chrom(), blki)) { + for (auto p : *_reader->read(coord1().bin1.chrom(), coord2().bin1.chrom(), blki)) { // Using bitwise operators gives a ~5% perf improvement on my machine const auto pixel_overlaps_query = bool(int(static_cast(p.bin1_id) >= bin1_lb) & int(static_cast(p.bin1_id) <= bin1_ub) & @@ -609,13 +625,13 @@ inline void PixelSelector::iterator::read_next_chunk_v9_intra_sorted() { continue; } - auto pt = _sel->transform_pixel(p); + auto pt = transform_pixel(p); pt.bin1_id += bin1_offset; pt.bin2_id += bin2_offset; _buffer->emplace_back(std::move(pt)); } if (!block_overlaps_query) { - _sel->_reader.evict(coord1().bin1.chrom(), coord2().bin1.chrom(), blki); + _reader->evict(coord1().bin1.chrom(), coord2().bin1.chrom(), blki); _block_blacklist->emplace(blki); } } @@ -626,6 +642,64 @@ inline void PixelSelector::iterator::read_next_chunk_v9_intra_sorted() { _bin1_id = bin1_id_last + 1; } +template +inline ThinPixel PixelSelector::iterator::transform_pixel(ThinPixel pixel) const { + assert(!!_footer); + + auto return_pixel = [&]() -> ThinPixel { + if constexpr (std::is_floating_point_v) { + return {pixel.bin1_id, pixel.bin2_id, conditional_static_cast(pixel.count)}; + } else { + return {pixel.bin1_id, pixel.bin2_id, static_cast(std::round(pixel.count))}; + } + }; + + const auto &weights1 = _footer->weights1()(); + const auto &weights2 = _footer->weights2()(); + const auto &expected = _footer->expectedValues(); + + const auto bin1 = pixel.bin1_id; + const auto bin2 = pixel.bin2_id; + + const auto is_inter = coord1().bin1.chrom() != coord2().bin1.chrom(); + const auto matrix_type = _footer->matrix_type(); + + assert(is_inter || bin1 <= bin2); + + const auto skip_normalization = + _footer->normalization() == balancing::Method::NONE() || matrix_type == MatrixType::expected; + + if (!skip_normalization) { + assert(bin1 < weights1.size()); + assert(bin2 < weights2.size()); + pixel.count /= static_cast(weights1[bin1] * weights2[bin2]); + } + + if (matrix_type == MatrixType::observed) { + return return_pixel(); + } + + const auto expected_count = [&]() { + if (is_inter) { + return float(_reader->avg()); + } + + const auto i = (bin2 - bin1); + assert(i < expected.size()); + return float(expected[i]); + }(); + + if (matrix_type == MatrixType::expected) { + pixel.count = expected_count; + return return_pixel(); + } + + assert(matrix_type == MatrixType::oe); + pixel.count /= expected_count; + + return return_pixel(); +} + inline PixelSelectorAll::PixelSelectorAll(std::vector selectors_) noexcept : _selectors(std::move(selectors_)) {} diff --git a/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp b/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp index e5e5aecb..ac3cb692 100644 --- a/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/libhictk/hic/include/hictk/hic/pixel_selector.hpp @@ -26,12 +26,12 @@ namespace hictk::hic { class PixelSelector { - mutable internal::HiCBlockReader _reader{}; + std::shared_ptr _reader{}; std::shared_ptr _footer{}; - PixelCoordinates _coord1{}; - PixelCoordinates _coord2{}; + std::shared_ptr _coord1{}; + std::shared_ptr _coord2{}; public: template @@ -114,7 +114,10 @@ class PixelSelector { using BufferT = std::vector>; using BlockBlacklist = phmap::flat_hash_set; - const PixelSelector *_sel{}; + std::shared_ptr _reader{}; + std::shared_ptr _coord1{}; + std::shared_ptr _coord2{}; + std::shared_ptr _footer{}; std::shared_ptr _block_idx{}; std::shared_ptr _block_blacklist{}; internal::Index::Overlap::const_iterator _block_it{}; @@ -134,7 +137,9 @@ class PixelSelector { iterator() = default; explicit iterator(const PixelSelector &sel, bool sorted); - [[nodiscard]] static auto at_end(const PixelSelector &sel) -> iterator; + [[nodiscard]] static auto at_end(std::shared_ptr reader, + std::shared_ptr coord1, + std::shared_ptr coord2) -> iterator; [[nodiscard]] bool operator==(const iterator &other) const noexcept; [[nodiscard]] bool operator!=(const iterator &other) const noexcept; @@ -165,6 +170,7 @@ class PixelSelector { void read_next_chunk_unsorted(); void read_next_chunk_sorted(); void read_next_chunk_v9_intra_sorted(); + [[nodiscard]] ThinPixel transform_pixel(ThinPixel pixel) const; }; }; From 7234995fee8145fedb01ed775ff4763c9036969c Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:55:05 +0100 Subject: [PATCH 3/8] Refactor hictk dump to better support --cis/trans-only options --- src/hictk/CMakeLists.txt | 3 + src/hictk/dump/dump.cpp | 431 +++++++++---------------------- src/hictk/dump/dump.hpp | 34 +++ src/hictk/dump/dump_common.cpp | 176 +++++++++++++ src/hictk/dump/dump_multiple.cpp | 3 + src/hictk/dump/dump_single.cpp | 3 + 6 files changed, 334 insertions(+), 316 deletions(-) create mode 100644 src/hictk/dump/dump.hpp create mode 100644 src/hictk/dump/dump_common.cpp create mode 100644 src/hictk/dump/dump_multiple.cpp create mode 100644 src/hictk/dump/dump_single.cpp diff --git a/src/hictk/CMakeLists.txt b/src/hictk/CMakeLists.txt index ea4e57f9..c841eb07 100644 --- a/src/hictk/CMakeLists.txt +++ b/src/hictk/CMakeLists.txt @@ -29,6 +29,9 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/convert/cool_to_hic.cpp ${CMAKE_CURRENT_SOURCE_DIR}/convert/hic_to_cool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/dump/dump.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dump/dump_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dump/dump_multiple.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dump/dump_single.cpp ${CMAKE_CURRENT_SOURCE_DIR}/fix_mcool/fix_mcool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/load/load.cpp ${CMAKE_CURRENT_SOURCE_DIR}/merge/merge.cpp diff --git a/src/hictk/dump/dump.cpp b/src/hictk/dump/dump.cpp index 409f54be..8251514f 100644 --- a/src/hictk/dump/dump.cpp +++ b/src/hictk/dump/dump.cpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: MIT +#include "./dump.hpp" + #include #include "hictk/balancing/methods.hpp" @@ -12,385 +14,179 @@ namespace hictk::tools { -static void print(const Pixel& pixel) { - fmt::print(FMT_COMPILE("{:bg2}\t{:.16g}\n"), pixel.coords, pixel.count); -} -static void print(const ThinPixel& pixel) { - fmt::print(FMT_COMPILE("{:d}\t{:d}\t{:.16g}\n"), pixel.bin1_id, pixel.bin2_id, pixel.count); +template +static void dump_pixels(PixelIt first_pixel, PixelIt last_pixel, + const std::shared_ptr& bins, bool join) { + if (!join) { + return print_pixels(first_pixel, last_pixel); + } + auto jsel = transformers::JoinGenomicCoords(first_pixel, last_pixel, bins); + return print_pixels(jsel.begin(), jsel.end()); } -template -static void dump_bins(const File& f, std::string_view range) { - if (range == "all") { - for (const auto& bin : f.bins()) { - fmt::print(FMT_COMPILE("{:s}\t{:d}\t{:d}\n"), bin.chrom().name(), bin.start(), bin.end()); - } - return; +static void dump_pixels_gw(File& f, std::string_view normalization, bool join, bool sorted) { + if (f.is_hic()) { + f.get().optimize_cache_size_for_iteration(); } - const auto coords = GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range}); - auto [first_bin, last_bin] = f.bins().find_overlap(coords); - std::for_each(first_bin, last_bin, [](const Bin& bin) { - fmt::print(FMT_COMPILE("{:s}\t{:d}\t{:d}\n"), bin.chrom().name(), bin.start(), bin.end()); - }); -} - -[[nodiscard]] static std::pair parse_bedpe(std::string_view line) { - auto next_token = [&]() { - assert(!line.empty()); - const auto pos1 = line.find('\t'); - const auto pos2 = line.find('\t', pos1 + 1); - const auto pos3 = line.find('\t', pos2 + 1); - - auto tok = std::string{line.substr(0, pos3)}; - tok[pos1] = ':'; - tok[pos2] = '-'; - line.remove_prefix(pos3 + 1); - return tok; - }; - const auto range1 = next_token(); - const auto range2 = next_token(); - - return std::make_pair(range1, range2); -} + if (f.is_hic()) { + const auto& ff = f.get(); + const auto sel = ff.fetch(normalization); + dump_pixels(sel.template begin(sorted), sel.template end(), ff.bins_ptr(), + join); + return; + } -template -static void print_pixels(PixelIt first, PixelIt last) { - std::for_each(first, last, [&](const auto& pixel) { print(pixel); }); + const auto& ff = f.get(); + const auto sel = ff.fetch(normalization); + dump_pixels(sel.template begin(), sel.template end(), ff.bins_ptr(), join); } -static void dump_pixels(const cooler::File& f, std::string_view range1, std::string_view range2, - std::string_view normalization, bool join, [[maybe_unused]] bool sorted) { - auto weights = f.read_weights(balancing::Method{normalization}); - if (range1 == "all") { - assert(range2 == "all"); - auto sel = f.fetch(weights); - if (!join) { - return print_pixels(sel.template begin(), sel.template end()); - } - - auto jsel = - transformers::JoinGenomicCoords(sel.begin(), sel.end(), f.bins_ptr()); - return print_pixels(jsel.begin(), jsel.end()); - } - - auto sel = f.fetch(range1, range2, weights); - if (!join) { - return print_pixels(sel.template begin(), sel.template end()); +static void dump_pixels_chrom_chrom(File& f, std::string_view range1, std::string_view range2, + std::string_view normalization, bool join, bool sorted) { + if (f.is_hic()) { + const auto& ff = f.get(); + const auto sel = ff.fetch(range1, range2, balancing::Method{normalization}); + dump_pixels(sel.template begin(sorted), sel.template end(), ff.bins_ptr(), + join); + return; } - auto jsel = transformers::JoinGenomicCoords(sel.begin(), sel.end(), f.bins_ptr()); - print_pixels(jsel.begin(), jsel.end()); + const auto& ff = f.get(); + const auto sel = ff.fetch(range1, range2, balancing::Method{normalization}); + dump_pixels(sel.template begin(), sel.template end(), ff.bins_ptr(), join); } -static void dump_pixels(hic::File& f, std::string_view range1, std::string_view range2, +static void dump_pixels(File& f, std::string_view range1, std::string_view range2, std::string_view normalization, bool join, bool sorted) { auto norm = balancing::Method{std::string{normalization}}; if (range1 == "all") { assert(range2 == "all"); - f.optimize_cache_size_for_iteration(); - auto sel = f.fetch(norm); - if (!join) { - return print_pixels(sel.template begin(sorted), sel.template end()); - } - auto jsel = - transformers::JoinGenomicCoords(sel.begin(sorted), sel.end(), f.bins_ptr()); - return print_pixels(jsel.begin(), jsel.end()); + return dump_pixels_gw(f, normalization, join, sorted); } - auto sel = f.fetch(range1, range2, norm); - if (!join) { - return print_pixels(sel.template begin(sorted), sel.template end()); - } - - auto jsel = - transformers::JoinGenomicCoords(sel.begin(sorted), sel.end(), f.bins_ptr()); - print_pixels(jsel.begin(), jsel.end()); + return dump_pixels_chrom_chrom(f, range1, range2, normalization, join, sorted); } -static void dump_pixels_trans_only_sorted(const cooler::File& f, std::string_view normalization, - bool join) { - auto weights = f.read_weights(balancing::Method{normalization}); - - using It = decltype(f.fetch("chr1", "chr2").template begin()); - - const auto chroms = f.chromosomes(); - for (std::uint32_t i = 0; i < chroms.size(); ++i) { - std::vector heads{}; - std::vector tails{}; - for (std::uint32_t j = i + 1; j < chroms.size(); ++j) { - const auto& chrom1 = chroms[i]; - const auto& chrom2 = chroms[j]; - - if (chrom1.is_all() || chrom2.is_all()) { - continue; - } - - auto sel = f.fetch(chrom1.name(), chrom2.name(), weights); - heads.emplace_back(sel.begin()); - tails.emplace_back(sel.end()); - } - - internal::PixelMerger merger(std::move(heads), std::move(tails)); - - if (!join) { - print_pixels(merger.begin(), merger.end()); +static void process_query_cis_only(File& f, std::string_view normalization, bool join, + bool sorted) { + for (const auto& chrom : f.chromosomes()) { + if (chrom.is_all()) { continue; } - - auto jsel = transformers::JoinGenomicCoords(merger.begin(), merger.end(), f.bins_ptr()); - print_pixels(jsel.begin(), jsel.end()); + dump_pixels(f, chrom.name(), chrom.name(), normalization, join, sorted); } } -static void dump_pixels_trans_only_sorted(hic::File& f, std::string_view normalization, bool join) { +static void dump_pixels_trans_only_sorted(File& f, std::string_view normalization, bool join) { auto norm = balancing::Method{std::string{normalization}}; - std::vector selectors; - - for (std::uint32_t chrom1_id = 0; chrom1_id < f.chromosomes().size(); ++chrom1_id) { - const auto& chrom1 = f.chromosomes().at(chrom1_id); - if (chrom1.is_all()) { - continue; - } - for (std::uint32_t chrom2_id = chrom1_id + 1; chrom2_id < f.chromosomes().size(); ++chrom2_id) { - const auto& chrom2 = f.chromosomes().at(chrom2_id); - if (chrom2.is_all()) { - continue; - } - try { - auto sel = f.fetch(chrom1.name(), chrom2.name(), norm); - if (!sel.empty()) { - selectors.emplace_back(std::move(sel)); - } - } catch (const std::exception& e) { - const std::string_view msg{e.what()}; - const auto missing_norm = msg.find("unable to find") != std::string_view::npos && - msg.find("normalization vector") != std::string_view::npos; - if (!missing_norm) { - throw; - } - } - } - } - - if (selectors.empty()) { - throw std::runtime_error( - fmt::format(FMT_STRING("unable to find {} normalization vectors at {} ({})"), - norm.to_string(), f.resolution(), hic::MatrixUnit::BP)); - } - - hic::PixelSelectorAll sel(std::move(selectors)); - - if (!join) { - print_pixels(sel.template begin(), sel.template end()); - } - - auto jsel = transformers::JoinGenomicCoords(sel.template begin(), - sel.template end(), f.bins_ptr()); - print_pixels(jsel.begin(), jsel.end()); -} - -static void process_query(File& f, std::string_view table, std::string_view range1, - std::string_view range2, std::string_view normalization, bool join, - bool sorted) { - if (table == "bins") { - return dump_bins(f, range1); - } - - assert(table == "pixels"); - std::visit([&](auto& ff) { return dump_pixels(ff, range1, range2, normalization, join, sorted); }, - f.get()); -} - -static void process_query_cis_only(File& f, std::string_view normalization, bool join, - bool sorted) { std::visit( - [&](auto& ff) { - for (const auto& chrom : ff.chromosomes()) { - if (chrom.is_all()) { - continue; - } - dump_pixels(ff, chrom.name(), chrom.name(), normalization, join, sorted); - } - }, - f.get()); -} + [&](const auto& ff) { + using It = decltype(ff.fetch("chr1", "chr2").template begin()); + std::vector heads{}; + std::vector tails{}; -static void process_query_trans_only_unsorted(File& f, std::string_view normalization, bool join) { - std::visit( - [&](auto& ff) { - const auto& chromosomes = ff.chromosomes(); - for (std::uint32_t chrom1_id = 0; chrom1_id < chromosomes.size(); ++chrom1_id) { - const auto& chrom1 = chromosomes[chrom1_id]; + for (std::uint32_t chrom1_id = 0; chrom1_id < ff.chromosomes().size(); ++chrom1_id) { + const auto& chrom1 = ff.chromosomes().at(chrom1_id); if (chrom1.is_all()) { continue; } - for (std::uint32_t chrom2_id = chrom1_id + 1; chrom2_id < chromosomes.size(); + for (std::uint32_t chrom2_id = chrom1_id + 1; chrom2_id < ff.chromosomes().size(); ++chrom2_id) { - const auto& chrom2 = chromosomes[chrom2_id]; + const auto& chrom2 = ff.chromosomes().at(chrom2_id); if (chrom2.is_all()) { continue; } - - dump_pixels(ff, chrom1.name(), chrom2.name(), normalization, join, false); + try { + const auto sel = ff.fetch(chrom1.name(), chrom2.name(), norm); + heads.emplace_back(sel.template begin()); + tails.emplace_back(sel.template end()); + } catch (const std::exception& e) { + const std::string_view msg{e.what()}; + const auto missing_norm = msg.find("unable to find") != std::string_view::npos && + msg.find("normalization vector") != std::string_view::npos; + if (!missing_norm) { + throw; + } + } } } - }, - f.get()); -} -static void process_query_trans_only_sorted(File& f, std::string_view normalization, bool join) { - std::visit([&](auto& ff) { dump_pixels_trans_only_sorted(ff, normalization, join); }, f.get()); -} - -static void process_query_trans_only(File& f, std::string_view normalization, bool join, - bool sorted) { - if (sorted) { - return process_query_trans_only_sorted(f, normalization, join); - } - return process_query_trans_only_unsorted(f, normalization, join); -} + if (heads.empty()) { + throw std::runtime_error( + fmt::format(FMT_STRING("unable to find {} normalization vectors at {} ({})"), + norm.to_string(), ff.bin_size(), hic::MatrixUnit::BP)); + } -static int dump_chroms(std::string_view uri, std::string_view format, std::uint32_t resolution) { - Reference ref{}; + transformers::PixelMerger merger(heads, tails); - if (format == "mcool") { - ref = cooler::MultiResFile{std::string{uri}}.chromosomes(); - } else if (format == "scool") { - ref = cooler::SingleCellFile{std::string{uri}}.chromosomes(); - } else { - ref = File{std::string{uri}, resolution}.chromosomes(); - } + if (!join) { + print_pixels(merger.begin(), merger.end()); + } - for (const Chromosome& chrom : ref) { - if (!chrom.is_all()) { - fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), chrom.name(), chrom.size()); - } - } - return 0; + auto jsel = transformers::JoinGenomicCoords(merger.begin(), merger.end(), f.bins_ptr()); + print_pixels(jsel.begin(), jsel.end()); + }, + f.get()); } -static phmap::btree_set get_normalizations(std::string_view uri, - std::string_view format, - std::uint32_t resolution) { - assert(format != "mcool"); - assert(format != "hic" || resolution != 0); - if (format == "scool") { - const auto cell_ids = cooler::SingleCellFile{uri}.cells(); - if (cell_ids.empty()) { - return {}; - } - - const auto scool_uri = fmt::format(FMT_STRING("{}::/cells/{}"), uri, *cell_ids.begin()); - return get_normalizations(scool_uri, "cool", 0); - } - - phmap::btree_set norms{}; - if (uri == "hic" && resolution == 0) { - const hic::File hf{std::string{uri}, resolution}; - - for (const auto& norm : hf.avail_normalizations()) { - norms.emplace(std::string{norm.to_string()}); +static void dump_pixels_trans_only_unsorted(File& f, std::string_view normalization, bool join) { + const auto& chromosomes = f.chromosomes(); + for (std::uint32_t chrom1_id = 0; chrom1_id < chromosomes.size(); ++chrom1_id) { + const auto& chrom1 = chromosomes[chrom1_id]; + if (chrom1.is_all()) { + continue; } - return norms; - } - - const auto norms_ = File{std::string{uri}, resolution}.avail_normalizations(); - std::transform(norms_.begin(), norms_.end(), std::inserter(norms, norms.begin()), - [](const auto& n) { return std::string{n.to_string()}; }); - - return norms; -} + for (std::uint32_t chrom2_id = chrom1_id + 1; chrom2_id < chromosomes.size(); ++chrom2_id) { + const auto& chrom2 = chromosomes[chrom2_id]; + if (chrom2.is_all()) { + continue; + } -static int dump_normalizations(std::string_view uri, std::string_view format, - std::uint32_t resolution) { - phmap::btree_set norms{}; - std::vector resolutions{}; - if (format == "mcool") { - resolutions = cooler::MultiResFile{uri}.resolutions(); - if (resolutions.empty()) { - return 0; - } - } else if (format == "hic" && resolution == 0) { - resolutions = hic::utils::list_resolutions(std::string{uri}); - if (resolutions.empty()) { - return 0; + dump_pixels(f, chrom1.name(), chrom2.name(), normalization, join, false); } } - - if (resolutions.empty()) { - norms = get_normalizations(uri, format, resolution); - } else { - format = format == "hic" ? "hic" : "cool"; - std::for_each(resolutions.begin(), resolutions.end(), - [&](const auto res) { norms.merge(get_normalizations(uri, format, res)); }); - } - - if (!norms.empty()) { - fmt::print(FMT_STRING("{}\n"), fmt::join(norms, "\n")); - } - return 0; } -static int dump_resolutions(std::string_view uri, std::string_view format, - std::uint32_t resolution) { - std::vector resolutions{}; - - if (format == "hic") { - resolutions = hic::utils::list_resolutions(uri); - if (resolution != 0) { - const auto res_found = - std::find(resolutions.begin(), resolutions.end(), resolution) != resolutions.end(); - resolutions.clear(); - if (res_found) { - resolutions.push_back(resolution); - } - } - } else if (format == "mcool") { - resolutions = cooler::MultiResFile{uri}.resolutions(); - } else if (format == "scool") { - resolutions.push_back(cooler::SingleCellFile{uri}.bin_size()); - } else { - assert(format == "cool"); - resolutions.push_back(cooler::File{uri}.bin_size()); +static void process_query(File& f, std::string_view table, std::string_view range1, + std::string_view range2, std::string_view normalization, bool join, + bool sorted) { + if (table == "bins") { + return dump_bins(f, range1); } - if (!resolutions.empty()) { - fmt::print(FMT_STRING("{}\n"), fmt::join(resolutions, "\n")); - } - return 0; + assert(table == "pixels"); + dump_pixels(f, range1, range2, normalization, join, sorted); } -static int dump_cells(std::string_view uri, std::string_view format) { - if (format != "scool") { - throw std::runtime_error(fmt::format(FMT_STRING("\"{}\" is not a .scool file"), uri)); - } - const auto cells = cooler::SingleCellFile{uri}.cells(); - if (!cells.empty()) { - fmt::print(FMT_STRING("{}\n"), fmt::join(cells, "\n")); +static void process_query_trans_only(File& f, std::string_view normalization, bool join, + bool sorted) { + if (sorted) { + dump_pixels_trans_only_sorted(f, normalization, join); + return; } - return 0; + dump_pixels_trans_only_unsorted(f, normalization, join); } -static int dump_tables(const DumpConfig& c) { +static void dump_tables(const DumpConfig& c) { hictk::File f{c.uri, c.resolution, c.matrix_type, c.matrix_unit}; if (c.query_file.empty() && !c.cis_only && !c.trans_only) { process_query(f, c.table, c.range1, c.range2, c.normalization, c.join, c.sorted); - return 0; + return; } if (c.cis_only) { assert(c.table == "pixels"); process_query_cis_only(f, c.normalization, c.join, c.sorted); - return 0; + return; } if (c.trans_only) { assert(c.table == "pixels"); process_query_trans_only(f, c.normalization, c.join, c.sorted); - return 0; + return; } const auto read_from_stdin = c.query_file == "-"; @@ -407,29 +203,32 @@ static int dump_tables(const DumpConfig& c) { const auto [range1, range2] = parse_bedpe(line); process_query(f, c.table, range1, range2, c.normalization, c.join, c.sorted); } - - return 0; } int dump_subcmd(const DumpConfig& c) { if (c.table == "bins" || c.table == "pixels") { - return dump_tables(c); + dump_tables(c); + return 0; } if (c.table == "chroms") { - return dump_chroms(c.uri, c.format, c.resolution); + dump_chroms(c.uri, c.format, c.resolution); + return 0; } if (c.table == "resolutions") { - return dump_resolutions(c.uri, c.format, c.resolution); + dump_resolutions(c.uri, c.format, c.resolution); + return 0; } if (c.table == "normalizations") { - return dump_normalizations(c.uri, c.format, c.resolution); + dump_normalizations(c.uri, c.format, c.resolution); + return 0; } assert(c.table == "cells"); - return dump_cells(c.uri, c.format); + dump_cells(c.uri, c.format); + return 0; } } // namespace hictk::tools diff --git a/src/hictk/dump/dump.hpp b/src/hictk/dump/dump.hpp new file mode 100644 index 00000000..1ca191f3 --- /dev/null +++ b/src/hictk/dump/dump.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +#include "hictk/file.hpp" +#include "hictk/pixel.hpp" + +namespace hictk::tools { + +void print(const Pixel& pixel); +void print(const ThinPixel& pixel); + +void dump_bins(const File& f, std::string_view range); +void dump_cells(std::string_view uri, std::string_view format); +void dump_chroms(std::string_view uri, std::string_view format, std::uint32_t resolution); +void dump_normalizations(std::string_view uri, std::string_view format, std::uint32_t resolution); +void dump_resolutions(std::string_view uri, std::string_view format, std::uint32_t resolution); + +[[nodiscard]] std::pair parse_bedpe(std::string_view line); + +template +inline void print_pixels(PixelIt first, PixelIt last) { + std::for_each(first, last, [&](const auto& pixel) { print(pixel); }); +} + +} // namespace hictk::tools diff --git a/src/hictk/dump/dump_common.cpp b/src/hictk/dump/dump_common.cpp new file mode 100644 index 00000000..bafa1f30 --- /dev/null +++ b/src/hictk/dump/dump_common.cpp @@ -0,0 +1,176 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "./dump.hpp" +#include "hictk/cooler.hpp" +#include "hictk/file.hpp" +#include "hictk/hic.hpp" +#include "hictk/pixel.hpp" + +namespace hictk::tools { +void print(const Pixel& pixel) { + fmt::print(FMT_COMPILE("{:bg2}\t{:.16g}\n"), pixel.coords, pixel.count); +} +void print(const ThinPixel& pixel) { + fmt::print(FMT_COMPILE("{:d}\t{:d}\t{:.16g}\n"), pixel.bin1_id, pixel.bin2_id, pixel.count); +} + +void dump_bins(const File& f, std::string_view range) { + if (range == "all") { + for (const auto& bin : f.bins()) { + fmt::print(FMT_COMPILE("{:s}\t{:d}\t{:d}\n"), bin.chrom().name(), bin.start(), bin.end()); + } + return; + } + + const auto coords = GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range}); + auto [first_bin, last_bin] = f.bins().find_overlap(coords); + std::for_each(first_bin, last_bin, [](const Bin& bin) { + fmt::print(FMT_COMPILE("{:s}\t{:d}\t{:d}\n"), bin.chrom().name(), bin.start(), bin.end()); + }); +} + +void dump_cells(std::string_view uri, std::string_view format) { + if (format != "scool") { + throw std::runtime_error(fmt::format(FMT_STRING("\"{}\" is not a .scool file"), uri)); + } + const auto cells = cooler::SingleCellFile{uri}.cells(); + std::for_each(cells.begin(), cells.end(), + [&](const auto& cell) { fmt::print(FMT_COMPILE("{}\n"), cell); }); +} + +void dump_chroms(std::string_view uri, std::string_view format, std::uint32_t resolution) { + Reference ref{}; + + if (format == "mcool") { + ref = cooler::MultiResFile{std::string{uri}}.chromosomes(); + } else if (format == "scool") { + ref = cooler::SingleCellFile{std::string{uri}}.chromosomes(); + } else { + ref = File{std::string{uri}, resolution}.chromosomes(); + } + + for (const Chromosome& chrom : ref) { + if (!chrom.is_all()) { + fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), chrom.name(), chrom.size()); + } + } +} + +static phmap::btree_set get_normalizations(std::string_view uri, + std::string_view format, + std::uint32_t resolution) { + assert(format != "mcool"); + assert(format != "hic" || resolution != 0); + if (format == "scool") { + const auto cell_ids = cooler::SingleCellFile{uri}.cells(); + if (cell_ids.empty()) { + return {}; + } + + const auto scool_uri = fmt::format(FMT_STRING("{}::/cells/{}"), uri, *cell_ids.begin()); + return get_normalizations(scool_uri, "cool", 0); + } + + phmap::btree_set norms{}; + if (uri == "hic" && resolution == 0) { + const hic::File hf{std::string{uri}, resolution}; + + for (const auto& norm : hf.avail_normalizations()) { + norms.emplace(std::string{norm.to_string()}); + } + return norms; + } + + const auto norms_ = File{std::string{uri}, resolution}.avail_normalizations(); + std::transform(norms_.begin(), norms_.end(), std::inserter(norms, norms.begin()), + [](const auto& n) { return std::string{n.to_string()}; }); + + return norms; +} + +void dump_normalizations(std::string_view uri, std::string_view format, std::uint32_t resolution) { + phmap::btree_set norms{}; + std::vector resolutions{}; + if (format == "mcool") { + resolutions = cooler::MultiResFile{uri}.resolutions(); + if (resolutions.empty()) { + return; + } + } else if (format == "hic" && resolution == 0) { + resolutions = hic::utils::list_resolutions(std::string{uri}); + if (resolutions.empty()) { + return; + } + } + + if (resolutions.empty()) { + norms = get_normalizations(uri, format, resolution); + } else { + format = format == "hic" ? "hic" : "cool"; + std::for_each(resolutions.begin(), resolutions.end(), + [&](const auto res) { norms.merge(get_normalizations(uri, format, res)); }); + } + + if (!norms.empty()) { + fmt::print(FMT_STRING("{}\n"), fmt::join(norms, "\n")); + } +} + +void dump_resolutions(std::string_view uri, std::string_view format, std::uint32_t resolution) { + std::vector resolutions{}; + + if (format == "hic") { + resolutions = hic::utils::list_resolutions(uri); + if (resolution != 0) { + const auto res_found = + std::find(resolutions.begin(), resolutions.end(), resolution) != resolutions.end(); + resolutions.clear(); + if (res_found) { + resolutions.push_back(resolution); + } + } + } else if (format == "mcool") { + resolutions = cooler::MultiResFile{uri}.resolutions(); + } else if (format == "scool") { + resolutions.push_back(cooler::SingleCellFile{uri}.bin_size()); + } else { + assert(format == "cool"); + resolutions.push_back(cooler::File{uri}.bin_size()); + } + + if (!resolutions.empty()) { + fmt::print(FMT_STRING("{}\n"), fmt::join(resolutions, "\n")); + } +} + +std::pair parse_bedpe(std::string_view line) { + auto next_token = [&]() { + assert(!line.empty()); + const auto pos1 = line.find('\t'); + const auto pos2 = line.find('\t', pos1 + 1); + const auto pos3 = line.find('\t', pos2 + 1); + + auto tok = std::string{line.substr(0, pos3)}; + tok[pos1] = ':'; + tok[pos2] = '-'; + line.remove_prefix(pos3 + 1); + return tok; + }; + const auto range1 = next_token(); + const auto range2 = next_token(); + + return std::make_pair(range1, range2); +} +} // namespace hictk::tools diff --git a/src/hictk/dump/dump_multiple.cpp b/src/hictk/dump/dump_multiple.cpp new file mode 100644 index 00000000..5b5e08cc --- /dev/null +++ b/src/hictk/dump/dump_multiple.cpp @@ -0,0 +1,3 @@ +// +// Created by roby on 12/1/23. +// diff --git a/src/hictk/dump/dump_single.cpp b/src/hictk/dump/dump_single.cpp new file mode 100644 index 00000000..5b5e08cc --- /dev/null +++ b/src/hictk/dump/dump_single.cpp @@ -0,0 +1,3 @@ +// +// Created by roby on 12/1/23. +// From 19600eadf31b891f1c4b14957a9b5528499910c0 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 4 Dec 2023 16:06:32 +0100 Subject: [PATCH 4/8] Update integration tests for hictk dump --- test/scripts/hictk_dump_cis.sh | 19 +++++++++++++++++++ test/scripts/hictk_dump_trans.sh | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/test/scripts/hictk_dump_cis.sh b/test/scripts/hictk_dump_cis.sh index c6fe6447..5644c2a4 100755 --- a/test/scripts/hictk_dump_cis.sh +++ b/test/scripts/hictk_dump_cis.sh @@ -85,6 +85,7 @@ fi outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" trap 'rm -rf -- "$outdir"' EXIT +# Test chrom-chrom matrix cooler dump --join "$ref_cooler::/resolutions/100000" -r chr2L > "$outdir/expected.pixels" "$hictk_bin" dump --join "$ref_cooler::/resolutions/100000" -r chr2L > "$outdir/out.cooler.pixels" "$hictk_bin" dump --join --resolution 100000 "$ref_hic8" -r chr2L > "$outdir/out.hic8.pixels" @@ -102,6 +103,24 @@ if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then status=1 fi +# Test --cis-only matrix +cooler dump --join "$ref_cooler::/resolutions/100000" | awk -F '\t' '$1==$4' | tee "$outdir/expected.pixels" > /dev/null +"$hictk_bin" dump --join "$ref_cooler::/resolutions/100000" --cis-only | tee "$outdir/out.cooler.pixels" > /dev/null +"$hictk_bin" dump --join --resolution 100000 "$ref_hic8" --cis-only | tee "$outdir/out.hic8.pixels" > /dev/null +"$hictk_bin" dump --join --resolution 100000 "$ref_hic9" --cis-only | tee "$outdir/out.hic9.pixels" > /dev/null + +if ! compare_files "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then + status=1 +fi + +if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then + status=1 +fi + +if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then + status=1 +fi + if [ "$status" -eq 0 ]; then printf '\n### PASS ###\n' else diff --git a/test/scripts/hictk_dump_trans.sh b/test/scripts/hictk_dump_trans.sh index 60680917..33842c3f 100755 --- a/test/scripts/hictk_dump_trans.sh +++ b/test/scripts/hictk_dump_trans.sh @@ -84,6 +84,7 @@ fi outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" trap 'rm -rf -- "$outdir"' EXIT +# Test chrom-chrom matrix cooler dump --join "$ref_cooler::/resolutions/100000" --range chr2L --range2 chrX > "$outdir/expected.pixels" "$hictk_bin" dump --join "$ref_cooler::/resolutions/100000" --range chr2L --range2 chrX > "$outdir/out.cooler.pixels" "$hictk_bin" dump --join --resolution 100000 "$ref_hic8" --range chr2L --range2 chrX > "$outdir/out.hic8.pixels" @@ -101,6 +102,24 @@ if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then status=1 fi +# Test --trans-only matrix +cooler dump --join "$ref_cooler::/resolutions/100000" | awk -F '\t' '$1!=$4' | tee "$outdir/expected.pixels" > /dev/null +"$hictk_bin" dump --join "$ref_cooler::/resolutions/100000" --trans-only | tee "$outdir/out.cooler.pixels" > /dev/null +"$hictk_bin" dump --join --resolution 100000 "$ref_hic8" --trans-only | tee "$outdir/out.hic8.pixels" > /dev/null +"$hictk_bin" dump --join --resolution 100000 "$ref_hic9" --trans-only | tee "$outdir/out.hic9.pixels" > /dev/null + +if ! compare_files "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then + status=1 +fi + +if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then + status=1 +fi + +if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then + status=1 +fi + if [ "$status" -eq 0 ]; then printf '\n### PASS ###\n' else From 9acf1a682b7affb7e5cfea13b75ea6afd5372cd6 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 4 Dec 2023 16:06:46 +0100 Subject: [PATCH 5/8] Clean up hictk dump --- src/hictk/cli/cli_dump.cpp | 7 ++++ src/hictk/dump/dump.cpp | 79 ++++++++++++++++++++------------------ 2 files changed, 49 insertions(+), 37 deletions(-) diff --git a/src/hictk/cli/cli_dump.cpp b/src/hictk/cli/cli_dump.cpp index b0da739e..8f10c3f3 100644 --- a/src/hictk/cli/cli_dump.cpp +++ b/src/hictk/cli/cli_dump.cpp @@ -121,11 +121,18 @@ void Cli::make_dump_subcommand() { // clang-format on sc.get_option("--range2")->needs(sc.get_option("--range")); + sc.get_option("--query-file")->excludes(sc.get_option("--range")); sc.get_option("--query-file")->excludes(sc.get_option("--range2")); sc.get_option("--query-file")->excludes(sc.get_option("--cis-only")); sc.get_option("--query-file")->excludes(sc.get_option("--trans-only")); + sc.get_option("--cis-only")->excludes(sc.get_option("--trans-only")); + sc.get_option("--cis-only")->excludes(sc.get_option("--range")); + sc.get_option("--cis-only")->excludes(sc.get_option("--range2")); + + sc.get_option("--trans-only")->excludes(sc.get_option("--range")); + sc.get_option("--trans-only")->excludes(sc.get_option("--range2")); _config = std::monostate{}; } diff --git a/src/hictk/dump/dump.cpp b/src/hictk/dump/dump.cpp index 8251514f..565d0d03 100644 --- a/src/hictk/dump/dump.cpp +++ b/src/hictk/dump/dump.cpp @@ -78,48 +78,53 @@ static void process_query_cis_only(File& f, std::string_view normalization, bool } } +template ().fetch("chr1", "chr2")), + typename PixelIt = decltype(std::declval().template begin())> +static transformers::PixelMerger init_pixel_merger(const File& f, + std::string_view normalization) { + std::vector heads{}; + std::vector tails{}; + + for (std::uint32_t chrom1_id = 0; chrom1_id < f.chromosomes().size(); ++chrom1_id) { + const auto& chrom1 = f.chromosomes().at(chrom1_id); + if (chrom1.is_all()) { + continue; + } + for (std::uint32_t chrom2_id = chrom1_id + 1; chrom2_id < f.chromosomes().size(); ++chrom2_id) { + const auto& chrom2 = f.chromosomes().at(chrom2_id); + if (chrom2.is_all()) { + continue; + } + try { + const auto sel = f.fetch(chrom1.name(), chrom2.name(), balancing::Method{normalization}); + heads.emplace_back(sel.template begin()); + tails.emplace_back(sel.template end()); + } catch (const std::exception& e) { + const std::string_view msg{e.what()}; + const auto missing_norm = msg.find("unable to find") != std::string_view::npos && + msg.find("normalization vector") != std::string_view::npos; + if (!missing_norm) { + throw; + } + } + } + } + + if (heads.empty()) { + throw std::runtime_error( + fmt::format(FMT_STRING("unable to find {} normalization vectors at {} ({})"), normalization, + f.bin_size(), hic::MatrixUnit::BP)); + } + + return {std::move(heads), std::move(tails)}; +} + static void dump_pixels_trans_only_sorted(File& f, std::string_view normalization, bool join) { auto norm = balancing::Method{std::string{normalization}}; std::visit( [&](const auto& ff) { - using It = decltype(ff.fetch("chr1", "chr2").template begin()); - std::vector heads{}; - std::vector tails{}; - - for (std::uint32_t chrom1_id = 0; chrom1_id < ff.chromosomes().size(); ++chrom1_id) { - const auto& chrom1 = ff.chromosomes().at(chrom1_id); - if (chrom1.is_all()) { - continue; - } - for (std::uint32_t chrom2_id = chrom1_id + 1; chrom2_id < ff.chromosomes().size(); - ++chrom2_id) { - const auto& chrom2 = ff.chromosomes().at(chrom2_id); - if (chrom2.is_all()) { - continue; - } - try { - const auto sel = ff.fetch(chrom1.name(), chrom2.name(), norm); - heads.emplace_back(sel.template begin()); - tails.emplace_back(sel.template end()); - } catch (const std::exception& e) { - const std::string_view msg{e.what()}; - const auto missing_norm = msg.find("unable to find") != std::string_view::npos && - msg.find("normalization vector") != std::string_view::npos; - if (!missing_norm) { - throw; - } - } - } - } - - if (heads.empty()) { - throw std::runtime_error( - fmt::format(FMT_STRING("unable to find {} normalization vectors at {} ({})"), - norm.to_string(), ff.bin_size(), hic::MatrixUnit::BP)); - } - - transformers::PixelMerger merger(heads, tails); + const auto merger = init_pixel_merger(ff, normalization); if (!join) { print_pixels(merger.begin(), merger.end()); From a1311ff02af587c680e6cb3ff06f97b75b99f7be Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 4 Dec 2023 16:16:17 +0100 Subject: [PATCH 6/8] Update integration tests for hictk dump --- test/scripts/hictk_dump_trans.sh | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/test/scripts/hictk_dump_trans.sh b/test/scripts/hictk_dump_trans.sh index 33842c3f..40706f12 100755 --- a/test/scripts/hictk_dump_trans.sh +++ b/test/scripts/hictk_dump_trans.sh @@ -102,7 +102,7 @@ if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then status=1 fi -# Test --trans-only matrix +# Test --trans-only matrix (sorted) cooler dump --join "$ref_cooler::/resolutions/100000" | awk -F '\t' '$1!=$4' | tee "$outdir/expected.pixels" > /dev/null "$hictk_bin" dump --join "$ref_cooler::/resolutions/100000" --trans-only | tee "$outdir/out.cooler.pixels" > /dev/null "$hictk_bin" dump --join --resolution 100000 "$ref_hic8" --trans-only | tee "$outdir/out.hic8.pixels" > /dev/null @@ -120,6 +120,23 @@ if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then status=1 fi +# Test --trans-only matrix (unsorted) +"$hictk_bin" dump --join "$ref_cooler::/resolutions/100000" --trans-only --unsorted | sort -V | tee "$outdir/out.cooler.pixels" > /dev/null +"$hictk_bin" dump --join --resolution 100000 "$ref_hic8" --trans-only --unsorted | sort -V | tee "$outdir/out.hic8.pixels" > /dev/null +"$hictk_bin" dump --join --resolution 100000 "$ref_hic9" --trans-only --unsorted | sort -V | tee "$outdir/out.hic9.pixels" > /dev/null + +if ! compare_files "$outdir/expected.pixels" "$outdir/out.cooler.pixels"; then + status=1 +fi + +if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic8.pixels"; then + status=1 +fi + +if ! compare_files "$outdir/expected.pixels" "$outdir/out.hic9.pixels"; then + status=1 +fi + if [ "$status" -eq 0 ]; then printf '\n### PASS ###\n' else From 63300e7a19a1879c032ffd410355f41a955acf59 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 4 Dec 2023 18:50:47 +0100 Subject: [PATCH 7/8] Bugfix --- src/hictk/dump/dump.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hictk/dump/dump.cpp b/src/hictk/dump/dump.cpp index 565d0d03..f80da6c2 100644 --- a/src/hictk/dump/dump.cpp +++ b/src/hictk/dump/dump.cpp @@ -31,14 +31,14 @@ static void dump_pixels_gw(File& f, std::string_view normalization, bool join, b if (f.is_hic()) { const auto& ff = f.get(); - const auto sel = ff.fetch(normalization); + const auto sel = ff.fetch(balancing::Method{normalization}); dump_pixels(sel.template begin(sorted), sel.template end(), ff.bins_ptr(), join); return; } const auto& ff = f.get(); - const auto sel = ff.fetch(normalization); + const auto sel = ff.fetch(balancing::Method{normalization}); dump_pixels(sel.template begin(), sel.template end(), ff.bins_ptr(), join); } From 1c67c55b781412084523e3cac5acb6c8c7473de3 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 4 Dec 2023 19:34:37 +0100 Subject: [PATCH 8/8] Update docs --- docs/cli_reference.rst | 10 +++++++--- docs/reading_interactions.rst | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/docs/cli_reference.rst b/docs/cli_reference.rst index 45001241..6ff8627f 100644 --- a/docs/cli_reference.rst +++ b/docs/cli_reference.rst @@ -139,12 +139,16 @@ hictk dump Matrix unit (ignored when file is not in .hic format). -t,--table TEXT:{chroms,bins,pixels,normalizations,resolutions,cells} [pixels] Name of the table to dump. - -r,--range TEXT [all] Excludes: --query-file + -r,--range TEXT [all] Excludes: --query-file --cis-only --trans-only Coordinates of the genomic regions to be dumped following UCSC-style notation (chr1:0-1000). - --range2 TEXT [all] Needs: --range Excludes: --query-file + --range2 TEXT [all] Needs: --range Excludes: --query-file --cis-only --trans-only Coordinates of the genomic regions to be dumped following UCSC-style notation (chr1:0-1000). - --query-file TEXT:(FILE) OR ({-}) Excludes: --range --range2 + --query-file TEXT:(FILE) OR ({-}) Excludes: --range --range2 --cis-only --trans-only Path to a BEDPE file with the list of coordinates to be fetched (pass - to read queries from stdin). + --cis-only Excludes: --range --range2 --query-file --trans-only + Dump intra-chromosomal interactions only. + --trans-only Excludes: --range --range2 --query-file --cis-only + Dump inter-chromosomal interactions only. -b,--balance TEXT [NONE] Balance interactions using the given method. --sorted,--unsorted{false} Return interactions in ascending order. --join,--no-join{false} Output pixels in BG2 format. diff --git a/docs/reading_interactions.rst b/docs/reading_interactions.rst index 7359c25f..39bc828d 100644 --- a/docs/reading_interactions.rst +++ b/docs/reading_interactions.rst @@ -106,3 +106,22 @@ Dump tables other than pixels: 5000 10000 ... + + +Dump cis or trans interactions only: + +.. code-block:: console + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.hic9 --resolution 1000 --cis-only --join + + chr2L 7000 8000 chr2L 7000 8000 1745 + chr2L 7000 8000 chr2L 12000 13000 1766 + chr2L 7000 8000 chr2L 17000 18000 1078 + ... + + user@dev:/tmp$ hictk dump data/4DNFIZ1ZVXC8.hic9 --resolution 1000 --trans-only --join + + chr2L 7000 8000 chr2R 27000 28000 1 + chr2L 7000 8000 chr2R 322000 323000 1 + chr2L 7000 8000 chr2R 397000 398000 1 + ...