diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 6b7c9cff..538a9bcd 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -212,8 +212,7 @@ jobs: test/scripts/hictk_load_coo.sh build/src/hictk/hictk unsorted test/scripts/hictk_load_bg2.sh build/src/hictk/hictk sorted test/scripts/hictk_load_bg2.sh build/src/hictk/hictk unsorted - test/scripts/hictk_load_4dn.sh build/src/hictk/hictk sorted - test/scripts/hictk_load_4dn.sh build/src/hictk/hictk unsorted + test/scripts/hictk_load_4dn.sh build/src/hictk/hictk test/scripts/hictk_merge.sh build/src/hictk/hictk diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index eaf25d58..35e1202f 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -430,13 +430,9 @@ jobs: run: | test/scripts/hictk_load_bg2.sh bin/hictk unsorted - - name: Test hictk load 4dn sorted + - name: Test hictk load 4dn run: | - test/scripts/hictk_load_4dn.sh bin/hictk sorted - - - name: Test hictk load 4dn unsorted - run: | - test/scripts/hictk_load_4dn.sh bin/hictk unsorted + test/scripts/hictk_load_4dn.sh bin/hictk - name: Test hictk merge run: | diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml index e06ea125..788f1695 100644 --- a/.github/workflows/ubuntu-ci.yml +++ b/.github/workflows/ubuntu-ci.yml @@ -483,13 +483,9 @@ jobs: run: | test/scripts/hictk_load_bg2.sh bin/hictk unsorted - - name: Test hictk load 4dn sorted + - name: Test hictk load 4dn run: | - test/scripts/hictk_load_4dn.sh bin/hictk sorted - - - name: Test hictk load 4dn unsorted - run: | - test/scripts/hictk_load_4dn.sh bin/hictk unsorted + test/scripts/hictk_load_4dn.sh bin/hictk - name: Test hictk merge run: | diff --git a/cmake/FetchTestDataset.cmake b/cmake/FetchTestDataset.cmake index e01c0c39..94900f53 100644 --- a/cmake/FetchTestDataset.cmake +++ b/cmake/FetchTestDataset.cmake @@ -4,8 +4,8 @@ # cmake-format: off file( - DOWNLOAD https://zenodo.org/record/8392109/files/hictk_test_data.tar.xz?download=1 - EXPECTED_HASH SHA256=43d05077082603cf03dc5ce2bfc0f81b2422c7249ee987e09512d2e3a56afff1 + DOWNLOAD https://zenodo.org/records/10289491/files/hictk_test_data.tar.xz?download=1 + EXPECTED_HASH SHA256=5e69dceb8789d923a38aed7add8fc18abfdfe531aea6effcdb7efe3c9bcf5246 "${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.xz") # cmake-format: on diff --git a/src/hictk/cli/cli_load.cpp b/src/hictk/cli/cli_load.cpp index 0c1eb4b6..ba1b5de8 100644 --- a/src/hictk/cli/cli_load.cpp +++ b/src/hictk/cli/cli_load.cpp @@ -35,19 +35,25 @@ void Cli::make_load_subcommand() { ->check(CLI::ExistingFile) ->required(); - sc.add_option( - "bin-size", - c.bin_size, - "Bin size (bp).") - ->check(CLI::PositiveNumber) - ->required(); - sc.add_option( "output-uri", c.uri, "Path to output Cooler (URI syntax supported).") ->required(); + sc.add_option( + "-b,--bin-size", + c.bin_size, + "Bin size (bp).\n" + "Required when --bin-table is not used.") + ->check(CLI::PositiveNumber); + + sc.add_option( + "-t,--bin-table", + c.path_to_bin_table, + "Path to a BED3+ file with the bin table.") + ->check(CLI::ExistingFile); + sc.add_option( "-f,--format", c.format, @@ -67,6 +73,13 @@ void Cli::make_load_subcommand() { "Assembly name.") ->capture_default_str(); + sc.add_flag( + "--one-based,!--zero-based", + c.one_based, + "Interpret genomic coordinates or bins as one/zero based.\n" + "By default coordinates are assumed to be one-based for interactions in\n" + "4dn and validapairs formats and zero-based otherwise."); + sc.add_flag( "--count-as-float", c.count_as_float, @@ -89,24 +102,48 @@ void Cli::make_load_subcommand() { sc.add_option( "--batch-size", c.batch_size, - "Number of pixels to buffer in memory. Only used when processing unsorted interactions or pairs") + "Number of pixels to buffer in memory.\n" + "Only used when processing unsorted interactions or pairs.") ->capture_default_str(); // clang-format on + sc.get_option("--bin-size")->excludes(sc.get_option("--bin-table")); _config = std::monostate{}; } void Cli::validate_load_subcommand() const { assert(_cli.get_subcommand("load")->parsed()); + std::vector warnings; std::vector errors; const auto& c = std::get(_config); + const auto& sc = *_cli.get_subcommand("load"); if (!c.force && std::filesystem::exists(c.uri)) { errors.emplace_back(fmt::format( FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.uri)); } + if (c.path_to_bin_table.empty() && c.path_to_chrom_sizes.empty()) { + assert(c.bin_size == 0); + errors.emplace_back("--bin-size is required when --bin-table is not specified."); + } + + if ((c.format == "bg2" || c.format == "coo") && !sc.get_option("--bin-table")->empty()) { + errors.emplace_back( + "specifying bins through the --bin-table is not supported when ingesting pre-binned " + "interactions."); + } + + if (c.format == "4dn" && c.format == "validpairs" && c.assume_sorted) { + warnings.emplace_back( + "--assume-sorted has no effect when ingesting interactions in 4dn or validpairs format."); + } + + for (const auto& w : warnings) { + SPDLOG_WARN(FMT_STRING("{}"), w); + } + if (!errors.empty()) { throw std::runtime_error( fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI " @@ -117,6 +154,15 @@ void Cli::validate_load_subcommand() const { void Cli::transform_args_load_subcommand() { auto& c = std::get(_config); + const auto& sc = *_cli.get_subcommand("load"); + + if (sc.get_option("--one-based")->empty()) { + if (c.format == "4dn" || c.format == "validpairs") { + c.offset = -1; + } + } else { + c.offset = c.one_based ? -1 : 0; + } // in spdlog, high numbers correspond to low log levels assert(c.verbosity > 0 && c.verbosity < 5); diff --git a/src/hictk/cli/cli_zoomify.cpp b/src/hictk/cli/cli_zoomify.cpp index 9c5fc534..fdfedac2 100644 --- a/src/hictk/cli/cli_zoomify.cpp +++ b/src/hictk/cli/cli_zoomify.cpp @@ -145,6 +145,12 @@ void Cli::validate_zoomify_subcommand() const { "--resolutions."); } + if (clr.bin_size() == 0) { // Variable bin size + errors.clear(); + warnings.clear(); + errors.emplace_back("zoomifying files with variable bin size is not currently supported."); + } + for (const auto& w : warnings) { SPDLOG_WARN(FMT_STRING("{}"), w); } diff --git a/src/hictk/convert/cool_to_hic.cpp b/src/hictk/convert/cool_to_hic.cpp index af723ceb..8b403b78 100644 --- a/src/hictk/convert/cool_to_hic.cpp +++ b/src/hictk/convert/cool_to_hic.cpp @@ -314,6 +314,12 @@ void cool_to_hic(const ConvertConfig& c) { c.path_to_input.string(), c.resolutions.front()); const cooler::File clr(uri); + + if (clr.bin_size() == 0) { + throw std::runtime_error( + "converting cooler files with variable bin size is not supported."); + } + dump_chrom_sizes(clr, chrom_sizes); dump_pixels(clr, pixels, c.gzip_compression_lvl, c.threads); } diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index 66b07928..985227fd 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -103,9 +103,13 @@ struct LoadConfig { std::string uri{}; std::filesystem::path path_to_chrom_sizes{}; + std::filesystem::path path_to_bin_table{}; std::uint32_t bin_size{}; + std::string format{}; std::string assembly{"unknown"}; + bool one_based{true}; + std::int64_t offset{0}; bool count_as_float{false}; bool assume_sorted{false}; bool force{false}; diff --git a/src/hictk/load/common.hpp b/src/hictk/load/common.hpp index e1bbb6dc..0763901a 100644 --- a/src/hictk/load/common.hpp +++ b/src/hictk/load/common.hpp @@ -31,20 +31,20 @@ enum class Format { COO, BG2, VP, _4DN }; template [[nodiscard]] inline ThinPixel parse_pixel(const BinTable& bins, std::string_view line, - Format format) { + Format format, std::int64_t offset) { ThinPixel pixel{}; switch (format) { case Format::COO: - pixel = ThinPixel::from_coo(bins, line); + pixel = ThinPixel::from_coo(bins, line, offset); break; case Format::BG2: - pixel = Pixel::from_bg2(bins, line).to_thin(); + pixel = Pixel::from_bg2(bins, line, offset).to_thin(); break; case Format::VP: - pixel = Pixel::from_validpair(bins, line).to_thin(); + pixel = Pixel::from_validpair(bins, line, offset).to_thin(); break; case Format::_4DN: - pixel = Pixel::from_4dn_pairs(bins, line).to_thin(); + pixel = Pixel::from_4dn_pairs(bins, line, offset).to_thin(); break; } if (pixel.bin1_id > pixel.bin2_id) { diff --git a/src/hictk/load/load.cpp b/src/hictk/load/load.cpp index 1770f38f..aa86f5f7 100644 --- a/src/hictk/load/load.cpp +++ b/src/hictk/load/load.cpp @@ -37,22 +37,66 @@ namespace hictk::tools { -void ingest_pixels_sorted(const LoadConfig& c) { +[[nodiscard]] static BinTable init_bin_table(const std::filesystem::path& path_to_chrom_sizes, + std::uint32_t bin_size) { + auto chroms = Reference::from_chrom_sizes(path_to_chrom_sizes); + return {chroms, bin_size}; +} + +[[nodiscard]] static BinTable init_bin_table(const std::filesystem::path& path_to_chrom_sizes, + const std::filesystem::path& path_to_bin_table) { + auto chroms = Reference::from_chrom_sizes(path_to_chrom_sizes); + + std::ifstream ifs{}; + ifs.exceptions(std::ios::badbit); + ifs.open(path_to_bin_table); + + std::vector start_pos{}; + std::vector end_pos{}; + + std::string line{}; + GenomicInterval record{}; + bool fixed_bin_size = true; + std::uint32_t bin_size = 0; + while (std::getline(ifs, line)) { + record = GenomicInterval::parse_bed(chroms, line); + if (bin_size == 0) { + bin_size = record.size(); + } + + fixed_bin_size &= record.size() == bin_size || record.chrom().size() == record.end(); + + start_pos.push_back(record.start()); + end_pos.push_back(record.end()); + } + + if (fixed_bin_size) { + SPDLOG_INFO(FMT_STRING("detected bin table with uniform bin size.")); + return {chroms, bin_size}; + } + + SPDLOG_INFO(FMT_STRING("detected bin table with variable bin size.")); + return {chroms, start_pos, end_pos}; +} + +static void ingest_pixels_sorted(const LoadConfig& c) { assert(c.assume_sorted); auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); const auto format = format_from_string(c.format); c.count_as_float ? ingest_pixels_sorted( cooler::File::create(c.uri, chroms, c.bin_size, c.force), format, - c.batch_size, c.validate_pixels) + c.offset, c.batch_size, c.validate_pixels) : ingest_pixels_sorted( cooler::File::create(c.uri, chroms, c.bin_size, c.force), - format, c.batch_size, c.validate_pixels); + format, c.offset, c.batch_size, c.validate_pixels); } -void ingest_pixels_unsorted(const LoadConfig& c) { +static void ingest_pixels_unsorted(const LoadConfig& c) { assert(!c.assume_sorted); - auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); + auto bins = c.path_to_bin_table.empty() + ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) + : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); const auto format = format_from_string(c.format); const auto tmp_cooler_path = c.uri + ".tmp"; @@ -70,13 +114,12 @@ void ingest_pixels_unsorted(const LoadConfig& c) { [&](auto& buffer) { using N = decltype(buffer.front().count); { - auto tmp_clr = - cooler::SingleCellFile::create(tmp_cooler_path, chroms, c.bin_size, c.force); + auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, c.force); for (std::size_t i = 0; true; ++i) { SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, tmp_cooler_path); const auto nnz = ingest_pixels_unsorted(tmp_clr.create_cell(fmt::to_string(i)), - buffer, format, c.validate_pixels); + buffer, format, c.offset, c.validate_pixels); SPDLOG_INFO(FMT_STRING("done writing chunk #{} to tmp file \"{}\"."), i + 1, tmp_cooler_path); if (nnz == 0) { @@ -92,22 +135,10 @@ void ingest_pixels_unsorted(const LoadConfig& c) { std::filesystem::remove(tmp_cooler_path); } -void ingest_pairs_sorted(const LoadConfig& c) { - assert(c.assume_sorted); - auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); - const auto format = format_from_string(c.format); - - c.count_as_float ? ingest_pairs_sorted( - cooler::File::create(c.uri, chroms, c.bin_size, c.force), format, - c.batch_size, c.validate_pixels) - : ingest_pairs_sorted( - cooler::File::create(c.uri, chroms, c.bin_size, c.force), - format, c.batch_size, c.validate_pixels); -} - -static void ingest_pairs_unsorted(const LoadConfig& c) { - assert(!c.assume_sorted); - auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); +static void ingest_pairs(const LoadConfig& c) { + auto bins = c.path_to_bin_table.empty() + ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) + : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); const auto format = format_from_string(c.format); const auto tmp_cooler_path = c.uri + ".tmp"; @@ -125,14 +156,13 @@ static void ingest_pairs_unsorted(const LoadConfig& c) { [&](auto& buffer) { using N = decltype(buffer.begin()->count); { - auto tmp_clr = - cooler::SingleCellFile::create(tmp_cooler_path, chroms, c.bin_size, c.force); + auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, c.force); for (std::size_t i = 0; true; ++i) { SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, tmp_cooler_path); - const auto nnz = ingest_pairs_unsorted(tmp_clr.create_cell(fmt::to_string(i)), - buffer, c.batch_size, format, c.validate_pixels); + const auto nnz = ingest_pairs(tmp_clr.create_cell(fmt::to_string(i)), buffer, + c.batch_size, format, c.offset, c.validate_pixels); SPDLOG_INFO(FMT_STRING("done writing chunk #{} to tmp file \"{}\"."), i + 1, tmp_cooler_path); @@ -154,6 +184,7 @@ static void ingest_pairs_unsorted(const LoadConfig& c) { int load_subcmd(const LoadConfig& c) { const auto format = format_from_string(c.format); const auto pixel_has_count = format == Format::COO || format == Format::BG2; + const auto t0 = std::chrono::system_clock::now(); if (c.assume_sorted && pixel_has_count) { SPDLOG_INFO(FMT_STRING("begin loading presorted pixels...")); @@ -161,13 +192,23 @@ int load_subcmd(const LoadConfig& c) { } else if (!c.assume_sorted && pixel_has_count) { SPDLOG_INFO(FMT_STRING("begin loading un-sorted pixels...")); ingest_pixels_unsorted(c); - } else if (c.assume_sorted && !pixel_has_count) { - SPDLOG_INFO(FMT_STRING("begin loading presorted pairs...")); - ingest_pairs_sorted(c); - } else { - SPDLOG_INFO(FMT_STRING("begin loading un-sorted pairs...")); - ingest_pairs_unsorted(c); + } else if (!pixel_has_count) { + SPDLOG_INFO(FMT_STRING("begin loading pairs...")); + ingest_pairs(c); } + + const cooler::File clr(c.uri); + + const auto t1 = std::chrono::system_clock::now(); + const auto delta = std::chrono::duration_cast(t1 - t0).count(); + + std::visit( + [&](const auto& sum) { + SPDLOG_INFO(FMT_STRING("ingested {} interactions ({} nnz) in {}s!"), sum, clr.nnz(), + static_cast(delta) / 1.0e9); + }, // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + *clr.attributes().sum); + return 0; } diff --git a/src/hictk/load/load_pairs.hpp b/src/hictk/load/load_pairs.hpp index 69436123..05101e54 100644 --- a/src/hictk/load/load_pairs.hpp +++ b/src/hictk/load/load_pairs.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include @@ -33,161 +34,93 @@ struct PixelCmp { template class PairsAggregator { phmap::btree_set, PixelCmp> _buffer{}; - typename phmap::btree_set, PixelCmp>::const_iterator _it{}; const BinTable& _bins{}; Format _format{}; - ThinPixel _last{}; + ThinPixel _last_pixel{}; + std::string _line_buffer{}; + std::int64_t _offset{}; public: PairsAggregator() = delete; - inline PairsAggregator(const BinTable& bins, Format format) - : _it(_buffer.end()), _bins(bins), _format(format) {} + inline PairsAggregator(const BinTable& bins, Format format, std::int64_t offset) + : _bins(bins), _format(format), _offset(offset) { + while (std::getline(std::cin, _line_buffer)) { + if (!line_is_header(_line_buffer)) { + break; + } + } + if (!_line_buffer.empty()) { + _last_pixel = parse_pixel(_bins, _line_buffer, _format, _offset); + } + } - inline void read_next_chunk(std::vector>& buffer) { + inline bool read_next_chunk(std::vector>& buffer) { buffer.clear(); read_next_batch(buffer.capacity()); std::copy(_buffer.begin(), _buffer.end(), std::back_inserter(buffer)); _buffer.clear(); - } - [[nodiscard]] inline ThinPixel next() { - if (_it == _buffer.end()) { - read_next_batch(); - } - if (_buffer.empty()) { - auto last = _last; - _last = {}; - return last; - } - return *_it++; + return buffer.size() == buffer.capacity(); } private: - inline void read_next_batch() { - auto last_bin1 = _last.bin1_id; - std::string line{}; - - _buffer.clear(); - if (!!_last) { - _buffer.emplace(_last); - } + inline ThinPixel aggregate_pixel() { + assert(!!_last_pixel); - while (std::getline(std::cin, line)) { - if (line_is_header(line)) { + while (std::getline(std::cin, _line_buffer)) { + if (_line_buffer.empty()) { continue; } - auto pixel = parse_pixel(_bins, line, _format); - auto node = _buffer.find(pixel); - if (node != _buffer.end()) { - node->count += pixel.count; - } else { - _buffer.emplace(pixel); + auto p = parse_pixel(_bins, _line_buffer, _format, _offset); + if (p.bin1_id != _last_pixel.bin1_id || p.bin2_id != _last_pixel.bin2_id) { + std::swap(p, _last_pixel); + return p; } - - if (last_bin1 != ThinPixel::null_id && pixel.bin1_id != last_bin1) { - break; - } - last_bin1 = pixel.bin1_id; + _last_pixel.count += p.count; } - _it = _buffer.begin(); - if (_buffer.empty()) { - _last = {}; + ThinPixel p{}; + std::swap(p, _last_pixel); + return p; + } + + inline void insert_or_update(const ThinPixel& pixel) { + auto node = _buffer.find(pixel); + if (node != _buffer.end()) { + node->count += pixel.count; } else { - _last = *_buffer.rbegin(); - _buffer.erase(_last); + _buffer.emplace(pixel); } } inline void read_next_batch(std::size_t batch_size) { - std::string line{}; - _buffer.clear(); - while (std::getline(std::cin, line)) { - if (line_is_header(line)) { - continue; - } - - auto pixel = parse_pixel(_bins, line, _format); - auto node = _buffer.find(pixel); - if (node != _buffer.end()) { - node->count += pixel.count; - } else { - _buffer.emplace(pixel); - } - if (_buffer.size() == batch_size) { + while (!!_last_pixel) { + const auto pixel = aggregate_pixel(); + if (!pixel) { break; } - } - _it = _buffer.begin(); - } -}; - -template -inline void read_sort_and_aggregate_batch(PairsAggregator& aggregator, - std::vector>& buffer, - std::size_t batch_size) { - buffer.reserve(batch_size); - buffer.clear(); - - while (true) { - if (buffer.size() == batch_size) { - return; - } - - auto pixel = aggregator.next(); - if (!pixel) { - break; - } - buffer.emplace_back(std::move(pixel)); - } -} - -template -inline void ingest_pairs_sorted(cooler::File&& clr, Format format, std::size_t batch_size, - bool validate_pixels) { - PairsAggregator aggregator{clr.bins(), format}; - std::vector> buffer{}; - buffer.reserve(batch_size); - - std::size_t i0 = 0; - std::size_t i1 = i0; - try { - for (; true; ++i1) { - if (buffer.size() == batch_size) { - SPDLOG_INFO(FMT_STRING("processing chunk {}-{}..."), i0, i1); - clr.append_pixels(buffer.begin(), buffer.end(), validate_pixels); - buffer.clear(); - i0 = i1; - } + insert_or_update(pixel); - auto pixel = aggregator.next(); - if (!pixel) { + if (_buffer.size() == batch_size - 1) { + insert_or_update(_last_pixel); break; } - buffer.emplace_back(std::move(pixel)); - } - - if (!buffer.empty()) { - clr.append_pixels(buffer.begin(), buffer.end(), validate_pixels); } - } catch (const std::exception& e) { - throw std::runtime_error(fmt::format( - FMT_STRING("an error occurred while processing chunk {}-{}: {}"), i0, i1, e.what())); } -} +}; template -[[nodiscard]] inline std::uint64_t ingest_pairs_unsorted(cooler::File&& clr, - std::vector>& buffer, - std::size_t batch_size, Format format, - bool validate_pixels) { +[[nodiscard]] inline std::uint64_t ingest_pairs(cooler::File&& clr, + std::vector>& buffer, + std::size_t batch_size, Format format, + std::int64_t offset, bool validate_pixels) { buffer.reserve(batch_size); - PairsAggregator{clr.bins(), format}.read_next_chunk(buffer); + PairsAggregator{clr.bins(), format, offset}.read_next_chunk(buffer); if (buffer.empty()) { assert(std::cin.eof()); diff --git a/src/hictk/load/load_pixels.hpp b/src/hictk/load/load_pixels.hpp index e6ec599e..211f6102 100644 --- a/src/hictk/load/load_pixels.hpp +++ b/src/hictk/load/load_pixels.hpp @@ -20,7 +20,8 @@ namespace hictk::tools { template -inline void read_batch(const BinTable& bins, std::vector>& buffer, Format format) { +inline void read_batch(const BinTable& bins, std::vector>& buffer, Format format, + std::int64_t offset) { buffer.clear(); std::string line{}; try { @@ -28,7 +29,7 @@ inline void read_batch(const BinTable& bins, std::vector>& buffer, if (line_is_header(line)) { continue; } - buffer.emplace_back(parse_pixel(bins, line, format)); + buffer.emplace_back(parse_pixel(bins, line, format, offset)); if (buffer.size() == buffer.capacity()) { return; } @@ -43,15 +44,15 @@ inline void read_batch(const BinTable& bins, std::vector>& buffer, } template -inline void ingest_pixels_sorted(cooler::File&& clr, Format format, std::size_t batch_size, - bool validate_pixels) { +inline void ingest_pixels_sorted(cooler::File&& clr, Format format, std::int64_t offset, + std::size_t batch_size, bool validate_pixels) { std::vector> buffer(batch_size); std::size_t i = 0; try { for (; !std::cin.eof(); ++i) { SPDLOG_INFO(FMT_STRING("processing chunk #{}..."), i + 1); - read_batch(clr.bins(), buffer, format); + read_batch(clr.bins(), buffer, format, offset); clr.append_pixels(buffer.begin(), buffer.end(), validate_pixels); buffer.clear(); } @@ -69,10 +70,11 @@ inline void ingest_pixels_sorted(cooler::File&& clr, Format format, std::size_t template [[nodiscard]] inline std::size_t ingest_pixels_unsorted(cooler::File&& clr, std::vector>& buffer, - Format format, bool validate_pixels) { + Format format, std::int64_t offset, + bool validate_pixels) { assert(buffer.capacity() != 0); - read_batch(clr.bins(), buffer, format); + read_batch(clr.bins(), buffer, format, offset); if (buffer.empty()) { assert(std::cin.eof()); diff --git a/src/libhictk/bin_table/include/hictk/bin.hpp b/src/libhictk/bin_table/include/hictk/bin.hpp new file mode 100644 index 00000000..6431d221 --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/bin.hpp @@ -0,0 +1,63 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include "hictk/chromosome.hpp" +#include "hictk/genomic_interval.hpp" + +namespace hictk { + +class Bin { + public: + static constexpr std::uint64_t null_id{(std::numeric_limits::max)()}; + static constexpr std::uint32_t rel_null_id{(std::numeric_limits::max)()}; + + private: + std::uint64_t _id{null_id}; + std::uint32_t _rel_id{rel_null_id}; + GenomicInterval _interval{}; + + public: + constexpr Bin() = default; + Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept; + Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, std::uint32_t start_, + std::uint32_t end_) noexcept; + explicit Bin(GenomicInterval interval) noexcept; + Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept; + + [[nodiscard]] explicit operator bool() const noexcept; + + [[nodiscard]] bool operator==(const Bin &other) const noexcept; + [[nodiscard]] bool operator!=(const Bin &other) const noexcept; + + [[nodiscard]] bool operator<(const Bin &other) const noexcept; + [[nodiscard]] bool operator<=(const Bin &other) const noexcept; + + [[nodiscard]] bool operator>(const Bin &other) const noexcept; + [[nodiscard]] bool operator>=(const Bin &other) const noexcept; + + [[nodiscard]] constexpr std::uint64_t id() const noexcept; + [[nodiscard]] constexpr std::uint32_t rel_id() const noexcept; + [[nodiscard]] const GenomicInterval &interval() const noexcept; + [[nodiscard]] const Chromosome &chrom() const noexcept; + [[nodiscard]] constexpr std::uint32_t start() const noexcept; + [[nodiscard]] constexpr std::uint32_t end() const noexcept; + + [[nodiscard]] constexpr bool has_null_id() const noexcept; +}; + +} // namespace hictk + +namespace std { +template <> +struct hash { + size_t operator()(const hictk::Bin &b) const; +}; +} // namespace std + +#include "./impl/bin_impl.hpp" diff --git a/src/libhictk/bin_table/include/hictk/bin_table.hpp b/src/libhictk/bin_table/include/hictk/bin_table.hpp index 9c22414d..ce5b5791 100644 --- a/src/libhictk/bin_table/include/hictk/bin_table.hpp +++ b/src/libhictk/bin_table/include/hictk/bin_table.hpp @@ -8,69 +8,27 @@ #include #include #include +#include #include +#include "hictk/bin_table_fixed.hpp" +#include "hictk/bin_table_variable.hpp" #include "hictk/common.hpp" #include "hictk/genomic_interval.hpp" #include "hictk/reference.hpp" - namespace hictk { -class Bin { - public: - static constexpr std::uint64_t null_id{(std::numeric_limits::max)()}; - static constexpr std::uint32_t rel_null_id{(std::numeric_limits::max)()}; - - private: - std::uint64_t _id{null_id}; - std::uint32_t _rel_id{rel_null_id}; - GenomicInterval _interval{}; - - public: - constexpr Bin() = default; - Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept; - Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, std::uint32_t start_, - std::uint32_t end_) noexcept; - explicit Bin(GenomicInterval interval) noexcept; - Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept; - - [[nodiscard]] explicit operator bool() const noexcept; - - [[nodiscard]] bool operator==(const Bin &other) const noexcept; - [[nodiscard]] bool operator!=(const Bin &other) const noexcept; - - [[nodiscard]] bool operator<(const Bin &other) const noexcept; - [[nodiscard]] bool operator<=(const Bin &other) const noexcept; - - [[nodiscard]] bool operator>(const Bin &other) const noexcept; - [[nodiscard]] bool operator>=(const Bin &other) const noexcept; - - [[nodiscard]] constexpr std::uint64_t id() const noexcept; - [[nodiscard]] constexpr std::uint32_t rel_id() const noexcept; - [[nodiscard]] const GenomicInterval &interval() const noexcept; - [[nodiscard]] const Chromosome &chrom() const noexcept; - [[nodiscard]] constexpr std::uint32_t start() const noexcept; - [[nodiscard]] constexpr std::uint32_t end() const noexcept; - - [[nodiscard]] constexpr bool has_null_id() const noexcept; -}; - -struct BinTableConcrete { - std::vector chroms{}; - std::vector bin_starts{}; - std::vector bin_ends{}; -}; - class BinTable { - Reference _chroms{}; - std::vector _num_bins_prefix_sum{}; - std::uint32_t _bin_size{std::numeric_limits::max()}; + std::variant> _table{BinTableFixed{}}; public: + using BinTableVar = decltype(_table); class iterator; friend iterator; BinTable() = default; + template + BinTable(BinTableT table); BinTable(Reference chroms, std::uint32_t bin_size, std::size_t bin_offset = 0); template BinTable(ChromIt first_chrom, ChromIt last_chrom, std::uint32_t bin_size, @@ -78,12 +36,16 @@ class BinTable { template BinTable(ChromNameIt first_chrom_name, ChromNameIt last_chrom_name, ChromSizeIt first_chrom_size, std::uint32_t bin_size, std::size_t bin_offset = 0); + template + BinTable(Reference chroms, const std::vector &start_pos, const std::vector &end_pos, + I bin_offset = 0); [[nodiscard]] std::size_t size() const noexcept; [[nodiscard]] bool empty() const noexcept; [[nodiscard]] std::size_t num_chromosomes() const; [[nodiscard]] constexpr std::uint32_t bin_size() const noexcept; [[nodiscard]] constexpr const Reference &chromosomes() const noexcept; + [[nodiscard]] constexpr bool has_fixed_bin_size() const noexcept; [[nodiscard]] constexpr const std::vector &num_bin_prefix_sum() const noexcept; @@ -123,30 +85,26 @@ class BinTable { [[nodiscard]] std::uint64_t map_to_bin_id(std::string_view chrom_name, std::uint32_t pos) const; [[nodiscard]] std::uint64_t map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const; - [[nodiscard]] BinTableConcrete concretize() const; - [[nodiscard]] bool operator==(const BinTable &other) const; [[nodiscard]] bool operator!=(const BinTable &other) const; - private: - [[nodiscard]] static std::vector compute_num_bins_prefix_sum( - const Reference &chroms, std::uint32_t bin_size, std::size_t bin_offset); + template + [[nodiscard]] constexpr const BinTableT &get() const; + template + [[nodiscard]] constexpr BinTableT &get(); + [[nodiscard]] constexpr auto get() const noexcept -> const BinTableVar &; + [[nodiscard]] constexpr auto get() noexcept -> BinTableVar &; - public: class iterator { friend BinTable; - const BinTable *_bin_table{}; - std::size_t _chrom_bin_id{}; - std::uint32_t _rel_bin_id{}; - std::uint32_t _chrom_id{}; - - static constexpr auto null_rel_bin_id = (std::numeric_limits::max)(); - static constexpr auto null_bin_id = (std::numeric_limits::max)(); - static constexpr auto nchrom = (std::numeric_limits::max)(); + std::variant::iterator> _it{ + BinTableFixed::iterator{}}; - explicit iterator(const BinTable &bin_table) noexcept; + template + explicit iterator(It it) noexcept; public: + using IteratorVar = decltype(_it); using difference_type = std::ptrdiff_t; using value_type = Bin; using pointer = value_type *; @@ -155,12 +113,12 @@ class BinTable { constexpr iterator() noexcept = default; - constexpr bool operator==(const iterator &other) const noexcept; - constexpr bool operator!=(const iterator &other) const noexcept; - constexpr bool operator<(const iterator &other) const noexcept; - constexpr bool operator<=(const iterator &other) const noexcept; - constexpr bool operator>(const iterator &other) const noexcept; - constexpr bool operator>=(const iterator &other) const noexcept; + constexpr bool operator==(const iterator &other) const; + constexpr bool operator!=(const iterator &other) const; + constexpr bool operator<(const iterator &other) const; + constexpr bool operator<=(const iterator &other) const; + constexpr bool operator>(const iterator &other) const; + constexpr bool operator>=(const iterator &other) const; auto operator*() const -> value_type; auto operator[](std::size_t i) const -> iterator; @@ -176,25 +134,15 @@ class BinTable { auto operator-(std::size_t i) const -> iterator; auto operator-(const iterator &other) const -> difference_type; - private: - [[nodiscard]] static auto make_end_iterator(const BinTable &table) noexcept -> iterator; - [[nodiscard]] const Chromosome &chromosome(std::uint32_t chrom_id) const; - [[nodiscard]] const Chromosome &chromosome() const; - [[nodiscard]] constexpr std::uint32_t bin_size() const noexcept; - [[nodiscard]] constexpr std::size_t bin_id() const noexcept; - [[nodiscard]] std::uint32_t compute_num_chrom_bins() const noexcept; - [[nodiscard]] std::size_t compute_bin_offset() const noexcept; - [[nodiscard]] std::size_t num_chromosomes() const noexcept; + template + [[nodiscard]] constexpr const IteratorT &get() const; + template + [[nodiscard]] constexpr IteratorT &get(); + [[nodiscard]] constexpr auto get() const noexcept -> const IteratorVar &; + [[nodiscard]] constexpr auto get() noexcept -> IteratorVar &; }; }; } // namespace hictk -namespace std { -template <> -struct hash { - size_t operator()(const hictk::Bin &b) const; -}; -} // namespace std - #include "./impl/bin_table_impl.hpp" diff --git a/src/libhictk/bin_table/include/hictk/bin_table_fixed.hpp b/src/libhictk/bin_table/include/hictk/bin_table_fixed.hpp new file mode 100644 index 00000000..be4cb917 --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/bin_table_fixed.hpp @@ -0,0 +1,146 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +#include "hictk/bin.hpp" +#include "hictk/common.hpp" +#include "hictk/genomic_interval.hpp" +#include "hictk/reference.hpp" + +namespace hictk { + +class BinTableFixed { + Reference _chroms{}; + std::vector _num_bins_prefix_sum{}; + std::uint32_t _bin_size{std::numeric_limits::max()}; + + public: + class iterator; + friend iterator; + + BinTableFixed() = default; + BinTableFixed(Reference chroms, std::uint32_t bin_size, std::size_t bin_offset = 0); + template + BinTableFixed(ChromIt first_chrom, ChromIt last_chrom, std::uint32_t bin_size, + std::size_t bin_offset = 0); + template + BinTableFixed(ChromNameIt first_chrom_name, ChromNameIt last_chrom_name, + ChromSizeIt first_chrom_size, std::uint32_t bin_size, std::size_t bin_offset = 0); + + [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t num_chromosomes() const; + [[nodiscard]] constexpr std::uint32_t bin_size() const noexcept; + [[nodiscard]] constexpr const Reference &chromosomes() const noexcept; + + [[nodiscard]] constexpr const std::vector &num_bin_prefix_sum() const noexcept; + + [[nodiscard]] auto begin() const -> iterator; + [[nodiscard]] auto end() const -> iterator; + [[nodiscard]] auto cbegin() const -> iterator; + [[nodiscard]] auto cend() const -> iterator; + + [[nodiscard]] BinTableFixed subset(const Chromosome &chrom) const; + [[nodiscard]] BinTableFixed subset(std::string_view chrom_name) const; + [[nodiscard]] BinTableFixed subset(std::uint32_t chrom_id) const; + + [[nodiscard]] auto find_overlap(const GenomicInterval &query) const + -> std::pair; + [[nodiscard]] auto find_overlap(const Chromosome &chrom, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + [[nodiscard]] auto find_overlap(std::string_view chrom_name, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + [[nodiscard]] auto find_overlap(std::uint32_t chrom_id, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + + // Map bin_id to Bin + [[nodiscard]] Bin at(std::uint64_t bin_id) const; + [[nodiscard]] std::pair at(const GenomicInterval &gi) const; + [[nodiscard]] Bin at(const Chromosome &chrom, std::uint32_t pos = 0) const; + [[nodiscard]] Bin at(std::string_view chrom_name, std::uint32_t pos = 0) const; + [[nodiscard]] Bin at(std::uint32_t chrom_id, std::uint32_t pos) const; + [[nodiscard]] Bin at_hint(std::uint64_t bin_id, const Chromosome &chrom) const; + + // Map genomic coords to bin_id + [[nodiscard]] std::pair map_to_bin_ids( + const GenomicInterval &gi) const; + [[nodiscard]] std::uint64_t map_to_bin_id(const Chromosome &chrom, std::uint32_t pos) const; + [[nodiscard]] std::uint64_t map_to_bin_id(std::string_view chrom_name, std::uint32_t pos) const; + [[nodiscard]] std::uint64_t map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const; + + [[nodiscard]] bool operator==(const BinTableFixed &other) const; + [[nodiscard]] bool operator!=(const BinTableFixed &other) const; + + private: + [[nodiscard]] static std::vector compute_num_bins_prefix_sum( + const Reference &chroms, std::uint32_t bin_size, std::size_t bin_offset); + + public: + class iterator { + friend BinTableFixed; + const BinTableFixed *_bin_table{}; + std::size_t _chrom_bin_id{}; + std::uint32_t _rel_bin_id{}; + std::uint32_t _chrom_id{}; + + static constexpr auto null_rel_bin_id = (std::numeric_limits::max)(); + static constexpr auto null_bin_id = (std::numeric_limits::max)(); + + explicit iterator(const BinTableFixed &bin_table) noexcept; + + public: + using difference_type = std::ptrdiff_t; + using value_type = Bin; + using pointer = value_type *; + using reference = value_type &; + using iterator_category = std::random_access_iterator_tag; + + constexpr iterator() noexcept = default; + + constexpr bool operator==(const iterator &other) const noexcept; + constexpr bool operator!=(const iterator &other) const noexcept; + constexpr bool operator<(const iterator &other) const noexcept; + constexpr bool operator<=(const iterator &other) const noexcept; + constexpr bool operator>(const iterator &other) const noexcept; + constexpr bool operator>=(const iterator &other) const noexcept; + + auto operator*() const -> value_type; + auto operator[](std::size_t i) const -> iterator; + + auto operator++() -> iterator &; + auto operator++(int) -> iterator; + auto operator+=(std::size_t i) -> iterator &; + auto operator+(std::size_t i) const -> iterator; + + auto operator--() -> iterator &; + auto operator--(int) -> iterator; + auto operator-=(std::size_t i) -> iterator &; + auto operator-(std::size_t i) const -> iterator; + auto operator-(const iterator &other) const -> difference_type; + + private: + [[nodiscard]] static auto make_end_iterator(const BinTableFixed &table) noexcept -> iterator; + [[nodiscard]] const Chromosome &chromosome(std::uint32_t chrom_id) const; + [[nodiscard]] const Chromosome &chromosome() const; + [[nodiscard]] constexpr std::uint32_t bin_size() const noexcept; + [[nodiscard]] constexpr std::size_t bin_id() const noexcept; + [[nodiscard]] std::uint32_t compute_num_chrom_bins() const noexcept; + [[nodiscard]] std::size_t compute_bin_offset() const noexcept; + [[nodiscard]] std::size_t num_chromosomes() const noexcept; + }; +}; + +} // namespace hictk + +#include "./impl/bin_table_fixed_impl.hpp" diff --git a/src/libhictk/bin_table/include/hictk/bin_table_variable.hpp b/src/libhictk/bin_table/include/hictk/bin_table_variable.hpp new file mode 100644 index 00000000..0016410e --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/bin_table_variable.hpp @@ -0,0 +1,136 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +#include "hictk/bin.hpp" +#include "hictk/common.hpp" +#include "hictk/genomic_interval.hpp" +#include "hictk/reference.hpp" + +namespace hictk { + +template +class BinTableVariable { + Reference _chroms{}; + std::vector _bin_end_prefix_sum{0}; + std::vector _num_bins_prefix_sum{0}; + + public: + class iterator; + friend iterator; + + BinTableVariable() = default; + BinTableVariable(Reference chroms, const std::vector &start_pos, const std::vector &end_pos, + I bin_offset = 0); + + [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] bool empty() const noexcept; + [[nodiscard]] std::size_t num_chromosomes() const; + [[nodiscard]] constexpr const Reference &chromosomes() const noexcept; + + [[nodiscard]] constexpr const std::vector &num_bin_prefix_sum() const noexcept; + + [[nodiscard]] auto begin() const -> iterator; + [[nodiscard]] auto end() const -> iterator; + [[nodiscard]] auto cbegin() const -> iterator; + [[nodiscard]] auto cend() const -> iterator; + + [[nodiscard]] BinTableVariable subset(const Chromosome &chrom) const; + [[nodiscard]] BinTableVariable subset(std::string_view chrom_name) const; + [[nodiscard]] BinTableVariable subset(std::uint32_t chrom_id) const; + + [[nodiscard]] auto find_overlap(const GenomicInterval &query) const + -> std::pair; + [[nodiscard]] auto find_overlap(const Chromosome &chrom, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + [[nodiscard]] auto find_overlap(std::string_view chrom_name, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + [[nodiscard]] auto find_overlap(std::uint32_t chrom_id, std::uint32_t start, + std::uint32_t end) const + -> std::pair; + + // Map bin_id to Bin + [[nodiscard]] Bin at(std::uint64_t bin_id) const; + [[nodiscard]] std::pair at(const GenomicInterval &gi) const; + [[nodiscard]] Bin at(const Chromosome &chrom, std::uint32_t pos = 0) const; + [[nodiscard]] Bin at(std::string_view chrom_name, std::uint32_t pos = 0) const; + [[nodiscard]] Bin at(std::uint32_t chrom_id, std::uint32_t pos) const; + [[nodiscard]] Bin at_hint(std::uint64_t bin_id, const Chromosome &chrom) const; + + // Map genomic coords to bin_id + [[nodiscard]] std::pair map_to_bin_ids( + const GenomicInterval &gi) const; + [[nodiscard]] std::uint64_t map_to_bin_id(const Chromosome &chrom, std::uint32_t pos) const; + [[nodiscard]] std::uint64_t map_to_bin_id(std::string_view chrom_name, std::uint32_t pos) const; + [[nodiscard]] std::uint64_t map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const; + + [[nodiscard]] bool operator==(const BinTableVariable &other) const; + [[nodiscard]] bool operator!=(const BinTableVariable &other) const; + + private: + static void validate_bin_coords(const std::vector &start_pos, const std::vector &end_pos); + + public: + class iterator { + friend BinTableVariable; + Bin _value{}; + const BinTableVariable *_bin_table{}; + std::uint32_t _chrom_id{}; + std::uint64_t _bin_id{}; + + static constexpr auto null_bin_id = (std::numeric_limits::max)(); + static constexpr auto nchrom = (std::numeric_limits::max)(); + + explicit iterator(const BinTableVariable &bin_table) noexcept; + + public: + using difference_type = std::ptrdiff_t; + using value_type = Bin; + using pointer = value_type *; + using reference = value_type &; + using iterator_category = std::random_access_iterator_tag; + + constexpr iterator() noexcept = default; + + constexpr bool operator==(const iterator &other) const noexcept; + constexpr bool operator!=(const iterator &other) const noexcept; + constexpr bool operator<(const iterator &other) const noexcept; + constexpr bool operator<=(const iterator &other) const noexcept; + constexpr bool operator>(const iterator &other) const noexcept; + constexpr bool operator>=(const iterator &other) const noexcept; + + auto operator*() const noexcept -> value_type; + auto operator[](std::size_t i) const -> iterator; + + auto operator++() -> iterator &; + auto operator++(int) -> iterator; + auto operator+=(std::size_t i) -> iterator &; + auto operator+(std::size_t i) const -> iterator; + + auto operator--() -> iterator &; + auto operator--(int) -> iterator; + auto operator-=(std::size_t i) -> iterator &; + auto operator-(std::size_t i) const -> iterator; + auto operator-(const iterator &other) const -> difference_type; + + private: + [[nodiscard]] static auto make_end_iterator(const BinTableVariable &table) noexcept -> iterator; + [[nodiscard]] const Chromosome &chromosome(std::uint32_t chrom_id) const; + [[nodiscard]] const Chromosome &chromosome() const; + [[nodiscard]] Bin get_bin() const; + }; +}; + +} // namespace hictk + +#include "./impl/bin_table_variable_impl.hpp" diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_impl.hpp new file mode 100644 index 00000000..c130933d --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/impl/bin_impl.hpp @@ -0,0 +1,78 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include "hictk/chromosome.hpp" +#include "hictk/genomic_interval.hpp" + +namespace hictk { + +inline Bin::Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end_) noexcept + : Bin(Bin::null_id, Bin::rel_null_id, chrom_, start_, end_) {} + +inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, + std::uint32_t start_, std::uint32_t end_) noexcept + : _id(id_), _rel_id(rel_id_), _interval(chrom_, start_, end_) {} + +inline Bin::Bin(GenomicInterval interval) noexcept + : Bin(Bin::null_id, Bin::rel_null_id, std::move(interval)) {} + +inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept + : _id(id_), _rel_id(rel_id_), _interval(std::move(interval)) {} + +inline Bin::operator bool() const noexcept { return !!chrom(); } + +inline bool Bin::operator==(const Bin &other) const noexcept { + if (!has_null_id() && !other.has_null_id()) { + return id() == other.id(); + } + return _interval == other._interval; +} +inline bool Bin::operator!=(const Bin &other) const noexcept { return !(*this == other); } + +inline bool Bin::operator<(const Bin &other) const noexcept { + if (!has_null_id() && !other.has_null_id()) { + return id() < other.id(); + } + return _interval < other._interval; +} + +inline bool Bin::operator<=(const Bin &other) const noexcept { + if (!has_null_id() && !other.has_null_id()) { + return id() <= other.id(); + } + return _interval <= other._interval; +} + +inline bool Bin::operator>(const Bin &other) const noexcept { + if (!has_null_id() && !other.has_null_id()) { + return id() > other.id(); + } + return _interval > other._interval; +} + +inline bool Bin::operator>=(const Bin &other) const noexcept { + if (!has_null_id() && !other.has_null_id()) { + return id() >= other.id(); + } + return _interval >= other._interval; +} + +constexpr std::uint64_t Bin::id() const noexcept { return _id; } +constexpr std::uint32_t Bin::rel_id() const noexcept { return _rel_id; } +inline const GenomicInterval &Bin::interval() const noexcept { return _interval; } +inline const Chromosome &Bin::chrom() const noexcept { return interval().chrom(); } +constexpr std::uint32_t Bin::start() const noexcept { return _interval.start(); } +constexpr std::uint32_t Bin::end() const noexcept { return _interval.end(); } + +constexpr bool Bin::has_null_id() const noexcept { return id() == Bin::null_id; } + +}; // namespace hictk + +inline std::size_t std::hash::operator()(const hictk::Bin &b) const { + return hictk::internal::hash_combine(0, b.id(), b.interval()); +} diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp new file mode 100644 index 00000000..86d9bd66 --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_fixed_impl.hpp @@ -0,0 +1,457 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/common.hpp" +#include "hictk/genomic_interval.hpp" +#include "hictk/suppress_warnings.hpp" + +namespace hictk { // NOLINT + +inline BinTableFixed::BinTableFixed(Reference chroms, std::uint32_t bin_size, + std::size_t bin_offset) + : _chroms(std::move(chroms)), + _num_bins_prefix_sum(compute_num_bins_prefix_sum(_chroms, bin_size, bin_offset)), + _bin_size(bin_size) { + assert(!_chroms.empty()); + assert(bin_size != 0); +} + +template +inline BinTableFixed::BinTableFixed(ChromIt first_chrom, ChromIt last_chrom, std::uint32_t bin_size, + std::size_t bin_offset) + : BinTableFixed(Reference(first_chrom, last_chrom), bin_size, bin_offset) {} + +template +inline BinTableFixed::BinTableFixed(ChromNameIt first_chrom_name, ChromNameIt last_chrom_name, + ChromSizeIt first_chrom_size, std::uint32_t bin_size, + std::size_t bin_offset) + : BinTableFixed(Reference(first_chrom_name, last_chrom_name, first_chrom_size), bin_size, + bin_offset) {} + +inline std::size_t BinTableFixed::size() const noexcept { + if (_num_bins_prefix_sum.empty()) { + return 0; + } + return conditional_static_cast(_num_bins_prefix_sum.back() - + _num_bins_prefix_sum.front()); +} + +inline bool BinTableFixed::empty() const noexcept { return size() == 0; } + +inline std::size_t BinTableFixed::num_chromosomes() const { return _chroms.size(); } + +constexpr std::uint32_t BinTableFixed::bin_size() const noexcept { return _bin_size; } + +constexpr const Reference &BinTableFixed::chromosomes() const noexcept { return _chroms; } + +constexpr const std::vector &BinTableFixed::num_bin_prefix_sum() const noexcept { + return _num_bins_prefix_sum; +} + +inline auto BinTableFixed::begin() const -> iterator { return iterator(*this); } +inline auto BinTableFixed::end() const -> iterator { return iterator::make_end_iterator(*this); } +inline auto BinTableFixed::cbegin() const -> iterator { return begin(); } +inline auto BinTableFixed::cend() const -> iterator { return end(); } + +inline bool BinTableFixed::operator==(const BinTableFixed &other) const { + return _bin_size == other._bin_size && _chroms == other._chroms; +} +inline bool BinTableFixed::operator!=(const BinTableFixed &other) const { + return !(*this == other); +} + +inline BinTableFixed BinTableFixed::subset(const Chromosome &chrom) const { + // GCC8 fails to compile when using if constexpr instead #ifndef + // See: https://github.com/fmtlib/fmt/issues/1455 +#ifndef NDEBUG + if (!_chroms.contains(chrom)) { + throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); + } +#endif + const auto offset = at(chrom, 0).id(); + return {Reference{chrom}, _bin_size, offset}; +} +inline BinTableFixed BinTableFixed::subset(std::string_view chrom_name) const { + return subset(_chroms.at(chrom_name)); +} +inline BinTableFixed BinTableFixed::subset(std::uint32_t chrom_id) const { + return subset(_chroms.at(chrom_id)); +} + +inline auto BinTableFixed::find_overlap(const GenomicInterval &query) const + -> std::pair { + return find_overlap(query.chrom(), query.start(), query.end()); +} + +inline auto BinTableFixed::find_overlap(const Chromosome &chrom, std::uint32_t start, + std::uint32_t end) const + -> std::pair { + assert(start < end); + + const auto bin1_id = at(chrom, start).id(); + const auto bin2_id = at(chrom, end - (std::min)(end, 1U)).id(); + + return std::make_pair(begin() + bin1_id, begin() + bin2_id + 1); +} +inline auto BinTableFixed::find_overlap(std::string_view chrom_name, std::uint32_t start, + std::uint32_t end) const + -> std::pair { + return find_overlap(_chroms.at(chrom_name), start, end); +} +inline auto BinTableFixed::find_overlap(std::uint32_t chrom_id, std::uint32_t start, + std::uint32_t end) const + -> std::pair { + return find_overlap(_chroms.at(chrom_id), start, end); +} + +inline Bin BinTableFixed::at(std::uint64_t bin_id) const { + // I tried benchmarking linear search as well as std::set (including third-party implementations). + // Binary search and find on flat vectors are always faster for a reasonable number of chromosomes + // (e.g. 5-100) and have fairly similar performance. + // Linear search is however better in practice because chromosomes are usually sorted by (approx.) + // size, with unplaced scaffolds etc. ending up last. + auto match = std::find_if(_num_bins_prefix_sum.begin(), _num_bins_prefix_sum.end(), + [&](const auto n) { return n > bin_id; }); + + if (match == _num_bins_prefix_sum.end()) { + throw std::out_of_range(fmt::format(FMT_STRING("bin id {} not found: out of range"), bin_id)); + } + assert(match != _num_bins_prefix_sum.begin()); + + const auto chrom_id = + static_cast(std::distance(_num_bins_prefix_sum.begin(), --match)); + return at_hint(bin_id, _chroms[chrom_id]); +} + +inline Bin BinTableFixed::at_hint(std::uint64_t bin_id, const Chromosome &chrom) const { + const auto offset = _num_bins_prefix_sum[chrom.id()]; + const auto relative_bin_id = bin_id - offset; + const auto start = static_cast(relative_bin_id * bin_size()); + assert(start < chrom.size()); + const auto end = (std::min)(start + bin_size(), chrom.size()); + + return {bin_id, static_cast(relative_bin_id), chrom, start, end}; +} + +inline std::pair BinTableFixed::at(const GenomicInterval &gi) const { + const auto [bin1_id, bin2_id] = map_to_bin_ids(gi); + return std::make_pair(at_hint(bin1_id, gi.chrom()), at_hint(bin2_id, gi.chrom())); +} +inline Bin BinTableFixed::at(const Chromosome &chrom, std::uint32_t pos) const { + return at_hint(map_to_bin_id(chrom, pos), chrom); +} +inline Bin BinTableFixed::at(std::string_view chrom_name, std::uint32_t pos) const { + return at(map_to_bin_id(chrom_name, pos)); +} +inline Bin BinTableFixed::at(std::uint32_t chrom_id, std::uint32_t pos) const { + return at(map_to_bin_id(chrom_id, pos)); +} + +inline std::pair BinTableFixed::map_to_bin_ids( + const GenomicInterval &gi) const { + return std::make_pair(map_to_bin_id(gi.chrom(), gi.start()), + map_to_bin_id(gi.chrom(), gi.end() - (std::min)(gi.end(), 1U))); +} + +inline std::uint64_t BinTableFixed::map_to_bin_id(const Chromosome &chrom, + std::uint32_t pos) const { + // GCC8 fails to compile when using if constexpr instead #ifndef + // See: https://github.com/fmtlib/fmt/issues/1455 +#ifndef NDEBUG + if (!_chroms.contains(chrom)) { + throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); + } +#endif + + if (pos >= chrom.size()) { + throw std::out_of_range(fmt::format( + FMT_STRING("position is greater than chromosome size: {} >= {}"), pos, chrom.size())); + } + + const auto bin_offset = _num_bins_prefix_sum[chrom.id()] - _num_bins_prefix_sum.front(); + + return bin_offset + static_cast(pos / bin_size()); +} + +inline std::uint64_t BinTableFixed::map_to_bin_id(std::string_view chrom_name, + std::uint32_t pos) const { + return map_to_bin_id(_chroms.at(chrom_name), pos); +} + +inline std::uint64_t BinTableFixed::map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const { + return map_to_bin_id(_chroms.at(chrom_id), pos); +} + +inline std::vector BinTableFixed::compute_num_bins_prefix_sum( + const Reference &chroms, std::uint32_t bin_size, std::size_t bin_offset) { + assert(bin_size != 0); + + DISABLE_WARNING_PUSH + DISABLE_WARNING_NULL_DEREF + std::vector prefix_sum(chroms.size() + 1); + prefix_sum.front() = bin_offset; + DISABLE_WARNING_POP + + // I am using transform instead of inclusive_scan because the latter is not always available + std::transform(chroms.begin(), chroms.end(), prefix_sum.begin() + 1, + [&, sum = bin_offset](const Chromosome &chrom) mutable { + if (chrom.is_all()) { + return sum; + } + const auto num_bins = (chrom.size() + bin_size - 1) / bin_size; + return sum += static_cast(num_bins); + }); + + return prefix_sum; +} + +inline BinTableFixed::iterator::iterator(const BinTableFixed &bin_table) noexcept + : _bin_table{&bin_table}, _chrom_bin_id(_bin_table->_num_bins_prefix_sum.front()) { + if (_bin_table->chromosomes().at(_chrom_id).is_all()) { + _chrom_id++; + } +} + +constexpr bool BinTableFixed::iterator::operator==(const iterator &other) const noexcept { + // clang-format off + return _bin_table == other._bin_table && + _chrom_id == other._chrom_id && + _rel_bin_id == other._rel_bin_id; + // clang-format on +} + +constexpr bool BinTableFixed::iterator::operator!=(const iterator &other) const noexcept { + return !(*this == other); +} + +constexpr bool BinTableFixed::iterator::operator<(const iterator &other) const noexcept { + return bin_id() < other.bin_id(); +} + +constexpr bool BinTableFixed::iterator::operator<=(const iterator &other) const noexcept { + return bin_id() <= other.bin_id(); +} + +constexpr bool BinTableFixed::iterator::operator>(const iterator &other) const noexcept { + return bin_id() > other.bin_id(); +} + +constexpr bool BinTableFixed::iterator::operator>=(const iterator &other) const noexcept { + return bin_id() >= other.bin_id(); +} + +inline auto BinTableFixed::iterator::make_end_iterator(const BinTableFixed &table) noexcept + -> iterator { + iterator it(table); + + it._chrom_id = static_cast(table.chromosomes().size()); + it._rel_bin_id = null_rel_bin_id; + return it; +} + +inline auto BinTableFixed::iterator::operator*() const -> value_type { + assert(_bin_table); + + const auto &chrom = chromosome(); + const auto bin_size = this->bin_size(); + + const auto start = std::min(_rel_bin_id * bin_size, chrom.size()); + const auto end = std::min(start + bin_size, chrom.size()); + + return value_type{bin_id(), _rel_bin_id, chrom, start, end}; +} + +inline auto BinTableFixed::iterator::operator++() -> iterator & { + assert(_bin_table); + if (_chrom_id >= _bin_table->chromosomes().size()) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to increment iterator past end()"); + } + + if (++_rel_bin_id >= compute_num_chrom_bins()) { + if (_chrom_id + 1 >= num_chromosomes()) { + return *this = make_end_iterator(*_bin_table); + } + ++_chrom_id; + _chrom_bin_id = compute_bin_offset(); + _rel_bin_id = 0; + } + + return *this; +} + +inline auto BinTableFixed::iterator::operator++(int) -> iterator { + auto it = *this; + std::ignore = ++(*this); + return it; +} + +inline auto BinTableFixed::iterator::operator+=(std::size_t i) -> iterator & { + assert(_bin_table); + if (i == 0) { + return *this; + } + + if (bin_id() + i > _bin_table->size()) { + throw std::out_of_range( + "BinTableFixed::iterator: caught attempt to increment iterator past end()"); + } + if (bin_id() + i == _bin_table->size()) { + *this = make_end_iterator(*_bin_table); + return *this; + } + + const auto ii = static_cast(i); + const auto num_bins = compute_num_chrom_bins(); + if (_rel_bin_id + ii < num_bins) { + _rel_bin_id += ii; + return *this; + } + + _chrom_id++; + _chrom_bin_id = compute_bin_offset(); + i -= (num_bins - _rel_bin_id); + _rel_bin_id = 0; + return *this += i; +} + +inline auto BinTableFixed::iterator::operator+(std::size_t i) const -> iterator { + auto it = *this; + return it += i; +} + +inline auto BinTableFixed::iterator::operator--() -> iterator & { + assert(_bin_table); + if (bin_id() == 0) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to decrement iterator past begin()"); + } + + if (_rel_bin_id == null_rel_bin_id) { + assert(*this == make_end_iterator(*_bin_table)); + _chrom_id = static_cast(num_chromosomes() - 1); + _chrom_bin_id = compute_bin_offset(); + _rel_bin_id = compute_num_chrom_bins() - 1; + return *this; + } + + if (_rel_bin_id-- == 0) { + _chrom_id--; + _chrom_bin_id = compute_bin_offset(); + _rel_bin_id = compute_num_chrom_bins() - 1; + } + + return *this; +} + +inline auto BinTableFixed::iterator::operator--(int) -> iterator { + auto it = *this; + std::ignore = --(*this); + return it; +} + +inline auto BinTableFixed::iterator::operator-=(std::size_t i) -> iterator & { + assert(_bin_table); + + if (i == 0) { + return *this; + } + + if (bin_id() < i) { + throw std::out_of_range( + "BinTableFixed::iterator: caught attempt to decrement iterator past begin()"); + } + + if (_rel_bin_id == null_rel_bin_id) { + assert(*this == make_end_iterator(*_bin_table)); + _chrom_id = static_cast(num_chromosomes() - 1); + _chrom_bin_id = compute_bin_offset(); + _rel_bin_id = compute_num_chrom_bins(); + return *this -= i; + } + + if (i <= _rel_bin_id) { + _rel_bin_id -= static_cast(i); + return *this; + } + + _chrom_id--; + _chrom_bin_id = compute_bin_offset(); + i -= _rel_bin_id; + _rel_bin_id = compute_num_chrom_bins(); + return *this -= i; +} + +inline auto BinTableFixed::iterator::operator-(std::size_t i) const -> iterator { + auto it = *this; + return it -= i; +} + +inline auto BinTableFixed::iterator::operator-(const iterator &other) const -> difference_type { + assert(_bin_table); + assert(other._bin_table); + + const auto offset1 = _chrom_id == _bin_table->num_chromosomes() ? _bin_table->size() : bin_id(); + const auto offset2 = other._chrom_id == other._bin_table->num_chromosomes() + ? other._bin_table->size() + : other.bin_id(); + + return static_cast(offset1) - static_cast(offset2); +} + +inline auto BinTableFixed::iterator::operator[](std::size_t i) const -> iterator { + return (*this + i); +} + +inline const Chromosome &BinTableFixed::iterator::chromosome() const { + return chromosome(_chrom_id); +} + +constexpr std::uint32_t BinTableFixed::iterator::bin_size() const noexcept { + assert(_bin_table); + return _bin_table->bin_size(); +} + +constexpr std::size_t BinTableFixed::iterator::bin_id() const noexcept { + return _chrom_bin_id + _rel_bin_id; +} + +inline const Chromosome &BinTableFixed::iterator::chromosome(std::uint32_t chrom_id) const { + return _bin_table->chromosomes().at(chrom_id); +} + +inline std::uint32_t BinTableFixed::iterator::compute_num_chrom_bins() const noexcept { + assert(_bin_table); + + const auto chrom_size = chromosome().size(); + const auto bin_size = this->bin_size(); + + return (chrom_size + bin_size - 1) / bin_size; +} + +inline std::size_t BinTableFixed::iterator::compute_bin_offset() const noexcept { + return _bin_table->at(_chrom_id, 0).id(); +} + +inline std::size_t BinTableFixed::iterator::num_chromosomes() const noexcept { + assert(_bin_table); + + return _bin_table->num_chromosomes(); +} + +} // namespace hictk diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp index c8b04a1c..2ca0e05c 100644 --- a/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_impl.hpp @@ -1,517 +1,328 @@ -// Copyright (C) 2022 Roberto Rossini +// Copyright (C) 2023 Roberto Rossini // // SPDX-License-Identifier: MIT #pragma once -#include - -#include -#include #include #include -#include +#include #include -#include -#include #include +#include "hictk/bin.hpp" #include "hictk/common.hpp" #include "hictk/genomic_interval.hpp" -#include "hictk/suppress_warnings.hpp" - -namespace hictk { // NOLINT - -inline Bin::Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end_) noexcept - : Bin(Bin::null_id, Bin::rel_null_id, chrom_, start_, end_) {} - -inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, - std::uint32_t start_, std::uint32_t end_) noexcept - : _id(id_), _rel_id(rel_id_), _interval(chrom_, start_, end_) {} - -inline Bin::Bin(GenomicInterval interval) noexcept - : Bin(Bin::null_id, Bin::rel_null_id, std::move(interval)) {} - -inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept - : _id(id_), _rel_id(rel_id_), _interval(std::move(interval)) {} - -inline Bin::operator bool() const noexcept { return !!chrom(); } - -inline bool Bin::operator==(const Bin &other) const noexcept { - if (!has_null_id() && !other.has_null_id()) { - return id() == other.id(); - } - return _interval == other._interval; -} -inline bool Bin::operator!=(const Bin &other) const noexcept { return !(*this == other); } - -inline bool Bin::operator<(const Bin &other) const noexcept { - if (!has_null_id() && !other.has_null_id()) { - return id() < other.id(); - } - return _interval < other._interval; -} - -inline bool Bin::operator<=(const Bin &other) const noexcept { - if (!has_null_id() && !other.has_null_id()) { - return id() <= other.id(); - } - return _interval <= other._interval; -} +#include "hictk/reference.hpp" +#include "hictk/type_traits.hpp" -inline bool Bin::operator>(const Bin &other) const noexcept { - if (!has_null_id() && !other.has_null_id()) { - return id() > other.id(); - } - return _interval > other._interval; -} - -inline bool Bin::operator>=(const Bin &other) const noexcept { - if (!has_null_id() && !other.has_null_id()) { - return id() >= other.id(); - } - return _interval >= other._interval; -} - -constexpr std::uint64_t Bin::id() const noexcept { return _id; } -constexpr std::uint32_t Bin::rel_id() const noexcept { return _rel_id; } -inline const GenomicInterval &Bin::interval() const noexcept { return _interval; } -inline const Chromosome &Bin::chrom() const noexcept { return interval().chrom(); } -constexpr std::uint32_t Bin::start() const noexcept { return _interval.start(); } -constexpr std::uint32_t Bin::end() const noexcept { return _interval.end(); } - -constexpr bool Bin::has_null_id() const noexcept { return id() == Bin::null_id; } +namespace hictk { +template +inline BinTable::BinTable(BinTableT table) : _table(std::move(table)) {} inline BinTable::BinTable(Reference chroms, std::uint32_t bin_size, std::size_t bin_offset) - : _chroms(std::move(chroms)), - _num_bins_prefix_sum(compute_num_bins_prefix_sum(_chroms, bin_size, bin_offset)), - _bin_size(bin_size) { - assert(bin_size != 0); -} + : BinTable(BinTableFixed(std::move(chroms), bin_size, bin_offset)) {} template inline BinTable::BinTable(ChromIt first_chrom, ChromIt last_chrom, std::uint32_t bin_size, std::size_t bin_offset) - : BinTable(Reference(first_chrom, last_chrom), bin_size, bin_offset) {} + : BinTable(BinTableFixed(first_chrom, last_chrom, bin_size, bin_offset)) {} template inline BinTable::BinTable(ChromNameIt first_chrom_name, ChromNameIt last_chrom_name, ChromSizeIt first_chrom_size, std::uint32_t bin_size, std::size_t bin_offset) - : BinTable(Reference(first_chrom_name, last_chrom_name, first_chrom_size), bin_size, - bin_offset) {} + : BinTable(BinTableFixed(first_chrom_name, last_chrom_name, first_chrom_size, bin_size, + bin_offset)) {} + +template +inline BinTable::BinTable(Reference chroms, const std::vector &start_pos, + const std::vector &end_pos, I bin_offset) + : BinTable(BinTableVariable(std::move(chroms), start_pos, end_pos, bin_offset)) {} inline std::size_t BinTable::size() const noexcept { - if (_num_bins_prefix_sum.empty()) { - return 0; - } - return conditional_static_cast(_num_bins_prefix_sum.back() - - _num_bins_prefix_sum.front()); + return std::visit([&](const auto &t) { return t.size(); }, _table); } inline bool BinTable::empty() const noexcept { return size() == 0; } -inline std::size_t BinTable::num_chromosomes() const { return _chroms.size(); } - -constexpr std::uint32_t BinTable::bin_size() const noexcept { return _bin_size; } - -constexpr const Reference &BinTable::chromosomes() const noexcept { return _chroms; } - -constexpr const std::vector &BinTable::num_bin_prefix_sum() const noexcept { - return _num_bins_prefix_sum; +inline std::size_t BinTable::num_chromosomes() const { + return std::visit([&](const auto &t) { return t.num_chromosomes(); }, _table); } -inline auto BinTable::begin() const -> iterator { return iterator(*this); } -inline auto BinTable::end() const -> iterator { return iterator::make_end_iterator(*this); } -inline auto BinTable::cbegin() const -> iterator { return begin(); } -inline auto BinTable::cend() const -> iterator { return end(); } - -constexpr std::uint32_t BinTable::iterator::bin_size() const noexcept { - assert(_bin_table); - return _bin_table->bin_size(); +constexpr std::uint32_t BinTable::bin_size() const noexcept { + if (std::holds_alternative(_table)) { + return std::get(_table).bin_size(); + } + return 0; } -constexpr std::size_t BinTable::iterator::bin_id() const noexcept { - return _chrom_bin_id + _rel_bin_id; +constexpr const Reference &BinTable::chromosomes() const noexcept { + return std::visit([&](const auto &t) -> const Reference & { return t.chromosomes(); }, _table); } -inline BinTableConcrete BinTable::concretize() const { - std::vector chroms(size()); - std::vector starts(size()); - std::vector ends(size()); +constexpr bool BinTable::has_fixed_bin_size() const noexcept { + return std::holds_alternative(_table); +} - std::size_t i = 0; - for (const auto &bin : *this) { - chroms[i] = bin.chrom(); - starts[i] = bin.start(); - ends[i++] = bin.end(); - } - assert(i == chroms.size()); +constexpr const std::vector &BinTable::num_bin_prefix_sum() const noexcept { + return std::visit( + [&](const auto &t) -> const std::vector & { return t.num_bin_prefix_sum(); }, + _table); +} - return BinTableConcrete{chroms, starts, ends}; +inline auto BinTable::begin() const -> iterator { + return std::visit([&](const auto &t) { return iterator{t.begin()}; }, _table); } -inline bool BinTable::operator==(const BinTable &other) const { - return _bin_size == other._bin_size && _chroms == other._chroms; +inline auto BinTable::end() const -> iterator { + return std::visit([&](const auto &t) { return iterator{t.end()}; }, _table); } -inline bool BinTable::operator!=(const BinTable &other) const { return !(*this == other); } + +inline auto BinTable::cbegin() const -> iterator { return begin(); } + +inline auto BinTable::cend() const -> iterator { return end(); } inline BinTable BinTable::subset(const Chromosome &chrom) const { - // GCC8 fails to compile when using if constexpr instead #ifndef - // See: https://github.com/fmtlib/fmt/issues/1455 -#ifndef NDEBUG - if (!_chroms.contains(chrom)) { - throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); - } -#endif - const auto offset = at(chrom, 0).id(); - return {Reference{chrom}, _bin_size, offset}; + return std::visit([&](const auto &t) -> BinTable { return {t.subset(chrom)}; }, _table); } + inline BinTable BinTable::subset(std::string_view chrom_name) const { - return subset(_chroms.at(chrom_name)); + return subset(chromosomes().at(chrom_name)); } + inline BinTable BinTable::subset(std::uint32_t chrom_id) const { - return subset(_chroms.at(chrom_id)); + return subset(chromosomes().at(chrom_id)); } inline auto BinTable::find_overlap(const GenomicInterval &query) const -> std::pair { - return find_overlap(query.chrom(), query.start(), query.end()); + return std::visit( + [&](const auto &t) { + auto its = t.find_overlap(query); + DISABLE_WARNING_PUSH + DISABLE_WARNING_MAYBE_UNINITIALIZED + return std::make_pair(iterator{its.first}, iterator{its.second}); + DISABLE_WARNING_POP + }, + _table); } inline auto BinTable::find_overlap(const Chromosome &chrom, std::uint32_t start, std::uint32_t end) const -> std::pair { - assert(start < end); - - const auto bin1_id = at(chrom, start).id(); - const auto bin2_id = at(chrom, end - (std::min)(end, 1U)).id(); - - return std::make_pair(begin() + bin1_id, begin() + bin2_id + 1); + return find_overlap(GenomicInterval{chrom, start, end}); } + inline auto BinTable::find_overlap(std::string_view chrom_name, std::uint32_t start, std::uint32_t end) const -> std::pair { - return find_overlap(_chroms.at(chrom_name), start, end); + return find_overlap(chromosomes().at(chrom_name), start, end); } + inline auto BinTable::find_overlap(std::uint32_t chrom_id, std::uint32_t start, std::uint32_t end) const -> std::pair { - return find_overlap(_chroms.at(chrom_id), start, end); + return find_overlap(chromosomes().at(chrom_id), start, end); } +// Map bin_id to Bin inline Bin BinTable::at(std::uint64_t bin_id) const { - // I tried benchmarking linear search as well as std::set (including third-party implementations). - // Binary search and find on flat vectors are always faster for a reasonable number of chromosomes - // (e.g. 5-100) and have fairly similar performanc. - // Linear search is however better in practice because chromosomes are usually sorted by (approx.) - // size, with unplaced scaffolds etc. ending up last. - auto match = std::find_if(_num_bins_prefix_sum.begin(), _num_bins_prefix_sum.end(), - [&](const auto n) { return n > bin_id; }); - - if (match == _num_bins_prefix_sum.end()) { - throw std::out_of_range(fmt::format(FMT_STRING("bin id {} not found: out of range"), bin_id)); - } - assert(match != _num_bins_prefix_sum.begin()); - - const auto chrom_id = - static_cast(std::distance(_num_bins_prefix_sum.begin(), --match)); - return at_hint(bin_id, _chroms[chrom_id]); -} - -inline Bin BinTable::at_hint(std::uint64_t bin_id, const Chromosome &chrom) const { - const auto offset = _num_bins_prefix_sum[chrom.id()]; - const auto relative_bin_id = bin_id - offset; - const auto start = static_cast(relative_bin_id * bin_size()); - assert(start < chrom.size()); - const auto end = (std::min)(start + bin_size(), chrom.size()); - - return {bin_id, static_cast(relative_bin_id), chrom, start, end}; + return std::visit([&](const auto &t) { return t.at(bin_id); }, _table); } inline std::pair BinTable::at(const GenomicInterval &gi) const { - const auto [bin1_id, bin2_id] = map_to_bin_ids(gi); - return std::make_pair(at_hint(bin1_id, gi.chrom()), at_hint(bin2_id, gi.chrom())); + return std::visit([&](const auto &t) { return t.at(gi); }, _table); } + inline Bin BinTable::at(const Chromosome &chrom, std::uint32_t pos) const { - return at_hint(map_to_bin_id(chrom, pos), chrom); + return std::visit([&](const auto &t) { return t.at(chrom, pos); }, _table); } + inline Bin BinTable::at(std::string_view chrom_name, std::uint32_t pos) const { - return at(map_to_bin_id(chrom_name, pos)); + return std::visit([&](const auto &t) { return t.at(chrom_name, pos); }, _table); } + inline Bin BinTable::at(std::uint32_t chrom_id, std::uint32_t pos) const { - return at(map_to_bin_id(chrom_id, pos)); + return std::visit([&](const auto &t) { return t.at(chrom_id, pos); }, _table); } +inline Bin BinTable::at_hint(std::uint64_t bin_id, const Chromosome &chrom) const { + return std::visit([&](const auto &t) { return t.at_hint(bin_id, chrom); }, _table); +} + +// Map genomic coords to bin_id inline std::pair BinTable::map_to_bin_ids( const GenomicInterval &gi) const { - return std::make_pair(map_to_bin_id(gi.chrom(), gi.start()), - map_to_bin_id(gi.chrom(), gi.end() - (std::min)(gi.end(), 1U))); + return std::visit([&](const auto &t) { return t.map_to_bin_ids(gi); }, _table); } inline std::uint64_t BinTable::map_to_bin_id(const Chromosome &chrom, std::uint32_t pos) const { - // GCC8 fails to compile when using if constexpr instead #ifndef - // See: https://github.com/fmtlib/fmt/issues/1455 -#ifndef NDEBUG - if (!_chroms.contains(chrom)) { - throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); - } -#endif - - if (pos > chrom.size()) { - throw std::out_of_range(fmt::format( - FMT_STRING("position is greater than chromosome size: {} > {}"), pos, chrom.size())); - } - - const auto bin_offset = _num_bins_prefix_sum[chrom.id()] - _num_bins_prefix_sum.front(); - - return bin_offset + static_cast(pos / bin_size()); + return std::visit([&](const auto &t) { return t.map_to_bin_id(chrom, pos); }, _table); } inline std::uint64_t BinTable::map_to_bin_id(std::string_view chrom_name, std::uint32_t pos) const { - return map_to_bin_id(_chroms.at(chrom_name), pos); + return std::visit([&](const auto &t) { return t.map_to_bin_id(chrom_name, pos); }, _table); } inline std::uint64_t BinTable::map_to_bin_id(std::uint32_t chrom_id, std::uint32_t pos) const { - return map_to_bin_id(_chroms.at(chrom_id), pos); + return std::visit([&](const auto &t) { return t.map_to_bin_id(chrom_id, pos); }, _table); } -inline std::vector BinTable::compute_num_bins_prefix_sum(const Reference &chroms, - std::uint32_t bin_size, - std::size_t bin_offset) { - assert(bin_size != 0); - - DISABLE_WARNING_PUSH - DISABLE_WARNING_NULL_DEREF - std::vector prefix_sum(chroms.size() + 1); - prefix_sum.front() = bin_offset; - DISABLE_WARNING_POP +inline bool BinTable::operator==(const BinTable &other) const { + return std::visit( + [&](const auto &t1) { + try { + using BinTableT = remove_cvref_t; + const auto &t2 = std::get(other._table); + return t1 == t2; + } catch (const std::bad_variant_access &) { + return false; + } + }, + _table); +} - // I am using transform instead of inclusive_scan because the latter is not always available - std::transform(chroms.begin(), chroms.end(), prefix_sum.begin() + 1, - [&, sum = bin_offset](const Chromosome &chrom) mutable { - if (chrom.is_all()) { - return sum; - } - const auto num_bins = (chrom.size() + bin_size - 1) / bin_size; - return sum += static_cast(num_bins); - }); +inline bool BinTable::operator!=(const BinTable &other) const { return !(*this == other); }; - return prefix_sum; +template +constexpr const BinTableT &BinTable::get() const { + return std::get(get()); } -inline BinTable::iterator::iterator(const BinTable &bin_table) noexcept - : _bin_table{&bin_table}, _chrom_bin_id(_bin_table->_num_bins_prefix_sum.front()) { - if (_bin_table->chromosomes().at(_chrom_id).is_all()) { - _chrom_id++; - } +template +constexpr BinTableT &BinTable::get() { + return std::get(get()); } -constexpr bool BinTable::iterator::operator==(const iterator &other) const noexcept { - // clang-format off - return _bin_table == other._bin_table && - _chrom_id == other._chrom_id && - _rel_bin_id == other._rel_bin_id; - // clang-format on -} +constexpr auto BinTable::get() const noexcept -> const BinTableVar & { return _table; } -constexpr bool BinTable::iterator::operator!=(const iterator &other) const noexcept { - return !(*this == other); -} +constexpr auto BinTable::get() noexcept -> BinTableVar & { return _table; } + +template +inline BinTable::iterator::iterator(It it) noexcept : _it(it) {} -constexpr bool BinTable::iterator::operator<(const iterator &other) const noexcept { - return bin_id() < other.bin_id(); +constexpr bool BinTable::iterator::operator==(const iterator &other) const { + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 == it2; + }, + _it); } -constexpr bool BinTable::iterator::operator<=(const iterator &other) const noexcept { - return bin_id() <= other.bin_id(); +constexpr bool BinTable::iterator::operator!=(const iterator &other) const { + return !(*this == other); } -constexpr bool BinTable::iterator::operator>(const iterator &other) const noexcept { - return bin_id() > other.bin_id(); +constexpr bool BinTable::iterator::operator<(const iterator &other) const { + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 < it2; + }, + _it); } -constexpr bool BinTable::iterator::operator>=(const iterator &other) const noexcept { - return bin_id() >= other.bin_id(); +constexpr bool BinTable::iterator::operator<=(const iterator &other) const { + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 <= it2; + }, + _it); } -inline auto BinTable::iterator::make_end_iterator(const BinTable &table) noexcept -> iterator { - iterator it(table); +constexpr bool BinTable::iterator::operator>(const iterator &other) const { + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 > it2; + }, + _it); +} - it._chrom_id = nchrom; - it._rel_bin_id = null_rel_bin_id; - return it; +constexpr bool BinTable::iterator::operator>=(const iterator &other) const { + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 >= it2; + }, + _it); } inline auto BinTable::iterator::operator*() const -> value_type { - assert(_bin_table); - - const auto &chrom = chromosome(); - const auto bin_size = this->bin_size(); - - const auto start = std::min(_rel_bin_id * bin_size, chrom.size()); - const auto end = std::min(start + bin_size, chrom.size()); - - return value_type{bin_id(), _rel_bin_id, chrom, start, end}; + return std::visit([&](const auto &it) { return *it; }, _it); +} +inline auto BinTable::iterator::operator[](std::size_t i) const -> iterator { + std::visit([&](const auto &it) { std::ignore = it + i; }, _it); + return *this; } inline auto BinTable::iterator::operator++() -> iterator & { - assert(_bin_table); - if (_chrom_id == nchrom) { - return *this; - } - - if (++_rel_bin_id >= compute_num_chrom_bins()) { - if (_chrom_id + 1 >= num_chromosomes()) { - return *this = make_end_iterator(*_bin_table); - } - ++_chrom_id; - _chrom_bin_id = compute_bin_offset(); - _rel_bin_id = 0; - } - + std::visit([&](auto &it) { std::ignore = ++it; }, _it); return *this; } inline auto BinTable::iterator::operator++(int) -> iterator { - auto it = *this; - std::ignore = ++(*this); - return it; + auto old_it = *this; + std::visit([&](auto &it) { std::ignore = ++it; }, _it); + return old_it; } inline auto BinTable::iterator::operator+=(std::size_t i) -> iterator & { - assert(_bin_table); - if (_chrom_id == nchrom) { - if (i == 0) { - return *this; - } - throw std::out_of_range("BinTable::iterator: caught attempt to increment iterator past end()"); - } - - const auto ii = static_cast(i); - const auto num_bins = compute_num_chrom_bins(); - if (_rel_bin_id + ii < num_bins) { - _rel_bin_id += ii; - return *this; - } - - _chrom_id++; - _chrom_bin_id = compute_bin_offset(); - i -= (num_bins - _rel_bin_id); - _rel_bin_id = 0; - return *this += i; + std::visit([&](auto &it) { std::ignore = it += i; }, _it); + return *this; } inline auto BinTable::iterator::operator+(std::size_t i) const -> iterator { auto it = *this; - return it += i; + it += i; + return it; } inline auto BinTable::iterator::operator--() -> iterator & { - assert(_bin_table); - if (bin_id() == 0) { - return *this; - } - - if (_rel_bin_id == null_rel_bin_id) { - assert(*this == make_end_iterator(*_bin_table)); - _chrom_id = static_cast(num_chromosomes() - 1); - _chrom_bin_id = compute_bin_offset(); - _rel_bin_id = compute_num_chrom_bins() - 1; - return *this; - } - - if (_rel_bin_id-- == 0) { - _chrom_id--; - _chrom_bin_id = compute_bin_offset(); - _rel_bin_id = compute_num_chrom_bins() - 1; - } - + std::visit([&](auto &it) { std::ignore = --it; }, _it); return *this; } inline auto BinTable::iterator::operator--(int) -> iterator { - auto it = *this; - std::ignore = --(*this); - return it; + auto old_it = *this; + std::visit([&](auto &it) { std::ignore = --it; }, _it); + return old_it; } inline auto BinTable::iterator::operator-=(std::size_t i) -> iterator & { - assert(_bin_table); - - if (_chrom_id == 0 && _rel_bin_id == 0 && i == 0) { - return *this; - } - - if (_chrom_id == 0 && _rel_bin_id < i) { - throw std::out_of_range( - "BinTable::iterator: caught attempt to decrement iterator past begin()"); - } - - if (_rel_bin_id == null_rel_bin_id) { - assert(*this == make_end_iterator(*_bin_table)); - _chrom_id = static_cast(num_chromosomes() - 1); - _chrom_bin_id = compute_bin_offset(); - _rel_bin_id = compute_num_chrom_bins(); - return *this -= i; - } - - if (i <= _rel_bin_id) { - _rel_bin_id -= static_cast(i); - return *this; - } - - _chrom_id--; - _chrom_bin_id = compute_bin_offset(); - i -= _rel_bin_id; - _rel_bin_id = compute_num_chrom_bins(); - return *this -= i; + std::visit([&](auto &it) { std::ignore = it -= i; }, _it); + return *this; } inline auto BinTable::iterator::operator-(std::size_t i) const -> iterator { auto it = *this; - return it -= i; + it -= i; + return it; } inline auto BinTable::iterator::operator-(const iterator &other) const -> difference_type { - assert(_bin_table); - assert(other._bin_table); - - const auto offset1 = _chrom_id == nchrom ? _bin_table->size() : bin_id(); - const auto offset2 = other._chrom_id == nchrom ? other._bin_table->size() : other.bin_id(); - - return static_cast(offset1) - static_cast(offset2); + return std::visit( + [&](const auto &it1) { + const auto &it2 = std::get>(other._it); + return it1 - it2; + }, + _it); } -inline auto BinTable::iterator::operator[](std::size_t i) const -> iterator { return (*this + i); } - -inline const Chromosome &BinTable::iterator::chromosome() const { return chromosome(_chrom_id); } - -inline const Chromosome &BinTable::iterator::chromosome(std::uint32_t chrom_id) const { - return _bin_table->chromosomes().at(chrom_id); +template +constexpr const IteratorT &BinTable::iterator::get() const { + return std::get(get()); } -inline std::uint32_t BinTable::iterator::compute_num_chrom_bins() const noexcept { - assert(_bin_table); - - const auto chrom_size = chromosome().size(); - const auto bin_size = this->bin_size(); - - return (chrom_size + bin_size - 1) / bin_size; -} - -inline std::size_t BinTable::iterator::compute_bin_offset() const noexcept { - return _bin_table->at(_chrom_id, 0).id(); +template +constexpr IteratorT &BinTable::iterator::get() { + return std::get(get()); } -inline std::size_t BinTable::iterator::num_chromosomes() const noexcept { - assert(_bin_table); - - return _bin_table->num_chromosomes(); -} +constexpr auto BinTable::iterator::get() const noexcept -> const IteratorVar & { return _it; } +constexpr auto BinTable::iterator::get() noexcept -> IteratorVar & { return _it; } } // namespace hictk - -inline std::size_t std::hash::operator()(const hictk::Bin &b) const { - return hictk::internal::hash_combine(0, b.id(), b.interval()); -} diff --git a/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp b/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp new file mode 100644 index 00000000..8d3a70c9 --- /dev/null +++ b/src/libhictk/bin_table/include/hictk/impl/bin_table_variable_impl.hpp @@ -0,0 +1,523 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hictk/common.hpp" +#include "hictk/genomic_interval.hpp" +#include "hictk/suppress_warnings.hpp" + +namespace hictk { // NOLINT + +template +inline BinTableVariable::BinTableVariable(Reference chroms, const std::vector &start_pos, + const std::vector &end_pos, I bin_offset) + : _chroms(std::move(chroms)) { + assert(start_pos.size() == end_pos.size()); + _bin_end_prefix_sum[0] = bin_offset; + if (start_pos.empty()) { + return; + } + + validate_bin_coords(start_pos, end_pos); + + _bin_end_prefix_sum.reserve(start_pos.size() + 1); + _bin_end_prefix_sum.push_back(end_pos.front()); + + for (std::size_t i = 1; i < start_pos.size(); ++i) { + if (start_pos[i] <= start_pos[i - 1]) { + _num_bins_prefix_sum.push_back(_bin_end_prefix_sum.size() - 1); + } + _bin_end_prefix_sum.push_back(_bin_end_prefix_sum.back() + end_pos[i] - start_pos[i]); + } + _num_bins_prefix_sum.push_back(_bin_end_prefix_sum.size() - 1); + if (_num_bins_prefix_sum.size() != _chroms.size() + 1) { + throw std::runtime_error( + fmt::format(FMT_STRING("Bin table contains an unexpected number of chromosomes: expected " + "{} chromosomes, found {}."), + _chroms.size(), _num_bins_prefix_sum.size() - 1)); + } +} + +template +inline std::size_t BinTableVariable::size() const noexcept { + return _bin_end_prefix_sum.size() - 1; +} + +template +inline bool BinTableVariable::empty() const noexcept { + return size() == 0; +} + +template +inline std::size_t BinTableVariable::num_chromosomes() const { + return chromosomes().size(); +} + +template +constexpr const Reference &BinTableVariable::chromosomes() const noexcept { + return _chroms; +} + +template +constexpr const std::vector &BinTableVariable::num_bin_prefix_sum() + const noexcept { + return _num_bins_prefix_sum; +} + +template +inline auto BinTableVariable::begin() const -> iterator { + volatile auto chrom = *chromosomes().begin(); + return iterator(*this); +} + +template +inline auto BinTableVariable::end() const -> iterator { + return iterator::make_end_iterator(*this); +} + +template +inline auto BinTableVariable::cbegin() const -> iterator { + return begin(); +} + +template +inline auto BinTableVariable::cend() const -> iterator { + return end(); +} + +template +inline bool BinTableVariable::operator==(const BinTableVariable &other) const { + return _chroms == other._chroms && + std::equal(_num_bins_prefix_sum.begin(), _num_bins_prefix_sum.end(), + other._num_bins_prefix_sum.begin()); +} + +template +inline bool BinTableVariable::operator!=(const BinTableVariable &other) const { + return !(*this == other); +} + +template +inline BinTableVariable BinTableVariable::subset(const Chromosome &chrom) const { + // GCC8 fails to compile when using if constexpr instead #ifndef + // See: https://github.com/fmtlib/fmt/issues/1455 +#ifndef NDEBUG + if (!_chroms.contains(chrom)) { + throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); + } +#endif + std::vector start_pos{}; + std::vector end_pos{}; + + const auto [first_bin, last_bin] = find_overlap(chrom, 0, chrom.size()); + std::for_each(first_bin, last_bin, [&](const Bin &b) { + start_pos.push_back(b.start()); + end_pos.push_back(b.end()); + }); + + return {Reference{chrom}, start_pos, end_pos, start_pos.front()}; +} + +template +inline BinTableVariable BinTableVariable::subset(std::string_view chrom_name) const { + return subset(_chroms.at(chrom_name)); +} + +template +inline BinTableVariable BinTableVariable::subset(std::uint32_t chrom_id) const { + return subset(_chroms.at(chrom_id)); +} + +template +inline auto BinTableVariable::find_overlap(const GenomicInterval &query) const + -> std::pair::iterator, BinTableVariable::iterator> { + return find_overlap(query.chrom(), query.start(), query.end()); +} + +template +inline auto BinTableVariable::find_overlap(const Chromosome &chrom, std::uint32_t start, + std::uint32_t end) const + -> std::pair::iterator, BinTableVariable::iterator> { + assert(start < end); + + const auto bin1_id = at(chrom, start).id(); + const auto bin2_id = at(chrom, end - (std::min)(end, 1U)).id(); + + return std::make_pair(begin() + bin1_id, begin() + bin2_id + 1); +} + +template +inline auto BinTableVariable::find_overlap(std::string_view chrom_name, std::uint32_t start, + std::uint32_t end) const + -> std::pair::iterator, BinTableVariable::iterator> { + return find_overlap(_chroms.at(chrom_name), start, end); +} + +template +inline auto BinTableVariable::find_overlap(std::uint32_t chrom_id, std::uint32_t start, + std::uint32_t end) const + -> std::pair::iterator, BinTableVariable::iterator> { + return find_overlap(_chroms.at(chrom_id), start, end); +} + +template +Bin BinTableVariable::at(std::uint64_t bin_id) const { + // I tried benchmarking linear search as well as std::set (including third-party implementations). + // Binary search and find on flat vectors are always faster for a reasonable number of chromosomes + // (e.g. 5-100) and have fairly similar performance. + // Linear search is however better in practice because chromosomes are usually sorted by (approx.) + // size, with unplaced scaffolds etc. ending up last. + auto match = std::find_if(_num_bins_prefix_sum.begin(), _num_bins_prefix_sum.end(), + [&](const auto n) { return n > bin_id; }); + + if (match == _num_bins_prefix_sum.end()) { + throw std::out_of_range(fmt::format(FMT_STRING("bin id {} not found: out of range"), bin_id)); + } + assert(match != _num_bins_prefix_sum.begin()); + + const auto chrom_id = + static_cast(std::distance(_num_bins_prefix_sum.begin(), --match)); + return at_hint(bin_id, _chroms[chrom_id]); +} + +template +inline Bin BinTableVariable::at_hint(std::uint64_t bin_id, const Chromosome &chrom) const { + if (_bin_end_prefix_sum.size() <= bin_id) { + throw std::out_of_range(fmt::format(FMT_STRING("bin id {} not found: out of range"), bin_id)); + } + + const auto bin_id_offset = _num_bins_prefix_sum[chrom.id()]; + const auto chrom_size_offset = _chroms.chrom_size_prefix_sum()[chrom.id()]; + const auto relative_bin_id = bin_id - bin_id_offset; + + const auto start = + conditional_static_cast(_bin_end_prefix_sum[bin_id] - chrom_size_offset); + const auto end = + conditional_static_cast(_bin_end_prefix_sum[bin_id + 1] - chrom_size_offset); + + if (_bin_end_prefix_sum[bin_id] < chrom_size_offset || end > chrom.size()) { + throw std::out_of_range(fmt::format( + FMT_STRING("bin id {} not found using {} as hint: out of range"), bin_id, chrom.name())); + } + + return {bin_id, static_cast(relative_bin_id), chrom, start, end}; +} + +template +inline std::pair BinTableVariable::at(const GenomicInterval &gi) const { + const auto [bin1_id, bin2_id] = map_to_bin_ids(gi); + return std::make_pair(at_hint(bin1_id, gi.chrom()), at_hint(bin2_id, gi.chrom())); +} + +template +inline Bin BinTableVariable::at(const Chromosome &chrom, std::uint32_t pos) const { + return at_hint(map_to_bin_id(chrom, pos), chrom); +} + +template +inline Bin BinTableVariable::at(std::string_view chrom_name, std::uint32_t pos) const { + return at(map_to_bin_id(chrom_name, pos)); +} + +template +inline Bin BinTableVariable::at(std::uint32_t chrom_id, std::uint32_t pos) const { + return at(map_to_bin_id(chrom_id, pos)); +} + +template +inline std::pair BinTableVariable::map_to_bin_ids( + const GenomicInterval &gi) const { + return std::make_pair(map_to_bin_id(gi.chrom(), gi.start()), + map_to_bin_id(gi.chrom(), gi.end() - (std::min)(gi.end(), 1U))); +} + +template +inline std::uint64_t BinTableVariable::map_to_bin_id(const Chromosome &chrom, + std::uint32_t pos) const { + // GCC8 fails to compile when using if constexpr instead #ifndef + // See: https://github.com/fmtlib/fmt/issues/1455 +#ifndef NDEBUG + if (!_chroms.contains(chrom)) { + throw std::out_of_range(fmt::format(FMT_STRING("chromosome \"{}\" not found"), chrom.name())); + } +#endif + + if (pos >= chrom.size()) { + throw std::out_of_range(fmt::format( + FMT_STRING("position is greater than chromosome size: {} >= {}"), pos, chrom.size())); + } + + const auto pos_offset = _chroms.chrom_size_prefix_sum()[chrom.id()]; + auto match = + std::upper_bound(_bin_end_prefix_sum.begin(), _bin_end_prefix_sum.end(), pos + pos_offset); + + return static_cast(std::distance(_bin_end_prefix_sum.begin(), --match)); +} + +template +inline std::uint64_t BinTableVariable::map_to_bin_id(std::string_view chrom_name, + std::uint32_t pos) const { + return map_to_bin_id(_chroms.at(chrom_name), pos); +} + +template +inline std::uint64_t BinTableVariable::map_to_bin_id(std::uint32_t chrom_id, + std::uint32_t pos) const { + return map_to_bin_id(_chroms.at(chrom_id), pos); +} + +template +inline void BinTableVariable::validate_bin_coords(const std::vector &start_pos, + const std::vector &end_pos) { + if (start_pos.front() != 0) { + throw std::runtime_error("Bin table does not start from zero"); + } + + for (std::size_t i = 1; i < start_pos.size(); ++i) { + const auto s1 = start_pos[i - 1]; + const auto s2 = start_pos[i]; + const auto e1 = end_pos[i - 1]; + const auto e2 = end_pos[i]; + + if (s1 >= e1) { + throw std::runtime_error(fmt::format( + FMT_STRING("Bin #{} is not valid: start_pos >= end_pos: {} >= {}"), i, s1, e1)); + } + + if (s1 >= s2 && s2 != 0) { + throw std::runtime_error(fmt::format(FMT_STRING("Bin table is not sorted in ascending order: " + "bin #{} >= bin #{} (start_pos {} >= {})"), + i - 1, i, s1, s2)); + } + + if (e1 >= e2 && s2 != 0) { + throw std::runtime_error(fmt::format(FMT_STRING("Bin table is not sorted in ascending order: " + "bin #{} >= bin #{} (end_pos {} >= {})"), + i - 1, i, e1, e2)); + } + + if (e1 != s2 && s2 != 0) { + throw std::runtime_error( + fmt::format(FMT_STRING("Detected a gap between bins #{} and #{}."), i - 1, i)); + } + } + if (start_pos.back() >= end_pos.back()) { + throw std::runtime_error( + fmt::format(FMT_STRING("Bin #{} is not valid: start_pos >= end_pos: {} >= {}"), + start_pos.size() - 1, start_pos.back(), end_pos.back())); + } +} + +template +inline BinTableVariable::iterator::iterator(const BinTableVariable &bin_table) noexcept + : _bin_table{&bin_table} { + if (_bin_table->chromosomes().at(_chrom_id).is_all()) { + _chrom_id++; + } + _value = get_bin(); +} + +template +constexpr bool BinTableVariable::iterator::operator==(const iterator &other) const noexcept { + // clang-format off + return _bin_table == other._bin_table && + _chrom_id == other._chrom_id && + _bin_id == other._bin_id; + // clang-format on +} + +template +constexpr bool BinTableVariable::iterator::operator!=(const iterator &other) const noexcept { + return !(*this == other); +} + +template +constexpr bool BinTableVariable::iterator::operator<(const iterator &other) const noexcept { + return _bin_id < other._bin_id; +} + +template +constexpr bool BinTableVariable::iterator::operator<=(const iterator &other) const noexcept { + return _bin_id <= other._bin_id; +} + +template +constexpr bool BinTableVariable::iterator::operator>(const iterator &other) const noexcept { + return _bin_id > other._bin_id; +} + +template +constexpr bool BinTableVariable::iterator::operator>=(const iterator &other) const noexcept { + return _bin_id >= other._bin_id; +} + +template +inline auto BinTableVariable::iterator::make_end_iterator(const BinTableVariable &table) noexcept + -> iterator { + iterator it(table); + + it._chrom_id = nchrom; + it._bin_id = table.size(); + return it; +} + +template +inline auto BinTableVariable::iterator::operator*() const noexcept -> value_type { + return _value; +} + +template +inline auto BinTableVariable::iterator::operator++() -> iterator & { + assert(_bin_table); + if (++_bin_id == _bin_table->size()) { + *this = make_end_iterator(*_bin_table); + return *this; + } + try { + _value = get_bin(); + _chrom_id = _value.chrom().id(); + } catch (const std::out_of_range &) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to increment iterator past end()"); + } + + return *this; +} + +template +inline auto BinTableVariable::iterator::operator++(int) -> iterator { + auto it = *this; + std::ignore = ++(*this); + return it; +} + +template +inline auto BinTableVariable::iterator::operator+=(std::size_t i) -> iterator & { + if (i == 0) { + return *this; + } + assert(_bin_table); + try { + if (_bin_id > _bin_table->size() - i) { + throw std::out_of_range(""); + } + + _bin_id += i; + _value = get_bin(); + _chrom_id = _value.chrom().id(); + } catch (const std::out_of_range &) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to increment iterator past end()"); + } + return *this; +} + +template +inline auto BinTableVariable::iterator::operator+(std::size_t i) const -> iterator { + auto it = *this; + return it += i; +} + +template +inline auto BinTableVariable::iterator::operator--() -> iterator & { + assert(_bin_table); + --_bin_id; + + try { + _value = get_bin(); + _chrom_id = _value.chrom().id(); + } catch (const std::out_of_range &) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to decrement iterator past begin()"); + } + return *this; +} + +template +inline auto BinTableVariable::iterator::operator--(int) -> iterator { + auto it = *this; + std::ignore = --(*this); + return it; +} + +template +inline auto BinTableVariable::iterator::operator-=(std::size_t i) -> iterator & { + if (i == 0) { + return *this; + } + assert(_bin_table); + try { + if (_bin_id < _bin_table->size() - i) { + throw std::out_of_range(""); + } + _bin_id -= i; + _value = get_bin(); + _chrom_id = _value.chrom().id(); + } catch (const std::out_of_range &) { + throw std::out_of_range( + "BinTableVariable::iterator: caught attempt to decrement iterator past begin()"); + } + return *this; +} + +template +inline auto BinTableVariable::iterator::operator-(std::size_t i) const -> iterator { + auto it = *this; + return it -= i; +} + +template +inline auto BinTableVariable::iterator::operator-(const iterator &other) const + -> difference_type { + assert(_bin_table); + assert(other._bin_table); + + const auto offset1 = _chrom_id == nchrom ? _bin_table->size() : _bin_id; + const auto offset2 = other._chrom_id == nchrom ? other._bin_table->size() : other._bin_id; + + return static_cast(offset1) - static_cast(offset2); +} + +template +inline auto BinTableVariable::iterator::operator[](std::size_t i) const -> iterator { + return (*this + i); +} + +template +inline const Chromosome &BinTableVariable::iterator::chromosome() const { + return chromosome(_chrom_id); +} + +template +inline const Chromosome &BinTableVariable::iterator::chromosome(std::uint32_t chrom_id) const { + return _bin_table->chromosomes().at(chrom_id); +} + +template +inline Bin BinTableVariable::iterator::get_bin() const { + if (_bin_id == _bin_table->size()) { + return {}; + } + try { + const auto &chrom = chromosome(); + return _bin_table->at_hint(_bin_id, chrom); + } catch (const std::out_of_range &) { + return _bin_table->at(_bin_id); + } +} + +} // namespace hictk diff --git a/src/libhictk/common/include/hictk/suppress_warnings.hpp b/src/libhictk/common/include/hictk/suppress_warnings.hpp index 71bc934e..b589a760 100644 --- a/src/libhictk/common/include/hictk/suppress_warnings.hpp +++ b/src/libhictk/common/include/hictk/suppress_warnings.hpp @@ -16,6 +16,7 @@ #define DISABLE_WARNING(warningNumber) __pragma(warning(disable : warningNumber)) #define DISABLE_WARNING_DEPRECATED_DECLARATIONS + #define DISABLE_WARNING_MAYBE_UNINITIALIZED #define DISABLE_WARNING_NULL_DEREF #define DISABLE_WARNING_USELESS_CAST #define DISABLE_WARNING_SIGN_COMPARE @@ -37,11 +38,13 @@ // Defines for GCC only #if defined(__GNUC__) && !defined(__clang__) - #define DISABLE_WARNING_USELESS_CAST DISABLE_WARNING("-Wuseless-cast") // NOLINT(cppcoreguidelines-macro-usage) + #define DISABLE_WARNING_MAYBE_UNINITIALIZED DISABLE_WARNING("-Wmaybe-uninitialized") // NOLINT(cppcoreguidelines-macro-usage) + #define DISABLE_WARNING_USELESS_CAST DISABLE_WARNING("-Wuseless-cast") // NOLINT(cppcoreguidelines-macro-usage) #endif // Defines for Clang only #ifdef __clang__ + #define DISABLE_WARNING_MAYBE_UNINITIALIZED #define DISABLE_WARNING_USELESS_CAST #endif @@ -52,6 +55,7 @@ #define DISABLE_WARNING_POP #define DISABLE_WARNING_DEPRECATED_DECLARATIONS + #define DISABLE_WARNING_MAYBE_UNINITIALIZED #define DISABLE_WARNING_NULL_DEREF #define DISABLE_WARNING_USELESS_CAST #define DISABLE_WARNING_SIGN_COMPARE diff --git a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp index 78d7f3d7..258d0d3c 100644 --- a/src/libhictk/cooler/include/hictk/cooler/cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/cooler.hpp @@ -98,13 +98,14 @@ class File { mutable std::shared_ptr _index{}; bool _finalize{false}; - // Constructors are private. Cooler files are opened using factory methods + // Private ctors File(RootGroup entrypoint, unsigned int mode, std::size_t cache_size_bytes, double w0, bool validate); template - File(RootGroup entrypoint, Reference chroms, PixelT pixel, Attributes attributes, + File(RootGroup entrypoint, BinTable bins, PixelT pixel, Attributes attributes, std::size_t cache_size_bytes, double w0); + // Ctor for SingleCellCooler template File(RootGroup entrypoint, PixelT pixel, Attributes attributes, std::size_t cache_size_bytes, @@ -135,6 +136,12 @@ class File { Attributes attributes = Attributes::init(0), std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + template + [[nodiscard]] static File create(std::string_view uri, BinTable bins, + bool overwrite_if_exists = false, + Attributes attributes = Attributes::init(0), + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + [[nodiscard]] static File open_random_access( RootGroup entrypoint, std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE, bool validate = true); @@ -147,6 +154,11 @@ class File { Attributes attributes = Attributes::init(0), std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + template + [[nodiscard]] static File create(RootGroup entrypoint, BinTable bins, + Attributes attributes = Attributes::init(0), + std::size_t cache_size_bytes = DEFAULT_HDF5_CACHE_SIZE * 4); + ~File() noexcept; File &operator=(const File &other) = delete; @@ -304,6 +316,9 @@ class File { [[nodiscard]] static auto import_chroms(const Dataset &chrom_names, const Dataset &chrom_sizes, bool missing_ok) -> Reference; + [[nodiscard]] static BinTable init_bin_table(const DatasetMap &dsets, std::string_view bin_type, + std::uint32_t bin_size); + [[nodiscard]] static Index init_index(const Dataset &chrom_offset_dset, const Dataset &bin_offset_dset, std::shared_ptr bin_table, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp index 8e647b79..5f51b4f3 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp @@ -41,8 +41,7 @@ inline File::File(RootGroup entrypoint, unsigned int mode, std::size_t cache_siz _attrs(read_standard_attributes(_root_group)), _pixel_variant(detect_pixel_type(_root_group)), _bins(std::make_shared( - import_chroms(_datasets.at("chroms/name"), _datasets.at("chroms/length"), false), - bin_size())), + init_bin_table(_datasets, _attrs.bin_type.value(), _attrs.bin_size))), _index(std::make_shared(init_index(_datasets.at("indexes/chrom_offset"), _datasets.at("indexes/bin1_offset"), _bins, _datasets.at("pixels/count").size(), false))) { @@ -53,18 +52,17 @@ inline File::File(RootGroup entrypoint, unsigned int mode, std::size_t cache_siz } template -inline File::File(RootGroup entrypoint, Reference chroms, [[maybe_unused]] PixelT pixel, +inline File::File(RootGroup entrypoint, BinTable bins, [[maybe_unused]] PixelT pixel, Attributes attributes, std::size_t cache_size_bytes, double w0) : _mode(HighFive::File::ReadWrite), _root_group(std::move(entrypoint)), _groups(create_groups(_root_group)), - _datasets(create_datasets(_root_group, chroms, cache_size_bytes, w0)), + _datasets(create_datasets(_root_group, bins.chromosomes(), cache_size_bytes, w0)), _attrs(std::move(attributes)), _pixel_variant(PixelT(0)), - _bins(std::make_shared(std::move(chroms), bin_size())), + _bins(std::make_shared(std::move(bins))), _index(std::make_shared(_bins)), _finalize(true) { - assert(bin_size() != 0); assert(!_bins->empty()); assert(!chromosomes().empty()); assert(!_index->empty()); @@ -86,12 +84,10 @@ inline File::File(RootGroup entrypoint, [[maybe_unused]] PixelT pixel, Attribute _groups = open_groups(_root_group); _datasets = open_datasets(_root_group, cache_size_bytes, w0); - _bins = std::make_shared( - import_chroms(_datasets.at("chroms/name"), _datasets.at("chroms/length"), false), bin_size()); + _bins = std::make_shared(init_bin_table(_datasets, *_attrs.bin_type, _attrs.bin_size)); _index = std::make_shared(_bins); assert(std::holds_alternative(_pixel_variant)); - assert(bin_size() != 0); assert(!_bins->empty()); assert(!chromosomes().empty()); assert(!_index->empty()); @@ -117,11 +113,17 @@ inline File File::open_read_once(std::string_view uri, std::size_t cache_size_by return File(open_or_create_root_group(open_file(uri, HighFive::File::ReadOnly, validate), uri), HighFive::File::ReadOnly, cache_size_bytes, 1.0, validate); } - template inline File File::create(std::string_view uri, const Reference &chroms, std::uint32_t bin_size, bool overwrite_if_exists, Attributes attributes, std::size_t cache_size_bytes) { + return File::create(uri, BinTable(chroms, bin_size), overwrite_if_exists, attributes, + cache_size_bytes); +} + +template +inline File File::create(std::string_view uri, BinTable bins, bool overwrite_if_exists, + Attributes attributes, std::size_t cache_size_bytes) { try { const auto [file_path, root_path] = parse_cooler_uri(uri); const auto uri_is_file_path = root_path.empty() || root_path == "/"; @@ -162,8 +164,8 @@ inline File File::create(std::string_view uri, const Reference &chroms, std::uin } return create( - open_or_create_root_group(open_file(uri, HighFive::File::ReadWrite, false), uri), chroms, - bin_size, attributes, cache_size_bytes); + open_or_create_root_group(open_file(uri, HighFive::File::ReadWrite, false), uri), bins, + attributes, cache_size_bytes); } catch (const std::exception &e) { throw std::runtime_error( fmt::format(FMT_STRING("Cannot create cooler at the following URI: \"{}\". Reason: {}"), @@ -184,18 +186,26 @@ inline File File::open_read_once(RootGroup entrypoint, std::size_t cache_size_by template inline File File::create(RootGroup entrypoint, const Reference &chroms, std::uint32_t bin_size, Attributes attributes, std::size_t cache_size_bytes) { + return File::create(entrypoint, BinTable(chroms, bin_size), attributes, cache_size_bytes); +} + +template +inline File File::create(RootGroup entrypoint, BinTable bins, Attributes attributes, + std::size_t cache_size_bytes) { static_assert(std::is_arithmetic_v); - if (bin_size == 0) { - throw std::logic_error("bin_size cannot be zero."); + if (std::holds_alternative>(bins.get())) { + attributes.bin_type = "variable"; + attributes.bin_size = 0; + } else { + attributes.bin_type = "fixed"; + attributes.bin_size = bins.bin_size(); } - attributes.bin_size = bin_size; + try { if (utils::is_cooler(entrypoint())) { throw std::runtime_error("URI points to an already existing cooler."); } - // At this point the parent file is guaranteed to exist, so we can always open it in ReadWrite - // mode - return File(entrypoint, chroms, PixelT(0), attributes, cache_size_bytes, true); + return File(entrypoint, bins, PixelT(0), attributes, cache_size_bytes, true); } catch (const std::exception &e) { throw std::runtime_error( diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp index e27e2d2a..9c837e41 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp @@ -467,12 +467,14 @@ inline auto File::read_standard_attributes(const RootGroup &root_grp, bool initi // Read mandatory attributes // We read format-version first because some attributes are mandatory only for cooler v3 read_or_throw("format-version", attrs.format_version); - read_or_throw("bin-size", attrs.bin_size); read_or_throw("format", attrs.format); // Read mandatory attributes for Cooler v3 auto missing_ok = attrs.format_version < 3; internal::read_optional(root_grp, "bin-type", attrs.bin_type, missing_ok); + if (attrs.bin_type.value() == "fixed") { + read_or_throw("bin-size", attrs.bin_size); + } internal::read_optional(root_grp, "storage-mode", attrs.storage_mode, missing_ok); // Try to read reserved attributes @@ -526,6 +528,19 @@ inline auto File::import_chroms(const Dataset &chrom_names, const Dataset &chrom } } +inline BinTable File::init_bin_table(const DatasetMap &dsets, std::string_view bin_type, + std::uint32_t bin_size) { + auto chroms = import_chroms(dsets.at("chroms/name"), dsets.at("chroms/length"), false); + if (bin_type == "fixed") { + return {std::move(chroms), bin_size}; + } + assert(bin_type == "variable"); + assert(bin_size == 0); + + return {std::move(chroms), dsets.at("bins/start").read_all>(), + dsets.at("bins/end").read_all>()}; +} + inline Index File::init_index(const Dataset &chrom_offset_dset, const Dataset &bin_offset_dset, std::shared_ptr bin_table, std::uint64_t expected_nnz, bool missing_ok) { diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_validation_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_validation_impl.hpp index da286e9e..fefce8b7 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_validation_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_validation_impl.hpp @@ -25,7 +25,7 @@ namespace hictk::cooler { inline void File::validate_bins(bool full) const { try { - assert(_attrs.bin_type == "fixed"); + assert(_attrs.bin_type == "fixed" || _attrs.bin_type == "variable"); auto nchroms = dataset("bins/chrom").size(); auto nstarts = dataset("bins/start").size(); auto nends = dataset("bins/end").size(); @@ -63,7 +63,7 @@ inline void File::validate_bins(bool full) const { if (chromosomes().at(*chrom_it).name() != bin.chrom().name() || *start_it != bin.start() || *end_it != bin.end()) { throw std::runtime_error( - fmt::format(FMT_STRING("GenomicInterval #{}: expected {}:{}-{}, found {:ucsc}"), i, + fmt::format(FMT_STRING("Bin #{}: expected {}:{}-{}, found {:ucsc}"), i, chromosomes().at(*chrom_it).name(), *start_it, *end_it, bin)); } ++chrom_it; @@ -75,8 +75,8 @@ inline void File::validate_bins(bool full) const { } catch (const HighFive::Exception &e) { throw std::runtime_error( - fmt::format(FMT_STRING("GenomicInterval table at URI {}/{} is invalid or corrupted: {}"), - uri(), group("bins")().getPath(), e.what())); + fmt::format(FMT_STRING("Bin table at URI {}/{} is invalid or corrupted: {}"), uri(), + group("bins")().getPath(), e.what())); } } diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp index b43765ff..b8645683 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_write_impl.hpp @@ -262,12 +262,17 @@ inline auto File::create_datasets(RootGroup &root_grp, const Reference &chroms, inline void File::write_standard_attributes(RootGroup &root_grp, const Attributes &attributes, bool skip_sentinel_attr) { - assert(attributes.bin_size != 0); [[maybe_unused]] HighFive::SilenceHDF5 silencer{}; // NOLINT if (attributes.assembly) { Attribute::write(root_grp(), "assembly", *attributes.assembly); } - Attribute::write(root_grp(), "bin-size", attributes.bin_size); + if (attributes.bin_size == 0) { + assert(attributes.bin_type.value() == "variable"); + Attribute::write(root_grp(), "bin-size", "null"); + } else { + assert(attributes.bin_type.value() == "fixed"); + Attribute::write(root_grp(), "bin-size", attributes.bin_size); + } Attribute::write(root_grp(), "bin-type", *attributes.bin_type); // NOLINT Attribute::write(root_grp(), "creation-date", *attributes.creation_date); // NOLINT Attribute::write(root_grp(), "format", std::string{COOL_MAGIC}); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp index 1df72d1c..bbbecc25 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp @@ -24,10 +24,8 @@ inline Index::Index(std::shared_ptr bins, const std::vector &chrom_offsets, std::uint64_t nnz, bool allocate) : _bins(std::move(bins)), - _idx(Index::init(_bins->chromosomes(), chrom_offsets, _bins->bin_size(), allocate)), - _nnz(nnz) { - assert(bin_size() != 0); -} + _idx(Index::init(_bins->chromosomes(), *_bins, chrom_offsets, allocate)), + _nnz(nnz) {} inline const Reference &Index::chromosomes() const noexcept { assert(_bins); @@ -120,7 +118,8 @@ inline std::uint64_t Index::get_offset_by_row_idx(std::uint32_t chrom_id, } inline void Index::set(const Chromosome &chrom, OffsetVect offsets) { - const auto expected_size = (chrom.size() + bin_size() - 1) / bin_size(); + const auto [fist_bin, last_bin] = _bins->find_overlap(chrom, 0, chrom.size()); + const auto expected_size = static_cast(std::distance(fist_bin, last_bin)); if (offsets.size() != expected_size) { throw std::runtime_error( fmt::format(FMT_STRING("expected index for {} to have size {}, found {}"), chrom, @@ -129,9 +128,13 @@ inline void Index::set(const Chromosome &chrom, OffsetVect offsets) { _idx.at(chrom) = std::move(offsets); } +inline void Index::set_offset_by_bin(const Bin &bin, std::uint64_t offset) { + set_offset_by_row_idx(bin.chrom().id(), conditional_static_cast(bin.rel_id()), + offset); +} + inline void Index::set_offset_by_bin_id(std::uint64_t bin_id, std::uint64_t offset) { - const auto &bin = _bins->at(bin_id); - set_offset_by_pos(bin.chrom(), bin.start(), offset); + set_offset_by_bin(_bins->at(bin_id), offset); } inline void Index::set_offset_by_pos(const Chromosome &chrom, std::uint32_t pos, @@ -141,8 +144,7 @@ inline void Index::set_offset_by_pos(const Chromosome &chrom, std::uint32_t pos, inline void Index::set_offset_by_pos(std::uint32_t chrom_id, std::uint32_t pos, std::uint64_t offset) { - const auto row_idx = pos / bin_size(); - set_offset_by_row_idx(chrom_id, row_idx, offset); + set_offset_by_bin(_bins->at(chrom_id, pos), offset); } inline void Index::set_offset_by_row_idx(std::uint32_t chrom_id, std::size_t row_idx, @@ -201,15 +203,15 @@ inline void Index::compute_chrom_offsets(std::vector &buff) const }); } -inline auto Index::init(const Reference &chroms, const std::vector &chrom_offsets, - std::uint32_t bin_size, bool allocate) -> MapT { +inline auto Index::init(const Reference &chroms, const BinTable &bins, + const std::vector &chrom_offsets, bool allocate) -> MapT { assert(!chroms.empty()); - assert(bin_size != 0); assert(chrom_offsets.empty() || chroms.size() + 1 == chrom_offsets.size()); MapT idx{}; for (std::uint32_t i = 0; i < chroms.size(); ++i) { const auto &chrom = chroms.at(i); - const auto num_bins = (chrom.size() + bin_size - 1) / bin_size; + const auto [first_bin, last_bin] = bins.find_overlap(chrom, 0, chrom.size()); + const auto num_bins = static_cast(std::distance(first_bin, last_bin)); auto node = idx.emplace(chrom, OffsetVect(allocate ? num_bins : 1, Index::offset_not_set_value)); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp index 7d116960..19190c93 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp @@ -25,6 +25,11 @@ namespace hictk::cooler { inline SingleCellAttributes SingleCellAttributes::init(std::uint32_t bin_size_) { SingleCellAttributes attrs{}; attrs.bin_size = bin_size_; + if (bin_size_ == 0) { + attrs.bin_type = "variable"; + } else { + attrs.bin_type = "fixed"; + } return attrs; } @@ -66,13 +71,19 @@ inline SingleCellFile::SingleCellFile(HighFive::File fp, BinTable bins, SingleCe _bins(std::make_shared(std::move(bins))) {} inline SingleCellFile::SingleCellFile(const std::filesystem::path& path, unsigned int mode) - : SingleCellFile(HighFive::File(path.string(), mode), read_bins(HighFive::File(path.string())), + : SingleCellFile(HighFive::File(path.string(), mode), + init_bin_table(HighFive::File(path.string())), read_standard_attributes(HighFive::File(path.string()), mode != HighFive::File::ReadOnly)) {} inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path, const Reference& chroms, std::uint32_t bin_size, bool force_overwrite) { + return SingleCellFile::create(path, {BinTableFixed{chroms, bin_size}}, force_overwrite); +} + +inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path, BinTable bins, + bool force_overwrite) { if (!force_overwrite && std::filesystem::exists(path)) { throw std::runtime_error( fmt::format(FMT_STRING("unable to initialize file \"{}\": file already exists"), path)); @@ -82,12 +93,10 @@ inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path, std::filesystem::remove(path); } - const BinTable bins(chroms, bin_size); - HighFive::File fp(path.string(), HighFive::File::Create); RootGroup root_grp{fp.getGroup("/")}; - auto attrs = SingleCellAttributes::init(bin_size); + auto attrs = SingleCellAttributes::init(bins.bin_size()); create_groups(root_grp); create_datasets(root_grp, bins); @@ -188,8 +197,7 @@ inline File SingleCellFile::aggregate(std::string_view uri, bool overwrite_if_ex tails.emplace_back(std::move(last)); } }); - utils::merge(heads, tails, chromosomes(), bins().bin_size(), uri, overwrite_if_exists, chunk_size, - update_frequency); + utils::merge(heads, tails, bins(), uri, overwrite_if_exists, chunk_size, update_frequency); return File(uri); } @@ -240,15 +248,23 @@ SingleCellFile::read_standard_attributes(const HighFive::File& f, bool initializ } DISABLE_WARNING_POP -inline BinTable SingleCellFile::read_bins(const HighFive::File& f) { +inline BinTable SingleCellFile::init_bin_table(const HighFive::File& f) { [[maybe_unused]] HighFive::SilenceHDF5 silencer{}; // NOLINT const RootGroup root_grp{f.getGroup("/")}; - const auto chroms = File::import_chroms(Dataset(root_grp, f.getDataSet("/chroms/name")), - Dataset(root_grp, f.getDataSet("/chroms/length")), false); - const auto bin_size = Attribute::read(root_grp(), "bin-size"); + auto chroms = File::import_chroms(Dataset(root_grp, f.getDataSet("/chroms/name")), + Dataset(root_grp, f.getDataSet("/chroms/length")), false); + const auto bin_type = Attribute::read(root_grp(), "bin-type"); + if (bin_type == "fixed") { + const auto bin_size = Attribute::read(root_grp(), "bin-size"); - return {chroms, bin_size}; -} + return {std::move(chroms), bin_size}; + } + + assert(bin_type == "variable"); + return {std::move(chroms), + Dataset(root_grp, f.getDataSet("bins/start")).read_all>(), + Dataset(root_grp, f.getDataSet("bins/end")).read_all>()}; +} // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) inline phmap::btree_set SingleCellFile::read_cells(const HighFive::File& f) { [[maybe_unused]] HighFive::SilenceHDF5 silencer{}; // NOLINT diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp index 489e81af..c02b4156 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp @@ -39,13 +39,21 @@ template inline void validate_bin_size(const std::vector>& coolers) { assert(coolers.size() > 1); const auto& clr1 = coolers.front(); + const auto bin_table1 = cooler::File(clr1.uri).bins(); for (std::size_t i = 1; i < coolers.size(); ++i) { const auto& clr2 = coolers[i]; - if (clr1.bin_size != clr2.bin_size) { + + if (clr1.bin_size == 0 || clr2.bin_size == 0) { // Table(s) with variable bin size + const auto bin_table2 = cooler::File(clr2.uri).bins(); + if (bin_table1 != bin_table2) { + throw std::runtime_error(fmt::format( + FMT_STRING("cooler \"{}\" and \"{}\" have different bin tables"), clr1.uri, clr2.uri)); + } + } else if (clr1.bin_size != clr2.bin_size) { throw std::runtime_error(fmt::format( FMT_STRING( - "cooler \"{}\" and \"{}\" have different resolutions ({} and {} respectively)"), + "cooler \"{}\" and \"{}\" have different resolutions ({} and {} respectively)"), clr1.uri, clr2.uri, clr1.bin_size, clr2.bin_size)); } } @@ -96,11 +104,8 @@ inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool o } } - const cooler::File clr(clrs.front().uri); - const auto chroms = clr.chromosomes(); - const auto bin_size = clr.bin_size(); - merge(heads, tails, chroms, bin_size, dest_uri, overwrite_if_exists, chunk_size, - update_frequency); + merge(heads, tails, cooler::File(clrs.front().uri).bins(), dest_uri, overwrite_if_exists, + chunk_size, update_frequency); } catch (const std::exception& e) { throw std::runtime_error(fmt::format(FMT_STRING("failed to merge {} cooler files: {}"), std::distance(first_uri, last_uri), e.what())); @@ -109,15 +114,15 @@ inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool o template inline void merge(const std::vector& heads, const std::vector& tails, - const Reference& chromosomes, std::uint32_t bin_size, std::string_view dest_uri, - bool overwrite_if_exists, std::size_t chunk_size, std::size_t update_frequency) { + const BinTable& bins, std::string_view dest_uri, bool overwrite_if_exists, + std::size_t chunk_size, std::size_t update_frequency) { using N = remove_cvref_tcount)>; hictk::transformers::PixelMerger merger{heads, tails}; std::vector> buffer(chunk_size); buffer.clear(); - auto dest = File::create(dest_uri, chromosomes, bin_size, overwrite_if_exists); + auto dest = File::create(dest_uri, bins, overwrite_if_exists); std::size_t pixels_processed{}; auto t0 = std::chrono::steady_clock::now(); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp index d0a313b8..f0233ec6 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/validation_impl.hpp @@ -121,10 +121,10 @@ inline ValidationStatusCooler is_cooler(const HighFive::Group &root_group) { status.missing_or_invalid_format_attr |= version == 0 || version > 3; } - // Check file has a bin-type that we support (currently only "fixed" is supported) + // Check file has a bin-type that we support if (Attribute::exists(root_group, "bin-type")) { const auto bin_type = Attribute::read(root_group, "bin-type"); - status.missing_or_invalid_bin_type_attr = bin_type != "fixed"; + status.missing_or_invalid_bin_type_attr = bin_type != "fixed" && bin_type != "variable"; } // Check file has the mandatory groups @@ -177,12 +177,12 @@ inline ValidationStatusMultiresCooler is_multires_file(const HighFive::File &fp, status.missing_or_invalid_format_attr |= version == 0 || version > 3; } - // Check file has a bin-type that we support (currently only "fixed" is supported) + // Check file has a bin-type that we support // NOTE: .mcool files are not required to advertise the bin type they are using at the root level status.missing_or_invalid_bin_type_attr = false; if (Attribute::exists(fp, "bin-type")) { const auto bin_type = Attribute::read(fp, "bin-type"); - status.missing_or_invalid_bin_type_attr = bin_type != "fixed"; + status.missing_or_invalid_bin_type_attr = bin_type != "fixed" && bin_type != "variable"; } // Try to read resolutions from the Cooler's root @@ -261,12 +261,12 @@ inline ValidationStatusScool is_scool_file(const HighFive::File &fp, bool valida status.missing_or_invalid_format_attr |= version == 0 || version > 3; } - // Check file has a bin-type that we support (currently only "fixed" is supported) + // Check file has a bin-type that we support // NOTE: .scool files are not required to advertise the bin type they are using at the root level status.missing_or_invalid_bin_type_attr = false; if (Attribute::exists(fp, "bin-type")) { const auto bin_type = Attribute::read(fp, "bin-type"); - status.missing_or_invalid_bin_type_attr = bin_type != "fixed"; + status.missing_or_invalid_bin_type_attr = bin_type != "fixed" && bin_type != "variable"; } constexpr std::array scool_root_groups{"chroms", "bins", "cells"}; diff --git a/src/libhictk/cooler/include/hictk/cooler/index.hpp b/src/libhictk/cooler/include/hictk/cooler/index.hpp index 5ea32299..2a0e7597 100644 --- a/src/libhictk/cooler/include/hictk/cooler/index.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/index.hpp @@ -14,6 +14,7 @@ #include #include +#include "hictk/bin.hpp" #include "hictk/chromosome.hpp" namespace hictk { @@ -84,6 +85,7 @@ class Index { std::size_t row_idx) const; void set(const Chromosome& chrom, OffsetVect offsets); + void set_offset_by_bin(const Bin& bin, std::uint64_t offset); void set_offset_by_bin_id(std::uint64_t bin_id, std::uint64_t offset); void set_offset_by_pos(const Chromosome& chrom, std::uint32_t pos, std::uint64_t offset); @@ -109,9 +111,9 @@ class Index { [[nodiscard]] auto at(std::string_view chrom_name) -> mapped_type&; [[nodiscard]] auto at(std::uint32_t chrom_id) -> mapped_type&; - [[nodiscard]] static auto init(const Reference& chroms, - const std::vector& chrom_offsets, - std::uint32_t bin_size, bool allocate) -> MapT; + [[nodiscard]] static auto init(const Reference& chroms, const BinTable& bins, + const std::vector& chrom_offsets, bool allocate) + -> MapT; public: class iterator { diff --git a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp index 86228e5b..86fcf6f4 100644 --- a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp @@ -12,6 +12,7 @@ #include #include +#include "hictk/bin.hpp" #include "hictk/chromosome.hpp" #include "hictk/cooler/cooler.hpp" @@ -62,6 +63,8 @@ class SingleCellFile { unsigned int mode = HighFive::File::ReadOnly); [[nodiscard]] static SingleCellFile create(const std::filesystem::path& path, const Reference& chroms, std::uint32_t bin_size, + bool force_overwrite); + [[nodiscard]] static SingleCellFile create(const std::filesystem::path& path, BinTable bins, bool force_overwrite = false); [[nodiscard]] constexpr const phmap::btree_set& cells() const noexcept; @@ -83,7 +86,7 @@ class SingleCellFile { private: [[nodiscard]] static SingleCellAttributes read_standard_attributes(const HighFive::File& f, bool initialize_missing); - [[nodiscard]] static BinTable read_bins(const HighFive::File& f); + [[nodiscard]] static BinTable init_bin_table(const HighFive::File& f); [[nodiscard]] static phmap::btree_set read_cells(const HighFive::File& f); static void create_groups(RootGroup& root_grp); diff --git a/src/libhictk/cooler/include/hictk/cooler/utils.hpp b/src/libhictk/cooler/include/hictk/cooler/utils.hpp index fe2d6075..af26b6e7 100644 --- a/src/libhictk/cooler/include/hictk/cooler/utils.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/utils.hpp @@ -23,9 +23,8 @@ void merge(Str first_file, Str last_file, std::string_view dest_uri, template void merge(const std::vector& heads, const std::vector& tails, - const Reference& chromosomes, std::uint32_t bin_size, std::string_view dest_uri, - bool overwrite_if_exists = false, std::size_t chunk_size = 500'000, - std::size_t update_frequency = 10'000'000); + const BinTable& bins, std::string_view dest_uri, bool overwrite_if_exists = false, + std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000); [[nodiscard]] bool equal(std::string_view uri1, std::string_view uri2, bool ignore_attributes = true); diff --git a/src/libhictk/formatting/include/hictk/fmt.hpp b/src/libhictk/formatting/include/hictk/fmt.hpp index 070d8284..70cc8986 100644 --- a/src/libhictk/formatting/include/hictk/fmt.hpp +++ b/src/libhictk/formatting/include/hictk/fmt.hpp @@ -4,7 +4,7 @@ #pragma once -#include "hictk/fmt/bin_table.hpp" +#include "hictk/fmt/bin.hpp" #include "hictk/fmt/chromosome.hpp" #include "hictk/fmt/genomic_interval.hpp" #include "hictk/fmt/pixel.hpp" diff --git a/src/libhictk/formatting/include/hictk/fmt/bin_table.hpp b/src/libhictk/formatting/include/hictk/fmt/bin.hpp similarity index 98% rename from src/libhictk/formatting/include/hictk/fmt/bin_table.hpp rename to src/libhictk/formatting/include/hictk/fmt/bin.hpp index 2e060601..adc90ced 100644 --- a/src/libhictk/formatting/include/hictk/fmt/bin_table.hpp +++ b/src/libhictk/formatting/include/hictk/fmt/bin.hpp @@ -6,7 +6,7 @@ #include -#include "hictk/bin_table.hpp" +#include "hictk/bin.hpp" #include "hictk/fmt/common.hpp" #include "hictk/fmt/genomic_interval.hpp" diff --git a/src/libhictk/formatting/include/hictk/fmt/pixel.hpp b/src/libhictk/formatting/include/hictk/fmt/pixel.hpp index 2f8a377f..dce5a95a 100644 --- a/src/libhictk/formatting/include/hictk/fmt/pixel.hpp +++ b/src/libhictk/formatting/include/hictk/fmt/pixel.hpp @@ -6,7 +6,7 @@ #include -#include "hictk/fmt/bin_table.hpp" +#include "hictk/fmt/bin.hpp" #include "hictk/fmt/common.hpp" #include "hictk/pixel.hpp" diff --git a/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp b/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp index 05f49829..c5805937 100644 --- a/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp +++ b/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp @@ -25,8 +25,8 @@ class GenomicInterval { GenomicInterval(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept; [[nodiscard]] static GenomicInterval parse(const Reference &chroms, std::string query, Type type = Type::UCSC); - [[nodiscard]] static GenomicInterval parse_ucsc(const Reference &chroms, std::string query); - [[nodiscard]] static GenomicInterval parse_bed(const Reference &chroms, std::string_view query, + [[nodiscard]] static GenomicInterval parse_ucsc(const Reference &chroms, std::string buffer); + [[nodiscard]] static GenomicInterval parse_bed(const Reference &chroms, std::string_view buffer, char sep = '\t'); [[nodiscard]] explicit operator bool() const noexcept; diff --git a/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp b/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp index d5126183..f27762c5 100644 --- a/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp +++ b/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp @@ -104,69 +104,69 @@ inline GenomicInterval GenomicInterval::parse(const Reference &chroms, std::stri return GenomicInterval::parse_bed(chroms, query); } -inline GenomicInterval GenomicInterval::parse_ucsc(const Reference &chroms, std::string query) { - if (query.empty()) { +inline GenomicInterval GenomicInterval::parse_ucsc(const Reference &chroms, std::string buffer) { + if (buffer.empty()) { throw std::runtime_error("query is empty"); } - if (const auto match = chroms.find(query); match != chroms.end()) { + if (const auto match = chroms.find(buffer); match != chroms.end()) { return GenomicInterval{*match}; } - const auto p1 = query.find_last_of(':'); - auto p2 = query.find_last_of('-'); + const auto p1 = buffer.find_last_of(':'); + auto p2 = buffer.find_last_of('-'); if (p1 == std::string::npos && p2 == std::string::npos) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid chromosome \"{0}\" in query \"{0}\""), query)); + fmt::format(FMT_STRING("invalid chromosome \"{0}\" in query \"{0}\""), buffer)); } if (p1 == std::string::npos || p2 == std::string::npos || p1 > p2) { - throw std::runtime_error(fmt::format(FMT_STRING("query \"{}\" is malformed"), query)); + throw std::runtime_error(fmt::format(FMT_STRING("query \"{}\" is malformed"), buffer)); } - if (query.find(',', p1) != std::string::npos) { - query.erase(std::remove(query.begin() + std::ptrdiff_t(p1), query.end(), ','), query.end()); - p2 = query.find_last_of('-'); + if (buffer.find(',', p1) != std::string::npos) { + buffer.erase(std::remove(buffer.begin() + std::ptrdiff_t(p1), buffer.end(), ','), buffer.end()); + p2 = buffer.find_last_of('-'); } - query[p1] = '\t'; - query[p2] = '\t'; + buffer[p1] = '\t'; + buffer[p2] = '\t'; - return GenomicInterval::parse_bed(chroms, query); + return GenomicInterval::parse_bed(chroms, buffer); } -inline GenomicInterval GenomicInterval::parse_bed(const Reference &chroms, std::string_view query, +inline GenomicInterval GenomicInterval::parse_bed(const Reference &chroms, std::string_view buffer, char sep) { - if (query.empty()) { - throw std::runtime_error("query is empty"); + if (buffer.empty()) { + throw std::runtime_error("interval is empty"); } - const auto p1 = query.find(sep); - const auto p2 = query.find(sep, p1 + 1); + const auto p1 = buffer.find(sep); + const auto p2 = buffer.find(sep, p1 + 1); if (p1 == std::string_view::npos || p2 == std::string_view::npos || p1 > p2) { - throw std::runtime_error(fmt::format(FMT_STRING("query \"{}\" is malformed"), query)); + throw std::runtime_error(fmt::format(FMT_STRING("interval \"{}\" is malformed"), buffer)); } - const auto chrom_name = query.substr(0, p1); - const auto start_pos_str = query.substr(p1 + 1, p2 - (p1 + 1)); - const auto end_pos_str = query.substr(p2 + 1); + const auto chrom_name = buffer.substr(0, p1); + const auto start_pos_str = buffer.substr(p1 + 1, p2 - (p1 + 1)); + const auto end_pos_str = buffer.substr(p2 + 1); const auto match = chroms.find(chrom_name); if (match == chroms.end()) { - throw std::runtime_error( - fmt::format(FMT_STRING("invalid chromosome \"{}\" in query \"{}\""), chrom_name, query)); + throw std::runtime_error(fmt::format(FMT_STRING("invalid chromosome \"{}\" in interval \"{}\""), + chrom_name, buffer)); } if (start_pos_str.empty()) { throw std::runtime_error( - fmt::format(FMT_STRING("query \"{}\" is malformed: missing start position"), query)); + fmt::format(FMT_STRING("interval \"{}\" is malformed: missing start position"), buffer)); } if (end_pos_str.empty()) { throw std::runtime_error( - fmt::format(FMT_STRING("query \"{}\" is malformed: missing end position"), query)); + fmt::format(FMT_STRING("interval \"{}\" is malformed: missing end position"), buffer)); } GenomicInterval gi{*match}; @@ -175,29 +175,29 @@ inline GenomicInterval GenomicInterval::parse_bed(const Reference &chroms, std:: internal::parse_numeric_or_throw(start_pos_str, gi._start); } catch (const std::exception &e) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid start position \"{}\" in query \"{}\": {}"), start_pos_str, - query, e.what())); + fmt::format(FMT_STRING("invalid start position \"{}\" in interval \"{}\": {}"), + start_pos_str, buffer, e.what())); } try { internal::parse_numeric_or_throw(end_pos_str, gi._end); } catch (const std::exception &e) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid end position \"{}\" in query \"{}\": {}"), end_pos_str, - query, e.what())); + fmt::format(FMT_STRING("invalid end position \"{}\" in interval \"{}\": {}"), end_pos_str, + buffer, e.what())); } if (gi._end > gi.chrom().size()) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid end position \"{0}\" in query \"{1}\": end position is " + fmt::format(FMT_STRING("invalid end position \"{0}\" in interval \"{1}\": end position is " "greater than the chromosome size ({0} > {2})"), - gi._end, query, gi.chrom().size())); + gi._end, buffer, gi.chrom().size())); } if (gi._start >= gi._end) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid query \"{}\": query end position should be " + fmt::format(FMT_STRING("invalid interval \"{}\": query end position should be " "greater than the start position ({} >= {})"), - query, gi._start, gi._end)); + buffer, gi._start, gi._end)); } return gi; diff --git a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp index 3325fe2d..0512cc78 100644 --- a/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp +++ b/src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp @@ -257,7 +257,7 @@ inline std::size_t PixelSelector::estimate_optimal_cache_size( const std::size_t first_bin_id = 0; const std::size_t last_bin_id = - bins().at(coord1().bin1.chrom(), coord1().bin1.chrom().size()).rel_id() - 1; + bins().at(coord1().bin1.chrom(), coord1().bin1.chrom().size() - 1).rel_id() - 1; const auto samples = (std::min)(num_samples, bins().subset(coord1().bin1.chrom()).size()); for (std::size_t i = 0; i < samples; ++i) { const auto bin_id = diff --git a/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp b/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp index e2785692..cafd7831 100644 --- a/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp +++ b/src/libhictk/pixel/include/hictk/impl/pixel_impl.hpp @@ -119,12 +119,15 @@ inline bool ThinPixel::operator>=(const ThinPixel &other) const noexcept { } template -inline auto ThinPixel::from_coo(const BinTable &bins, std::string_view line) -> ThinPixel { +inline auto ThinPixel::from_coo(const BinTable &bins, std::string_view line, std::int64_t offset) + -> ThinPixel { try { const auto toks = internal::tokenize_n<3>(line); - const auto bin1_id = internal::parse_numeric_or_throw(toks[0]); - const auto bin2_id = internal::parse_numeric_or_throw(toks[1]); + const auto bin1_id = + static_cast(internal::parse_numeric_or_throw(toks[0]) + offset); + const auto bin2_id = + static_cast(internal::parse_numeric_or_throw(toks[1]) + offset); const auto count = internal::parse_numeric_or_throw(toks[2]); if (bin1_id > bins.size()) { @@ -140,13 +143,16 @@ inline auto ThinPixel::from_coo(const BinTable &bins, std::string_view line) fmt::format(FMT_STRING("line \"{}\" is not in coo format: {}"), line, e.what())); } } + template -inline auto ThinPixel::from_coo(std::string_view line) -> ThinPixel { +inline auto ThinPixel::from_coo(std::string_view line, std::int64_t offset) -> ThinPixel { try { const auto toks = internal::tokenize_n<3>(line); - const auto bin1_id = internal::parse_numeric_or_throw(toks[0]); - const auto bin2_id = internal::parse_numeric_or_throw(toks[1]); + const auto bin1_id = + static_cast(internal::parse_numeric_or_throw(toks[0]) + offset); + const auto bin2_id = + static_cast(internal::parse_numeric_or_throw(toks[1]) + offset); const auto count = internal::parse_numeric_or_throw(toks[2]); return {bin1_id, bin2_id, count}; @@ -285,12 +291,15 @@ inline ThinPixel Pixel::to_thin() const noexcept { } template -inline auto Pixel::from_coo(const hictk::BinTable &bins, std::string_view line) -> Pixel { +inline auto Pixel::from_coo(const hictk::BinTable &bins, std::string_view line, + std::int64_t offset) -> Pixel { try { const auto toks = internal::tokenize_n<3>(line); - const auto bin1_id = internal::parse_numeric_or_throw(toks[0]); - const auto bin2_id = internal::parse_numeric_or_throw(toks[1]); + const auto bin1_id = + static_cast(internal::parse_numeric_or_throw(toks[0]) + offset); + const auto bin2_id = + static_cast(internal::parse_numeric_or_throw(toks[1]) + offset); const auto count = internal::parse_numeric_or_throw(toks[2]); return {bins.at(bin1_id), bins.at(bin2_id), count}; @@ -301,15 +310,18 @@ inline auto Pixel::from_coo(const hictk::BinTable &bins, std::string_view lin } template -inline auto Pixel::from_bg2(const hictk::BinTable &bins, std::string_view line) -> Pixel { +inline auto Pixel::from_bg2(const hictk::BinTable &bins, std::string_view line, + std::int64_t offset) -> Pixel { try { const auto toks = internal::tokenize_n_or_more<7>(line); const auto &chrom1 = toks[0]; - const auto start1 = internal::parse_numeric_or_throw(toks[1]); + const auto start1 = static_cast( + internal::parse_numeric_or_throw(toks[1]) + offset); const auto &chrom2 = toks[3]; - const auto start2 = internal::parse_numeric_or_throw(toks[4]); + const auto start2 = static_cast( + internal::parse_numeric_or_throw(toks[4]) + offset); const auto count = internal::parse_numeric_or_throw(toks[6]); return {bins.at(chrom1, start1), bins.at(chrom2, start2), count}; @@ -320,16 +332,18 @@ inline auto Pixel::from_bg2(const hictk::BinTable &bins, std::string_view lin } template -inline auto Pixel::from_validpair(const hictk::BinTable &bins, std::string_view line) - -> Pixel { +inline auto Pixel::from_validpair(const hictk::BinTable &bins, std::string_view line, + std::int64_t offset) -> Pixel { try { const auto toks = internal::tokenize_n_or_more<6>(line); const auto &chrom1 = toks[1]; - const auto start1 = internal::parse_numeric_or_throw(toks[2]); + const auto start1 = static_cast( + internal::parse_numeric_or_throw(toks[2]) + offset); const auto &chrom2 = toks[4]; - const auto start2 = internal::parse_numeric_or_throw(toks[5]); + const auto start2 = static_cast( + internal::parse_numeric_or_throw(toks[5]) + offset); return {bins.at(chrom1, start1), bins.at(chrom2, start2), 1}; } catch (const std::exception &e) { @@ -339,16 +353,18 @@ inline auto Pixel::from_validpair(const hictk::BinTable &bins, std::string_vi } template -inline auto Pixel::from_4dn_pairs(const hictk::BinTable &bins, std::string_view line) - -> Pixel { +inline auto Pixel::from_4dn_pairs(const hictk::BinTable &bins, std::string_view line, + std::int64_t offset) -> Pixel { try { const auto toks = internal::tokenize_n_or_more<6>(line); const auto &chrom1 = toks[1]; - const auto start1 = internal::parse_numeric_or_throw(toks[2]); + const auto start1 = static_cast( + internal::parse_numeric_or_throw(toks[2]) + offset); const auto &chrom2 = toks[3]; - const auto start2 = internal::parse_numeric_or_throw(toks[4]); + const auto start2 = static_cast( + internal::parse_numeric_or_throw(toks[4]) + offset); return {bins.at(chrom1, start1), bins.at(chrom2, start2), 1}; } catch (const std::exception &e) { diff --git a/src/libhictk/pixel/include/hictk/pixel.hpp b/src/libhictk/pixel/include/hictk/pixel.hpp index 6837c12e..d38309d7 100644 --- a/src/libhictk/pixel/include/hictk/pixel.hpp +++ b/src/libhictk/pixel/include/hictk/pixel.hpp @@ -31,8 +31,9 @@ struct ThinPixel { [[nodiscard]] bool operator>(const ThinPixel &other) const noexcept; [[nodiscard]] bool operator>=(const ThinPixel &other) const noexcept; - static auto from_coo(std::string_view line) -> ThinPixel; - static auto from_coo(const BinTable &bins, std::string_view line) -> ThinPixel; + static auto from_coo(std::string_view line, std::int64_t offset = 0) -> ThinPixel; + static auto from_coo(const BinTable &bins, std::string_view line, std::int64_t offset = 0) + -> ThinPixel; }; struct PixelCoordinates { @@ -83,10 +84,14 @@ struct Pixel { [[nodiscard]] bool operator>=(const Pixel &other) const noexcept; [[nodiscard]] ThinPixel to_thin() const noexcept; - static auto from_coo(const BinTable &bins, std::string_view line) -> Pixel; - static auto from_bg2(const BinTable &bins, std::string_view line) -> Pixel; - static auto from_validpair(const BinTable &bins, std::string_view line) -> Pixel; - static auto from_4dn_pairs(const BinTable &bins, std::string_view line) -> Pixel; + static auto from_coo(const BinTable &bins, std::string_view line, std::int64_t offset = 0) + -> Pixel; + static auto from_bg2(const BinTable &bins, std::string_view line, std::int64_t offset = 0) + -> Pixel; + static auto from_validpair(const BinTable &bins, std::string_view line, std::int64_t offset = 0) + -> Pixel; + static auto from_4dn_pairs(const BinTable &bins, std::string_view line, std::int64_t offset = 0) + -> Pixel; }; } // namespace hictk diff --git a/src/libhictk/reference/include/hictk/impl/reference_impl.hpp b/src/libhictk/reference/include/hictk/impl/reference_impl.hpp index e71ce366..8b5d9aec 100644 --- a/src/libhictk/reference/include/hictk/impl/reference_impl.hpp +++ b/src/libhictk/reference/include/hictk/impl/reference_impl.hpp @@ -27,8 +27,9 @@ namespace hictk { template inline Reference::Reference(ChromosomeIt first_chrom, ChromosomeIt last_chrom) - : _buff(first_chrom, last_chrom), + : _buff(construct_chrom_buffer(first_chrom, last_chrom)), _map(construct_chrom_map(_buff)), + _size_prefix_sum(compute_size_prefix_sum(_buff)), _longest_chrom(find_longest_chromosome(_buff)), _chrom_with_longest_name(find_chromosome_with_longest_name(_buff)) { validate(); @@ -39,6 +40,7 @@ inline Reference::Reference(ChromosomeNameIt first_chrom_name, ChromosomeNameIt ChromosomeSizeIt first_chrom_size) : _buff(construct_chrom_buffer(first_chrom_name, last_chrom_name, first_chrom_size)), _map(construct_chrom_map(_buff)), + _size_prefix_sum(compute_size_prefix_sum(_buff)), _longest_chrom(find_longest_chromosome(_buff)), _chrom_with_longest_name(find_chromosome_with_longest_name(_buff)) { validate(); @@ -156,6 +158,10 @@ inline bool Reference::operator==(const Reference& other) const { inline bool Reference::operator!=(const Reference& other) const { return !(*this == other); } +constexpr const std::vector& Reference::chrom_size_prefix_sum() const noexcept { + return _size_prefix_sum; +} + inline const Chromosome& Reference::longest_chromosome() const { if (empty()) { throw std::runtime_error("longest_chromosome() was called on an empty Reference"); @@ -195,6 +201,20 @@ inline auto Reference::construct_chrom_buffer(ChromosomeNameIt first_chrom_name, return buff; } +template +inline auto Reference::construct_chrom_buffer(ChromosomeIt first_chrom, ChromosomeIt last_chrom) + -> ChromBuff { + std::vector chrom_names{}; + std::vector chrom_sizes{}; + + std::for_each(first_chrom, last_chrom, [&](const Chromosome& chrom) { + chrom_names.emplace_back(std::string{chrom.name()}); + chrom_sizes.emplace_back(chrom.size()); + }); + + return construct_chrom_buffer(chrom_names.begin(), chrom_names.end(), chrom_sizes.begin()); +} + inline auto Reference::construct_chrom_map(const ChromBuff& chroms) -> ChromMap { ChromMap buff(chroms.size()); std::transform(chroms.begin(), chroms.end(), std::inserter(buff, buff.begin()), @@ -233,6 +253,18 @@ inline std::size_t Reference::find_chromosome_with_longest_name(const ChromBuff& return static_cast(std::distance(chroms.begin(), match)); } +inline std::vector Reference::compute_size_prefix_sum( + const ChromBuff& chroms) noexcept { + std::vector buff(chroms.size() + 2, 0); + for (std::size_t i = 1; i < chroms.size() + 1; ++i) { + buff[i] = buff[i - 1] + chroms[i - 1].size(); + } + + buff.back() = buff[chroms.size()] + 1; + + return buff; +} + inline void Reference::validate() const { if (empty()) { return; diff --git a/src/libhictk/reference/include/hictk/reference.hpp b/src/libhictk/reference/include/hictk/reference.hpp index 63261ed1..e9eebc54 100644 --- a/src/libhictk/reference/include/hictk/reference.hpp +++ b/src/libhictk/reference/include/hictk/reference.hpp @@ -25,6 +25,7 @@ class Reference { ChromBuff _buff{}; ChromMap _map{}; + std::vector _size_prefix_sum{}; std::size_t _longest_chrom{Chromosome{}.id()}; std::size_t _chrom_with_longest_name{Chromosome{}.id()}; @@ -87,6 +88,8 @@ class Reference { [[nodiscard]] bool operator==(const Reference& other) const; [[nodiscard]] bool operator!=(const Reference& other) const; + [[nodiscard]] constexpr const std::vector& chrom_size_prefix_sum() const noexcept; + // In case of ties, the first match is returned [[nodiscard]] const Chromosome& longest_chromosome() const; [[nodiscard]] const Chromosome& chromosome_with_longest_name() const; @@ -99,12 +102,19 @@ class Reference { ChromosomeNameIt last_chrom_name, ChromosomeSizeIt first_chrom_size) -> ChromBuff; + template + [[nodiscard]] static auto construct_chrom_buffer(ChromosomeIt first_chrom, + ChromosomeIt last_chrom) -> ChromBuff; + [[nodiscard]] static auto construct_chrom_map(const ChromBuff& chroms) -> ChromMap; [[nodiscard]] static std::size_t find_longest_chromosome(const ChromBuff& chroms) noexcept; [[nodiscard]] static std::size_t find_chromosome_with_longest_name( const ChromBuff& chroms) noexcept; + [[nodiscard]] static std::vector compute_size_prefix_sum( + const ChromBuff& chroms) noexcept; + void validate() const; }; } // namespace hictk diff --git a/test/scripts/hictk_load_4dn.sh b/test/scripts/hictk_load_4dn.sh index a81cf6af..d3b337a5 100755 --- a/test/scripts/hictk_load_4dn.sh +++ b/test/scripts/hictk_load_4dn.sh @@ -49,35 +49,23 @@ function compare_coolers { fi } -function shuffle { - if command -v shuf &> /dev/null; then - shuf - else - sort -R - fi -} - export function readlink_py shuffle status=0 -if [ $# -ne 2 ]; then - 2>&1 echo "Usage: $0 path_to_hictk [un]sorted" +if [ $# -ne 1 ]; then + 2>&1 echo "Usage: $0 path_to_hictk" status=1 fi hictk_bin="$1" -if [[ "$2" == 'sorted' ]]; then - sorted=true -else - sorted=false -fi data_dir="$(readlink_py "$(dirname "$0")/../data/integration_tests")" script_dir="$(readlink_py "$(dirname "$0")")" pairs="$data_dir/4DNFIKNWM36K.subset.pairs.xz" -ref_cooler="$data_dir/4DNFIKNWM36K.subset.cool" +ref_cooler_fixed_bins="$data_dir/4DNFIKNWM36K.subset.fixed-bins.cool" +ref_cooler_variable_bins="$data_dir/4DNFIKNWM36K.subset.variable-bins.cool" export PATH="$PATH:$script_dir" @@ -99,37 +87,43 @@ if [ $status -ne 0 ]; then exit $status fi -if ! check_files_exist "$ref_cooler"; then +if ! check_files_exist "$pairs" "$ref_cooler_fixed_bins" "$ref_cooler_variable_bins"; then exit 1 fi outdir="$(mktemp -d -t hictk-tmp-XXXXXXXXXX)" trap 'rm -rf -- "$outdir"' EXIT -cooler dump -t chroms "$ref_cooler" > "$outdir/chrom.sizes" - -if [[ "$sorted" == true ]]; then - xzcat "$pairs" | - "$hictk_bin" load \ - -f 4dn \ - --assume-sorted \ - --batch-size 1000000 \ - "$outdir/chrom.sizes" \ - 10000 \ - "$outdir/out.cool" -else - xzcat "$pairs" | - shuffle | - "$hictk_bin" load \ - -f 4dn \ - --assume-unsorted \ - --batch-size 1000000 \ - "$outdir/chrom.sizes" \ - 10000 \ - "$outdir/out.cool" +cooler dump -t chroms "$ref_cooler_fixed_bins" > "$outdir/chrom.sizes" + +# Test cooler with fixed bin size +xzcat "$pairs" | + "$hictk_bin" load \ + -f 4dn \ + --assume-sorted \ + --batch-size 1000000 \ + --bin-size 10000 \ + "$outdir/chrom.sizes" \ + "$outdir/out.cool" + +if ! compare_coolers "$outdir/out.cool" "$ref_cooler_fixed_bins"; then + status=1 fi -if ! compare_coolers "$outdir/out.cool" "$ref_cooler"; then +# Test cooler with variable bin size +cooler dump -t bins "$ref_cooler_variable_bins" > "$outdir/bins.bed" + +xzcat "$pairs" | + "$hictk_bin" load \ + -f 4dn \ + --assume-sorted \ + --batch-size 1000000 \ + --bin-table "$outdir/bins.bed" \ + --force \ + "$outdir/chrom.sizes" \ + "$outdir/out.cool" + +if ! compare_coolers "$outdir/out.cool" "$ref_cooler_variable_bins"; then status=1 fi diff --git a/test/scripts/hictk_load_bg2.sh b/test/scripts/hictk_load_bg2.sh index 3c1df19b..5bb7ea75 100755 --- a/test/scripts/hictk_load_bg2.sh +++ b/test/scripts/hictk_load_bg2.sh @@ -76,7 +76,7 @@ fi data_dir="$(readlink_py "$(dirname "$0")/../data/integration_tests")" script_dir="$(readlink_py "$(dirname "$0")")" -ref_cooler="$data_dir/4DNFIKNWM36K.subset.cool" +ref_cooler="$data_dir/4DNFIKNWM36K.subset.fixed-bins.cool" export PATH="$PATH:$script_dir" @@ -108,8 +108,8 @@ if [[ "$sorted" == true ]]; then -f bg2 \ --assume-sorted \ --batch-size 1000000 \ + --bin-size 10000 \ "$outdir/chrom.sizes" \ - 10000 \ "$outdir/out.cool" else cooler dump -t pixels --join "$ref_cooler" | @@ -118,8 +118,8 @@ else -f bg2 \ --assume-unsorted \ --batch-size 1000000 \ + --bin-size 10000 \ "$outdir/chrom.sizes" \ - 10000 \ "$outdir/out.cool" fi diff --git a/test/scripts/hictk_load_coo.sh b/test/scripts/hictk_load_coo.sh index 6ab669d6..0d46f9c7 100755 --- a/test/scripts/hictk_load_coo.sh +++ b/test/scripts/hictk_load_coo.sh @@ -76,7 +76,7 @@ fi data_dir="$(readlink_py "$(dirname "$0")/../data/integration_tests")" script_dir="$(readlink_py "$(dirname "$0")")" -ref_cooler="$data_dir/4DNFIKNWM36K.subset.cool" +ref_cooler="$data_dir/4DNFIKNWM36K.subset.fixed-bins.cool" export PATH="$PATH:$script_dir" @@ -108,8 +108,8 @@ if [[ "$sorted" == true ]]; then -f coo \ --assume-sorted \ --batch-size 1000000 \ + --bin-size 10000 \ "$outdir/chrom.sizes" \ - 10000 \ "$outdir/out.cool" else cooler dump -t pixels "$ref_cooler" | @@ -118,8 +118,8 @@ else -f coo \ --assume-unsorted \ --batch-size 1000000 \ + --bin-size 10000 \ "$outdir/chrom.sizes" \ - 10000 \ "$outdir/out.cool" fi diff --git a/test/scripts/segment_genome_random.py b/test/scripts/segment_genome_random.py new file mode 100755 index 00000000..30a4b7c8 --- /dev/null +++ b/test/scripts/segment_genome_random.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +import random +import argparse +import pathlib +import pandas as pd + + +def make_cli(): + def positive_int(arg): + if (n := int(arg)) > 0: + return n + + raise ValueError("Not a positive integer") + + cli = argparse.ArgumentParser() + + cli.add_argument( + "chrom-sizes", + type=pathlib.Path, + help="Path to a .chrom.sizes file.", + ) + + cli.add_argument( + "bin-size", + type=positive_int, + help="Average bin size.", + ) + + cli.add_argument( + "--seed", + type=positive_int, + default=2074288341, + help="Seed used to initialize the PRNG.", + ) + + return cli + + +def read_chrom_sizes(path_to_chrom_sizes: pathlib.Path): + return pd.read_table(path_to_chrom_sizes, names=["chrom", "length"]) + + +def segment_chromosome(chrom: str, size: int, bin_size: int): + start = 0 + end = min(start + int(random.gauss(bin_size, 0.1 * bin_size)), size) + + print(f"{chrom}\t{start}\t{end}") + + while end < size: + start = end + end = min(start + int(random.gauss(bin_size, 0.1 * bin_size)), size) + print(f"{chrom}\t{start}\t{end}") + + +def main(): + args = vars(make_cli().parse_args()) + random.seed(args["seed"]) + + chrom_sizes = read_chrom_sizes(args["chrom-sizes"]) + for _, (chrom, size) in chrom_sizes.iterrows(): + segment_chromosome(chrom, size, args["bin-size"]) + + +if __name__ == "__main__": + main() diff --git a/test/units/bin_table/CMakeLists.txt b/test/units/bin_table/CMakeLists.txt index afc87653..71223665 100644 --- a/test/units/bin_table/CMakeLists.txt +++ b/test/units/bin_table/CMakeLists.txt @@ -3,13 +3,17 @@ # SPDX-License-Identifier: MIT find_package(Filesystem REQUIRED) +find_package(fmt REQUIRED) include(CTest) include(Catch) add_executable(bin_table_tests) -target_sources(bin_table_tests PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/bin_table_test.cpp") +target_sources( + bin_table_tests + PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/bin_test.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/bin_table_test.cpp") target_link_libraries( bin_table_tests @@ -20,6 +24,7 @@ target_link_system_libraries( bin_table_tests PUBLIC Catch2::Catch2WithMain + fmt::fmt-header-only std::filesystem) file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/Testing/") diff --git a/test/units/bin_table/bin_table_test.cpp b/test/units/bin_table/bin_table_test.cpp index 4d92e35d..65e8dfb9 100644 --- a/test/units/bin_table/bin_table_test.cpp +++ b/test/units/bin_table/bin_table_test.cpp @@ -12,102 +12,10 @@ #include #include -#include "hictk/fmt/bin_table.hpp" - namespace hictk::test::bin_table { // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Bin", "[bin][short]") { - const Chromosome chrom1{0, "chr1", 50}; - const Chromosome chrom2{1, "chr2", 10}; - SECTION("Ctors") { - CHECK(Bin{chrom1, 1, 2}.has_null_id()); - CHECK_FALSE(Bin{0, 0, chrom1, 1, 2}.has_null_id()); - CHECK_FALSE(Bin{0, 0, GenomicInterval{chrom1, 1, 2}}.has_null_id()); - } - - SECTION("Accessors") { - const Bin bin1{chrom1, 1, 2}; - const Bin bin2{10, 5, chrom1, 1, 2}; - - CHECK(bin1.id() == Bin::null_id); - CHECK(bin2.id() == 10); - CHECK(bin2.rel_id() == 5); - - CHECK(bin1.interval() == GenomicInterval{chrom1, 1, 2}); - - CHECK(bin2.chrom() == chrom1); - CHECK(bin2.start() == 1); - CHECK(bin2.end() == 2); - } - - SECTION("operators (wo/ id)") { - const Bin bin0{}; - const Bin bin1{chrom1, 1, 2}; - const Bin bin2{chrom1, 2, 3}; - - const Bin bin3{chrom2, 1, 2}; - - CHECK(!bin0); - CHECK(!!bin1); - - CHECK(bin1 != bin2); - CHECK(bin1 != bin3); - - CHECK(bin1 < bin2); - CHECK(bin1 < bin3); - - CHECK(bin1 <= bin2); - CHECK(bin1 <= bin3); - - CHECK(bin2 > bin1); - CHECK(bin3 > bin1); - - CHECK(bin2 >= bin1); - CHECK(bin3 >= bin1); - } - - SECTION("operators (w/ id)") { - const Bin bin1{0, 0, chrom1, 1, 2}; - const Bin bin2{1, 1, chrom1, 2, 3}; - - const Bin bin3{10, 10, chrom2, 1, 2}; - const Bin bin4{10, 10, chrom2, 10, 20}; - - CHECK(bin1 != bin2); - CHECK(bin1 != bin3); - - // This is true because they have the same ID. - // However, comparing bins with same ID and different interval is UB - CHECK(bin3 == bin4); - - CHECK(bin1 < bin2); - CHECK(bin1 < bin3); - - CHECK(bin1 <= bin2); - CHECK(bin1 <= bin3); - - CHECK(bin2 > bin1); - CHECK(bin3 > bin1); - - CHECK(bin2 >= bin1); - CHECK(bin3 >= bin1); - } - - SECTION("fmt") { - const Bin bin1{chrom1, 0, 100}; - const Bin bin2{123, 123, chrom1, 0, 100}; - - CHECK(fmt::format(FMT_STRING("{}"), bin1) == std::to_string(Bin::null_id)); - CHECK(fmt::format(FMT_STRING("{}"), bin2) == "123"); - CHECK(fmt::format(FMT_STRING("{:bed}"), bin1) == "chr1\t0\t100"); - CHECK(fmt::format(FMT_STRING("{:ucsc}"), bin1) == "chr1:0-100"); - CHECK(fmt::format(FMT_STRING("{:raw}"), bin2) == "123"); - } -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("BinTable", "[bin-table][short]") { +TEST_CASE("BinTable (fixed bins)", "[bin-table][short]") { constexpr std::uint32_t bin_size = 5000; // clang-format off const BinTable table({ @@ -130,10 +38,15 @@ TEST_CASE("BinTable", "[bin-table][short]") { CHECK(table.at(0) == Bin{chr1, 0, bin_size}); CHECK(table.at(10) == Bin{chr1, 50000, 50001}); - CHECK(table.at(11) == Bin{chr2, 0, bin_size}); + CHECK(table.at(chr1, bin_size - 1).id() == 0); + CHECK(table.at(chr1, 50000).id() == 10); + CHECK(table.at(chr2, 1).id() == 11); + CHECK_THROWS_AS(table.at(table.size()), std::out_of_range); + CHECK_THROWS_AS(table.at(chr1, 50001), std::out_of_range); + CHECK_THROWS_AS(table.at(chr2, 26000), std::out_of_range); } SECTION("coord to bin id") { @@ -150,7 +63,7 @@ TEST_CASE("BinTable", "[bin-table][short]") { } SECTION("subset") { - const BinTable expected{{Chromosome{1, "chr2", 25017}}, bin_size}; + const BinTableFixed expected{{Chromosome{1, "chr2", 25017}}, bin_size}; CHECK(table.subset(Chromosome{1, "chr2", 25017}) == expected); CHECK(table.subset("chr2") == expected); @@ -181,6 +94,19 @@ TEST_CASE("BinTable", "[bin-table][short]") { CHECK(std::distance(its.first, its.second) == std::distance(table1.begin(), table1.end())); } + SECTION("accessors") { + CHECK(table.has_fixed_bin_size()); + CHECK_NOTHROW(table.get()); + CHECK_THROWS(table.get>()); + } + + SECTION("operator==") { + CHECK(BinTable(table.chromosomes(), 10) == BinTable(table.chromosomes(), 10)); + CHECK(BinTable(table.chromosomes(), 10) != BinTable(table.chromosomes(), 20)); + CHECK(BinTable(Reference{table.chromosomes().begin(), table.chromosomes().end() - 1}, 10) != + BinTable(table.chromosomes(), 10)); + } + SECTION("iterators") { const auto& chr1 = table.chromosomes().at("chr1"); const auto& chr2 = table.chromosomes().at("chr2"); @@ -222,6 +148,9 @@ TEST_CASE("BinTable", "[bin-table][short]") { } CHECK(first_bin == last_bin); + + CHECK_THROWS_AS(first_bin++, std::out_of_range); + CHECK_THROWS_AS(last_bin++, std::out_of_range); } SECTION("backward") { @@ -234,6 +163,9 @@ TEST_CASE("BinTable", "[bin-table][short]") { } CHECK(first_bin == last_bin); + + CHECK_THROWS_AS(--first_bin, std::out_of_range); + CHECK_THROWS_AS(--last_bin, std::out_of_range); } SECTION("operator+") { @@ -244,6 +176,8 @@ TEST_CASE("BinTable", "[bin-table][short]") { for (std::size_t i = 0; i < expected.size() - 5; ++i) { CHECK(*(it + i) == expected[i + 5]); // NOLINT } + + CHECK_THROWS_AS(it + 100, std::out_of_range); } SECTION("operator-") { @@ -255,20 +189,245 @@ TEST_CASE("BinTable", "[bin-table][short]") { for (std::size_t i = 1; i < expected.size(); ++i) { CHECK(*(it1 - i) == *(it2 - i)); // NOLINT } + + CHECK_THROWS_AS(it1 - 100, std::out_of_range); + } + + SECTION("accessors") { + CHECK_NOTHROW(table.begin().get()); + CHECK_THROWS(table.begin().get::iterator>()); + } + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("BinTable (variable bins)", "[bin-table][short]") { + const Chromosome chrom1{0, "chr1", 32}; + const Chromosome chrom2{1, "chr2", 32}; + + const std::vector start_pos{0, 8, 15, 23, 0, 5, 10, 26}; + const std::vector end_pos{8, 15, 23, 32, 5, 10, 26, 32}; + + const BinTable table({chrom1, chrom2}, start_pos, end_pos); + + SECTION("stats") { + CHECK(BinTable{}.empty()); + CHECK(table.size() == start_pos.size()); + CHECK(table.num_chromosomes() == 2); + CHECK(table.bin_size() == 0); + } + + SECTION("at") { + const auto& chr1 = table.chromosomes().at("chr1"); + const auto& chr2 = table.chromosomes().at("chr2"); + + CHECK(table.at(0) == Bin{chr1, 0, 8}); + CHECK(table.at(3) == Bin{chr1, 23, 32}); + CHECK(table.at(4) == Bin{chr2, 0, 5}); + + CHECK(table.at(chr1, 0).id() == 0); + CHECK(table.at(chr1, 7).id() == 0); + CHECK(table.at(chr1, 8).id() == 1); + + CHECK(table.at(chr1, 23).id() == 3); + CHECK(table.at(chr2, 4).id() == 4); + + CHECK_THROWS_AS(table.at(table.size()), std::out_of_range); + CHECK_THROWS_AS(table.at(chr1, 32), std::out_of_range); + CHECK_THROWS_AS(table.at(chr2, 32), std::out_of_range); + } + + SECTION("coord to bin id") { + const auto& chr2 = table.chromosomes().at("chr2"); + + CHECK(table.map_to_bin_id(0, 8) == 1); + CHECK(table.map_to_bin_id("chr1", 25) == 3); + CHECK(table.map_to_bin_id(chr2, 9) == 5); + + CHECK_THROWS_AS(table.map_to_bin_id("a", 0), std::out_of_range); + CHECK_THROWS_AS(table.map_to_bin_id("chr1", 33), std::out_of_range); + CHECK_THROWS_AS(table.map_to_bin_id(chr2, 50), std::out_of_range); + CHECK_THROWS_AS(table.map_to_bin_id(1, 50), std::out_of_range); + } + + SECTION("subset") { + const std::vector start_pos_{0, 5, 10, 26}; + const std::vector end_pos_{5, 10, 26, 32}; + const BinTableVariable expected{{Chromosome{1, "chr2", 32}}, start_pos_, end_pos_}; + + CHECK(table.subset(Chromosome{1, "chr2", 32}) == expected); + CHECK(table.subset("chr2") == expected); + CHECK(table.subset(1) == expected); + CHECK(table.subset("chr1") != expected); + + if constexpr (ndebug_not_defined()) { + CHECK_THROWS_AS(table.subset(Chromosome{4, "chr5", 1}), std::out_of_range); + } + CHECK_THROWS_AS(table.subset("a"), std::out_of_range); + CHECK_THROWS_AS(table.subset(10), std::out_of_range); + } + + SECTION("find overlap") { + const auto& chrom = *table.chromosomes().begin(); + + auto its = table.find_overlap({chrom, 8, 9}); + CHECK(std::distance(its.first, its.second) == 1); + + its = table.find_overlap({chrom, 8, 15 - 1}); + CHECK(std::distance(its.first, its.second) == 1); + + its = table.find_overlap({chrom, 14, 23}); + CHECK(std::distance(its.first, its.second) == 2); + + its = table.find_overlap({chrom, 0, chrom.size()}); + CHECK(std::distance(its.first, its.second) == 4); + } + + SECTION("accessors") { + CHECK_FALSE(table.has_fixed_bin_size()); + CHECK_NOTHROW(table.get>()); + CHECK_THROWS(table.get()); + } + + SECTION("invalid bins") { + SECTION("bins out of order") { + const std::vector start_pos1{0, 8, 7}; + const std::vector end_pos1{8, 15, 23}; + + CHECK_THROWS_WITH(BinTable({chrom1, chrom2}, start_pos1, end_pos1), + Catch::Matchers::ContainsSubstring("not sorted")); + + const std::vector start_pos2{0, 8, 15}; + const std::vector end_pos2{8, 15, 14}; + + CHECK_THROWS_WITH(BinTable({chrom1, chrom2}, start_pos2, end_pos2), + Catch::Matchers::ContainsSubstring("not sorted")); + } + SECTION("gap between bins") { + const std::vector start_pos1{0, 8, 16}; + const std::vector end_pos1{8, 15, 23}; + + CHECK_THROWS_WITH(BinTable({chrom1, chrom2}, start_pos1, end_pos1), + Catch::Matchers::ContainsSubstring("gap between bins")); + + const std::vector start_pos2{1, 8, 16}; + const std::vector end_pos2{8, 15, 23}; + + CHECK_THROWS_WITH(BinTable({chrom1, chrom2}, start_pos2, end_pos2), + Catch::Matchers::ContainsSubstring("does not start from zero")); + } + + SECTION("start pos >= end pos") { + const std::vector start_pos1{0, 8, 10, 15}; + const std::vector end_pos1{0, 10, 15, 23}; + + CHECK_THROWS_WITH(BinTable({chrom1, chrom2}, start_pos1, end_pos1), + Catch::Matchers::ContainsSubstring("start_pos >= end_pos")); + } + + SECTION("number of chromosome mismatch") { + const Chromosome chrom3{2, "chr3", 32}; + CHECK_THROWS_WITH(BinTable({chrom1, chrom2, chrom3}, start_pos, end_pos), + Catch::Matchers::ContainsSubstring("unexpected number of chromosomes")); } } - SECTION("concretize") { - const auto concrete_table = table.concretize(); + SECTION("operator==") { + CHECK(BinTable(table.chromosomes(), start_pos, end_pos) == + BinTable(table.chromosomes(), start_pos, end_pos)); + + const std::vector start_pos1{0, 0}; + const std::vector end_pos1{32, 32}; + CHECK(BinTable(table.chromosomes(), start_pos, end_pos) != + BinTable(table.chromosomes(), start_pos1, end_pos1)); + + const std::vector start_pos2{0}; + const std::vector end_pos2{32}; + CHECK(BinTable(Reference{table.chromosomes().begin(), table.chromosomes().end() - 1}, + start_pos2, end_pos2) != BinTable(table.chromosomes(), start_pos, end_pos)); + + CHECK(BinTable(table.chromosomes(), start_pos, end_pos) != BinTable(table.chromosomes(), 10)); + } + + SECTION("iterators") { + const auto& chr1 = table.chromosomes().at("chr1"); + const auto& chr2 = table.chromosomes().at("chr2"); + + // clang-format off + const std::array expected{ + Bin{0, 0, chr1, 0, 8}, + Bin{1, 1, chr1, 8, 15}, + Bin{2, 2, chr1, 15, 23}, + Bin{3, 3, chr1, 23, 32}, + Bin{4, 0, chr2, 0, 5}, + Bin{5, 1, chr2, 5, 10}, + Bin{6, 2, chr2, 10, 26}, + Bin{7, 3, chr2, 26, 32} + }; + // clang-format on - REQUIRE(concrete_table.chroms.size() == table.size()); + REQUIRE(table.size() == expected.size()); - std::size_t i = 0; - for (const auto& bin : table) { - CHECK(concrete_table.chroms[i] == bin.chrom()); - CHECK(concrete_table.bin_starts[i] == bin.start()); - CHECK(concrete_table.bin_ends[i++] == bin.end()); + SECTION("forward") { + auto first_bin = table.begin(); + auto last_bin = table.end(); + + // NOLINTNEXTLINE + for (std::size_t i = 0; i < expected.size(); ++i) { + CHECK(*first_bin++ == expected[i]); + } + + CHECK(first_bin == last_bin); + + CHECK_THROWS_AS(first_bin++, std::out_of_range); + CHECK_THROWS_AS(last_bin++, std::out_of_range); + } + + SECTION("backward") { + auto first_bin = table.begin(); + auto last_bin = table.end(); + + // NOLINTNEXTLINE + for (std::size_t i = expected.size(); i != 0; --i) { + CHECK(*(--last_bin) == expected[i - 1]); + } + + CHECK(first_bin == last_bin); + + CHECK_THROWS_AS(--first_bin, std::out_of_range); + CHECK_THROWS_AS(--last_bin, std::out_of_range); + } + + SECTION("operator+") { + CHECK(table.begin() + 0 == table.begin()); + CHECK(*(table.begin() + 5) == expected[5]); + + auto it = table.begin() + 5; + for (std::size_t i = 0; i < expected.size() - 5; ++i) { + CHECK(*(it + i) == expected[i + 5]); // NOLINT + } + + CHECK_THROWS_AS(it + 100, std::out_of_range); + } + + SECTION("operator-") { + CHECK(table.begin() - 0 == table.begin()); + CHECK(*(table.end() - 5) == *(expected.end() - 5)); + + auto it1 = table.end(); + auto it2 = expected.end(); // NOLINT + for (std::size_t i = 1; i < expected.size(); ++i) { + CHECK(*(it1 - i) == *(it2 - i)); // NOLINT + } + + CHECK_THROWS_AS(it1 - 100, std::out_of_range); + } + + SECTION("accessors") { + CHECK_NOTHROW(table.begin().get::iterator>()); + CHECK_THROWS(table.begin().get()); } } } + } // namespace hictk::test::bin_table diff --git a/test/units/bin_table/bin_test.cpp b/test/units/bin_table/bin_test.cpp new file mode 100644 index 00000000..7d0889e8 --- /dev/null +++ b/test/units/bin_table/bin_test.cpp @@ -0,0 +1,105 @@ +// Copyright (C) 2022 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include "hictk/bin.hpp" + +#include + +#include +#include + +#include "hictk/fmt/bin.hpp" + +namespace hictk::test::bin_table { + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("Bin", "[bin][short]") { + const Chromosome chrom1{0, "chr1", 50}; + const Chromosome chrom2{1, "chr2", 10}; + SECTION("Ctors") { + CHECK(Bin{chrom1, 1, 2}.has_null_id()); + CHECK_FALSE(Bin{0, 0, chrom1, 1, 2}.has_null_id()); + CHECK_FALSE(Bin{0, 0, GenomicInterval{chrom1, 1, 2}}.has_null_id()); + } + + SECTION("Accessors") { + const Bin bin1{chrom1, 1, 2}; + const Bin bin2{10, 5, chrom1, 1, 2}; + + CHECK(bin1.id() == Bin::null_id); + CHECK(bin2.id() == 10); + CHECK(bin2.rel_id() == 5); + + CHECK(bin1.interval() == GenomicInterval{chrom1, 1, 2}); + + CHECK(bin2.chrom() == chrom1); + CHECK(bin2.start() == 1); + CHECK(bin2.end() == 2); + } + + SECTION("operators (wo/ id)") { + const Bin bin0{}; + const Bin bin1{chrom1, 1, 2}; + const Bin bin2{chrom1, 2, 3}; + + const Bin bin3{chrom2, 1, 2}; + + CHECK(!bin0); + CHECK(!!bin1); + + CHECK(bin1 != bin2); + CHECK(bin1 != bin3); + + CHECK(bin1 < bin2); + CHECK(bin1 < bin3); + + CHECK(bin1 <= bin2); + CHECK(bin1 <= bin3); + + CHECK(bin2 > bin1); + CHECK(bin3 > bin1); + + CHECK(bin2 >= bin1); + CHECK(bin3 >= bin1); + } + + SECTION("operators (w/ id)") { + const Bin bin1{0, 0, chrom1, 1, 2}; + const Bin bin2{1, 1, chrom1, 2, 3}; + + const Bin bin3{10, 10, chrom2, 1, 2}; + const Bin bin4{10, 10, chrom2, 10, 20}; + + CHECK(bin1 != bin2); + CHECK(bin1 != bin3); + + // This is true because they have the same ID. + // However, comparing bins with same ID and different interval is UB + CHECK(bin3 == bin4); + + CHECK(bin1 < bin2); + CHECK(bin1 < bin3); + + CHECK(bin1 <= bin2); + CHECK(bin1 <= bin3); + + CHECK(bin2 > bin1); + CHECK(bin3 > bin1); + + CHECK(bin2 >= bin1); + CHECK(bin3 >= bin1); + } + + SECTION("fmt") { + const Bin bin1{chrom1, 0, 100}; + const Bin bin2{123, 123, chrom1, 0, 100}; + + CHECK(fmt::format(FMT_STRING("{}"), bin1) == fmt::to_string(Bin::null_id)); + CHECK(fmt::format(FMT_STRING("{}"), bin2) == "123"); + CHECK(fmt::format(FMT_STRING("{:bed}"), bin1) == "chr1\t0\t100"); + CHECK(fmt::format(FMT_STRING("{:ucsc}"), bin1) == "chr1:0-100"); + CHECK(fmt::format(FMT_STRING("{:raw}"), bin2) == "123"); + } +} +} // namespace hictk::test::bin_table diff --git a/test/units/cooler/file_ctors_test.cpp b/test/units/cooler/file_ctors_test.cpp index 470aab67..bd99cd95 100644 --- a/test/units/cooler/file_ctors_test.cpp +++ b/test/units/cooler/file_ctors_test.cpp @@ -18,11 +18,31 @@ namespace hictk::cooler::test::cooler_file { TEST_CASE("Cooler: init files", "[cooler][short]") { const Reference chroms{Chromosome{0, "chr1", 10000}, Chromosome{1, "chr2", 5000}}; - const auto path = testdir() / "test_init.cool"; - constexpr std::uint32_t bin_size = 1000; - std::ignore = File::create(path.string(), chroms, bin_size, true); - CHECK(utils::is_cooler(path.string())); // NOLINTNEXTLINE - CHECK(File(path.string()).attributes().generated_by->find("hictk") == 0); + SECTION("fixed bins") { + const auto path = testdir() / "test_init_fixed_bins.cool"; + constexpr std::uint32_t bin_size = 1000; + std::ignore = File::create(path.string(), chroms, bin_size, true); + CHECK(utils::is_cooler(path.string())); // NOLINTNEXTLINE + CHECK(File(path.string()).attributes().generated_by->find("hictk") == 0); + CHECK(File(path.string()).attributes().bin_type.value() == "fixed"); // NOLINT + } + + SECTION("variable bins") { + const auto path = testdir() / "test_init_variable_bins.cool"; + + const Chromosome chrom1{0, "chr1", 32}; + const Chromosome chrom2{1, "chr2", 32}; + + const std::vector start_pos{0, 8, 15, 23, 0, 5, 10, 26}; + const std::vector end_pos{8, 15, 23, 32, 5, 10, 26, 32}; + + const BinTable table({chrom1, chrom2}, start_pos, end_pos); + + std::ignore = File::create(path.string(), table, true); + CHECK(utils::is_cooler(path.string())); // NOLINTNEXTLINE + CHECK(File(path.string()).attributes().generated_by->find("hictk") == 0); + CHECK(File(path.string()).attributes().bin_type.value() == "variable"); // NOLINT + } } // NOLINTNEXTLINE(readability-function-cognitive-complexity) @@ -62,7 +82,7 @@ TEST_CASE("Cooler: file ctors", "[cooler][short]") { f.append_pixels(pixels.begin(), pixels.end(), true); } } - SECTION("open .cool") { + SECTION("open .cool (fixed bin size)") { const auto path = datadir / "cooler_test_file.cool"; const File f(path.string()); @@ -74,6 +94,18 @@ TEST_CASE("Cooler: file ctors", "[cooler][short]") { CHECK(f.has_pixel_of_type()); } + SECTION("open .cool (variable bin size)") { + const auto path = datadir / "cooler_variable_bins_test_file.cool"; + const File f(path.string()); + + CHECK(f.path() == path); + CHECK(f.uri() == path); + CHECK(f.bin_size() == 0); + CHECK(f.chromosomes().size() == 2); + CHECK(f.bins().size() == 8); + CHECK(f.has_pixel_of_type()); + } + SECTION("open .scool") { const auto path = datadir / "single_cell_cooler_test_file.scool"; diff --git a/test/units/cooler/file_pixels_test.cpp b/test/units/cooler/file_pixels_test.cpp index 9d224836..6f9d3b1a 100644 --- a/test/units/cooler/file_pixels_test.cpp +++ b/test/units/cooler/file_pixels_test.cpp @@ -29,7 +29,7 @@ TEST_CASE("Cooler: read/write pixels", "[cooler][long]") { std::mt19937_64 rand_eng{rd()}; auto pixel_it = expected.begin(); - do { + do { // NOLINT(cppcoreguidelines-avoid-do-while) const auto diff = std::distance(pixel_it, expected.end()); // Write pixels in chunks of random size const auto offset = diff --git a/test/units/pixel/pixel_test.cpp b/test/units/pixel/pixel_test.cpp index 539cbe28..1f7efff3 100644 --- a/test/units/pixel/pixel_test.cpp +++ b/test/units/pixel/pixel_test.cpp @@ -138,12 +138,14 @@ TEST_CASE("ThinPixel: parsers", "[pixel][short]") { } SECTION("invalid") { - CHECK_THROWS_WITH(Pixel::from_coo(bins, ""), + CHECK_THROWS_WITH(ThinPixel::from_coo(bins, ""), Catch::Matchers::ContainsSubstring("expected exactly 3 fields")); - CHECK_THROWS_WITH(Pixel::from_coo(bins, "chr1\t0\t10\tchr1\t10\t20\t1"), + CHECK_THROWS_WITH(ThinPixel::from_coo(bins, "chr1\t0\t10\tchr1\t10\t20\t1"), Catch::Matchers::ContainsSubstring("expected exactly 3 fields")); - CHECK_THROWS_WITH(Pixel::from_coo(bins, "0\t1\tchr"), + CHECK_THROWS_WITH(ThinPixel::from_coo(bins, "0\t1\tchr"), Catch::Matchers::ContainsSubstring("Unable to convert field \"chr\"")); + CHECK_THROWS_WITH(ThinPixel::from_coo(bins, "9999999999\t9999999999\t1"), + Catch::Matchers::ContainsSubstring("out of range")); } } } @@ -159,10 +161,11 @@ TEST_CASE("Pixel: parsers", "[pixel][short]") { const BinTable bins{chroms, bin_size}; using N = std::uint32_t; - const Pixel expected{{bins.at(0), bins.at(1)}, 1}; + const Pixel expected1{{bins.at(0), bins.at(1)}, 1}; + const Pixel expected2{{bins.at(24895642), bins.at(24895642)}, 1}; SECTION("coo") { - SECTION("valid") { CHECK(Pixel::from_coo(bins, "0\t1\t1") == expected); } + SECTION("valid") { CHECK(Pixel::from_coo(bins, "0\t1\t1") == expected1); } SECTION("invalid") { CHECK_THROWS_WITH(Pixel::from_coo(bins, ""), Catch::Matchers::ContainsSubstring("expected exactly 3 fields")); @@ -170,13 +173,17 @@ TEST_CASE("Pixel: parsers", "[pixel][short]") { Catch::Matchers::ContainsSubstring("expected exactly 3 fields")); CHECK_THROWS_WITH(Pixel::from_coo(bins, "0\t1\tchr"), Catch::Matchers::ContainsSubstring("Unable to convert field \"chr\"")); + CHECK_THROWS_WITH(Pixel::from_coo(bins, "9999999999\t9999999999\t1"), + Catch::Matchers::ContainsSubstring("out of range")); } } SECTION("bg2") { SECTION("valid") { - CHECK(Pixel::from_bg2(bins, "chr1\t0\t10\tchr1\t10\t20\t1") == expected); - CHECK(Pixel::from_bg2(bins, "chr1\t0\t10\tchr1\t10\t20\t1\ta\tb\tc") == expected); + CHECK(Pixel::from_bg2(bins, "chr1\t0\t10\tchr1\t10\t20\t1") == expected1); + CHECK(Pixel::from_bg2(bins, "chr1\t248956421\t248956422\tchr1\t248956421\t248956422\t1") == + expected2); + CHECK(Pixel::from_bg2(bins, "chr1\t0\t10\tchr1\t10\t20\t1\ta\tb\tc") == expected1); } SECTION("invalid") { CHECK_THROWS_WITH(Pixel::from_bg2(bins, "chr999\t0\t10\tchr1\t0\t10\t1"), @@ -187,11 +194,18 @@ TEST_CASE("Pixel: parsers", "[pixel][short]") { Catch::Matchers::ContainsSubstring("expected 7 or more fields, found 1")); CHECK_THROWS_WITH(Pixel::from_bg2(bins, "chr1\ta\t10\tchr1\t10\t20\t1"), Catch::Matchers::ContainsSubstring("Unable to convert field \"a\"")); + CHECK_THROWS_WITH(Pixel::from_coo(bins, "9999999999\t9999999999\t1"), + Catch::Matchers::ContainsSubstring("out of range")); } } SECTION("validpair") { - SECTION("valid") { CHECK(Pixel::from_validpair(bins, "read_id\tchr1\t5\t+\tchr1\t15\t-")); } + SECTION("valid") { + CHECK(Pixel::from_validpair(bins, "read_id\tchr1\t5\t+\tchr1\t15\t-")); + CHECK(Pixel::from_validpair(bins, "read_id\tchr1\t248956421\t+\tchr1\t248956421\t-") == + expected2); + } + SECTION("invalid") { CHECK_THROWS_WITH(Pixel::from_validpair(bins, ""), Catch::Matchers::ContainsSubstring("expected 6 or more fields, found 0")); @@ -201,6 +215,9 @@ TEST_CASE("Pixel: parsers", "[pixel][short]") { Catch::Matchers::ContainsSubstring("expected 6 or more fields, found 5")); CHECK_THROWS_WITH(Pixel::from_validpair(bins, "read_id\tchr1\tchr1\t+\tchr1\t15\t-"), Catch::Matchers::ContainsSubstring("Unable to convert field \"chr1\"")); + CHECK_THROWS_WITH( + Pixel::from_validpair(bins, "read_id\tchr1\t248956423\t+\tchr1\t248956423\t-"), + Catch::Matchers::ContainsSubstring("position is greater than chromosome size")); } } } diff --git a/test/units/reference/reference_test.cpp b/test/units/reference/reference_test.cpp index 5dc740fb..e13bbb14 100644 --- a/test/units/reference/reference_test.cpp +++ b/test/units/reference/reference_test.cpp @@ -115,6 +115,13 @@ TEST_CASE("Reference", "[reference][short]") { CHECK(chroms2.chromosome_with_longest_name().name() == "chr123"); CHECK(chroms2.longest_chromosome().name() == "chr1"); + + const auto& prefix_sum = chroms2.chrom_size_prefix_sum(); + REQUIRE(prefix_sum.size() == 4); + CHECK(prefix_sum[0] == 0); + CHECK(prefix_sum[1] == 1000); + CHECK(prefix_sum[2] == 1005); + CHECK(prefix_sum[3] == 1006); } } // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) } // namespace hictk::test::reference