diff --git a/src/hictk/cli/cli_load.cpp b/src/hictk/cli/cli_load.cpp index 0c1eb4b6..faaf1529 100644 --- a/src/hictk/cli/cli_load.cpp +++ b/src/hictk/cli/cli_load.cpp @@ -35,19 +35,25 @@ void Cli::make_load_subcommand() { ->check(CLI::ExistingFile) ->required(); - sc.add_option( - "bin-size", - c.bin_size, - "Bin size (bp).") - ->check(CLI::PositiveNumber) - ->required(); - sc.add_option( "output-uri", c.uri, "Path to output Cooler (URI syntax supported).") ->required(); + sc.add_option( + "-b,--bin-size", + c.bin_size, + "Bin size (bp).\n" + "Required when --bin-table is not used.") + ->check(CLI::PositiveNumber); + + sc.add_option( + "-t,--bin-table", + c.path_to_bin_table, + "Path to a BED3+ file with the bin table.") + ->check(CLI::ExistingFile); + sc.add_option( "-f,--format", c.format, @@ -93,6 +99,7 @@ void Cli::make_load_subcommand() { ->capture_default_str(); // clang-format on + sc.get_option("--bin-size")->excludes(sc.get_option("--bin-table")); _config = std::monostate{}; } @@ -107,6 +114,12 @@ void Cli::validate_load_subcommand() const { FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.uri)); } + if (c.path_to_bin_table.empty() && c.path_to_chrom_sizes.empty()) { + assert(c.bin_size == 0); + errors.emplace_back( + "--bin-size is required when --bin-table is not specified."); + } + if (!errors.empty()) { throw std::runtime_error( fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI " diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index 66b07928..bb775f48 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -103,7 +103,9 @@ struct LoadConfig { std::string uri{}; std::filesystem::path path_to_chrom_sizes{}; + std::filesystem::path path_to_bin_table{}; std::uint32_t bin_size{}; + std::string format{}; std::string assembly{"unknown"}; bool count_as_float{false}; diff --git a/src/hictk/load/load.cpp b/src/hictk/load/load.cpp index 1770f38f..1575dbf8 100644 --- a/src/hictk/load/load.cpp +++ b/src/hictk/load/load.cpp @@ -37,7 +37,35 @@ namespace hictk::tools { -void ingest_pixels_sorted(const LoadConfig& c) { +[[nodiscard]] static BinTable init_bin_table(const std::filesystem::path& path_to_chrom_sizes, + std::uint32_t bin_size) { + auto chroms = Reference::from_chrom_sizes(path_to_chrom_sizes); + return {chroms, bin_size}; +} + +[[nodiscard]] static BinTable init_bin_table(const std::filesystem::path& path_to_chrom_sizes, + const std::filesystem::path& path_to_bin_table) { + auto chroms = Reference::from_chrom_sizes(path_to_chrom_sizes); + + std::ifstream ifs{}; + ifs.exceptions(std::ios::badbit); + ifs.open(path_to_bin_table); + + std::vector start_pos{}; + std::vector end_pos{}; + + std::string line{}; + GenomicInterval record{}; + while (std::getline(ifs, line)) { + record = GenomicInterval::parse_bed(chroms, line); + start_pos.push_back(record.start()); + end_pos.push_back(record.end()); + } + + return {chroms, start_pos, end_pos}; +} + +static void ingest_pixels_sorted(const LoadConfig& c) { assert(c.assume_sorted); auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); const auto format = format_from_string(c.format); @@ -50,9 +78,11 @@ void ingest_pixels_sorted(const LoadConfig& c) { format, c.batch_size, c.validate_pixels); } -void ingest_pixels_unsorted(const LoadConfig& c) { +static void ingest_pixels_unsorted(const LoadConfig& c) { assert(!c.assume_sorted); - auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); + auto bins = c.path_to_bin_table.empty() + ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) + : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); const auto format = format_from_string(c.format); const auto tmp_cooler_path = c.uri + ".tmp"; @@ -70,8 +100,7 @@ void ingest_pixels_unsorted(const LoadConfig& c) { [&](auto& buffer) { using N = decltype(buffer.front().count); { - auto tmp_clr = - cooler::SingleCellFile::create(tmp_cooler_path, chroms, c.bin_size, c.force); + auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, c.force); for (std::size_t i = 0; true; ++i) { SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, tmp_cooler_path); @@ -92,22 +121,25 @@ void ingest_pixels_unsorted(const LoadConfig& c) { std::filesystem::remove(tmp_cooler_path); } -void ingest_pairs_sorted(const LoadConfig& c) { +static void ingest_pairs_sorted(const LoadConfig& c) { assert(c.assume_sorted); - auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); + auto bins = c.path_to_bin_table.empty() + ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) + : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); const auto format = format_from_string(c.format); - c.count_as_float ? ingest_pairs_sorted( - cooler::File::create(c.uri, chroms, c.bin_size, c.force), format, - c.batch_size, c.validate_pixels) - : ingest_pairs_sorted( - cooler::File::create(c.uri, chroms, c.bin_size, c.force), - format, c.batch_size, c.validate_pixels); + c.count_as_float + ? ingest_pairs_sorted(cooler::File::create(c.uri, bins, c.force), format, + c.batch_size, c.validate_pixels) + : ingest_pairs_sorted(cooler::File::create(c.uri, bins, c.force), + format, c.batch_size, c.validate_pixels); } static void ingest_pairs_unsorted(const LoadConfig& c) { assert(!c.assume_sorted); - auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes); + auto bins = c.path_to_bin_table.empty() + ? init_bin_table(c.path_to_chrom_sizes, c.bin_size) + : init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table); const auto format = format_from_string(c.format); const auto tmp_cooler_path = c.uri + ".tmp"; @@ -125,8 +157,7 @@ static void ingest_pairs_unsorted(const LoadConfig& c) { [&](auto& buffer) { using N = decltype(buffer.begin()->count); { - auto tmp_clr = - cooler::SingleCellFile::create(tmp_cooler_path, chroms, c.bin_size, c.force); + auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, c.force); for (std::size_t i = 0; true; ++i) { SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp index bb5f1d62..5f51b4f3 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp @@ -84,12 +84,10 @@ inline File::File(RootGroup entrypoint, [[maybe_unused]] PixelT pixel, Attribute _groups = open_groups(_root_group); _datasets = open_datasets(_root_group, cache_size_bytes, w0); - _bins = std::make_shared( - import_chroms(_datasets.at("chroms/name"), _datasets.at("chroms/length"), false), bin_size()); + _bins = std::make_shared(init_bin_table(_datasets, *_attrs.bin_type, _attrs.bin_size)); _index = std::make_shared(_bins); assert(std::holds_alternative(_pixel_variant)); - assert(bin_size() != 0); assert(!_bins->empty()); assert(!chromosomes().empty()); assert(!_index->empty()); diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp index f8b75d65..9c837e41 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/file_read_impl.hpp @@ -532,14 +532,13 @@ inline BinTable File::init_bin_table(const DatasetMap &dsets, std::string_view b std::uint32_t bin_size) { auto chroms = import_chroms(dsets.at("chroms/name"), dsets.at("chroms/length"), false); if (bin_type == "fixed") { - return {BinTableFixed{std::move(chroms), bin_size}}; + return {std::move(chroms), bin_size}; } assert(bin_type == "variable"); assert(bin_size == 0); - return {BinTableVariable{std::move(chroms), - dsets.at("bins/start").read_all>(), - dsets.at("bins/end").read_all>()}}; + return {std::move(chroms), dsets.at("bins/start").read_all>(), + dsets.at("bins/end").read_all>()}; } inline Index File::init_index(const Dataset &chrom_offset_dset, const Dataset &bin_offset_dset, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp index 4bd4848b..66dff5d7 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp @@ -118,7 +118,8 @@ inline std::uint64_t Index::get_offset_by_row_idx(std::uint32_t chrom_id, } inline void Index::set(const Chromosome &chrom, OffsetVect offsets) { - const auto expected_size = (chrom.size() + bin_size() - 1) / bin_size(); + const auto [fist_bin, last_bin] = _bins->find_overlap(chrom, 0, chrom.size()); + const auto expected_size = static_cast(std::distance(fist_bin, last_bin)); if (offsets.size() != expected_size) { throw std::runtime_error( fmt::format(FMT_STRING("expected index for {} to have size {}, found {}"), chrom, @@ -127,9 +128,12 @@ inline void Index::set(const Chromosome &chrom, OffsetVect offsets) { _idx.at(chrom) = std::move(offsets); } +inline void Index::set_offset_by_bin(const Bin &bin, std::uint64_t offset) { + set_offset_by_row_idx(bin.chrom().id(), bin.rel_id(), offset); +} + inline void Index::set_offset_by_bin_id(std::uint64_t bin_id, std::uint64_t offset) { - const auto &bin = _bins->at(bin_id); - set_offset_by_pos(bin.chrom(), bin.start(), offset); + set_offset_by_bin(_bins->at(bin_id), offset); } inline void Index::set_offset_by_pos(const Chromosome &chrom, std::uint32_t pos, @@ -139,8 +143,7 @@ inline void Index::set_offset_by_pos(const Chromosome &chrom, std::uint32_t pos, inline void Index::set_offset_by_pos(std::uint32_t chrom_id, std::uint32_t pos, std::uint64_t offset) { - const auto row_idx = pos / bin_size(); - set_offset_by_row_idx(chrom_id, row_idx, offset); + set_offset_by_bin(_bins->at(chrom_id, pos), offset); } inline void Index::set_offset_by_row_idx(std::uint32_t chrom_id, std::size_t row_idx, diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp index 7d116960..5d6483f4 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/singlecell_cooler_impl.hpp @@ -25,6 +25,11 @@ namespace hictk::cooler { inline SingleCellAttributes SingleCellAttributes::init(std::uint32_t bin_size_) { SingleCellAttributes attrs{}; attrs.bin_size = bin_size_; + if (bin_size_ == 0) { + attrs.bin_type = "variable"; + } else { + attrs.bin_type = "fixed"; + } return attrs; } @@ -66,13 +71,19 @@ inline SingleCellFile::SingleCellFile(HighFive::File fp, BinTable bins, SingleCe _bins(std::make_shared(std::move(bins))) {} inline SingleCellFile::SingleCellFile(const std::filesystem::path& path, unsigned int mode) - : SingleCellFile(HighFive::File(path.string(), mode), read_bins(HighFive::File(path.string())), + : SingleCellFile(HighFive::File(path.string(), mode), + init_bin_table(HighFive::File(path.string())), read_standard_attributes(HighFive::File(path.string()), mode != HighFive::File::ReadOnly)) {} inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path, const Reference& chroms, std::uint32_t bin_size, bool force_overwrite) { + return SingleCellFile::create(path, {BinTableFixed{chroms, bin_size}}, force_overwrite); +} + +inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path, BinTable bins, + bool force_overwrite) { if (!force_overwrite && std::filesystem::exists(path)) { throw std::runtime_error( fmt::format(FMT_STRING("unable to initialize file \"{}\": file already exists"), path)); @@ -82,12 +93,10 @@ inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path, std::filesystem::remove(path); } - const BinTable bins(chroms, bin_size); - HighFive::File fp(path.string(), HighFive::File::Create); RootGroup root_grp{fp.getGroup("/")}; - auto attrs = SingleCellAttributes::init(bin_size); + auto attrs = SingleCellAttributes::init(bins.bin_size()); create_groups(root_grp); create_datasets(root_grp, bins); @@ -188,8 +197,7 @@ inline File SingleCellFile::aggregate(std::string_view uri, bool overwrite_if_ex tails.emplace_back(std::move(last)); } }); - utils::merge(heads, tails, chromosomes(), bins().bin_size(), uri, overwrite_if_exists, chunk_size, - update_frequency); + utils::merge(heads, tails, bins(), uri, overwrite_if_exists, chunk_size, update_frequency); return File(uri); } @@ -240,14 +248,22 @@ SingleCellFile::read_standard_attributes(const HighFive::File& f, bool initializ } DISABLE_WARNING_POP -inline BinTable SingleCellFile::read_bins(const HighFive::File& f) { +inline BinTable SingleCellFile::init_bin_table(const HighFive::File& f) { [[maybe_unused]] HighFive::SilenceHDF5 silencer{}; // NOLINT const RootGroup root_grp{f.getGroup("/")}; - const auto chroms = File::import_chroms(Dataset(root_grp, f.getDataSet("/chroms/name")), - Dataset(root_grp, f.getDataSet("/chroms/length")), false); - const auto bin_size = Attribute::read(root_grp(), "bin-size"); + auto chroms = File::import_chroms(Dataset(root_grp, f.getDataSet("/chroms/name")), + Dataset(root_grp, f.getDataSet("/chroms/length")), false); + const auto bin_type = Attribute::read(root_grp(), "bin-type"); + if (bin_type == "fixed") { + const auto bin_size = Attribute::read(root_grp(), "bin-size"); + + return {std::move(chroms), bin_size}; + } - return {chroms, bin_size}; + assert(bin_type == "variable"); + return {std::move(chroms), + Dataset(root_grp, f.getDataSet("bins/start")).read_all>(), + Dataset(root_grp, f.getDataSet("bins/end")).read_all>()}; } inline phmap::btree_set SingleCellFile::read_cells(const HighFive::File& f) { diff --git a/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp b/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp index 489e81af..c9627243 100644 --- a/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/impl/utils_merge_impl.hpp @@ -96,11 +96,8 @@ inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool o } } - const cooler::File clr(clrs.front().uri); - const auto chroms = clr.chromosomes(); - const auto bin_size = clr.bin_size(); - merge(heads, tails, chroms, bin_size, dest_uri, overwrite_if_exists, chunk_size, - update_frequency); + merge(heads, tails, cooler::File(clrs.front().uri).bins(), dest_uri, overwrite_if_exists, + chunk_size, update_frequency); } catch (const std::exception& e) { throw std::runtime_error(fmt::format(FMT_STRING("failed to merge {} cooler files: {}"), std::distance(first_uri, last_uri), e.what())); @@ -109,15 +106,15 @@ inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool o template inline void merge(const std::vector& heads, const std::vector& tails, - const Reference& chromosomes, std::uint32_t bin_size, std::string_view dest_uri, - bool overwrite_if_exists, std::size_t chunk_size, std::size_t update_frequency) { + const BinTable& bins, std::string_view dest_uri, bool overwrite_if_exists, + std::size_t chunk_size, std::size_t update_frequency) { using N = remove_cvref_tcount)>; hictk::transformers::PixelMerger merger{heads, tails}; std::vector> buffer(chunk_size); buffer.clear(); - auto dest = File::create(dest_uri, chromosomes, bin_size, overwrite_if_exists); + auto dest = File::create(dest_uri, bins, overwrite_if_exists); std::size_t pixels_processed{}; auto t0 = std::chrono::steady_clock::now(); diff --git a/src/libhictk/cooler/include/hictk/cooler/index.hpp b/src/libhictk/cooler/include/hictk/cooler/index.hpp index 4aae8ee1..c35b14e9 100644 --- a/src/libhictk/cooler/include/hictk/cooler/index.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/index.hpp @@ -84,6 +84,7 @@ class Index { std::size_t row_idx) const; void set(const Chromosome& chrom, OffsetVect offsets); + void set_offset_by_bin(const Bin& bin, std::uint64_t offset); void set_offset_by_bin_id(std::uint64_t bin_id, std::uint64_t offset); void set_offset_by_pos(const Chromosome& chrom, std::uint32_t pos, std::uint64_t offset); diff --git a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp index 86228e5b..14a840ce 100644 --- a/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/singlecell_cooler.hpp @@ -62,6 +62,8 @@ class SingleCellFile { unsigned int mode = HighFive::File::ReadOnly); [[nodiscard]] static SingleCellFile create(const std::filesystem::path& path, const Reference& chroms, std::uint32_t bin_size, + bool force_overwrite); + [[nodiscard]] static SingleCellFile create(const std::filesystem::path& path, BinTable bins, bool force_overwrite = false); [[nodiscard]] constexpr const phmap::btree_set& cells() const noexcept; @@ -83,7 +85,7 @@ class SingleCellFile { private: [[nodiscard]] static SingleCellAttributes read_standard_attributes(const HighFive::File& f, bool initialize_missing); - [[nodiscard]] static BinTable read_bins(const HighFive::File& f); + [[nodiscard]] static BinTable init_bin_table(const HighFive::File& f); [[nodiscard]] static phmap::btree_set read_cells(const HighFive::File& f); static void create_groups(RootGroup& root_grp); diff --git a/src/libhictk/cooler/include/hictk/cooler/utils.hpp b/src/libhictk/cooler/include/hictk/cooler/utils.hpp index fe2d6075..af26b6e7 100644 --- a/src/libhictk/cooler/include/hictk/cooler/utils.hpp +++ b/src/libhictk/cooler/include/hictk/cooler/utils.hpp @@ -23,9 +23,8 @@ void merge(Str first_file, Str last_file, std::string_view dest_uri, template void merge(const std::vector& heads, const std::vector& tails, - const Reference& chromosomes, std::uint32_t bin_size, std::string_view dest_uri, - bool overwrite_if_exists = false, std::size_t chunk_size = 500'000, - std::size_t update_frequency = 10'000'000); + const BinTable& bins, std::string_view dest_uri, bool overwrite_if_exists = false, + std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000); [[nodiscard]] bool equal(std::string_view uri1, std::string_view uri2, bool ignore_attributes = true); diff --git a/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp b/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp index 05f49829..c5805937 100644 --- a/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp +++ b/src/libhictk/genomic_interval/include/hictk/genomic_interval.hpp @@ -25,8 +25,8 @@ class GenomicInterval { GenomicInterval(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept; [[nodiscard]] static GenomicInterval parse(const Reference &chroms, std::string query, Type type = Type::UCSC); - [[nodiscard]] static GenomicInterval parse_ucsc(const Reference &chroms, std::string query); - [[nodiscard]] static GenomicInterval parse_bed(const Reference &chroms, std::string_view query, + [[nodiscard]] static GenomicInterval parse_ucsc(const Reference &chroms, std::string buffer); + [[nodiscard]] static GenomicInterval parse_bed(const Reference &chroms, std::string_view buffer, char sep = '\t'); [[nodiscard]] explicit operator bool() const noexcept; diff --git a/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp b/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp index d5126183..f27762c5 100644 --- a/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp +++ b/src/libhictk/genomic_interval/include/hictk/impl/genomic_interval_impl.hpp @@ -104,69 +104,69 @@ inline GenomicInterval GenomicInterval::parse(const Reference &chroms, std::stri return GenomicInterval::parse_bed(chroms, query); } -inline GenomicInterval GenomicInterval::parse_ucsc(const Reference &chroms, std::string query) { - if (query.empty()) { +inline GenomicInterval GenomicInterval::parse_ucsc(const Reference &chroms, std::string buffer) { + if (buffer.empty()) { throw std::runtime_error("query is empty"); } - if (const auto match = chroms.find(query); match != chroms.end()) { + if (const auto match = chroms.find(buffer); match != chroms.end()) { return GenomicInterval{*match}; } - const auto p1 = query.find_last_of(':'); - auto p2 = query.find_last_of('-'); + const auto p1 = buffer.find_last_of(':'); + auto p2 = buffer.find_last_of('-'); if (p1 == std::string::npos && p2 == std::string::npos) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid chromosome \"{0}\" in query \"{0}\""), query)); + fmt::format(FMT_STRING("invalid chromosome \"{0}\" in query \"{0}\""), buffer)); } if (p1 == std::string::npos || p2 == std::string::npos || p1 > p2) { - throw std::runtime_error(fmt::format(FMT_STRING("query \"{}\" is malformed"), query)); + throw std::runtime_error(fmt::format(FMT_STRING("query \"{}\" is malformed"), buffer)); } - if (query.find(',', p1) != std::string::npos) { - query.erase(std::remove(query.begin() + std::ptrdiff_t(p1), query.end(), ','), query.end()); - p2 = query.find_last_of('-'); + if (buffer.find(',', p1) != std::string::npos) { + buffer.erase(std::remove(buffer.begin() + std::ptrdiff_t(p1), buffer.end(), ','), buffer.end()); + p2 = buffer.find_last_of('-'); } - query[p1] = '\t'; - query[p2] = '\t'; + buffer[p1] = '\t'; + buffer[p2] = '\t'; - return GenomicInterval::parse_bed(chroms, query); + return GenomicInterval::parse_bed(chroms, buffer); } -inline GenomicInterval GenomicInterval::parse_bed(const Reference &chroms, std::string_view query, +inline GenomicInterval GenomicInterval::parse_bed(const Reference &chroms, std::string_view buffer, char sep) { - if (query.empty()) { - throw std::runtime_error("query is empty"); + if (buffer.empty()) { + throw std::runtime_error("interval is empty"); } - const auto p1 = query.find(sep); - const auto p2 = query.find(sep, p1 + 1); + const auto p1 = buffer.find(sep); + const auto p2 = buffer.find(sep, p1 + 1); if (p1 == std::string_view::npos || p2 == std::string_view::npos || p1 > p2) { - throw std::runtime_error(fmt::format(FMT_STRING("query \"{}\" is malformed"), query)); + throw std::runtime_error(fmt::format(FMT_STRING("interval \"{}\" is malformed"), buffer)); } - const auto chrom_name = query.substr(0, p1); - const auto start_pos_str = query.substr(p1 + 1, p2 - (p1 + 1)); - const auto end_pos_str = query.substr(p2 + 1); + const auto chrom_name = buffer.substr(0, p1); + const auto start_pos_str = buffer.substr(p1 + 1, p2 - (p1 + 1)); + const auto end_pos_str = buffer.substr(p2 + 1); const auto match = chroms.find(chrom_name); if (match == chroms.end()) { - throw std::runtime_error( - fmt::format(FMT_STRING("invalid chromosome \"{}\" in query \"{}\""), chrom_name, query)); + throw std::runtime_error(fmt::format(FMT_STRING("invalid chromosome \"{}\" in interval \"{}\""), + chrom_name, buffer)); } if (start_pos_str.empty()) { throw std::runtime_error( - fmt::format(FMT_STRING("query \"{}\" is malformed: missing start position"), query)); + fmt::format(FMT_STRING("interval \"{}\" is malformed: missing start position"), buffer)); } if (end_pos_str.empty()) { throw std::runtime_error( - fmt::format(FMT_STRING("query \"{}\" is malformed: missing end position"), query)); + fmt::format(FMT_STRING("interval \"{}\" is malformed: missing end position"), buffer)); } GenomicInterval gi{*match}; @@ -175,29 +175,29 @@ inline GenomicInterval GenomicInterval::parse_bed(const Reference &chroms, std:: internal::parse_numeric_or_throw(start_pos_str, gi._start); } catch (const std::exception &e) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid start position \"{}\" in query \"{}\": {}"), start_pos_str, - query, e.what())); + fmt::format(FMT_STRING("invalid start position \"{}\" in interval \"{}\": {}"), + start_pos_str, buffer, e.what())); } try { internal::parse_numeric_or_throw(end_pos_str, gi._end); } catch (const std::exception &e) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid end position \"{}\" in query \"{}\": {}"), end_pos_str, - query, e.what())); + fmt::format(FMT_STRING("invalid end position \"{}\" in interval \"{}\": {}"), end_pos_str, + buffer, e.what())); } if (gi._end > gi.chrom().size()) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid end position \"{0}\" in query \"{1}\": end position is " + fmt::format(FMT_STRING("invalid end position \"{0}\" in interval \"{1}\": end position is " "greater than the chromosome size ({0} > {2})"), - gi._end, query, gi.chrom().size())); + gi._end, buffer, gi.chrom().size())); } if (gi._start >= gi._end) { throw std::runtime_error( - fmt::format(FMT_STRING("invalid query \"{}\": query end position should be " + fmt::format(FMT_STRING("invalid interval \"{}\": query end position should be " "greater than the start position ({} >= {})"), - query, gi._start, gi._end)); + buffer, gi._start, gi._end)); } return gi;