Skip to content

Commit

Permalink
Update hictk load to support variable bin sizes
Browse files Browse the repository at this point in the history
  • Loading branch information
robomics committed Dec 6, 2023
1 parent 98e671b commit ad8766b
Show file tree
Hide file tree
Showing 13 changed files with 155 additions and 94 deletions.
27 changes: 20 additions & 7 deletions src/hictk/cli/cli_load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,25 @@ void Cli::make_load_subcommand() {
->check(CLI::ExistingFile)
->required();

sc.add_option(
"bin-size",
c.bin_size,
"Bin size (bp).")
->check(CLI::PositiveNumber)
->required();

sc.add_option(
"output-uri",
c.uri,
"Path to output Cooler (URI syntax supported).")
->required();

sc.add_option(
"-b,--bin-size",
c.bin_size,
"Bin size (bp).\n"
"Required when --bin-table is not used.")
->check(CLI::PositiveNumber);

sc.add_option(
"-t,--bin-table",
c.path_to_bin_table,
"Path to a BED3+ file with the bin table.")
->check(CLI::ExistingFile);

sc.add_option(
"-f,--format",
c.format,
Expand Down Expand Up @@ -93,6 +99,7 @@ void Cli::make_load_subcommand() {
->capture_default_str();
// clang-format on

sc.get_option("--bin-size")->excludes(sc.get_option("--bin-table"));
_config = std::monostate{};
}

Expand All @@ -107,6 +114,12 @@ void Cli::validate_load_subcommand() const {
FMT_STRING("Refusing to overwrite file {}. Pass --force to overwrite."), c.uri));
}

if (c.path_to_bin_table.empty() && c.path_to_chrom_sizes.empty()) {
assert(c.bin_size == 0);
errors.emplace_back(
"--bin-size is required when --bin-table is not specified.");
}

if (!errors.empty()) {
throw std::runtime_error(
fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI "
Expand Down
2 changes: 2 additions & 0 deletions src/hictk/include/hictk/tools/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@ struct LoadConfig {
std::string uri{};

std::filesystem::path path_to_chrom_sizes{};
std::filesystem::path path_to_bin_table{};
std::uint32_t bin_size{};

std::string format{};
std::string assembly{"unknown"};
bool count_as_float{false};
Expand Down
63 changes: 47 additions & 16 deletions src/hictk/load/load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,35 @@

namespace hictk::tools {

void ingest_pixels_sorted(const LoadConfig& c) {
[[nodiscard]] static BinTable init_bin_table(const std::filesystem::path& path_to_chrom_sizes,
std::uint32_t bin_size) {
auto chroms = Reference::from_chrom_sizes(path_to_chrom_sizes);
return {chroms, bin_size};
}

[[nodiscard]] static BinTable init_bin_table(const std::filesystem::path& path_to_chrom_sizes,
const std::filesystem::path& path_to_bin_table) {
auto chroms = Reference::from_chrom_sizes(path_to_chrom_sizes);

std::ifstream ifs{};
ifs.exceptions(std::ios::badbit);
ifs.open(path_to_bin_table);

std::vector<std::uint32_t> start_pos{};
std::vector<std::uint32_t> end_pos{};

std::string line{};
GenomicInterval record{};
while (std::getline(ifs, line)) {
record = GenomicInterval::parse_bed(chroms, line);
start_pos.push_back(record.start());
end_pos.push_back(record.end());
}

return {chroms, start_pos, end_pos};
}

static void ingest_pixels_sorted(const LoadConfig& c) {
assert(c.assume_sorted);
auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes);
const auto format = format_from_string(c.format);
Expand All @@ -50,9 +78,11 @@ void ingest_pixels_sorted(const LoadConfig& c) {
format, c.batch_size, c.validate_pixels);
}

void ingest_pixels_unsorted(const LoadConfig& c) {
static void ingest_pixels_unsorted(const LoadConfig& c) {
assert(!c.assume_sorted);
auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes);
auto bins = c.path_to_bin_table.empty()
? init_bin_table(c.path_to_chrom_sizes, c.bin_size)
: init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table);
const auto format = format_from_string(c.format);

const auto tmp_cooler_path = c.uri + ".tmp";
Expand All @@ -70,8 +100,7 @@ void ingest_pixels_unsorted(const LoadConfig& c) {
[&](auto& buffer) {
using N = decltype(buffer.front().count);
{
auto tmp_clr =
cooler::SingleCellFile::create(tmp_cooler_path, chroms, c.bin_size, c.force);
auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, c.force);
for (std::size_t i = 0; true; ++i) {
SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1,
tmp_cooler_path);
Expand All @@ -92,22 +121,25 @@ void ingest_pixels_unsorted(const LoadConfig& c) {
std::filesystem::remove(tmp_cooler_path);
}

void ingest_pairs_sorted(const LoadConfig& c) {
static void ingest_pairs_sorted(const LoadConfig& c) {
assert(c.assume_sorted);
auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes);
auto bins = c.path_to_bin_table.empty()
? init_bin_table(c.path_to_chrom_sizes, c.bin_size)
: init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table);
const auto format = format_from_string(c.format);

c.count_as_float ? ingest_pairs_sorted<double>(
cooler::File::create<double>(c.uri, chroms, c.bin_size, c.force), format,
c.batch_size, c.validate_pixels)
: ingest_pairs_sorted<std::int32_t>(
cooler::File::create<std::int32_t>(c.uri, chroms, c.bin_size, c.force),
format, c.batch_size, c.validate_pixels);
c.count_as_float
? ingest_pairs_sorted<double>(cooler::File::create<double>(c.uri, bins, c.force), format,
c.batch_size, c.validate_pixels)
: ingest_pairs_sorted<std::int32_t>(cooler::File::create<std::int32_t>(c.uri, bins, c.force),
format, c.batch_size, c.validate_pixels);
}

static void ingest_pairs_unsorted(const LoadConfig& c) {
assert(!c.assume_sorted);
auto chroms = Reference::from_chrom_sizes(c.path_to_chrom_sizes);
auto bins = c.path_to_bin_table.empty()
? init_bin_table(c.path_to_chrom_sizes, c.bin_size)
: init_bin_table(c.path_to_chrom_sizes, c.path_to_bin_table);
const auto format = format_from_string(c.format);

const auto tmp_cooler_path = c.uri + ".tmp";
Expand All @@ -125,8 +157,7 @@ static void ingest_pairs_unsorted(const LoadConfig& c) {
[&](auto& buffer) {
using N = decltype(buffer.begin()->count);
{
auto tmp_clr =
cooler::SingleCellFile::create(tmp_cooler_path, chroms, c.bin_size, c.force);
auto tmp_clr = cooler::SingleCellFile::create(tmp_cooler_path, bins, c.force);

for (std::size_t i = 0; true; ++i) {
SPDLOG_INFO(FMT_STRING("writing chunk #{} to intermediate file \"{}\"..."), i + 1,
Expand Down
4 changes: 1 addition & 3 deletions src/libhictk/cooler/include/hictk/cooler/impl/file_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,10 @@ inline File::File(RootGroup entrypoint, [[maybe_unused]] PixelT pixel, Attribute
_groups = open_groups(_root_group);
_datasets = open_datasets(_root_group, cache_size_bytes, w0);

_bins = std::make_shared<BinTable>(
import_chroms(_datasets.at("chroms/name"), _datasets.at("chroms/length"), false), bin_size());
_bins = std::make_shared<BinTable>(init_bin_table(_datasets, *_attrs.bin_type, _attrs.bin_size));
_index = std::make_shared<Index>(_bins);

assert(std::holds_alternative<PixelT>(_pixel_variant));
assert(bin_size() != 0);
assert(!_bins->empty());
assert(!chromosomes().empty());
assert(!_index->empty());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -532,14 +532,13 @@ inline BinTable File::init_bin_table(const DatasetMap &dsets, std::string_view b
std::uint32_t bin_size) {
auto chroms = import_chroms(dsets.at("chroms/name"), dsets.at("chroms/length"), false);
if (bin_type == "fixed") {
return {BinTableFixed{std::move(chroms), bin_size}};
return {std::move(chroms), bin_size};
}
assert(bin_type == "variable");
assert(bin_size == 0);

return {BinTableVariable{std::move(chroms),
dsets.at("bins/start").read_all<std::vector<std::uint32_t>>(),
dsets.at("bins/end").read_all<std::vector<std::uint32_t>>()}};
return {std::move(chroms), dsets.at("bins/start").read_all<std::vector<std::uint32_t>>(),
dsets.at("bins/end").read_all<std::vector<std::uint32_t>>()};
}

inline Index File::init_index(const Dataset &chrom_offset_dset, const Dataset &bin_offset_dset,
Expand Down
13 changes: 8 additions & 5 deletions src/libhictk/cooler/include/hictk/cooler/impl/index_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ inline std::uint64_t Index::get_offset_by_row_idx(std::uint32_t chrom_id,
}

inline void Index::set(const Chromosome &chrom, OffsetVect offsets) {
const auto expected_size = (chrom.size() + bin_size() - 1) / bin_size();
const auto [fist_bin, last_bin] = _bins->find_overlap(chrom, 0, chrom.size());
const auto expected_size = static_cast<std::size_t>(std::distance(fist_bin, last_bin));
if (offsets.size() != expected_size) {
throw std::runtime_error(
fmt::format(FMT_STRING("expected index for {} to have size {}, found {}"), chrom,
Expand All @@ -127,9 +128,12 @@ inline void Index::set(const Chromosome &chrom, OffsetVect offsets) {
_idx.at(chrom) = std::move(offsets);
}

inline void Index::set_offset_by_bin(const Bin &bin, std::uint64_t offset) {
set_offset_by_row_idx(bin.chrom().id(), bin.rel_id(), offset);
}

inline void Index::set_offset_by_bin_id(std::uint64_t bin_id, std::uint64_t offset) {
const auto &bin = _bins->at(bin_id);
set_offset_by_pos(bin.chrom(), bin.start(), offset);
set_offset_by_bin(_bins->at(bin_id), offset);
}

inline void Index::set_offset_by_pos(const Chromosome &chrom, std::uint32_t pos,
Expand All @@ -139,8 +143,7 @@ inline void Index::set_offset_by_pos(const Chromosome &chrom, std::uint32_t pos,

inline void Index::set_offset_by_pos(std::uint32_t chrom_id, std::uint32_t pos,
std::uint64_t offset) {
const auto row_idx = pos / bin_size();
set_offset_by_row_idx(chrom_id, row_idx, offset);
set_offset_by_bin(_bins->at(chrom_id, pos), offset);
}

inline void Index::set_offset_by_row_idx(std::uint32_t chrom_id, std::size_t row_idx,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ namespace hictk::cooler {
inline SingleCellAttributes SingleCellAttributes::init(std::uint32_t bin_size_) {
SingleCellAttributes attrs{};
attrs.bin_size = bin_size_;
if (bin_size_ == 0) {
attrs.bin_type = "variable";
} else {
attrs.bin_type = "fixed";
}
return attrs;
}

Expand Down Expand Up @@ -66,13 +71,19 @@ inline SingleCellFile::SingleCellFile(HighFive::File fp, BinTable bins, SingleCe
_bins(std::make_shared<const BinTable>(std::move(bins))) {}

inline SingleCellFile::SingleCellFile(const std::filesystem::path& path, unsigned int mode)
: SingleCellFile(HighFive::File(path.string(), mode), read_bins(HighFive::File(path.string())),
: SingleCellFile(HighFive::File(path.string(), mode),
init_bin_table(HighFive::File(path.string())),
read_standard_attributes(HighFive::File(path.string()),
mode != HighFive::File::ReadOnly)) {}

inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path,
const Reference& chroms, std::uint32_t bin_size,
bool force_overwrite) {
return SingleCellFile::create(path, {BinTableFixed{chroms, bin_size}}, force_overwrite);
}

inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path, BinTable bins,
bool force_overwrite) {
if (!force_overwrite && std::filesystem::exists(path)) {
throw std::runtime_error(
fmt::format(FMT_STRING("unable to initialize file \"{}\": file already exists"), path));
Expand All @@ -82,12 +93,10 @@ inline SingleCellFile SingleCellFile::create(const std::filesystem::path& path,
std::filesystem::remove(path);
}

const BinTable bins(chroms, bin_size);

HighFive::File fp(path.string(), HighFive::File::Create);
RootGroup root_grp{fp.getGroup("/")};

auto attrs = SingleCellAttributes::init(bin_size);
auto attrs = SingleCellAttributes::init(bins.bin_size());

create_groups(root_grp);
create_datasets(root_grp, bins);
Expand Down Expand Up @@ -188,8 +197,7 @@ inline File SingleCellFile::aggregate(std::string_view uri, bool overwrite_if_ex
tails.emplace_back(std::move(last));
}
});
utils::merge(heads, tails, chromosomes(), bins().bin_size(), uri, overwrite_if_exists, chunk_size,
update_frequency);
utils::merge(heads, tails, bins(), uri, overwrite_if_exists, chunk_size, update_frequency);

return File(uri);
}
Expand Down Expand Up @@ -240,14 +248,22 @@ SingleCellFile::read_standard_attributes(const HighFive::File& f, bool initializ
}
DISABLE_WARNING_POP

inline BinTable SingleCellFile::read_bins(const HighFive::File& f) {
inline BinTable SingleCellFile::init_bin_table(const HighFive::File& f) {
[[maybe_unused]] HighFive::SilenceHDF5 silencer{}; // NOLINT
const RootGroup root_grp{f.getGroup("/")};
const auto chroms = File::import_chroms(Dataset(root_grp, f.getDataSet("/chroms/name")),
Dataset(root_grp, f.getDataSet("/chroms/length")), false);
const auto bin_size = Attribute::read<std::uint32_t>(root_grp(), "bin-size");
auto chroms = File::import_chroms(Dataset(root_grp, f.getDataSet("/chroms/name")),
Dataset(root_grp, f.getDataSet("/chroms/length")), false);
const auto bin_type = Attribute::read<std::string>(root_grp(), "bin-type");
if (bin_type == "fixed") {
const auto bin_size = Attribute::read<std::uint32_t>(root_grp(), "bin-size");

return {std::move(chroms), bin_size};
}

return {chroms, bin_size};
assert(bin_type == "variable");
return {std::move(chroms),
Dataset(root_grp, f.getDataSet("bins/start")).read_all<std::vector<std::uint32_t>>(),
Dataset(root_grp, f.getDataSet("bins/end")).read_all<std::vector<std::uint32_t>>()};
}

inline phmap::btree_set<std::string> SingleCellFile::read_cells(const HighFive::File& f) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,8 @@ inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool o
}
}

const cooler::File clr(clrs.front().uri);
const auto chroms = clr.chromosomes();
const auto bin_size = clr.bin_size();
merge(heads, tails, chroms, bin_size, dest_uri, overwrite_if_exists, chunk_size,
update_frequency);
merge(heads, tails, cooler::File(clrs.front().uri).bins(), dest_uri, overwrite_if_exists,
chunk_size, update_frequency);
} catch (const std::exception& e) {
throw std::runtime_error(fmt::format(FMT_STRING("failed to merge {} cooler files: {}"),
std::distance(first_uri, last_uri), e.what()));
Expand All @@ -109,15 +106,15 @@ inline void merge(Str first_uri, Str last_uri, std::string_view dest_uri, bool o

template <typename PixelIt>
inline void merge(const std::vector<PixelIt>& heads, const std::vector<PixelIt>& tails,
const Reference& chromosomes, std::uint32_t bin_size, std::string_view dest_uri,
bool overwrite_if_exists, std::size_t chunk_size, std::size_t update_frequency) {
const BinTable& bins, std::string_view dest_uri, bool overwrite_if_exists,
std::size_t chunk_size, std::size_t update_frequency) {
using N = remove_cvref_t<decltype(heads.front()->count)>;

hictk::transformers::PixelMerger merger{heads, tails};
std::vector<ThinPixel<N>> buffer(chunk_size);
buffer.clear();

auto dest = File::create<N>(dest_uri, chromosomes, bin_size, overwrite_if_exists);
auto dest = File::create<N>(dest_uri, bins, overwrite_if_exists);

std::size_t pixels_processed{};
auto t0 = std::chrono::steady_clock::now();
Expand Down
1 change: 1 addition & 0 deletions src/libhictk/cooler/include/hictk/cooler/index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ class Index {
std::size_t row_idx) const;

void set(const Chromosome& chrom, OffsetVect offsets);
void set_offset_by_bin(const Bin& bin, std::uint64_t offset);
void set_offset_by_bin_id(std::uint64_t bin_id, std::uint64_t offset);

void set_offset_by_pos(const Chromosome& chrom, std::uint32_t pos, std::uint64_t offset);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ class SingleCellFile {
unsigned int mode = HighFive::File::ReadOnly);
[[nodiscard]] static SingleCellFile create(const std::filesystem::path& path,
const Reference& chroms, std::uint32_t bin_size,
bool force_overwrite);
[[nodiscard]] static SingleCellFile create(const std::filesystem::path& path, BinTable bins,
bool force_overwrite = false);

[[nodiscard]] constexpr const phmap::btree_set<std::string>& cells() const noexcept;
Expand All @@ -83,7 +85,7 @@ class SingleCellFile {
private:
[[nodiscard]] static SingleCellAttributes read_standard_attributes(const HighFive::File& f,
bool initialize_missing);
[[nodiscard]] static BinTable read_bins(const HighFive::File& f);
[[nodiscard]] static BinTable init_bin_table(const HighFive::File& f);
[[nodiscard]] static phmap::btree_set<std::string> read_cells(const HighFive::File& f);

static void create_groups(RootGroup& root_grp);
Expand Down
5 changes: 2 additions & 3 deletions src/libhictk/cooler/include/hictk/cooler/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ void merge(Str first_file, Str last_file, std::string_view dest_uri,

template <typename PixelIt>
void merge(const std::vector<PixelIt>& heads, const std::vector<PixelIt>& tails,
const Reference& chromosomes, std::uint32_t bin_size, std::string_view dest_uri,
bool overwrite_if_exists = false, std::size_t chunk_size = 500'000,
std::size_t update_frequency = 10'000'000);
const BinTable& bins, std::string_view dest_uri, bool overwrite_if_exists = false,
std::size_t chunk_size = 500'000, std::size_t update_frequency = 10'000'000);

[[nodiscard]] bool equal(std::string_view uri1, std::string_view uri2,
bool ignore_attributes = true);
Expand Down
Loading

0 comments on commit ad8766b

Please sign in to comment.