Skip to content

Commit

Permalink
Add --skip-all-vs-all flag to hictk load
Browse files Browse the repository at this point in the history
  • Loading branch information
robomics committed Jan 29, 2024
1 parent f091897 commit 7da082d
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 25 deletions.
3 changes: 3 additions & 0 deletions docs/cli_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,9 @@ hictk load
By default coordinates are assumed to be one-based for interactions in
4dn and validapairs formats and zero-based otherwise.
--count-as-float Interactions are floats.
--skip-all-vs-all,--no-skip-all-vs-all{false}
Do not generate All vs All matrix.
Has no effect when creating .cool files.
--assume-sorted,--assume-unsorted{false}
Assume input files are already sorted.
--chunk-size UINT [10000000]
Expand Down
7 changes: 7 additions & 0 deletions src/hictk/cli/cli_load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,13 @@ void Cli::make_load_subcommand() {
"Interactions are floats.")
->capture_default_str();

sc.add_flag(
"--skip-all-vs-all,!--no-skip-all-vs-all",
c.skip_all_vs_all_matrix,
"Do not generate All vs All matrix.\n"
"Has no effect when creating .cool files.")
->capture_default_str();

sc.add_flag(
"--assume-sorted,!--assume-unsorted",
c.assume_sorted,
Expand Down
1 change: 1 addition & 0 deletions src/hictk/include/hictk/tools/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ struct LoadConfig {
bool assume_sorted{false};
bool force{false};
bool validate_pixels{true};
bool skip_all_vs_all_matrix{true};

std::string output_format{};

Expand Down
6 changes: 4 additions & 2 deletions src/hictk/load/load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ static Stats ingest_pixels_hic(const LoadConfig& c) {

[[maybe_unused]] const internal::TmpDir tmpdir{c.tmp_dir};
return ingest_pixels_hic(c.output_path, c.tmp_dir, chroms, c.bin_size, c.assembly, c.offset,
format, c.threads, c.batch_size, c.compression_lvl, c.force);
c.skip_all_vs_all_matrix, format, c.threads, c.batch_size,
c.compression_lvl, c.force);
}

static Stats ingest_pixels_cooler(const LoadConfig& c) {
Expand Down Expand Up @@ -116,7 +117,8 @@ static Stats ingest_pairs_hic(const LoadConfig& c) {

[[maybe_unused]] const internal::TmpDir tmpdir{c.tmp_dir};
return ingest_pairs_hic(c.output_path, c.tmp_dir, chroms, c.bin_size, c.assembly, c.offset,
format, c.threads, c.batch_size, c.compression_lvl, c.force);
c.skip_all_vs_all_matrix, format, c.threads, c.batch_size,
c.compression_lvl, c.force);
}

static Stats ingest_pixels(const LoadConfig& c) {
Expand Down
16 changes: 8 additions & 8 deletions src/hictk/load/load_hic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,33 +13,33 @@ namespace hictk::tools {

static Stats ingest_pixels_hic(std::string_view uri, const std::filesystem::path& tmp_dir,
const Reference& chromosomes, std::uint32_t bin_size,
const std::string& assembly, std::int64_t offset, Format format,
std::size_t threads, std::size_t batch_size,
std::uint32_t compression_lvl, bool force) {
const std::string& assembly, std::int64_t offset,
bool skip_all_vs_all_matrix, Format format, std::size_t threads,
std::size_t batch_size, std::uint32_t compression_lvl, bool force) {
SPDLOG_INFO(FMT_STRING("begin loading pixels into a .hic file..."));

if (force) {
std::filesystem::remove(uri);
}

hic::internal::HiCFileWriter hf(uri, chromosomes, {bin_size}, assembly, threads, batch_size,
tmp_dir, compression_lvl);
tmp_dir, compression_lvl, skip_all_vs_all_matrix);

std::vector<ThinPixel<float>> write_buffer(batch_size);
return ingest_pixels(std::move(hf), write_buffer, format, offset);
}

inline Stats ingest_pairs_hic(std::string_view uri, const std::filesystem::path& tmp_dir,
const Reference& chromosomes, std::uint32_t bin_size,
const std::string& assembly, std::int64_t offset, Format format,
std::size_t threads, std::size_t batch_size,
std::uint32_t compression_lvl, bool force) {
const std::string& assembly, std::int64_t offset,
bool skip_all_vs_all_matrix, Format format, std::size_t threads,
std::size_t batch_size, std::uint32_t compression_lvl, bool force) {
if (force) {
std::filesystem::remove(uri);
}

hic::internal::HiCFileWriter hf(uri, chromosomes, {bin_size}, assembly, threads, batch_size,
tmp_dir, compression_lvl);
tmp_dir, compression_lvl, skip_all_vs_all_matrix);

std::vector<ThinPixel<float>> buffer(batch_size);
return ingest_pairs(std::move(hf), buffer, format, offset);
Expand Down
10 changes: 7 additions & 3 deletions src/libhictk/hic/include/hictk/hic/file_writer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ class HiCFileWriter {

BS::thread_pool _tpool{};

bool _skip_all_vs_all_matrix{};

static constexpr std::uint32_t DEFAULT_CHROM_ALL_SCALE_FACTOR{1000};

public:
Expand All @@ -147,7 +149,8 @@ class HiCFileWriter {
std::vector<std::uint32_t> resolutions_, std::string_view assembly_ = "unknown",
std::size_t n_threads = 1, std::size_t chunk_size = 10'000'000,
const std::filesystem::path& tmpdir = std::filesystem::temp_directory_path(),
std::uint32_t compression_lvl = 12, std::size_t buffer_size = 32'000'000);
std::uint32_t compression_lvl = 12, bool skip_all_vs_all_matrix = false,
std::size_t buffer_size = 32'000'000);

[[nodiscard]] std::string_view path() const noexcept;
[[nodiscard]] const Reference& chromosomes() const noexcept;
Expand Down Expand Up @@ -184,7 +187,8 @@ class HiCFileWriter {
[[nodiscard]] static HiCHeader read_header(filestream::FileStream& fs);
[[nodiscard]] static HiCHeader init_header(std::string_view path, Reference chromosomes,
std::vector<std::uint32_t> resolutions,
std::string_view assembly);
std::string_view assembly,
bool skip_all_vs_all_matrix);
[[nodiscard]] static auto init_bin_tables(const Reference& chromosomes,
const std::vector<std::uint32_t>& resolutions)
-> BinTables;
Expand All @@ -200,7 +204,7 @@ class HiCFileWriter {
void write_norm_vector_index();

// Write pixels
void write_pixels();
void write_pixels(bool skip_all_vs_all_matrix);
auto write_pixels(const Chromosome& chrom1, const Chromosome& chrom2) -> HiCSectionOffsets;
auto write_pixels(const Chromosome& chrom1, const Chromosome& chrom2, std::uint32_t resolution)
-> HiCSectionOffsets;
Expand Down
27 changes: 19 additions & 8 deletions src/libhictk/hic/include/hictk/hic/impl/file_writer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,19 @@ inline HiCFileWriter::HiCFileWriter(std::string_view path_, Reference chromosome
std::vector<std::uint32_t> resolutions_,
std::string_view assembly_, std::size_t n_threads,
std::size_t chunk_size, const std::filesystem::path &tmpdir,
std::uint32_t compression_lvl, std::size_t buffer_size)
std::uint32_t compression_lvl, bool skip_all_vs_all_matrix,
std::size_t buffer_size)
: _fs(filestream::FileStream::create(std::string{path_})),
_tmpdir(tmpdir),
_header(init_header(path_, std::move(chromosomes_), std::move(resolutions_), assembly_)),
_header(init_header(path_, std::move(chromosomes_), std::move(resolutions_), assembly_,
skip_all_vs_all_matrix)),
_bin_tables(init_bin_tables(chromosomes(), resolutions())),
_block_mappers(init_interaction_block_mappers(_tmpdir, _bin_tables, chunk_size, 3)),
_compression_lvl(compression_lvl),
_compressor(libdeflate_alloc_compressor(static_cast<std::int32_t>(compression_lvl))),
_compression_buffer(buffer_size, '\0'),
_tpool(init_tpool(n_threads)) {
_tpool(init_tpool(n_threads)),
_skip_all_vs_all_matrix(skip_all_vs_all_matrix) {
if (!std::filesystem::exists(_tmpdir)) {
throw std::runtime_error(
fmt::format(FMT_STRING("temporary directory {} does not exist"), _tmpdir));
Expand Down Expand Up @@ -216,7 +219,7 @@ inline auto HiCFileWriter::stats(std::uint32_t resolution) const noexcept -> Sta
inline void HiCFileWriter::serialize() {
try {
write_header();
write_pixels();
write_pixels(_skip_all_vs_all_matrix);
finalize(true);
for (auto &[_, mapper] : _block_mappers) {
mapper.clear();
Expand Down Expand Up @@ -315,7 +318,7 @@ inline void HiCFileWriter::add_pixels(std::uint32_t resolution, PixelIt first_pi
}
}

inline void HiCFileWriter::write_pixels() {
inline void HiCFileWriter::write_pixels(bool skip_all_vs_all_matrix) {
SPDLOG_INFO(FMT_STRING("begin writing interaction blocks to file \"{}\"..."), path());
const auto &chrom_idx = _block_mappers.at(resolutions().front()).chromosome_index();
std::vector<std::pair<Chromosome, Chromosome>> chroms{chrom_idx.size()};
Expand All @@ -329,7 +332,10 @@ inline void HiCFileWriter::write_pixels() {
}
write_pixels(chrom1, chrom2);
}
write_all_matrix();

if (!skip_all_vs_all_matrix) {
write_all_matrix();
}
}

inline void HiCFileWriter::write_all_matrix(std::uint32_t target_num_bins) {
Expand Down Expand Up @@ -1041,8 +1047,13 @@ inline HiCHeader HiCFileWriter::read_header(filestream::FileStream &fs) {

inline HiCHeader HiCFileWriter::init_header(std::string_view path, Reference chromosomes,
std::vector<std::uint32_t> resolutions,
std::string_view assembly) {
chromosomes = chromosomes.add_ALL(DEFAULT_CHROM_ALL_SCALE_FACTOR);
std::string_view assembly,
bool skip_all_vs_all_matrix) {
if (skip_all_vs_all_matrix) {
chromosomes = chromosomes.remove_ALL();
} else {
chromosomes = chromosomes.add_ALL(DEFAULT_CHROM_ALL_SCALE_FACTOR);
}
return {
std::string{path}, // url
9, // version
Expand Down
11 changes: 7 additions & 4 deletions test/units/hic/file_writer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,14 @@ static void hic_file_writer_compare_pixels(const std::vector<Pixel<float>>& expe
// NOLINTNEXTLINE(readability-function-cognitive-complexity)
static void hic_file_writer_create_file_test(const std::string& path1, const std::string& path2,
const std::vector<std::uint32_t>& resolutions,
std::size_t num_threads) {
std::size_t num_threads, bool skip_all_vs_all_matrix) {
{
const auto chromosomes = hic::File(path1, resolutions.front()).chromosomes();
const auto tmpdir = testdir() / (path1 + ".tmp");
std::filesystem::create_directories(tmpdir);
std::filesystem::remove(path2);
HiCFileWriter w(path2, chromosomes, resolutions, "dm6", num_threads);
HiCFileWriter w(path2, chromosomes, resolutions, "dm6", num_threads, 99'999, tmpdir, 1,
skip_all_vs_all_matrix);
for (std::size_t i = 0; i < resolutions.size(); ++i) {
if (i % 2 == 0) {
const auto resolution = resolutions[i];
Expand Down Expand Up @@ -161,11 +164,11 @@ TEST_CASE("HiC: HiCFileWriter", "[hic][v9][long]") {

SECTION("create file (st)") {
const std::vector<std::uint32_t> resolutions{250'000, 500'000, 2'500'000};
hic_file_writer_create_file_test(path1, path2, resolutions, 1);
hic_file_writer_create_file_test(path1, path2, resolutions, 1, false);
}
SECTION("create file (mt)") {
const std::vector<std::uint32_t> resolutions{25'000, 1'000'000, 2'500'000};
hic_file_writer_create_file_test(path1, path2, resolutions, 3);
hic_file_writer_create_file_test(path1, path2, resolutions, 3, true);
}

SECTION("add weights") {
Expand Down

0 comments on commit 7da082d

Please sign in to comment.