Skip to content

Commit

Permalink
Merge pull request #62 from paulsengroup/improve-dump
Browse files Browse the repository at this point in the history
Update hictk dump to support dumping resolutions, cells and normalizations
  • Loading branch information
robomics authored Sep 27, 2023
2 parents 9a1551d + 89b2c6d commit cac5aa5
Show file tree
Hide file tree
Showing 22 changed files with 647 additions and 53 deletions.
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ Checks: >
-cppcoreguidelines-pro-bounds-array-to-pointer-decay,
-cppcoreguidelines-pro-bounds-constant-array-index,
-hicpp-no-array-decay,
-misc-no-recursion,
-modernize-use-trailing-return-type,
-readability-identifier-length,
-readability-magic-numbers
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,10 @@ jobs:
run: |
test/scripts/hictk_dump_chroms.sh build/src/hictk/hictk
test/scripts/hictk_dump_bins.sh build/src/hictk/hictk
test/scripts/hictk_dump_resolutions.sh build/src/hictk/hictk
test/scripts/hictk_dump_normalizations.sh build/src/hictk/hictk
test/scripts/hictk_dump_cells.sh build/src/hictk/hictk
test/scripts/hictk_dump_gw.sh build/src/hictk/hictk
test/scripts/hictk_dump_cis.sh build/src/hictk/hictk
test/scripts/hictk_dump_trans.sh build/src/hictk/hictk
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/macos-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,18 @@ jobs:
run: |
test/scripts/hictk_dump_bins.sh bin/hictk
- name: Test hictk dump resolutions
run: |
test/scripts/hictk_dump_resolutions.sh bin/hictk
- name: Test hictk dump normalizations
run: |
test/scripts/hictk_dump_normalizations.sh bin/hictk
- name: Test hictk dump cells
run: |
test/scripts/hictk_dump_cells.sh bin/hictk
- name: Test hictk dump genome-wide
run: |
test/scripts/hictk_dump_gw.sh bin/hictk
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/ubuntu-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,18 @@ jobs:
run: |
test/scripts/hictk_dump_bins.sh bin/hictk
- name: Test hictk dump resolutions
run: |
test/scripts/hictk_dump_resolutions.sh bin/hictk
- name: Test hictk dump normalizations
run: |
test/scripts/hictk_dump_normalizations.sh bin/hictk
- name: Test hictk dump cells
run: |
test/scripts/hictk_dump_cells.sh bin/hictk
- name: Test hictk dump genome-wide
run: |
test/scripts/hictk_dump_gw.sh bin/hictk
Expand Down
50 changes: 29 additions & 21 deletions src/hictk/cli/cli_dump.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ void Cli::make_dump_subcommand() {
"uri",
c.uri,
"Path to a .hic, .cool or .mcool file (Cooler URI syntax supported).")
->check(IsValidHiCFile | IsValidCoolerFile)
->check(IsValidHiCFile |
IsValidCoolerFile |
IsValidMultiresCoolerFile |
IsValidSingleCellCoolerFile)
->required();

sc.add_option(
Expand All @@ -57,7 +60,8 @@ void Cli::make_dump_subcommand() {
"-t,--table",
c.table,
"Name of the table to dump.\n")
->check(CLI::IsMember({"chroms", "bins", "pixels"}))
->check(CLI::IsMember({"chroms", "bins", "pixels", "normalizations",
"resolutions", "cells"}))
->capture_default_str();

sc.add_option(
Expand Down Expand Up @@ -106,6 +110,7 @@ void Cli::make_dump_subcommand() {

// clang-format on

sc.get_option("--range2")->needs(sc.get_option("--range"));
sc.get_option("--query-file")->excludes(sc.get_option("--range"));
sc.get_option("--query-file")->excludes(sc.get_option("--range2"));

Expand All @@ -119,38 +124,40 @@ void Cli::validate_dump_subcommand() const {
std::vector<std::string> errors;
const auto& c = std::get<DumpConfig>(_config);

if (!errors.empty()) {
throw std::runtime_error(
fmt::format(FMT_STRING("the following error(s) where encountered while validating CLI "
"arguments and input file(s):\n - {}"),
fmt::join(errors, "\n - ")));
}
const auto& subcmd = *_cli.get_subcommand("dump");

const auto is_hic = hic::utils::is_hic_file(c.uri);
const auto is_cooler = cooler::utils::is_cooler(c.uri);
const auto is_mcooler = cooler::utils::is_multires_file(c.uri);
const auto is_scool = cooler::utils::is_scool_file(c.uri);

if (is_hic && c.resolution == 0 && c.table != "chroms") {
errors.emplace_back("--resolution is mandatory when file is in .hic format.");
if ((is_hic || is_mcooler) && c.resolution == 0 && (c.table == "pixels" || c.table == "bins")) {
errors.emplace_back("--resolution is mandatory when file is in .hic or .mcool format.");
}

const auto resolution_parsed = !_cli.get_subcommand("dump")->get_option("--resolution")->empty();
const auto resolution_parsed = !subcmd.get_option("--resolution")->empty();

if ((is_cooler || is_mcooler) && resolution_parsed) {
warnings.emplace_back("--resolution is ignored when file is in .cool or .mcool format.");
if ((is_cooler || is_scool) && resolution_parsed) {
warnings.emplace_back("--resolution is ignored when file is in .[s]cool format.");
}

const auto weight_type_parsed =
!_cli.get_subcommand("dump")->get_option("--weight-type")->empty();
const auto weight_type_parsed = !subcmd.get_option("--weight-type")->empty();

if (is_hic && weight_type_parsed) {
warnings.emplace_back("--weight-type is ignored when file is in .hic format.");
}

const auto matrix_type_parsed =
!_cli.get_subcommand("dump")->get_option("--matrix-type")->empty();
const auto matrix_unit_parsed =
!_cli.get_subcommand("dump")->get_option("--matrix-unit")->empty();
const auto range_parsed = !subcmd.get_option("--range")->empty();
if (range_parsed && c.table != "bins" && c.table != "pixels") {
warnings.emplace_back("--range and --range2 are ignore when --table is not bins or pixels");
}
const auto query_file_parsed = !subcmd.get_option("--query-file")->empty();
if (query_file_parsed && c.table != "bins" && c.table != "pixels") {
warnings.emplace_back("--query-file is ignored when --table is not bins or pixels");
}

const auto matrix_type_parsed = !subcmd.get_option("--matrix-type")->empty();
const auto matrix_unit_parsed = !subcmd.get_option("--matrix-unit")->empty();

if (!is_hic && (matrix_type_parsed || matrix_unit_parsed)) {
warnings.emplace_back(
Expand Down Expand Up @@ -181,9 +188,10 @@ void Cli::transform_args_dump_subcommand() {
c.verbosity = static_cast<std::uint8_t>(spdlog::level::critical) - c.verbosity;

c.format = infer_input_format(c.uri);
if (c.format == "hic" && c.resolution == 0) {
assert(c.table == "chroms");
if (c.format == "hic" && c.resolution == 0 && c.table == "chroms") {
c.resolution = hic::utils::list_resolutions(c.uri).back();
} else if (c.format == "mcool" && c.resolution == 0 && c.table == "chroms") {
c.resolution = cooler::utils::list_resolutions(c.uri).back();
}

if (_cli.get_subcommand("dump")->get_option("--range2")->empty()) {
Expand Down
4 changes: 2 additions & 2 deletions src/hictk/cli/cli_validate.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Copyright (C) 2023 Roberto Rossini <roberros@uio.no>
//
// Created by roby on 7/13/23.
//
// SPDX-License-Identifier: MIT

#include <fmt/format.h>
#include <fmt/std.h>
Expand Down
166 changes: 145 additions & 21 deletions src/hictk/dump/dump.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <variant>

#include "hictk/balancing/methods.hpp"
#include "hictk/cooler.hpp"
#include "hictk/file.hpp"
#include "hictk/tools/config.hpp"
#include "hictk/transformers.hpp"
Expand All @@ -18,23 +19,6 @@ static void print(const ThinPixel<double>& pixel) {
fmt::print(FMT_COMPILE("{:d}\t{:d}\t{:.16g}\n"), pixel.bin1_id, pixel.bin2_id, pixel.count);
}

static void dump_chroms(const File& f, std::string_view range) {
if (range == "all") {
for (const Chromosome& chrom : f.chromosomes()) {
if (!chrom.is_all()) {
fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), chrom.name(), chrom.size());
}
}
return;
}

const auto coords = GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range});
auto it = f.chromosomes().find(coords.chrom());
if (it != f.chromosomes().end()) {
fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), it->name(), it->size());
}
}

template <typename File>
static void dump_bins(const File& f, std::string_view range) {
if (range == "all") {
Expand Down Expand Up @@ -127,9 +111,6 @@ static void dump_pixels(hic::File& f, std::string_view range1, std::string_view
static void process_query(File& f, std::string_view table, std::string_view range1,
std::string_view range2, std::string_view normalization, bool join,
bool sorted) {
if (table == "chroms") {
return dump_chroms(f, range1);
}
if (table == "bins") {
return dump_bins(f, range1);
}
Expand All @@ -139,7 +120,128 @@ static void process_query(File& f, std::string_view table, std::string_view rang
f.get());
}

int dump_subcmd(const DumpConfig& c) {
static int dump_chroms(std::string_view uri, std::string_view format, std::uint32_t resolution) {
Reference ref{};

if (format == "mcool") {
ref = cooler::MultiResFile{std::string{uri}}.chromosomes();
} else if (format == "scool") {
ref = cooler::SingleCellFile{std::string{uri}}.chromosomes();
} else {
ref = File{std::string{uri}, resolution}.chromosomes();
}

for (const Chromosome& chrom : ref) {
if (!chrom.is_all()) {
fmt::print(FMT_COMPILE("{:s}\t{:d}\n"), chrom.name(), chrom.size());
}
}
return 0;
}

static phmap::btree_set<std::string> get_normalizations(std::string_view uri,
std::string_view format,
std::uint32_t resolution) {
assert(format != "mcool");
assert(format != "hic" || resolution != 0);
if (format == "scool") {
const auto cell_ids = cooler::SingleCellFile{uri}.cells();
if (cell_ids.empty()) {
return {};
}

const auto scool_uri = fmt::format(FMT_STRING("{}::/cells/{}"), uri, *cell_ids.begin());
return get_normalizations(scool_uri, "cool", 0);
}

phmap::btree_set<std::string> norms{};
if (uri == "hic" && resolution == 0) {
const hic::File hf{std::string{uri}, resolution};

for (const auto& norm : hf.avail_normalizations()) {
norms.emplace(std::string{norm.to_string()});
}
return norms;
}

const auto norms_ = File{std::string{uri}, resolution}.avail_normalizations();
std::transform(norms_.begin(), norms_.end(), std::inserter(norms, norms.begin()),
[](const auto& n) { return std::string{n.to_string()}; });

return norms;
}

static int dump_normalizations(std::string_view uri, std::string_view format,
std::uint32_t resolution) {
phmap::btree_set<std::string> norms{};
std::vector<std::uint32_t> resolutions{};
if (format == "mcool") {
resolutions = cooler::MultiResFile{uri}.resolutions();
if (resolutions.empty()) {
return 0;
}
} else if (format == "hic" && resolution == 0) {
resolutions = hic::utils::list_resolutions(std::string{uri});
if (resolutions.empty()) {
return 0;
}
}

if (resolutions.empty()) {
norms = get_normalizations(uri, format, resolution);
} else {
format = format == "hic" ? "hic" : "cool";
std::for_each(resolutions.begin(), resolutions.end(),
[&](const auto res) { norms.merge(get_normalizations(uri, format, res)); });
}

if (!norms.empty()) {
fmt::print(FMT_STRING("{}\n"), fmt::join(norms, "\n"));
}
return 0;
}

static int dump_resolutions(std::string_view uri, std::string_view format,
std::uint32_t resolution) {
std::vector<std::uint32_t> resolutions{};

if (format == "hic") {
resolutions = hic::utils::list_resolutions(uri);
if (resolution != 0) {
const auto res_found =
std::find(resolutions.begin(), resolutions.end(), resolution) != resolutions.end();
resolutions.clear();
if (res_found) {
resolutions.push_back(resolution);
}
}
} else if (format == "mcool") {
resolutions = cooler::MultiResFile{uri}.resolutions();
} else if (format == "scool") {
resolutions.push_back(cooler::SingleCellFile{uri}.bin_size());
} else {
assert(format == "cool");
resolutions.push_back(cooler::File{uri}.bin_size());
}

if (!resolutions.empty()) {
fmt::print(FMT_STRING("{}\n"), fmt::join(resolutions, "\n"));
}
return 0;
}

static int dump_cells(std::string_view uri, std::string_view format) {
if (format != "scool") {
throw std::runtime_error(fmt::format(FMT_STRING("\"{}\" is not a .scool file"), uri));
}
const auto cells = cooler::SingleCellFile{uri}.cells();
if (!cells.empty()) {
fmt::print(FMT_STRING("{}\n"), fmt::join(cells, "\n"));
}
return 0;
}

static int dump_tables(const DumpConfig& c) {
hictk::File f{c.uri, c.resolution, c.matrix_type, c.matrix_unit};

if (c.query_file.empty()) {
Expand All @@ -164,4 +266,26 @@ int dump_subcmd(const DumpConfig& c) {

return 0;
}

int dump_subcmd(const DumpConfig& c) {
if (c.table == "bins" || c.table == "pixels") {
return dump_tables(c);
}

if (c.table == "chroms") {
return dump_chroms(c.uri, c.format, c.resolution);
}

if (c.table == "resolutions") {
return dump_resolutions(c.uri, c.format, c.resolution);
}

if (c.table == "normalizations") {
return dump_normalizations(c.uri, c.format, c.resolution);
}

assert(c.table == "cells");

return dump_cells(c.uri, c.format);
}
} // namespace hictk::tools
Loading

0 comments on commit cac5aa5

Please sign in to comment.