Skip to content

Commit

Permalink
Initial support for adding normalization vectors to .hic files
Browse files Browse the repository at this point in the history
  • Loading branch information
robomics committed Jan 18, 2024
1 parent 1c06c12 commit 5ce289a
Show file tree
Hide file tree
Showing 11 changed files with 678 additions and 102 deletions.
7 changes: 7 additions & 0 deletions src/libhictk/hic/include/hictk/hic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ class File {
std::uint64_t first_bin2, std::uint64_t last_bin2,
balancing::Method norm = balancing::Method::NONE()) const;

[[nodiscard]] balancing::Weights normalization(balancing::Method norm,
const Chromosome &chrom) const;
[[nodiscard]] balancing::Weights normalization(std::string_view norm,
const Chromosome &chrom) const;
[[nodiscard]] balancing::Weights normalization(balancing::Method norm) const;
[[nodiscard]] balancing::Weights normalization(std::string_view norm) const;

[[nodiscard]] std::size_t num_cached_footers() const noexcept;
void purge_footer_cache();

Expand Down
6 changes: 6 additions & 0 deletions src/libhictk/hic/include/hictk/hic/binary_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ class BinaryBuffer {
// NOLINTNEXTLINE
template <typename T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr>
T read();
template <typename T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr>
void read(T& buff);
template <typename T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr>
void read(std::vector<T>& data);
void read(std::string& buff, std::size_t n);
void read(char* buff, std::size_t n);
std::string getline(char delim = '\n');
// NOLINTNEXTLINE
template <typename T, typename std::enable_if<std::is_fundamental<T>::value>::type* = nullptr>
void write(T data);
Expand Down
37 changes: 29 additions & 8 deletions src/libhictk/hic/include/hictk/hic/file_writer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <queue>
#include <string>

#include "hictk/balancing/weights.hpp"
#include "hictk/bin_table.hpp"
#include "hictk/default_delete.hpp"
#include "hictk/hash.hpp"
Expand Down Expand Up @@ -112,7 +113,7 @@ class HiCFileWriter {
BlockMappers _block_mappers{};

using StatsTank = phmap::flat_hash_map<std::uint32_t, Stats>;
using FooterTank = phmap::btree_map<std::pair<Chromosome, Chromosome>, FooterV5>;
using FooterTank = phmap::btree_map<std::pair<Chromosome, Chromosome>, FooterMasterIndex>;

MatrixBodyMetadataTank _matrix_metadata{};
FooterTank _footers{};
Expand All @@ -123,6 +124,9 @@ class HiCFileWriter {
std::unique_ptr<libdeflate_compressor> _compressor{};
std::string _compression_buffer{};

phmap::btree_set<NormalizedExpectedValuesBlock> _normalized_expected_values{};
phmap::btree_map<NormalizationVectorIndexBlock, std::vector<float>> _normalization_vectors{};

HiCSectionOffsets _header_section{};
HiCSectionOffsets _data_block_section{};
HiCSectionOffsets _body_metadata_section{};
Expand All @@ -137,12 +141,12 @@ class HiCFileWriter {

public:
HiCFileWriter() = default;
explicit HiCFileWriter(
std::string_view path_, Reference chromosomes_, std::vector<std::uint32_t> resolutions_,
std::string_view assembly_ = "unknown", std::size_t n_threads = 1,
std::size_t chunk_size = 10'000'000,
const std::filesystem::path& tmpdir = std::filesystem::temp_directory_path(),
std::uint32_t compression_lvl = 12, std::size_t buffer_size = 32'000'000);
explicit HiCFileWriter(std::string_view path_);
HiCFileWriter(std::string_view path_, Reference chromosomes_,
std::vector<std::uint32_t> resolutions_, std::string_view assembly_ = "unknown",
std::size_t n_threads = 1, std::size_t chunk_size = 10'000'000,
const std::filesystem::path& tmpdir = std::filesystem::temp_directory_path(),
std::uint32_t compression_lvl = 12, std::size_t buffer_size = 32'000'000);

[[nodiscard]] std::string_view url() const noexcept;
[[nodiscard]] const Reference& chromosomes() const noexcept;
Expand Down Expand Up @@ -176,13 +180,26 @@ class HiCFileWriter {
// Write expected/normalization values
void compute_and_write_expected_values();

// Write normalization vectors
void add_norm_vector(const NormalizationVectorIndexBlock& blk, const std::vector<float>& weights);
void add_norm_vector(std::string_view type, const Chromosome& chrom, std::string_view unit,
std::uint32_t bin_size, const std::vector<float>& weights,
std::size_t position = std::numeric_limits<std::size_t>::max(),
std::size_t n_bytes = std::numeric_limits<std::size_t>::max());
void add_norm_vector(const NormalizationVectorIndexBlock& blk, const balancing::Weights& weights);
void add_norm_vector(std::string_view type, const Chromosome& chrom, std::string_view unit,
std::uint32_t bin_size, const balancing::Weights& weights,
std::size_t position = std::numeric_limits<std::size_t>::max(),
std::size_t n_bytes = std::numeric_limits<std::size_t>::max());
void write_norm_vectors();

void write_empty_expected_values();
void write_empty_normalized_expected_values();
void write_empty_norm_vectors();

void finalize();

private:
[[nodiscard]] static HiCHeader read_header(std::string_view path);
[[nodiscard]] static HiCHeader init_header(std::string_view path, Reference chromosomes,
std::vector<std::uint32_t> resolutions,
std::string_view assembly);
Expand Down Expand Up @@ -211,6 +228,10 @@ class HiCFileWriter {
[[nodiscard]] std::size_t compute_num_bins(const Chromosome& chrom1, const Chromosome& chrom2,
std::uint32_t resolution);

void read_normalized_expected_values();
void read_norm_vectors();
[[nodiscard]] std::vector<float> read_norm_vector(const NormalizationVectorIndexBlock& blk);

// Methods to be called from worker threads
auto merge_and_compress_blocks_thr(
HiCInteractionToBlockMapper& mapper, std::mutex& mapper_mtx,
Expand Down
94 changes: 72 additions & 22 deletions src/libhictk/hic/include/hictk/hic/file_writer_data_structures.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <vector>

#include "hictk/hic/binary_buffer.hpp"
#include "hictk/hic/filestream.hpp"
#include "hictk/pixel.hpp"

namespace hictk::hic::internal {
Expand Down Expand Up @@ -124,7 +125,7 @@ struct MatrixInteractionBlock {
};

// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#master-index
struct MasterIndex {
struct FooterMasterIndex {
std::string key;
std::int64_t position;
std::int32_t size;
Expand All @@ -134,54 +135,103 @@ struct MasterIndex {
struct ExpectedValuesBlock {
std::string unit{};
std::int32_t binSize{};
std::int64_t nValues{};
[[nodiscard]] std::int64_t nValues() const noexcept;
std::vector<float> value{};
std::int32_t nChrScaleFactors{};
[[nodiscard]] std::int32_t nChrScaleFactors() const noexcept;
std::vector<std::int32_t> chrIndex{};
std::vector<float> chrScaleFactor{};

ExpectedValuesBlock() = default;
ExpectedValuesBlock(std::string_view unit_, std::uint32_t bin_size,
const std::vector<double>& weights,
const std::vector<std::uint32_t>& chrom_ids,
const std::vector<double>& scale_factors);

[[nodiscard]] bool operator<(const ExpectedValuesBlock& other) const noexcept;

[[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const;
[[nodiscard]] static ExpectedValuesBlock deserialize(filestream::FileStream& fs);
};

// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#expected-value-vectors
struct ExpectedValues {
std::int32_t nExpectedValueVectors = 0;
std::vector<ExpectedValuesBlock> expectedValues;
class ExpectedValues {
std::vector<ExpectedValuesBlock> _expected_values;

public:
[[nodiscard]] std::int32_t nExpectedValueVectors() const noexcept;
[[nodiscard]] const std::vector<ExpectedValuesBlock>& expectedValues() const noexcept;
void emplace_back(ExpectedValuesBlock evb);
[[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const;
[[nodiscard]] static ExpectedValues deserialize(filestream::FileStream& fs);
};

// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalized-expected-value-vectors
struct NormalizedExpectedValues {
std::int32_t nNormExpectedValueVectors = 0;
struct NormalizedExpectedValuesBlock {
std::string type{};
std::string unit{};
std::int32_t binSize{};
[[nodiscard]] std::int64_t nValues() const noexcept;
std::vector<float> value{};
[[nodiscard]] std::int32_t nChrScaleFactors() const noexcept;
std::vector<std::int32_t> chrIndex{};
std::vector<float> chrScaleFactor{};

NormalizedExpectedValuesBlock() = default;
NormalizedExpectedValuesBlock(std::string_view type_, std::string_view unit_,
std::uint32_t bin_size, const std::vector<double>& weights,
const std::vector<std::uint32_t>& chrom_ids,
const std::vector<double>& scale_factors);

[[nodiscard]] bool operator<(const NormalizedExpectedValuesBlock& other) const noexcept;

[[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const;
[[nodiscard]] static NormalizedExpectedValuesBlock deserialize(filestream::FileStream& fs);
};

// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalization-vector-index
struct NormalizationVectorIndex {
std::int32_t nNormVectors = 0;
// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalized-expected-value-vectors
class NormalizedExpectedValues {
std::vector<NormalizedExpectedValuesBlock> _normalized_expected_values;

public:
[[nodiscard]] std::int32_t nNormExpectedValueVectors() const noexcept;
[[nodiscard]] const std::vector<NormalizedExpectedValuesBlock>& normExpectedValues()
const noexcept;
void emplace_back(NormalizedExpectedValuesBlock evb);
[[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const;
[[nodiscard]] static NormalizedExpectedValues deserialize(filestream::FileStream& fs);
};

// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalization-vector-arrays-1-per-normalization-vector
struct NormalizationVectorArray {
std::int64_t nValues = 0;
struct NormalizationVectorIndexBlock {
std::string type{};
std::int32_t chrIdx{};
std::string unit{};
std::int32_t binSize{};
std::int64_t position{};
std::int64_t nBytes{};

private:
public:
NormalizationVectorIndexBlock() = default;

Check warning on line 213 in src/libhictk/hic/include/hictk/hic/file_writer_data_structures.hpp

View check run for this annotation

Codecov / codecov/patch

src/libhictk/hic/include/hictk/hic/file_writer_data_structures.hpp#L213

Added line #L213 was not covered by tests
NormalizationVectorIndexBlock(std::string type_, std::uint32_t chrom_idx, std::string unit_,
std::uint32_t bin_size, std::size_t position_, std::size_t n_bytes);

[[nodiscard]] bool operator<(const NormalizationVectorIndexBlock& other) const noexcept;

[[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const;
[[nodiscard]] static NormalizationVectorIndexBlock deserialize(filestream::FileStream& fs);
};

struct FooterV5 {
MasterIndex masterIndex{};
// https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#normalization-vector-index
class NormalizationVectorIndex {
std::vector<NormalizationVectorIndexBlock> _norm_vect_idx{};

ExpectedValues expectedValues{};
NormalizedExpectedValues normExpectedValues{};
NormalizationVectorIndex normVectIndex{};
std::vector<NormalizationVectorArray> normVectArray{};
public:
[[nodiscard]] std::int32_t nNormVectors() const noexcept;
[[nodiscard]] const std::vector<NormalizationVectorIndexBlock> normalizationVectorIndex()
const noexcept;
void emplace_back(NormalizationVectorIndexBlock blk);

FooterV5() = default;
[[nodiscard]] std::string serialize(BinaryBuffer& buffer, bool clear = true) const;
[[nodiscard]] static NormalizationVectorIndex deserialize(filestream::FileStream& fs);
};

} // namespace hictk::hic::internal
Expand Down
2 changes: 1 addition & 1 deletion src/libhictk/hic/include/hictk/hic/filestream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class FileStream {

public:
FileStream() = default;
explicit FileStream(std::string path);
explicit FileStream(std::string path, std::ios::openmode mode = std::ios::in);
static FileStream create(std::string path);

[[nodiscard]] const std::string &path() const noexcept;
Expand Down
24 changes: 22 additions & 2 deletions src/libhictk/hic/include/hictk/hic/impl/binary_buffer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,34 @@ inline T BinaryBuffer::read() {
return x;
}

template <typename T, typename std::enable_if<std::is_fundamental<T>::value>::type *>
inline void BinaryBuffer::read(T &buff) {
buff = read<T>();
}

template <typename T, typename std::enable_if<std::is_fundamental<T>::value>::type *>
inline void BinaryBuffer::read(std::vector<T> &buff) {
read(reinterpret_cast<char*>(buff.data()), sizeof(T) * buff.size());
}

inline void BinaryBuffer::read(std::string &buff, std::size_t n) {
buff.resize(n);
read(buff.data(), n);
}

inline void BinaryBuffer::read(char *buff, std::size_t n) {
static_assert(sizeof(char) == 1);
assert(_i < _buffer.size());
buff.resize(n);
std::memcpy(static_cast<void *>(buff.data()), _buffer.data() + _i, sizeof(char));
std::memcpy(static_cast<void *>(buff), _buffer.data() + _i, n * sizeof(char));
_i += sizeof(char);
}

inline std::string BinaryBuffer::getline(char delim) {
std::string_view view{_buffer};
const auto pos = view.substr(_i).find(delim);
return std::string{view.substr(0, pos)};
}

template <typename T, typename std::enable_if<std::is_fundamental<T>::value>::type *>
inline void BinaryBuffer::write(T data) {
static_assert(sizeof(char) == 1);
Expand Down
Loading

0 comments on commit 5ce289a

Please sign in to comment.