Skip to content

Commit

Permalink
Improve resolving block codecs
Browse files Browse the repository at this point in the history
  • Loading branch information
elshize committed Apr 5, 2024
1 parent 599dc04 commit 173c74f
Show file tree
Hide file tree
Showing 15 changed files with 155 additions and 51 deletions.
24 changes: 22 additions & 2 deletions include/pisa/block_inverted_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "mappable/mappable_vector.hpp"
#include "memory_source.hpp"
#include "temporary_directory.hpp"
#include "util/block_profiler.hpp"

namespace pisa {

Expand All @@ -24,7 +25,12 @@ namespace index::block {
*/
class BlockInvertedIndexCursor {
public:
BlockInvertedIndexCursor(BlockCodec const* block_codec, std::uint8_t const* data, std::uint64_t universe)
BlockInvertedIndexCursor(
BlockCodec const* block_codec,
std::uint8_t const* data,
std::uint64_t universe,
std::optional<std::uint32_t> profile_term
)
: m_base(TightVariableByte::decode(data, &m_n, 1)),
m_blocks(ceil_div(m_n, block_codec->block_size())),
m_block_maxs(m_base),
Expand All @@ -38,6 +44,10 @@ class BlockInvertedIndexCursor {
&& concepts::SortedPostingCursor<BlockInvertedIndexCursor>)
);

if (profile_term.has_value()) {
m_profiler = block_profiler::open_list(*profile_term, m_blocks);
}

m_docs_buf.resize(m_block_size);
m_freqs_buf.resize(m_block_size);
reset();
Expand Down Expand Up @@ -218,6 +228,10 @@ class BlockInvertedIndexCursor {
m_pos_in_block = 0;
m_cur_docid = m_docs_buf[0];
m_freqs_decoded = false;

if (m_profiler != nullptr) {
++m_profiler[2 * m_cur_block];
}
}

void PISA_NOINLINE decode_freqs_block() {
Expand All @@ -226,6 +240,10 @@ class BlockInvertedIndexCursor {
);
intrinsics::prefetch(next_block);
m_freqs_decoded = true;

if (m_profiler != nullptr) {
++m_profiler[2 * m_cur_block + 1];
}
}

uint32_t m_n{0};
Expand All @@ -249,6 +267,7 @@ class BlockInvertedIndexCursor {
std::vector<uint32_t> m_freqs_buf;
BlockCodec const* m_block_codec;
std::size_t m_block_size;
block_profiler::counter_type* m_profiler = nullptr;
};

class BlockInvertedIndex {
Expand All @@ -259,6 +278,7 @@ class BlockInvertedIndex {
mapper::mappable_vector<std::uint8_t> m_lists;
MemorySource m_source;
std::unique_ptr<BlockCodec> m_block_codec;
bool m_profile;

void check_term_range(std::size_t term_id) const;

Expand All @@ -268,7 +288,7 @@ class BlockInvertedIndex {
public:
using document_enumerator = BlockInvertedIndexCursor;

explicit BlockInvertedIndex(MemorySource source, std::unique_ptr<BlockCodec> block_codec);
BlockInvertedIndex(MemorySource source, std::unique_ptr<BlockCodec> block_codec, bool profile = false);

template <typename Visitor>
void map(Visitor& visit) {
Expand Down
15 changes: 15 additions & 0 deletions include/pisa/codec/block_codec_registry.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#pragma once

#include <memory>
#include <string_view>

#include <gsl/span>

#include "codec/block_codec.hpp"

namespace pisa {

[[nodiscard]] auto get_block_codec(std::string_view name) -> std::unique_ptr<BlockCodec>;
[[nodiscard]] auto get_block_codec_names() -> gsl::span<std::string_view const>;

} // namespace pisa
3 changes: 3 additions & 0 deletions include/pisa/codec/interpolative.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <string_view>
#include <vector>

#include "codec/block_codec.hpp"
Expand All @@ -10,6 +11,8 @@ class InterpolativeBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

public:
constexpr static std::string_view name = "block_interpolative";

void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
Expand Down
2 changes: 2 additions & 0 deletions include/pisa/codec/maskedvbyte.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class MaskedVByteBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_overflow = 512;

public:
constexpr static std::string_view name = "block_maskedvbyte";

void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
Expand Down
2 changes: 2 additions & 0 deletions include/pisa/codec/optpfor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class OptPForBlockCodec: public BlockCodec {
static const uint64_t m_block_size = Codec::BlockSize;

public:
constexpr static std::string_view name = "block_optpfor";

void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
Expand Down
2 changes: 2 additions & 0 deletions include/pisa/codec/qmx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ class QmxBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_overflow = 512;

public:
constexpr static std::string_view name = "block_qmx";

void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
Expand Down
2 changes: 2 additions & 0 deletions include/pisa/codec/simdbp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ class SimdBpBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

public:
constexpr static std::string_view name = "block_simdbp";

void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
Expand Down
2 changes: 2 additions & 0 deletions include/pisa/codec/simple16.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class Simple16BlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

public:
constexpr static std::string_view name = "block_simple16";

void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
Expand Down
2 changes: 2 additions & 0 deletions include/pisa/codec/simple8b.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class Simple8bBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

public:
constexpr static std::string_view name = "block_simple8b";

void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
Expand Down
3 changes: 3 additions & 0 deletions include/pisa/codec/streamvbyte.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <array>
#include <cassert>
#include <cstdint>
#include <string_view>
#include <vector>

#include "codec/block_codec.hpp"
Expand Down Expand Up @@ -43,6 +44,8 @@ class StreamVByteBlockCodec: public BlockCodec {
pisa::streamvbyte_max_compressedbytes(m_block_size);

public:
constexpr static std::string_view name = "block_streamvbyte";

void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
Expand Down
3 changes: 3 additions & 0 deletions include/pisa/codec/varint_g8iu.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <string_view>
#include <vector>

#include "codec/block_codec.hpp"
Expand All @@ -10,6 +11,8 @@ class VarintG8IUBlockCodec: public BlockCodec {
static const uint64_t m_block_size = 128;

public:
constexpr static std::string_view name = "block_varintg8iu";

void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
Expand Down
2 changes: 2 additions & 0 deletions include/pisa/codec/varintgb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,8 @@ class VarintGbBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

public:
constexpr static std::string_view name = "block_varintgb";

void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
Expand Down
14 changes: 11 additions & 3 deletions src/block_inverted_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@

namespace pisa {

BlockInvertedIndex::BlockInvertedIndex(MemorySource source, std::unique_ptr<BlockCodec> block_codec)
: m_source(std::move(source)), m_block_codec(std::move(block_codec)) {
BlockInvertedIndex::BlockInvertedIndex(
MemorySource source, std::unique_ptr<BlockCodec> block_codec, bool profile
)
: m_source(std::move(source)), m_block_codec(std::move(block_codec)), m_profile(profile) {
PISA_ASSERT_CONCEPT((concepts::SortedInvertedIndex<BlockInvertedIndex, BlockInvertedIndexCursor>));
mapper::map(*this, m_source.data(), mapper::map_flags::warmup);
}
Expand All @@ -14,7 +16,13 @@ auto BlockInvertedIndex::operator[](std::size_t term_id) const -> BlockInvertedI
check_term_range(term_id);
compact_elias_fano::enumerator endpoints(m_endpoints, 0, m_lists.size(), m_size, m_params);
auto endpoint = endpoints.move(term_id).second;
return BlockInvertedIndexCursor(m_block_codec.get(), m_lists.data() + endpoint, num_docs());
std::optional<std::uint32_t> profile_term = std::nullopt;
if (m_profile) {
profile_term = term_id;
}
return BlockInvertedIndexCursor(
m_block_codec.get(), m_lists.data() + endpoint, num_docs(), profile_term
);
}

void BlockInvertedIndex::check_term_range(std::size_t term_id) const {
Expand Down
69 changes: 69 additions & 0 deletions src/codec/block_codec_registry.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include "codec/block_codec_registry.hpp"

#include <algorithm>
#include <array>
#include <memory>
#include <string_view>

#include <fmt/format.h>
#include <gsl/span>

#include "codec/block_codec.hpp"
#include "codec/interpolative.hpp"
#include "codec/maskedvbyte.hpp"
#include "codec/optpfor.hpp"
#include "codec/qmx.hpp"
#include "codec/simdbp.hpp"
#include "codec/simple16.hpp"
#include "codec/simple8b.hpp"
#include "codec/streamvbyte.hpp"
#include "codec/varint_g8iu.hpp"
#include "codec/varintgb.hpp"

namespace pisa {

template <typename... C>
struct BlockCodecRegistry {
using BlockCodecConstructor = std::unique_ptr<BlockCodec> (*)();

constexpr static std::array<std::string_view, sizeof...(C)> names =
std::array<std::string_view, sizeof...(C)>{C::name...};

constexpr static std::array<BlockCodecConstructor, sizeof...(C)> constructors =
std::array<BlockCodecConstructor, sizeof...(C)>{[]() -> std::unique_ptr<BlockCodec> {
return std::make_unique<C>();
}...};

constexpr static auto count() -> std::size_t { return sizeof...(C); }

static auto get(std::string_view name) -> std::unique_ptr<BlockCodec> {
auto pos = std::find(names.begin(), names.end(), name);
if (pos == names.end()) {
throw std::domain_error(fmt::format("invalid codec: {}", name));
}
auto constructor = constructors[std::distance(names.begin(), pos)];
return constructor();
}
};

using BlockCodecs = BlockCodecRegistry<
InterpolativeBlockCodec,
MaskedVByteBlockCodec,
OptPForBlockCodec,
QmxBlockCodec,
SimdBpBlockCodec,
Simple16BlockCodec,
Simple8bBlockCodec,
StreamVByteBlockCodec,
VarintG8IUBlockCodec,
VarintGbBlockCodec>;

auto get_block_codec(std::string_view name) -> std::unique_ptr<BlockCodec> {
return BlockCodecs::get(name);
}

auto get_block_codec_names() -> gsl::span<std::string_view const> {
return gsl::make_span<std::string_view const>(&BlockCodecs::names[0], BlockCodecs::count());
}

} // namespace pisa
Loading

0 comments on commit 173c74f

Please sign in to comment.