Skip to content

Commit

Permalink
Implement all dynamic-dispatch block codecs
Browse files Browse the repository at this point in the history
  • Loading branch information
elshize committed Mar 31, 2024
1 parent b75fcbe commit 599dc04
Show file tree
Hide file tree
Showing 20 changed files with 574 additions and 29 deletions.
17 changes: 17 additions & 0 deletions include/pisa/codec/block_codec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,33 @@

namespace pisa {

/**
* Block codecs encode and decode an entire list. This is in opposition to a streaming codec,
* which can encode and decode values one by one.
*/
class BlockCodec {
public:
/**
* Encodes a list of `n` unsigned integers and appends them to the output buffer.
*/
virtual void encode(
std::uint32_t const* in, std::uint32_t sum_of_values, std::size_t n, std::vector<uint8_t>& out
) const = 0;

/**
* Decodes a list of `n` unsigned integers from a binary buffer and writes them to pre-allocated
* memory.
*/
virtual std::uint8_t const* decode(
std::uint8_t const* in, std::uint32_t* out, std::uint32_t sum_of_values, std::size_t n
) const = 0;

/**
* Returns the block size of the encoding.
*
* Block codecs write blocks of fixed size, e.g., 128 integers. Thus, it is only possible to
* encode at most `block_size()` elements.
*/
[[nodiscard]] virtual auto block_size() const noexcept -> std::size_t = 0;
};

Expand Down
18 changes: 18 additions & 0 deletions include/pisa/codec/interpolative.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include <vector>

#include "codec/block_codec.hpp"

namespace pisa {

class InterpolativeBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

public:
void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
};

} // namespace pisa
13 changes: 12 additions & 1 deletion include/pisa/codec/maskedvbyte.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

#include "MaskedVByte/include/varintdecode.h"
#include "MaskedVByte/include/varintencode.h"
#include "codec/block_codec.hpp"
#include "codec/block_codecs.hpp"
#include "util/util.hpp"

namespace pisa {
struct maskedvbyte_block {
Expand All @@ -31,4 +31,15 @@ struct maskedvbyte_block {
return in + read;
}
};

class MaskedVByteBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;
static constexpr std::uint64_t m_overflow = 512;

public:
void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
};

} // namespace pisa
54 changes: 54 additions & 0 deletions include/pisa/codec/optpfor.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#pragma once

#include <vector>

#include "FastPFor/headers/optpfor.h"

#include "codec/block_codec.hpp"

namespace pisa {

class OptPForBlockCodec: public BlockCodec {
struct Codec: FastPForLib::OPTPFor<4, FastPForLib::Simple16<false>> {
uint8_t const* force_b{nullptr};

uint32_t findBestB(const uint32_t* in, uint32_t len) {
// trick to force the choice of b from a parameter
if (force_b != nullptr) {
return *force_b;
}

// this is mostly a cut&paste from FastPFor, but we stop the
// optimization early as the b to test becomes larger than maxb
uint32_t b = 0;
uint32_t bsize = std::numeric_limits<uint32_t>::max();
const uint32_t mb = FastPForLib::maxbits(in, in + len);
uint32_t i = 0;
while (mb > 28 + possLogs[i]) {
++i; // some schemes such as Simple16 don't code numbers greater than 28
}

for (; i < possLogs.size(); i++) {
if (possLogs[i] > mb && possLogs[i] >= mb) {
break;
}
const uint32_t csize = tryB(possLogs[i], in, len);

if (csize <= bsize) {
b = possLogs[i];
bsize = csize;
}
}
return b;
}
};

static const uint64_t m_block_size = Codec::BlockSize;

public:
void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
};

} // namespace pisa
12 changes: 12 additions & 0 deletions include/pisa/codec/qmx.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "QMX/qmx.hpp"
#include "codec/block_codec.hpp"
#include "codec/block_codecs.hpp"

namespace pisa {
Expand Down Expand Up @@ -47,4 +48,15 @@ struct qmx_block {
return in + enc_len;
}
};

class QmxBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;
static constexpr std::uint64_t m_overflow = 512;

public:
void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
};

} // namespace pisa
27 changes: 3 additions & 24 deletions include/pisa/codec/simple16.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,30 +39,9 @@ struct simple16_block {
class Simple16BlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

void
encode(uint32_t const* in, uint32_t /* sum_of_values */, size_t n, std::vector<uint8_t>& out) const {
assert(n <= m_block_size);
thread_local FastPForLib::Simple16<false> codec;
thread_local std::array<std::uint8_t, 2 * 8 * m_block_size> buf{};
size_t out_len = buf.size();
codec.encodeArray(in, n, reinterpret_cast<uint32_t*>(buf.data()), out_len);
out_len *= 4;
out.insert(out.end(), buf.data(), buf.data() + out_len);
}

uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const {
assert(n <= m_block_size);
FastPForLib::Simple16<false> codec;
std::array<std::uint32_t, 2 * m_block_size> buf{};

auto const* ret = reinterpret_cast<uint8_t const*>(
codec.decodeArray(reinterpret_cast<uint32_t const*>(in), 8 * n, buf.data(), n)
);

std::copy(buf.begin(), std::next(buf.begin(), n), out);
return ret;
}

public:
void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
};

Expand Down
12 changes: 12 additions & 0 deletions include/pisa/codec/simple8b.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include <array>

#include "codec/block_codec.hpp"

namespace pisa {

struct simple8b_block {
Expand All @@ -28,4 +30,14 @@ struct simple8b_block {
);
}
};

class Simple8bBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

public:
void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
};

} // namespace pisa
13 changes: 13 additions & 0 deletions include/pisa/codec/streamvbyte.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <cstdint>
#include <vector>

#include "codec/block_codec.hpp"
#include "streamvbyte/include/streamvbyte.h"

namespace pisa {
Expand Down Expand Up @@ -35,4 +36,16 @@ struct streamvbyte_block {
return in + read;
}
};

class StreamVByteBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;
static constexpr std::size_t m_max_compressed_bytes =
pisa::streamvbyte_max_compressedbytes(m_block_size);

public:
void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
};

} // namespace pisa
18 changes: 18 additions & 0 deletions include/pisa/codec/varint_g8iu.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#pragma once

#include <vector>

#include "codec/block_codec.hpp"

namespace pisa {

class VarintG8IUBlockCodec: public BlockCodec {
static const uint64_t m_block_size = 128;

public:
void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
};

} // namespace pisa
13 changes: 11 additions & 2 deletions include/pisa/codec/varintgb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
#include <cstring>
#include <vector>

#include "FastPFor/headers/common.h"

#include "codec/block_codec.hpp"
#include "codec/block_codecs.hpp"
#include "memory.hpp"

Expand Down Expand Up @@ -257,4 +256,14 @@ struct varintgb_block {
return read + in;
}
};

class VarintGbBlockCodec: public BlockCodec {
static constexpr std::uint64_t m_block_size = 128;

public:
void encode(uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out) const;
uint8_t const* decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const;
auto block_size() const noexcept -> std::size_t { return m_block_size; }
};

} // namespace pisa
54 changes: 54 additions & 0 deletions src/codec/interpolative.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#include <cassert>
#include <limits>

#include "codec/block_codecs.hpp"
#include "codec/interpolative.hpp"

namespace pisa {

void InterpolativeBlockCodec::encode(
uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out
) const {
assert(n <= m_block_size);
thread_local std::array<std::uint32_t, m_block_size> inbuf{};
thread_local std::vector<uint32_t> outbuf; // TODO: Can we use array? How long does it need
// to be?
inbuf[0] = *in;
for (size_t i = 1; i < n; ++i) {
inbuf[i] = inbuf[i - 1] + in[i];
}

if (sum_of_values == uint32_t(-1)) {
sum_of_values = inbuf[n - 1];
TightVariableByte::encode_single(sum_of_values, out);
}

bit_writer bw(outbuf);
bw.write_interpolative(inbuf.data(), n - 1, 0, sum_of_values);
auto const* bufptr = reinterpret_cast<uint8_t const*>(outbuf.data());
out.insert(out.end(), bufptr, bufptr + ceil_div(bw.size(), 8));
}

uint8_t const* InterpolativeBlockCodec::decode(
uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n
) const {
assert(n <= m_block_size);
if (sum_of_values == std::numeric_limits<std::uint32_t>::max()) {
in = TightVariableByte::decode(in, &sum_of_values, 1);
}

out[n - 1] = sum_of_values;
size_t read_interpolative = 0;
if (n > 1) {
bit_reader br(in);
br.read_interpolative(out, n - 1, 0, sum_of_values);
for (size_t i = n - 1; i > 0; --i) {
out[i] -= out[i - 1];
}
read_interpolative = ceil_div(br.position(), 8);
}

return in + read_interpolative;
}

} // namespace pisa
29 changes: 29 additions & 0 deletions src/codec/maskedvbyte.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "codec/maskedvbyte.hpp"

namespace pisa {

void MaskedVByteBlockCodec::encode(
uint32_t const* in, uint32_t sum_of_values, size_t n, std::vector<uint8_t>& out
) const {
assert(n <= m_block_size);
auto* src = const_cast<uint32_t*>(in);
if (n < m_block_size) {
interpolative_block::encode(src, sum_of_values, n, out);
return;
}
thread_local std::array<std::uint8_t, 2 * m_block_size * sizeof(std::uint32_t)> buf{};
size_t out_len = vbyte_encode(src, n, buf.data());
out.insert(out.end(), buf.data(), buf.data() + out_len);
}

uint8_t const*
MaskedVByteBlockCodec::decode(uint8_t const* in, uint32_t* out, uint32_t sum_of_values, size_t n) const {
assert(n <= m_block_size);
if PISA_UNLIKELY (n < m_block_size) {
return interpolative_block::decode(in, out, sum_of_values, n);
}
auto read = masked_vbyte_decode(in, out, n);
return in + read;
}

} // namespace pisa
Loading

0 comments on commit 599dc04

Please sign in to comment.