diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md index 63cc358d..ef1e4931 100644 --- a/docs/src/SUMMARY.md +++ b/docs/src/SUMMARY.md @@ -46,3 +46,7 @@ - [`taily-stats`](cli/taily-stats.md) - [`taily-thresholds`](cli/taily-thresholds.md) - [`thresholds`](cli/thresholds.md) + +# Specifications + +- [Lookup Table](specs/lookup-table.md) diff --git a/docs/src/specs/lookup-table.md b/docs/src/specs/lookup-table.md new file mode 100644 index 00000000..6d7d57a7 --- /dev/null +++ b/docs/src/specs/lookup-table.md @@ -0,0 +1,112 @@ +# Lookup Table Format Specification + +A lookup table is a bidirectional mapping from an index, representing an +internal ID, to a binary payload, such as string. E.g., an `N`-element +lookup table maps values `0...N-1` to their payloads. These tables are +used for things like mapping terms to term IDs and document IDs to +titles or URLs. + +The format of a lookup table is designed to operate without having to +parse the entire structure. Once the header is parsed, it is possible to +operate directly on the binary format to access the data. In fact, a +lookup table will typically be memory mapped. Therefore, it is possible +to perform a lookup (or reverse lookup) without loading the entire +structure into memory. + +The header always begins as follows: + +``` ++--------+--------+-------- -+ +| 0x87 | Ver. | ... | ++--------+--------+-------- -+ +``` + +The first byte is a constant identifier. When reading, we can verify +whether this byte is correct to make sure we are using the correct type +of data structure. + +The second byte is equal to the version of the format. + +The remaining of the format is defined separately for each version. The +version is introduced in order to be able to update the format in the +future but still be able to read old formats for backwards +compatibility. + +## v1 + +``` ++--------+--------+--------+--------+--------+--------+--------+--------+ +| 0x87 | 0x01 | Flags | 0x00 | ++--------+--------+--------+--------+--------+--------+--------+--------+ +| Length | ++--------+--------+--------+--------+--------+--------+--------+--------+ +| | +| Offsets | +| | ++-----------------------------------------------------------------------+ +| | +| Payloads | +| | ++-----------------------------------------------------------------------+ +``` + +Immediately after the version bit, we have flags byte. + +``` + MSB LSB ++---+---+---+---+---+---+---+---+ +| 0 | 0 | 0 | 0 | 0 | 0 | W | S | ++---+---+---+---+---+---+---+---+ +``` + +The first bit (`S`) indicates whether the payloads are sorted (1) or not +(0). The second bit (`W`) defines the width of offsets (see below): +32-bit (0) or 64-bit (1). In most use cases, the cumulative size of the +payloads will be small enough to address it by 32-bit offsets. For +example, if we store words that are 16-bytes long on average, we can +address over 200 million of them. For this many elements, reducing the +width of the offsets would save us over 700 MB. Still, we want to +support 64-bit addressing because some payloads may be much longer +(e.g., URLs). + +The rest of the bits in the flags byte are currently not used, but +should be set to 0 to make sure that if more flags are introduced, we +know what values to expect in the older iterations, and thus we can make +sure to keep it backwards-compatible. + +The following 5 bytes are padding with values of 0. This is to help with +byte alignment. When loaded to memory, it should be loaded with 8-byte +alignment. When memory mapped, it should be already correctly aligned by +the operating system (at least on Linux). + +Following the padding, there is a 64-bit unsigned integer encoding the +number of elements in the lexicon (`N`). + +Given `N` and `W`, we can now calculate the byte range of all offsets, +and thus the address offset for the start of the payloads. The offsets +are `N+1` little-endian unsigned integers of size determined by `W` +(either 4 or 8 bytes). The offsets are associated with consecutive IDs +from 0 to `N-1`; the last the `N+1` offsets points at the first byte +after the last payload. The offsets are relative to the beginning of the +first payload, therefore the first offset will always be 0. + +Payloads are arbitrary bytes, and must be interpreted by the software. +Although the typical use case are strings, this can be any binary +payload. Note that in case of strings, they will not be 0-terminated +unless they were specifically stored as such. Although this should be +clear by the fact a payload is simply a sequence of bytes, it is only +prudent to point it out. Thus, one must be extremely careful when using +C-style strings, as their use is contingent on a correct values inserted +and encoded in the first place, and assuming 0-terminated strings may +easily lead to undefined behavior. Thus, it is recommended to store +strings without terminating them, and then interpret them as string +views (such as `std::string_view`) instead of a C-style string. + +The boundaries of the k-th payload are defined by the values of k-th and +(k+1)-th offsets. Note that because of the additional offset that points +to immediately after the last payload, we can read offsets `k` and `k+1` +for any index `k < N` (recall that `N` is the number of elements). + +If the payloads are sorted (S), we can find an ID of a certain payload +with a binary search. This is crucial for any application that requires +mapping from payloads to their position in the table. diff --git a/include/pisa/io.hpp b/include/pisa/io.hpp index 60ba83a1..5236264f 100644 --- a/include/pisa/io.hpp +++ b/include/pisa/io.hpp @@ -36,7 +36,7 @@ template void for_each_line(std::istream& is, Function fn) { std::string line; while (std::getline(is, line)) { - fn(line); + fn(std::move(line)); } } diff --git a/include/pisa/lookup_table.hpp b/include/pisa/lookup_table.hpp new file mode 100644 index 00000000..4a6af1cc --- /dev/null +++ b/include/pisa/lookup_table.hpp @@ -0,0 +1,229 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace pisa::lt { + +namespace detail { + + class BaseLookupTable { + public: + virtual ~BaseLookupTable() = default; + [[nodiscard]] virtual auto size() const noexcept -> std::size_t = 0; + [[nodiscard]] virtual auto operator[](std::size_t idx) const + -> std::span = 0; + [[nodiscard]] virtual auto find(std::span value) const noexcept + -> std::optional = 0; + + [[nodiscard]] virtual auto clone() -> std::unique_ptr = 0; + }; + + class BaseLookupTableEncoder { + public: + virtual ~BaseLookupTableEncoder() = default; + void virtual insert(std::span payload) = 0; + void virtual encode(std::ostream& out) = 0; + }; + +} // namespace detail + +namespace v1 { + + class Flags { + private: + std::uint8_t flags = 0; + + public: + constexpr Flags() = default; + explicit constexpr Flags(std::uint8_t bitset) : flags(bitset) {} + + [[nodiscard]] auto sorted() const noexcept -> bool; + [[nodiscard]] auto wide_offsets() const noexcept -> bool; + [[nodiscard]] auto bits() const noexcept -> std::uint8_t; + }; + + namespace flags { + inline constexpr std::uint8_t SORTED = 0b001; + inline constexpr std::uint8_t WIDE_OFFSETS = 0b010; + } // namespace flags + +}; // namespace v1 + +} // namespace pisa::lt + +namespace pisa { + +/** + * Lookup table mapping integers from a range [0, N) to binary payloads. + * + * This table assigns each _unique_ value (duplicates are not allowed) to an index in [0, N), where + * N is the size of the table. Thus, this structure is equivalent to a sequence of binary values. + * The difference between `LookupTable` and, say, `std::vector` is that its encoding supports + * reading the values without fully parsing the entire binary representation of the table. As such, + * it supports quickly initializing the structure from an external device (with random access), + * e.g., via mmap, and performing a lookup without loading the entire structure to main memory. + * This is especially useful for short-lived programs that must perform a lookup without the + * unnecessary overhead of loading it to memory. + * + * If the values are sorted, and the appropriate flag is toggled in the header, a quick binary + * search lookup can be performed to find an index of a value. If the values are not sorted, then a + * linear scan will be used; therefore, one should consider having values sorted if such lookups are + * important. Getting the value at a given index is a constant-time operation, though if using + * memory mapping, each such operation may need to load multiple pages to memory. + */ +class LookupTable { + private: + std::unique_ptr<::pisa::lt::detail::BaseLookupTable> m_impl; + + explicit LookupTable(std::unique_ptr<::pisa::lt::detail::BaseLookupTable> impl); + + [[nodiscard]] static auto v1(std::span bytes) -> LookupTable; + + public: + LookupTable(LookupTable const&); + LookupTable(LookupTable&&); + LookupTable& operator=(LookupTable const&); + LookupTable& operator=(LookupTable&&); + ~LookupTable(); + + /** + * The number of elements in the table. + */ + [[nodiscard]] auto size() const noexcept -> std::size_t; + + /** + * Retrieves the value at index `idx`. + * + * If `idx < size()`, then `std::out_of_range` exception is thrown. See `at()` if you want to + * conveniently cast the memory span to another type. + */ + [[nodiscard]] auto operator[](std::size_t idx) const -> std::span; + + /** + * Returns the position of `value` in the table or `std::nullopt` if the value does not exist. + * + * See the templated version of this function if you want to automatically cast from another + * type to byte span. + */ + [[nodiscard]] auto find(std::span value) const noexcept + -> std::optional; + + /** + * Returns the value at index `idx` cast to type `T`. + * + * The type `T` must define `T::value_type` that resolves to a byte-wide type, as well as a + * constructor that takes `T::value_type const*` (pointer to the first byte) and `std::size_t` + * (the total number of bytes). If `T::value_type` is longer than 1 byte, this operation results + * in **undefined behavior**. + * + * Examples of types that can be used are: `std::string_view` or `std::span`. + */ + template + [[nodiscard]] auto at(std::size_t idx) const -> T { + auto bytes = this->operator[](idx); + return T(reinterpret_cast(bytes.data()), bytes.size()); + } + + /** + * Returns the position of `value` in the table or `std::nullopt` if the value does not exist. + * + * The type `T` of the value must be such that `std:span` is + * constructible from `T`. + */ + template + requires(std::constructible_from, T>) + [[nodiscard]] auto find(T value) const noexcept -> std::optional { + return find(std::as_bytes(std::span(value))); + } + + /** + * Constructs a lookup table from the encoded sequence of bytes. + */ + [[nodiscard]] static auto from_bytes(std::span bytes) -> LookupTable; +}; + +/** + * Lookup table encoder. + * + * This class builds and encodes a sequence of values to the binary format of lookup table. + * See `LookupTable` for more details. + * + * Note that all encoded data is accumulated in memory and only flushed to the output stream when + * `encode()` member function is called. + */ +class LookupTableEncoder { + std::unique_ptr<::pisa::lt::detail::BaseLookupTableEncoder> m_impl; + + explicit LookupTableEncoder(std::unique_ptr<::pisa::lt::detail::BaseLookupTableEncoder> impl); + + public: + /** + * Constructs an encoder for a lookup table in v1 format, with the given flag options. + * + * If sorted flag is _not_ set, then an additional hash set will be produced to keep track of + * duplicates. This will increase the memory footprint at build time. + */ + static LookupTableEncoder v1(::pisa::lt::v1::Flags flags); + + /** + * Inserts payload. + * + * If sorted flag was set at construction time, it will throw if the given payload is not + * lexicographically greater than the previously inserted payload. If sorted flag was _not_ set + * and the given payload has already been inserted, it will throw as well. + */ + auto insert(std::span payload) -> LookupTableEncoder&; + + /** + * Writes the encoded table to the output stream. + */ + auto encode(std::ostream& out) -> LookupTableEncoder&; + + /** + * Inserts a payload of type `Payload`. + * + * `std::span` must be constructible from `Payload`, which + * in turn will be cast as byte span before calling the non-templated version of `insert()`. + */ + template + requires(std::constructible_from, Payload>) + auto insert(Payload const& payload) -> LookupTableEncoder& { + insert(std::as_bytes(std::span(payload))); + return *this; + } + + /** + * Inserts all payloads in the given span. + * + * It calls `insert()` for each element in the span. See `insert()` for more details. + */ + template + auto insert_span(std::span payloads) -> LookupTableEncoder& { + for (auto const& payload: payloads) { + insert(payload); + } + return *this; + } +}; + +} // namespace pisa diff --git a/include/pisa/span.hpp b/include/pisa/span.hpp index 4eb0b103..3ce1bf4e 100644 --- a/include/pisa/span.hpp +++ b/include/pisa/span.hpp @@ -34,12 +34,47 @@ template return span[pos]; } -} // namespace pisa +template +[[nodiscard]] constexpr auto subspan_or_throw( + std::span const& span, + typename std::span::size_type offset, + typename std::span::size_type count, + std::string const& error_msg +) -> std::span { + if (offset + count > span.size()) { + throw std::out_of_range(error_msg); + } + return span.subspan(offset, count); +} -namespace std { +template +[[nodiscard]] constexpr auto subspan_or_throw( + std::span const& span, + typename std::span::size_type offset, + typename std::span::size_type count +) -> std::span { + return subspan_or_throw(span, offset, count, "out of range subspan"); +} template -[[nodiscard]] auto operator==(std::span const& lhs, std::span const& rhs) -> bool { +[[nodiscard]] auto lex_lt(std::span const& lhs, std::span const& rhs) -> bool { + auto lit = lhs.begin(); + auto rit = rhs.begin(); + while (lit != lhs.end() && rit != rhs.end()) { + if (*lit < *rit) { + return true; + } + if (*lit > *rit) { + return false; + } + ++lit; + ++rit; + } + return rit == rhs.end() && lit != lhs.end(); +} + +template +[[nodiscard]] auto lex_eq(std::span const& lhs, std::span const& rhs) -> bool { if (lhs.size() != rhs.size()) { return false; } @@ -53,4 +88,13 @@ template return true; } +} // namespace pisa + +namespace std { + +template +[[nodiscard]] auto operator==(std::span const& lhs, std::span const& rhs) -> bool { + return ::pisa::lex_eq(lhs, rhs); +} + } // namespace std diff --git a/include/pisa/stream.hpp b/include/pisa/stream.hpp new file mode 100644 index 00000000..6768db49 --- /dev/null +++ b/include/pisa/stream.hpp @@ -0,0 +1,56 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +namespace pisa { + +class FileOpenError: public std::exception { + public: + explicit FileOpenError(std::string const& file); + [[nodiscard]] auto what() const noexcept -> char const*; + + private: + std::string m_message; +}; + +class WriteError: public std::exception { + public: + [[nodiscard]] auto what() const noexcept -> char const*; +}; + +auto open_file_w(std::string const& filename) -> std::ofstream; + +template +auto put(std::basic_ostream& stream, CharT ch) -> std::ostream& { + if (!stream.put(ch)) { + throw WriteError(); + } + return stream; +} + +template +auto write(std::basic_ostream& stream, CharT const* data, std::streamsize count) + -> std::basic_ostream& { + if (!stream.write(data, count)) { + throw WriteError(); + } + return stream; +} + +} // namespace pisa diff --git a/src/lookup_table.cpp b/src/lookup_table.cpp new file mode 100644 index 00000000..fd66425b --- /dev/null +++ b/src/lookup_table.cpp @@ -0,0 +1,308 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "fmt/core.h" +#include "pisa/lookup_table.hpp" +#include "pisa/span.hpp" +#include "pisa/stream.hpp" + +namespace pisa::lt { + +constexpr std::byte VERIFICATION_BYTE = std::byte(0x87); +constexpr std::size_t PADDING_LENGTH = 5; +constexpr std::array PADDING = { + std::byte{0}, std::byte{0}, std::byte{0}, std::byte{0}, std::byte{0} +}; + +auto v1::Flags::sorted() const noexcept -> bool { + return (this->flags & 0b1) > 0; +} + +auto v1::Flags::wide_offsets() const noexcept -> bool { + return (this->flags & 0b10) > 0; +} + +auto v1::Flags::bits() const noexcept -> std::uint8_t { + return this->flags; +} + +} // namespace pisa::lt + +namespace pisa { + +LookupTable::LookupTable(std::unique_ptr<::pisa::lt::detail::BaseLookupTable> impl) + : m_impl(std::move(impl)) {} + +LookupTable::LookupTable(LookupTable const& other) : m_impl(other.m_impl->clone()) {} + +LookupTable::LookupTable(LookupTable&&) = default; + +LookupTable& LookupTable::operator=(LookupTable const& other) { + m_impl = other.m_impl->clone(); + return *this; +} + +LookupTable& LookupTable::operator=(LookupTable&&) = default; + +LookupTable::~LookupTable() = default; + +template + requires(std::unsigned_integral) +[[nodiscard]] auto +read(std::span bytes, std::size_t offset, std::string const& error_msg) -> T { + auto sub = pisa::subspan_or_throw(bytes, offset, sizeof(T), error_msg); + T value; + std::memcpy(&value, bytes.data() + offset, sizeof(T)); + return value; +} + +template + requires(std::unsigned_integral) +[[nodiscard]] auto read(std::span bytes, std::size_t offset) -> T { + return read(bytes, offset, "not enough bytes"); +} + +void validate_padding(std::span bytes) { + auto padding = read(bytes, 0, "not enough bytes for header"); + padding &= 0xFFFFFFFFFF000000; + if (padding != 0) { + throw std::domain_error(fmt::format( + "bytes 3-7 must be all 0 but are {:#2x} {:#2x} {:#2x} {:#2x} {:#2x}", + bytes[3], + bytes[4], + bytes[5], + bytes[6], + bytes[7] + )); + } +} + +template +class LookupTableV1: public ::pisa::lt::detail::BaseLookupTable { + std::span m_offsets; + std::span m_payloads; + std::size_t m_size; + bool m_sorted; + + [[nodiscard]] auto read_offset(std::size_t idx) const -> Offset { + return read(m_offsets, idx * sizeof(Offset)); + } + + [[nodiscard]] auto read_payload(std::size_t idx) const -> std::span { + auto offset = read_offset(idx); + auto count = read_offset(idx + 1) - offset; + return pisa::subspan_or_throw(m_payloads, offset, count, "not enough bytes for payload"); + } + + public: + LookupTableV1(std::span offsets, std::span payloads, bool sorted) + : m_offsets(offsets), + m_payloads(payloads), + m_size(m_offsets.size() / sizeof(Offset) - 1), + m_sorted(sorted) {} + + ~LookupTableV1() = default; + + [[nodiscard]] virtual auto clone() -> std::unique_ptr override { + return std::make_unique>(m_offsets, m_payloads, m_sorted); + } + + [[nodiscard]] virtual auto size() const noexcept -> std::size_t override { return m_size; } + + [[nodiscard]] virtual auto operator[](std::size_t idx) const + -> std::span override { + if (idx >= m_size) { + throw std::out_of_range( + fmt::format("accessing element {} in a table of size {}", idx, m_size) + ); + } + auto offset = read_offset(idx); + auto count = read_offset(idx + 1) - offset; + return pisa::subspan_or_throw(m_payloads, offset, count, "not enough bytes for payload"); + } + + [[nodiscard]] virtual auto find_sorted(std::span value) const noexcept + -> std::optional { + if (size() == 0) { + return std::nullopt; + } + std::size_t low = 0; + std::size_t high = size() - 1; + while (low < high) { + auto mid = std::midpoint(low, high); + auto midval = read_payload(mid); + if (lex_lt(midval, value)) { + low = mid + 1; + } else { + high = mid; + } + } + return std::nullopt; + } + + [[nodiscard]] virtual auto find_unsorted(std::span value) const noexcept + -> std::optional { + for (std::size_t pos = 0; pos < size(); ++pos) { + if (read_payload(pos) == value) { + return pos; + } + } + return std::nullopt; + } + + [[nodiscard]] virtual auto find(std::span value) const noexcept + -> std::optional override { + return m_sorted ? find_sorted(value) : find_unsorted(value); + } +}; + +template +auto construct_lookup_table_v1(std::span bytes, bool sorted) + -> std::unique_ptr<::pisa::lt::detail::BaseLookupTable> { + auto length = read(bytes, 8, "not enough bytes for table length"); + std::size_t offsets_bytes_length = (length + 1) * sizeof(Offset); + auto offsets = + pisa::subspan_or_throw(bytes, 16, offsets_bytes_length, "not enough bytes for offsets"); + auto payloads = pisa::subspan_or_throw(bytes, 16 + offsets_bytes_length, std::dynamic_extent); + return std::make_unique>(offsets, payloads, sorted); +} + +auto LookupTable::v1(std::span bytes) -> LookupTable { + validate_padding(bytes); + auto flags = lt::v1::Flags(static_cast(bytes[2])); + if (flags.wide_offsets()) { + return LookupTable(construct_lookup_table_v1(bytes, flags.sorted())); + } + return LookupTable(construct_lookup_table_v1(bytes, flags.sorted())); +} + +auto LookupTable::from_bytes(std::span bytes) -> LookupTable { + auto leading_bytes = pisa::subspan_or_throw(bytes, 0, 2, "header must be at least 2 bytes"); + auto verification_byte = leading_bytes[0]; + if (verification_byte != lt::VERIFICATION_BYTE) { + throw std::domain_error(fmt::format( + "lookup table verification byte invalid: must be {:#x} but {:#x} given", + lt::VERIFICATION_BYTE, + verification_byte + )); + } + + auto version = static_cast(leading_bytes[1]); + if (version != 1) { + throw std::domain_error(fmt::format("only version 1 is valid but {} given", version)); + } + + return LookupTable::v1(bytes); +} + +auto LookupTable::size() const noexcept -> std::size_t { + return m_impl->size(); +} +auto LookupTable::operator[](std::size_t idx) const -> std::span { + return m_impl->operator[](idx); +} + +auto LookupTable::find(std::span value) const noexcept + -> std::optional { + return m_impl->find(value); +} + +template +class LookupTableEncoderV1: public ::pisa::lt::detail::BaseLookupTableEncoder { + ::pisa::lt::v1::Flags m_flags; + std::vector m_offsets{0}; + std::vector m_payloads{}; + std::unordered_set m_inserted{}; + + void encode_header(std::ostream& out) { + auto flag_bits = m_flags.bits(); + pisa::put(out, static_cast(lt::VERIFICATION_BYTE)); + pisa::put(out, static_cast(1)); + pisa::put(out, static_cast(flag_bits)); + pisa::write( + out, reinterpret_cast(&::pisa::lt::PADDING), ::pisa::lt::PADDING_LENGTH + ); + } + + void write_offsets(std::ostream& out) { + for (auto const& offset: m_offsets) { + pisa::write(out, reinterpret_cast(&offset), sizeof(Offset)); + } + } + + public: + explicit LookupTableEncoderV1(::pisa::lt::v1::Flags flags) : m_flags(flags) {} + + virtual ~LookupTableEncoderV1() = default; + + void virtual insert(std::span payload) { + if (m_flags.sorted()) { + auto prev = std::span(m_payloads).subspan(m_offsets.back()); + if (pisa::lex_lt(payload, prev)) { + throw std::invalid_argument("payloads not strictly sorted in sorted table"); + } + } else { + auto payload_as_str = + std::string_view(reinterpret_cast(payload.data()), payload.size()); + if (auto pos = m_inserted.find(payload_as_str); pos != m_inserted.end()) { + throw std::invalid_argument("payload duplicate"); + } + m_inserted.insert(payload_as_str); + } + m_offsets.push_back(m_offsets.back() + payload.size()); + m_payloads.insert(m_payloads.end(), payload.begin(), payload.end()); + } + + void virtual encode(std::ostream& out) { + encode_header(out); + std::uint64_t size = m_offsets.size() - 1; + pisa::write(out, reinterpret_cast(&size), sizeof(size)); + write_offsets(out); + pisa::write(out, reinterpret_cast(m_payloads.data()), m_payloads.size()); + } +}; + +LookupTableEncoder::LookupTableEncoder(std::unique_ptr<::pisa::lt::detail::BaseLookupTableEncoder> impl) + : m_impl(std::move(impl)) {} + +LookupTableEncoder LookupTableEncoder::v1(::pisa::lt::v1::Flags flags) { + if (flags.wide_offsets()) { + return LookupTableEncoder(std::make_unique>(flags)); + } + return LookupTableEncoder(std::make_unique>(flags)); +} + +auto LookupTableEncoder::insert(std::span payload) -> LookupTableEncoder& { + m_impl->insert(payload); + return *this; +} + +auto LookupTableEncoder::encode(std::ostream& out) -> LookupTableEncoder& { + m_impl->encode(out); + return *this; +} + +} // namespace pisa diff --git a/src/stream.cpp b/src/stream.cpp new file mode 100644 index 00000000..b9b14895 --- /dev/null +++ b/src/stream.cpp @@ -0,0 +1,41 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "fmt/core.h" +#include "pisa/stream.hpp" + +namespace pisa { + +FileOpenError::FileOpenError(std::string const& file) + : m_message(fmt::format("failed to open file: {}", file)) {} + +auto FileOpenError::what() const noexcept -> char const* { + return m_message.c_str(); +} + +auto WriteError::what() const noexcept -> char const* { + return "failed to write to stream"; +} + +auto open_file_w(std::string const& filename) -> std::ofstream { + auto stream = std::ofstream(filename); + if (!stream) { + throw FileOpenError(filename); + } + return stream; +} + +} // namespace pisa diff --git a/test/cli/run.sh b/test/cli/run.sh index fb61a5a8..6a837a23 100755 --- a/test/cli/run.sh +++ b/test/cli/run.sh @@ -5,3 +5,4 @@ bash "$DIR/setup.sh" bats "$DIR/test_taily_stats.sh" bats "$DIR/test_count_postings.sh" bats "$DIR/test_wand_data.sh" +bats "$DIR/test_lookup_table.sh" diff --git a/test/cli/test_lookup_table.sh b/test/cli/test_lookup_table.sh new file mode 100755 index 00000000..a594e929 --- /dev/null +++ b/test/cli/test_lookup_table.sh @@ -0,0 +1,209 @@ +#!/usr/bin/env bats + +set +x + +sorted_values=$(cat < "$input_file" + + # build + lookup-table build -o "$lt" < "$input_file" + + # verify size + count_expected_bytes "$input_file" > "$workdir/expected_bytes" + wc -c > "$workdir/actual_bytes" < "$lt" + diff "$workdir/expected_bytes" "$workdir/actual_bytes" + + # get by index + assert_eq "$(lookup-table get "$lt" 0)" adipiscing + assert_eq "$(lookup-table get "$lt" 10)" erat + assert_eq "$(lookup-table get "$lt" 15)" ipsum + assert_eq "$(lookup-table get "$lt" 16)" lorem + assert_eq "$(lookup-table get "$lt" 22)" ultricies + # out of bounds exits with a failure exit code and prints out error + run lookup-table get "$lt" 23 + (( status != 0 )) + assert_eq "${lines[0]}" 'error: accessing element 23 in a table of size 23' + + # find + assert_eq "$(lookup-table find "$lt" adipiscing)" 0 + assert_eq "$(lookup-table find "$lt" erat)" 10 + assert_eq "$(lookup-table find "$lt" ipsum)" 15 + assert_eq "$(lookup-table find "$lt" lorem)" 16 + assert_eq "$(lookup-table find "$lt" ultricies)" 22 + # no element found + run lookup-table find "$lt" zonk + (( status != 0 )) + assert_eq "${lines[0]}" "error: value 'zonk' not found" + + # print + + lookup-table print "$lt" > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 0 --to 22 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 0 --count 23 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 5 --to 17 > "$workdir/printed" + diff "$workdir/printed" <(head -18 "$workdir/input" | tail -13) + + lookup-table print "$lt" --from 5 --count 13 > "$workdir/printed" + diff "$workdir/printed" <(head -18 "$workdir/input" | tail -13) + + lookup-table print "$lt" --to 22 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --count 23 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --to 10 > "$workdir/printed" + diff "$workdir/printed" <(head -11 "$workdir/input") + + lookup-table print "$lt" --count 10 > "$workdir/printed" + diff "$workdir/printed" <(head -10 "$workdir/input") +} + +@test "build unsorted input" { + workdir=$(mktemp -d) + input_file="$workdir/input" + lt="$workdir/lt" + + echo "$workdir" + printf "%s\n" "$unsorted_values" > "$input_file" + + # build + cat "$input_file" | lookup-table build -o "$lt" + + # verify size + count_expected_bytes "$input_file" > "$workdir/expected_bytes" + cat "$lt" | wc -c > "$workdir/actual_bytes" + diff "$workdir/expected_bytes" "$workdir/actual_bytes" + + # get by index + assert_eq "$(lookup-table get "$lt" 0)" arcu + assert_eq "$(lookup-table get "$lt" 10)" elit + assert_eq "$(lookup-table get "$lt" 15)" bibendum + assert_eq "$(lookup-table get "$lt" 16)" odor + assert_eq "$(lookup-table get "$lt" 22)" ridiculus + # out of bounds exits with a failure exit code and prints out error + run lookup-table get "$lt" 23 + (( status != 0 )) + assert_eq "${lines[0]}" 'error: accessing element 23 in a table of size 23' + + # find + assert_eq "$(lookup-table find "$lt" arcu)" 0 + assert_eq "$(lookup-table find "$lt" elit)" 10 + assert_eq "$(lookup-table find "$lt" bibendum)" 15 + assert_eq "$(lookup-table find "$lt" odor)" 16 + assert_eq "$(lookup-table find "$lt" ridiculus)" 22 + # no element found + run lookup-table find "$lt" zonk + (( status != 0 )) + assert_eq "${lines[0]}" "error: value 'zonk' not found" + + # print + + lookup-table print "$lt" > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 0 --to 22 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 0 --count 23 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --from 5 --to 17 > "$workdir/printed" + diff "$workdir/printed" <(head -18 "$workdir/input" | tail -13) + + lookup-table print "$lt" --from 5 --count 13 > "$workdir/printed" + diff "$workdir/printed" <(head -18 "$workdir/input" | tail -13) + + lookup-table print "$lt" --to 22 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --count 23 > "$workdir/printed" + diff "$workdir/printed" "$workdir/input" + + lookup-table print "$lt" --to 10 > "$workdir/printed" + diff "$workdir/printed" <(head -11 "$workdir/input") + + lookup-table print "$lt" --count 10 > "$workdir/printed" + diff "$workdir/printed" <(head -10 "$workdir/input") +} diff --git a/test/test_lookup_table.cpp b/test/test_lookup_table.cpp new file mode 100644 index 00000000..30a56f43 --- /dev/null +++ b/test/test_lookup_table.cpp @@ -0,0 +1,365 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#define CATCH_CONFIG_MAIN +#include "catch2/catch.hpp" + +#include +#include +#include +#include + +#include "pisa/lookup_table.hpp" +#include "pisa/span.hpp" + +using namespace std::literals; + +TEST_CASE("flags") { + SECTION("defaults") { + auto default_flags = pisa::lt::v1::Flags(); + REQUIRE_FALSE(default_flags.sorted()); + REQUIRE_FALSE(default_flags.wide_offsets()); + } + SECTION("sorted") { + auto default_flags = pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED); + REQUIRE(default_flags.sorted()); + REQUIRE_FALSE(default_flags.wide_offsets()); + } + SECTION("wide_offsets") { + auto default_flags = pisa::lt::v1::Flags(pisa::lt::v1::flags::WIDE_OFFSETS); + REQUIRE_FALSE(default_flags.sorted()); + REQUIRE(default_flags.wide_offsets()); + } + SECTION("sorted + wide_offsets") { + auto default_flags = + pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED | pisa::lt::v1::flags::WIDE_OFFSETS); + REQUIRE(default_flags.sorted()); + REQUIRE(default_flags.wide_offsets()); + } +} + +TEST_CASE("LookupTable::from") { + SECTION("wrong identifier") { + auto bytes = std::vector{std::byte(0), std::byte(0), std::byte(0), std::byte(0)}; + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "lookup table verification byte invalid: must be 0x87 but 0x0 given" + ); + } + SECTION("invalid version 0") { + auto bytes = + std::vector{std::byte(0x87), std::byte(0), std::byte(0), std::byte(0)}; + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "only version 1 is valid but 0 given" + ); + } + SECTION("invalid version 2") { + auto bytes = + std::vector{std::byte(0x87), std::byte(2), std::byte(0), std::byte(0)}; + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "only version 1 is valid but 2 given" + ); + } + SECTION("padding is invalid") { + auto bytes = + std::vector{std::byte(0x87), std::byte(1), std::byte(0), std::byte(0)}; + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "not enough bytes for header" + ); + bytes = std::vector{ + std::byte(0x87), + std::byte(1), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(1) + }; + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "bytes 3-7 must be all 0 but are 0x0 0x0 0x0 0x0 0x1" + ); + bytes = std::vector{ + std::byte(0x87), + std::byte(1), + std::byte(0), + std::byte(1), + std::byte(2), + std::byte(3), + std::byte(4), + std::byte(5) + }; + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "bytes 3-7 must be all 0 but are 0x1 0x2 0x3 0x4 0x5" + ); + } + SECTION("empty table narrow offsets") { + auto bytes = + std::vector{std::byte(0x87), std::byte(1), std::byte(0), std::byte(0), + std::byte(0), std::byte(0), std::byte(0), std::byte(0), + std::byte(0), std::byte(0), std::byte(0), std::byte(0), + std::byte(0), std::byte(0), std::byte(0), std::byte(0), + std::byte(0), std::byte(0), std::byte(0), std::byte(0)}; + auto lt = pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())); + REQUIRE(lt.size() == 0); + } + SECTION("empty table wide offsets") { + auto bytes = std::vector{ + std::byte(0x87), std::byte(1), std::byte(pisa::lt::v1::flags::WIDE_OFFSETS), + std::byte(0), std::byte(0), std::byte(0), + std::byte(0), std::byte(0), std::byte(0), + std::byte(0), std::byte(0), std::byte(0), + std::byte(0), std::byte(0), std::byte(0), + std::byte(0), std::byte(0), std::byte(0), + std::byte(0), std::byte(0), std::byte(0), + std::byte(0), std::byte(0), std::byte(0) + }; + auto lt = pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())); + REQUIRE(lt.size() == 0); + } + SECTION("empty table must have a single offset") { + auto bytes = std::vector{ + std::byte(0x87), + std::byte(1), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0) + }; + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "not enough bytes for offsets" + ); + } + SECTION("not enough bytes for offsets") { + auto bytes = std::vector{ + std::byte(0x87), + std::byte(1), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(1), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0), + std::byte(0) + }; + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes(std::span(bytes.data(), bytes.size())), + "not enough bytes for offsets" + ); + } + SECTION("12 bytes is not enough for 3 wide offsets") { + /* clang-format off */ + auto bytes = std::vector{ + // header + 0x87, 1, pisa::lt::v1::flags::WIDE_OFFSETS, 0, 0, 0, 0, 0, + // size + 2, 0, 0, 0, 0, 0, 0, 0, + // offsets + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 + }; + /* clang-format on */ + REQUIRE_THROWS_WITH( + pisa::LookupTable::from_bytes( + std::span(reinterpret_cast(bytes.data()), bytes.size()) + ), + "not enough bytes for offsets" + ); + } + SECTION("12 bytes is enough for 3 narrow offsets") { + /* clang-format off */ + auto bytes = std::vector{ + // header + 0x87, 1, 0, 0, 0, 0, 0, 0, + // size + 2, 0, 0, 0, 0, 0, 0, 0, + // offsets + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 + }; + /* clang-format on */ + auto lt = pisa::LookupTable::from_bytes( + std::span(reinterpret_cast(bytes.data()), bytes.size()) + ); + REQUIRE(lt.size() == 2); + } + SECTION("[a, bcd, efgh] with wide offsets") { + /* clang-format off */ + auto bytes = std::vector{ + // header + 0x87, 1, pisa::lt::v1::flags::WIDE_OFFSETS, 0, 0, 0, 0, 0, + // size + 3, 0, 0, 0, 0, 0, 0, 0, + // offsets + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 4, 0, 0, 0, 0, 0, 0, 0, + 8, 0, 0, 0, 0, 0, 0, 0, + // payloads + 'a', + 'b', 'c', 'd', + 'e', 'f', 'g', 'h' + }; + /* clang-format on */ + auto lt = pisa::LookupTable::from_bytes( + std::span(reinterpret_cast(bytes.data()), bytes.size()) + ); + REQUIRE(lt.size() == 3); + REQUIRE( + lt[0] + == std::span(reinterpret_cast(bytes.data()) + 48, 1) + ); + REQUIRE( + lt[1] + == std::span(reinterpret_cast(bytes.data()) + 49, 3) + ); + REQUIRE( + lt[2] + == std::span(reinterpret_cast(bytes.data()) + 52, 4) + ); + } +} + +TEST_CASE("LookupTable v1") { + SECTION("encode [a, bcd, efgh]") { + /* clang-format off */ + auto expected = std::vector{ + // header + 0x87, 1, pisa::lt::v1::flags::WIDE_OFFSETS, 0, 0, 0, 0, 0, + // size + 3, 0, 0, 0, 0, 0, 0, 0, + // offsets + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 4, 0, 0, 0, 0, 0, 0, 0, + 8, 0, 0, 0, 0, 0, 0, 0, + // payloads + 'a', + 'b', 'c', 'd', + 'e', 'f', 'g', 'h' + }; + /* clang-format on */ + std::ostringstream out; + auto encoder = + pisa::LookupTableEncoder::v1(pisa::lt::v1::Flags(pisa::lt::v1::flags::WIDE_OFFSETS)); + std::vector payloads{"a", "bcd", "efgh"}; + encoder.insert_span(std::span(payloads.data(), payloads.size())); + encoder.encode(out); + std::string bytes = out.str(); + auto actual = std::as_bytes(std::span(bytes.data(), bytes.size())); + REQUIRE(actual == std::as_bytes(std::span(expected.data(), expected.size()))); + } + SECTION("wrong order in sorted table") { + std::ostringstream out; + auto encoder = pisa::LookupTableEncoder::v1(pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED)); + std::vector payloads{"bcd", "a", "efgh"}; + REQUIRE_THROWS_WITH( + encoder.insert_span(std::span(payloads.data(), payloads.size())), + "payloads not strictly sorted in sorted table" + ); + } + SECTION("detects duplicates") { + auto flags = GENERATE( + pisa::lt::v1::Flags(), + pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED), + pisa::lt::v1::Flags(pisa::lt::v1::flags::WIDE_OFFSETS), + pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED | pisa::lt::v1::flags::WIDE_OFFSETS) + ); + std::ostringstream out; + auto encoder = pisa::LookupTableEncoder::v1(flags); + std::vector payloads{"a", "b", "b", "c"}; + REQUIRE_THROWS_WITH( + encoder.insert_span(std::span(payloads.data(), payloads.size())), + flags.sorted() ? "payloads not strictly sorted in sorted table" : "payload duplicate" + ); + } + SECTION("operator[]") { + auto flags = GENERATE( + pisa::lt::v1::Flags(), + pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED), + pisa::lt::v1::Flags(pisa::lt::v1::flags::WIDE_OFFSETS), + pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED | pisa::lt::v1::flags::WIDE_OFFSETS) + ); + std::ostringstream out; + std::vector payloads{"a", "bcd", "efgh"}; + pisa::LookupTableEncoder::v1(pisa::lt::v1::Flags()) + .insert_span(std::span(payloads.data(), payloads.size())) + .encode(out); + std::string bytes = out.str(); + + auto lt = pisa::LookupTable::from_bytes(std::as_bytes(std::span(bytes))); + + REQUIRE(lt.at(0) == "a"); + REQUIRE(lt.at(1) == "bcd"); + REQUIRE(lt.at(2) == "efgh"); + + REQUIRE(lt.at(0) == "a"); + REQUIRE(lt.at(1) == "bcd"); + REQUIRE(lt.at(2) == "efgh"); + + auto val = lt.at>(0); + REQUIRE(std::vector(val.begin(), val.end()) == std::vector{'a'}); + val = lt.at>(1); + REQUIRE(std::vector(val.begin(), val.end()) == std::vector{'b', 'c', 'd'}); + val = lt.at>(2); + REQUIRE(std::vector(val.begin(), val.end()) == std::vector{'e', 'f', 'g', 'h'}); + } + SECTION("find()") { + auto flags = GENERATE( + pisa::lt::v1::Flags(), + pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED), + pisa::lt::v1::Flags(pisa::lt::v1::flags::WIDE_OFFSETS), + pisa::lt::v1::Flags(pisa::lt::v1::flags::SORTED | pisa::lt::v1::flags::WIDE_OFFSETS) + ); + std::ostringstream out; + std::vector payloads{"a", "bcd", "efgh"}; + pisa::LookupTableEncoder::v1(pisa::lt::v1::Flags()) + .insert_span(std::span(payloads.data(), payloads.size())) + .encode(out); + std::string bytes = out.str(); + + auto lt = pisa::LookupTable::from_bytes(std::as_bytes(std::span(bytes))); + + REQUIRE_FALSE(lt.find(""sv).has_value()); + REQUIRE(lt.find("a"sv) == 0); + REQUIRE_FALSE(lt.find("aa"sv).has_value()); + REQUIRE(lt.find("bcd"sv) == 1); + REQUIRE_FALSE(lt.find("bcde"sv).has_value()); + REQUIRE(lt.find("efgh"sv) == 2); + REQUIRE_FALSE(lt.find("efghi"sv).has_value()); + } +} diff --git a/test/test_span.cpp b/test/test_span.cpp index 4b0a5b9e..23c21576 100644 --- a/test/test_span.cpp +++ b/test/test_span.cpp @@ -1,6 +1,23 @@ +// Copyright 2024 PISA developers +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #define CATCH_CONFIG_MAIN #include "catch2/catch.hpp" +#include +#include + #include "span.hpp" TEST_CASE("pisa::at", "[span]") { @@ -13,6 +30,28 @@ TEST_CASE("pisa::at", "[span]") { REQUIRE_THROWS_AS(pisa::at(span, 4), std::out_of_range); } +TEST_CASE("pisa::subspan", "[span]") { + std::vector vec{0, 1, 2, 3}; + auto span = std::span{vec.data(), vec.size()}; + REQUIRE(pisa::subspan_or_throw(span, 0, 0) == std::span(vec.data(), 0)); + REQUIRE(pisa::subspan_or_throw(span, 0, 1) == std::span(vec.data(), 1)); + REQUIRE(pisa::subspan_or_throw(span, 1, 0) == std::span(vec.data() + 1, 0)); + REQUIRE(pisa::subspan_or_throw(span, 0, 4) == std::span(vec.data(), 4)); + REQUIRE(pisa::subspan_or_throw(span, 1, 3) == std::span(vec.data() + 1, 3)); + REQUIRE(pisa::subspan_or_throw(span, 0, 3) == std::span(vec.data(), 3)); + REQUIRE(pisa::subspan_or_throw(span, 2, 2) == std::span(vec.data() + 2, 2)); + REQUIRE(pisa::subspan_or_throw(span, 3, 1) == std::span(vec.data() + 3, 1)); + REQUIRE(pisa::subspan_or_throw(span, 4, 0) == std::span(vec.data() + 4, 0)); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 0, 6), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 0, 5), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 1, 4), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 2, 3), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 3, 2), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 4, 1), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 5, 0), std::out_of_range); + REQUIRE_THROWS_AS(pisa::subspan_or_throw(span, 5, 1), std::out_of_range); +} + TEST_CASE("operator== for spans", "[span]") { std::vector vec1{0, 1, 2, 3}; auto span1 = std::span(vec1.data(), vec1.size()); @@ -25,3 +64,35 @@ TEST_CASE("operator== for spans", "[span]") { REQUIRE(span2 != span3); REQUIRE(span1 == std::span(vec1.data(), vec1.size())); } + +TEST_CASE("lex_lt", "[span]") { + std::string_view aardvark = "aardvark"; + std::string_view dog = "dog"; + std::string_view zebra = "zebra"; + + REQUIRE_FALSE(pisa::lex_lt(std::span(aardvark), std::span(aardvark))); + REQUIRE(pisa::lex_lt(std::span(aardvark), std::span(dog))); + REQUIRE(pisa::lex_lt(std::span(aardvark), std::span(zebra))); + + REQUIRE_FALSE(pisa::lex_lt(std::span(dog), std::span(dog))); + REQUIRE_FALSE(pisa::lex_lt(std::span(dog), std::span(aardvark))); + REQUIRE(pisa::lex_lt(std::span(dog), std::span(zebra))); + + REQUIRE_FALSE(pisa::lex_lt(std::span(zebra), std::span(zebra))); + REQUIRE_FALSE(pisa::lex_lt(std::span(zebra), std::span(aardvark))); + REQUIRE_FALSE(pisa::lex_lt(std::span(zebra), std::span(dog))); +} + +TEST_CASE("lex_lt sort", "[span]") { + std::vector> animals{ + "aardvark", "dog", "zebra", "pelican", "goose", "geese", "cat" + }; + std::sort(animals.begin(), animals.end(), pisa::lex_lt); + REQUIRE(animals[0] == std::span("aardvark")); + REQUIRE(animals[1] == std::span("cat")); + REQUIRE(animals[2] == std::span("dog")); + REQUIRE(animals[3] == std::span("geese")); + REQUIRE(animals[4] == std::span("goose")); + REQUIRE(animals[5] == std::span("pelican")); + REQUIRE(animals[6] == std::span("zebra")); +} diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index b60dc377..c12894fb 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -32,3 +32,4 @@ add_tool(kth_threshold kth_threshold.cpp) add_tool(taily-stats taily_stats.cpp) add_tool(taily-thresholds taily_thresholds.cpp) add_tool(extract-maxscores extract_maxscores.cpp) +add_tool(lookup-table lookup_table.cpp) diff --git a/tools/lookup_table.cpp b/tools/lookup_table.cpp new file mode 100644 index 00000000..40be555c --- /dev/null +++ b/tools/lookup_table.cpp @@ -0,0 +1,152 @@ +#include + +#include +#include +#include + +#include "app.hpp" +#include "pisa/io.hpp" +#include "pisa/lookup_table.hpp" + +struct Arguments { + std::string lexicon_file{}; + std::string value{}; + std::size_t idx = 0; + std::optional last{}; + std::optional count{}; +}; + +struct Commands { + CLI::App* build{}; + CLI::App* get{}; + CLI::App* find{}; + CLI::App* print{}; +}; + +auto build_cmd(CLI::App& app, Arguments& args) { + auto cmd = app.add_subcommand("build", "Builds a lookup table from stdin"); + cmd->add_option("-o,--output", args.lexicon_file, "Binary output file")->required(); + return cmd; +} + +auto get_cmd(CLI::App& app, Arguments& args) { + auto cmd = app.add_subcommand("get", "Retrieves the value at the given position"); + cmd->add_option("table", args.lexicon_file, "Path to lookup table")->required(); + cmd->add_option("position", args.idx, "Position")->required(); + return cmd; +} + +auto find_cmd(CLI::App& app, Arguments& args) { + auto cmd = app.add_subcommand("find", "Finds the given value and returns its position"); + cmd->add_option("table", args.lexicon_file, "Path to lookup table")->required(); + cmd->add_option("value", args.value, "Value to find")->required(); + return cmd; +} + +auto print_cmd(CLI::App& app, Arguments& args) { + auto cmd = app.add_subcommand("print", "Prints values"); + cmd->add_option("table", args.lexicon_file, "Path to lookup table")->required(); + cmd->add_option("--from", args.idx, "Starting position"); + auto to = cmd->add_option("--to", args.last, "Last position"); + cmd->add_option("--count", args.count, "Number of values to print")->excludes(to); + return cmd; +} + +void build(Arguments const& args) { + std::vector values; + std::size_t payload_size = 0; + bool sorted = true; + pisa::io::for_each_line(std::cin, [&values, &payload_size, &sorted](std::string&& value) { + payload_size += value.size(); + values.push_back(std::move(value)); + if (sorted && payload_size > 0 && value <= values.back()) { + sorted = false; + } + }); + std::uint8_t flags = 0; + if (sorted) { + flags |= ::pisa::lt::v1::flags::SORTED; + } + if (payload_size >= (1UL << 32) - 1) { + flags |= ::pisa::lt::v1::flags::WIDE_OFFSETS; + } + auto encoder = ::pisa::LookupTableEncoder::v1(::pisa::lt::v1::Flags(flags)); + for (auto& value: values) { + encoder.insert(value); + } + std::ofstream out(args.lexicon_file); + encoder.encode(out); +} + +void get(pisa::LookupTable const& table, std::size_t idx) { + auto value = table.at(idx); + std::cout << value; +} + +void find(pisa::LookupTable const& table, std::string const& value) { + auto idx = table.find(value); + if (idx.has_value()) { + std::cout << *idx; + } else { + throw std::runtime_error(fmt::format("value '{}' not found", value)); + } +} + +void print(pisa::LookupTable const& table, Arguments const& args) { + if (args.idx >= table.size()) { + throw std::runtime_error(fmt::format("starting position {} is out of bounds", args.idx)); + } + std::size_t last = table.size() - 1; + if (args.last.has_value()) { + if (*args.last >= table.size()) { + throw std::runtime_error(fmt::format("last position {} is out of bounds", *args.last)); + } + last = *args.last; + } + if (args.count.has_value()) { + if (*args.count + args.idx - 1 >= table.size()) { + throw std::runtime_error( + fmt::format("last position {} is out of bounds", args.idx + *args.count - 1) + ); + } + last = *args.count + args.idx - 1; + } + for (auto pos = args.idx; pos <= last; ++pos) { + std::cout << table.at(pos) << '\n'; + } +} + +int main(int argc, char** argv) { + Arguments args; + Commands cmds; + + pisa::App app{"Builds, prints, or queries lookup table"}; + app.require_subcommand(); + cmds.build = build_cmd(app, args); + cmds.get = get_cmd(app, args); + cmds.find = find_cmd(app, args); + cmds.print = print_cmd(app, args); + CLI11_PARSE(app, argc, argv); + + try { + if (*cmds.build) { + build(args); + } else { + mio::mmap_source mem(args.lexicon_file.c_str()); + auto table = pisa::LookupTable::from_bytes( + std::span(reinterpret_cast(mem.data()), mem.size()) + ); + if (*cmds.get) { + get(table, args.idx); + } else if (*cmds.find) { + find(table, args.value); + } else if (*cmds.print) { + print(table, args); + } + } + } catch (std::exception const& err) { + std::cerr << "error: " << err.what() << '\n'; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +}