From a94d962bda8719b6c7abc5e2d803ec19ec7ca6cf Mon Sep 17 00:00:00 2001 From: Dylon Edwards Date: Wed, 14 Feb 2024 13:24:13 -0500 Subject: [PATCH] Moves distance metrics into their own namespace, liblevenshtein::distance, to promote their independence from the core library; removes workaround for absl linking for protobuf; updates the README.md --- CMakeLists.txt | 2 +- README.md | 116 +++++++++++++----- example/CMakeLists.txt | 2 - proto/CMakeLists.txt | 10 +- src/liblevenshtein/distance/distance.h | 2 +- .../distance/memoized_distance.cpp | 5 +- .../distance/memoized_distance.h | 28 ++--- .../distance/merge_and_split_distance.cpp | 4 +- .../distance/merge_and_split_distance.h | 12 +- .../distance/standard_distance.cpp | 4 +- .../distance/standard_distance.h | 12 +- .../distance/symmetric_pair.cpp | 8 +- src/liblevenshtein/distance/symmetric_pair.h | 35 +++--- .../distance/transposition_distance.cpp | 5 +- .../distance/transposition_distance.h | 13 +- .../test_merge_and_split_distance.cpp | 3 +- .../distance/test_standard_distance.cpp | 3 +- .../distance/test_symmetric_pair.cpp | 10 +- .../distance/test_transposition_distance.cpp | 3 +- .../transducer/test_transducer.cpp | 9 +- 20 files changed, 168 insertions(+), 118 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eaa997b..2645c3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ set(CMAKE_VERBOSE_MAKEFILE ON) include(GNUInstallDirs) -option(BUILD_BASELINE_METRICS "Builds baseline distance metrics for validation" ON) +option(BUILD_BASELINE_METRICS "Builds baseline distance metrics for validation" OFF) option(BUILD_TESTS "Build liblevenshtein testing suite" OFF) option(ENABLE_TEST_COVERAGE "Generate test coverage report" OFF) option(ENABLE_LINTING "Enables the source code linter" OFF) diff --git a/README.md b/README.md index c1ffba0..67f0201 100644 --- a/README.md +++ b/README.md @@ -10,21 +10,22 @@ A library for generating Finite State Transducers based on Levenshtein Automata. NOTE: This library is currently in rc phase. I'll have it production ready as -soon as possible. Currently, the top-level components have >90% test coverage -and the library is usable as described below. +soon as possible. Currently, there is >90% test coverage over the sources and +the library is usable as described below. -To make my life easier, this library takes advantage of C++20 features. If you -need compatibility with an older standard, please either submit a pull request -or create an issue stating the standard you need compatibility with and I'll get -around to adding its support when I get time. +Due to limited resources on my part, this library requires C++20 features (or +whichever is the latest standard). If you need compatibility with an older +standard, please either submit a pull request (preferably) or create an issue +stating the standard you need compatibility with and I will comply if I can. For a demonstration, please reference the [example app](example/). ## Initialization To ease dependency management during development, -[Anaconda](https://www.anaconda.com/) is used. If you do not have a working -installation, I recommend the +[Anaconda](https://www.anaconda.com/) is used but should not be required if you +have the necessary libraries installed. If you do not have a working +[Anaconda](https://www.anaconda.com/) installation, I recommend the [Mamba](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html) variant: @@ -40,6 +41,8 @@ wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge bash Miniforge3-MacOSX-arm64.sh -b ``` +TODO: Add instructions for Windows. + Initialize the `base` environment: ```bash @@ -74,7 +77,7 @@ conda activate ll-cpp ```shell mkdir build cd build -cmake -D CMAKE_BUILD_TYPE=Debug -D CMAKE_INSTALL_PREFIX=/usr/local .. +cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX .. make make install ``` @@ -135,36 +138,36 @@ ${CMAKE_INSTALL_PREFIX} 11 directories, 37 files ``` -### Disabling tests +### Enabling tests -If you want to build the library without tests, use the same instructions but -add the CMake option `BUILD_TESTS=OFF`, as described below: +If you want to build the library with tests, use the same instructions but +add the CMake option `BUILD_TESTS=ON`, as described below: ```shell -# ... -cmake -D CMAKE_BUILD_TYPE=Debug \ - -D CMAKE_INSTALL_PREFIX=/usr/local \ - -D BUILD_TESTS=OFF \ +cmake -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ + -DBUILD_TESTS=ON \ .. -# ... ``` -### Disabling baseline metrics +### Enabling baseline metrics -If you want to disable the baseline metrics used for validation, you need to -disable both tests and the metrics. If you disable the metrics but enable tests -then they will be built anyway because they are required for the tests. +If you want to enable the baseline metrics for validation, you must pass +`-DBUILD_BASELINE_METRICS=ON` to CMake: ```shell -# ... -cmake -D CMAKE_BUILD_TYPE=Debug \ - -D CMAKE_INSTALL_PREFIX=/usr/local \ - -D BUILD_BASELINE_METRICS=OFF \ - -D BUILD_TESTS=OFF \ +cmake -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ + -DBUILD_BASELINE_METRICS=ON \ .. -# ... ``` +The baseline metrics are intended for validation of the search results but might +be useful if you need to compute edit distances among individual pairs of terms. + +NOTE: The baseline metrics are required for the tests and will be implicitly +enabled for them if the baseline metrics are not explicitly enabled. + ## Usage ### Algorithms @@ -219,7 +222,47 @@ operation is an edit operation that errs in a penalty of 1 unit. ### Example +```cmake +# file: CMakeLists.txt + +cmake_minimum_required(VERSION 3.20 FATAL_ERROR) + +project(liblevenshtein-demo + VERSION 1.0.0 + DESCRIPTION "Demonstrates how to use liblevenshtein-cpp." + HOMEPAGE_URL "https://github.com/universal-automata/liblevenshtein-cpp" + LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +SET(CMAKE_CXX_FLAGS_DEBUG "-g -O0") +SET(CMAKE_C_FLAGS_DEBUG "-g -O0") + +set(CMAKE_COMPILE_WARNING_AS_ERROR ON) + +set(CMAKE_VERBOSE_MAKEFILE ON) + +include(GNUInstallDirs) + +find_package(Protobuf REQUIRED) +find_package(liblevenshtein REQUIRED) + +add_executable(${PROJECT_NAME} + "command_line.cpp" + "main.cpp") + +target_link_libraries(${PROJECT_NAME} + PRIVATE + protobuf::libprotobuf + levenshtein) + +``` + ```c++ +// file: main.cpp + #include #include #include @@ -289,7 +332,7 @@ int main(int argc, char *argv[]) { */ // save the dictionary for reuse - serialize_protobuf(dawg, serialization_path); + ll::serialize_protobuf(dawg, serialization_path); delete dawg; @@ -308,6 +351,17 @@ int main(int argc, char *argv[]) { ``` ### Dependencies -1. [Google Test](https://github.com/google/googletest) -2. [RapidCheck](https://github.com/emil-e/rapidcheck) -3. [yaml-cpp](https://github.com/jbeder/yaml-cpp) +1. [CMake](https://cmake.org/) +2. [Make](https://www.gnu.org/software/make/) +3. C++ Compiler + - Linux + - [g++](https://gcc.gnu.org/) + - [clang++](https://clang.llvm.org/) + - MacOS + - [clang++](https://clang.llvm.org/) + - Windows + - [vc++](https://visualstudio.microsoft.com/) +4. [Protocol Buffers](https://protobuf.dev/) +5. [Google Test](https://github.com/google/googletest) +6. [RapidCheck](https://github.com/emil-e/rapidcheck) +7. [yaml-cpp](https://github.com/jbeder/yaml-cpp) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 91e022b..52609e8 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -19,7 +19,6 @@ set(CMAKE_VERBOSE_MAKEFILE ON) include(GNUInstallDirs) -find_package(absl REQUIRED) # workaround for protobuf linking bug find_package(Protobuf REQUIRED) find_package(liblevenshtein REQUIRED) @@ -29,6 +28,5 @@ add_executable(${PROJECT_NAME} target_link_libraries(${PROJECT_NAME} PRIVATE - absl::log_internal_check_op # workaround for protobuf linking bug protobuf::libprotobuf levenshtein) diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt index 60d9de6..4a3123f 100644 --- a/proto/CMakeLists.txt +++ b/proto/CMakeLists.txt @@ -12,20 +12,14 @@ target_sources(levenshtein find_package(Protobuf REQUIRED) -find_package(absl REQUIRED) # workaround for protobuf linking bug - target_include_directories(levenshtein PUBLIC "${Protobuf_INCLUDE_DIRS}" $ - $ -) + $) target_link_libraries(levenshtein PUBLIC - protobuf::libprotobuf - PRIVATE - absl::log_internal_check_op # workaround for protobuf linking bug -) + protobuf::libprotobuf) protobuf_generate( TARGET levenshtein diff --git a/src/liblevenshtein/distance/distance.h b/src/liblevenshtein/distance/distance.h index bbff75e..1d14f45 100644 --- a/src/liblevenshtein/distance/distance.h +++ b/src/liblevenshtein/distance/distance.h @@ -3,7 +3,7 @@ #include -namespace liblevenshtein { +namespace liblevenshtein::distance { class Distance { public: diff --git a/src/liblevenshtein/distance/memoized_distance.cpp b/src/liblevenshtein/distance/memoized_distance.cpp index a03567b..1d429db 100644 --- a/src/liblevenshtein/distance/memoized_distance.cpp +++ b/src/liblevenshtein/distance/memoized_distance.cpp @@ -4,8 +4,7 @@ using namespace std::literals; - -namespace liblevenshtein { +namespace liblevenshtein::distance { auto MemoizedDistance::operator()(const std::string &v, const std::string &w) -> std::size_t { @@ -36,4 +35,4 @@ auto MemoizedDistance::f(const std::string &u, std::size_t const t) -> std::stri return ""; } -} // namespace liblevenshtein +} // namespace liblevenshtein::distance diff --git a/src/liblevenshtein/distance/memoized_distance.h b/src/liblevenshtein/distance/memoized_distance.h index d3c159e..f88c7b1 100644 --- a/src/liblevenshtein/distance/memoized_distance.h +++ b/src/liblevenshtein/distance/memoized_distance.h @@ -8,23 +8,23 @@ #include "liblevenshtein/distance/distance.h" #include "liblevenshtein/distance/symmetric_pair.h" -namespace liblevenshtein { +namespace liblevenshtein::distance { - class MemoizedDistance : public Distance { - public: - auto operator()(const std::string &v, const std::string &w) -> std::size_t override; +class MemoizedDistance : public Distance { +public: + auto operator()(const std::string &v, const std::string &w) -> std::size_t override; - protected: - auto get(const SymmetricPair &key, std::size_t &distance) -> bool; - auto set(const SymmetricPair &key, const std::size_t &distance) - -> std::size_t; - static auto f(const std::string &u, std::size_t t) -> std::string; +protected: + auto get(const SymmetricPair &key, std::size_t &distance) -> bool; + auto set(const SymmetricPair &key, const std::size_t &distance) + -> std::size_t; + static auto f(const std::string &u, std::size_t t) -> std::string; - private: - std::unordered_map memo; - mutable std::shared_mutex mutex; - }; +private: + std::unordered_map memo; + mutable std::shared_mutex mutex; +}; -} // namespace liblevenshtein +} // namespace liblevenshtein::distance #endif // LIBLEVENSHTEIN_DISTANCE_MEMOIZED_DISTANCE_H diff --git a/src/liblevenshtein/distance/merge_and_split_distance.cpp b/src/liblevenshtein/distance/merge_and_split_distance.cpp index b66a7f3..cf6a5bb 100644 --- a/src/liblevenshtein/distance/merge_and_split_distance.cpp +++ b/src/liblevenshtein/distance/merge_and_split_distance.cpp @@ -1,7 +1,7 @@ #include "liblevenshtein/distance/merge_and_split_distance.h" #include "liblevenshtein/distance/symmetric_pair.h" -namespace liblevenshtein { +namespace liblevenshtein::distance { auto MergeAndSplitDistance::between(std::string v, std::string w) -> std::size_t { @@ -95,4 +95,4 @@ auto MergeAndSplitDistance::between(std::string v, std::string w) return set(key, 1 + min_distance); } -} // namespace liblevenshtein +} // namespace liblevenshtein::distance diff --git a/src/liblevenshtein/distance/merge_and_split_distance.h b/src/liblevenshtein/distance/merge_and_split_distance.h index 302d1fc..9f1d934 100644 --- a/src/liblevenshtein/distance/merge_and_split_distance.h +++ b/src/liblevenshtein/distance/merge_and_split_distance.h @@ -5,13 +5,13 @@ #include "liblevenshtein/distance/memoized_distance.h" -namespace liblevenshtein { +namespace liblevenshtein::distance { - class MergeAndSplitDistance : public MemoizedDistance { - public: - auto between(std::string v, std::string w) -> std::size_t override; - }; +class MergeAndSplitDistance : public MemoizedDistance { +public: + auto between(std::string v, std::string w) -> std::size_t override; +}; -} // namespace liblevenshtein +} // namespace liblevenshtein::distance #endif // LIBLEVENSHTEIN_DISTANCE_MERGE_AND_SPLIT_DISTANCE_H diff --git a/src/liblevenshtein/distance/standard_distance.cpp b/src/liblevenshtein/distance/standard_distance.cpp index c63b85d..7d31aef 100644 --- a/src/liblevenshtein/distance/standard_distance.cpp +++ b/src/liblevenshtein/distance/standard_distance.cpp @@ -1,7 +1,7 @@ #include "liblevenshtein/distance/standard_distance.h" #include "liblevenshtein/distance/symmetric_pair.h" -namespace liblevenshtein { +namespace liblevenshtein::distance { auto StandardDistance::between(std::string v, std::string w) -> std::size_t { const SymmetricPair key(v, w); @@ -64,4 +64,4 @@ auto StandardDistance::between(std::string v, std::string w) -> std::size_t { return set(key, 1 + min_distance); } -} // namespace liblevenshtein +} // namespace liblevenshtein::distance diff --git a/src/liblevenshtein/distance/standard_distance.h b/src/liblevenshtein/distance/standard_distance.h index 26a65e0..50b6293 100644 --- a/src/liblevenshtein/distance/standard_distance.h +++ b/src/liblevenshtein/distance/standard_distance.h @@ -5,13 +5,13 @@ #include "liblevenshtein/distance/memoized_distance.h" -namespace liblevenshtein { +namespace liblevenshtein::distance { - class StandardDistance : public MemoizedDistance { - public: - auto between(std::string v, std::string w) -> std::size_t override; - }; +class StandardDistance : public MemoizedDistance { +public: + auto between(std::string v, std::string w) -> std::size_t override; +}; -} // namespace liblevenshtein +} // namespace liblevenshtein::distance #endif // LIBLEVENSHTEIN_DISTANCE_STANDARD_DISTANCE_H diff --git a/src/liblevenshtein/distance/symmetric_pair.cpp b/src/liblevenshtein/distance/symmetric_pair.cpp index 5f81dac..a186645 100644 --- a/src/liblevenshtein/distance/symmetric_pair.cpp +++ b/src/liblevenshtein/distance/symmetric_pair.cpp @@ -4,7 +4,7 @@ #include "liblevenshtein/distance/symmetric_pair.h" -namespace liblevenshtein { +namespace liblevenshtein::distance { SymmetricPair::SymmetricPair(const std::string &first, const std::string &second) { if (first.compare(second) < 0) { @@ -30,10 +30,10 @@ auto operator<<(std::ostream &out, const SymmetricPair &pair) return out; } -} // namespace liblevenshtein +} // namespace liblevenshtein::distance -auto std::hash::operator()( - const liblevenshtein::SymmetricPair &pair) const -> std::size_t { +auto std::hash::operator()( + const liblevenshtein::distance::SymmetricPair &pair) const -> std::size_t { std::uint64_t hash_code = 0xDEADBEEF; hash_code = MurmurHash64A(pair.first.c_str(), (int) pair.first.length(), hash_code); return MurmurHash64A(pair.second.c_str(), (int) pair.second.length(), hash_code); diff --git a/src/liblevenshtein/distance/symmetric_pair.h b/src/liblevenshtein/distance/symmetric_pair.h index 9945fdf..c74f0a8 100644 --- a/src/liblevenshtein/distance/symmetric_pair.h +++ b/src/liblevenshtein/distance/symmetric_pair.h @@ -4,32 +4,31 @@ #include #include +namespace liblevenshtein::distance { -namespace liblevenshtein { +class SymmetricPair { +public: + SymmetricPair(const std::string &first, const std::string &second); + auto operator==(const SymmetricPair &other) const -> bool; + auto operator!=(const SymmetricPair &other) const -> bool; - class SymmetricPair { - public: - SymmetricPair(const std::string &first, const std::string &second); - auto operator==(const SymmetricPair &other) const -> bool; - auto operator!=(const SymmetricPair &other) const -> bool; + friend class std::hash; + friend auto operator<<(std::ostream &out, const SymmetricPair &pair) + -> std::ostream &; - friend class std::hash; - friend auto operator<<(std::ostream &out, const SymmetricPair &pair) - -> std::ostream &; - - private: - std::string first; - std::string second; - }; -} // namespace liblevenshtein +private: + std::string first; + std::string second; +}; +} // namespace liblevenshtein::distance namespace std { template <> - struct hash { - auto operator()(const liblevenshtein::SymmetricPair &pair) const - -> std::size_t; + struct hash { + auto operator()(const liblevenshtein::distance::SymmetricPair &pair) const + -> std::size_t; }; } // namespace std diff --git a/src/liblevenshtein/distance/transposition_distance.cpp b/src/liblevenshtein/distance/transposition_distance.cpp index 5bf7633..1ce68a9 100644 --- a/src/liblevenshtein/distance/transposition_distance.cpp +++ b/src/liblevenshtein/distance/transposition_distance.cpp @@ -1,8 +1,7 @@ #include "liblevenshtein/distance/symmetric_pair.h" #include "liblevenshtein/distance/transposition_distance.h" - -namespace liblevenshtein { +namespace liblevenshtein::distance { auto TranspositionDistance::between(std::string v, std::string w) -> std::size_t { @@ -82,4 +81,4 @@ auto TranspositionDistance::between(std::string v, std::string w) return set(key, 1 + min_distance); } -} // namespace liblevenshtein +} // namespace liblevenshtein::distance diff --git a/src/liblevenshtein/distance/transposition_distance.h b/src/liblevenshtein/distance/transposition_distance.h index 9544c14..a832d42 100644 --- a/src/liblevenshtein/distance/transposition_distance.h +++ b/src/liblevenshtein/distance/transposition_distance.h @@ -5,12 +5,13 @@ #include "liblevenshtein/distance/memoized_distance.h" -namespace liblevenshtein { +namespace liblevenshtein::distance { - class TranspositionDistance : public MemoizedDistance { - public: - auto between(std::string v, std::string w) -> std::size_t override; - }; -} // namespace liblevenshtein +class TranspositionDistance : public MemoizedDistance { +public: + auto between(std::string v, std::string w) -> std::size_t override; +}; + +} // namespace liblevenshtein::distance #endif // LIBLEVENSHTEIN_DISTANCE_TRANSPOSITION_DISTANCE_H diff --git a/test/liblevenshtein/distance/test_merge_and_split_distance.cpp b/test/liblevenshtein/distance/test_merge_and_split_distance.cpp index 262e4b1..29628a6 100644 --- a/test/liblevenshtein/distance/test_merge_and_split_distance.cpp +++ b/test/liblevenshtein/distance/test_merge_and_split_distance.cpp @@ -4,8 +4,9 @@ #include "liblevenshtein/distance/merge_and_split_distance.h" +namespace lld = liblevenshtein::distance; -static liblevenshtein::MergeAndSplitDistance d; +static lld::MergeAndSplitDistance d; RC_GTEST_PROP(MergeAndSplitDistance, satisfies_non_negativity, (const std::string &x, const std::string &y)) { diff --git a/test/liblevenshtein/distance/test_standard_distance.cpp b/test/liblevenshtein/distance/test_standard_distance.cpp index e0f1dee..a848c23 100644 --- a/test/liblevenshtein/distance/test_standard_distance.cpp +++ b/test/liblevenshtein/distance/test_standard_distance.cpp @@ -4,8 +4,9 @@ #include "liblevenshtein/distance/standard_distance.h" +namespace lld = liblevenshtein::distance; -static liblevenshtein::StandardDistance d; +static lld::StandardDistance d; RC_GTEST_PROP(StandardDistance, satisfies_non_negativity, (const std::string &x, const std::string &y)) { diff --git a/test/liblevenshtein/distance/test_symmetric_pair.cpp b/test/liblevenshtein/distance/test_symmetric_pair.cpp index 49ee0da..8c6d275 100644 --- a/test/liblevenshtein/distance/test_symmetric_pair.cpp +++ b/test/liblevenshtein/distance/test_symmetric_pair.cpp @@ -6,14 +6,16 @@ #include "liblevenshtein/distance/symmetric_pair.h" +namespace lld = liblevenshtein::distance; + RC_GTEST_PROP(SymmetricPair, is_comparable, (const std::string &foo, const std::string &bar, const std::string &baz)) { - std::hash hash; + std::hash hash; - liblevenshtein::SymmetricPair pair1(foo, bar); - liblevenshtein::SymmetricPair pair2(bar, foo); - liblevenshtein::SymmetricPair pair3(foo, baz); + lld::SymmetricPair pair1(foo, bar); + lld::SymmetricPair pair2(bar, foo); + lld::SymmetricPair pair3(foo, baz); RC_ASSERT(pair1 == pair1); RC_ASSERT(pair2 == pair2); diff --git a/test/liblevenshtein/distance/test_transposition_distance.cpp b/test/liblevenshtein/distance/test_transposition_distance.cpp index 013228f..7314c23 100644 --- a/test/liblevenshtein/distance/test_transposition_distance.cpp +++ b/test/liblevenshtein/distance/test_transposition_distance.cpp @@ -4,8 +4,9 @@ #include "liblevenshtein/distance/transposition_distance.h" +namespace lld = liblevenshtein::distance; -static liblevenshtein::TranspositionDistance d; +static lld::TranspositionDistance d; RC_GTEST_PROP(TranspositionDistance, satisfies_non_negativity, (const std::string &x, const std::string &y)) { diff --git a/test/liblevenshtein/transducer/test_transducer.cpp b/test/liblevenshtein/transducer/test_transducer.cpp index df0241b..f0fb7e2 100644 --- a/test/liblevenshtein/transducer/test_transducer.cpp +++ b/test/liblevenshtein/transducer/test_transducer.cpp @@ -18,17 +18,18 @@ #include "liblevenshtein/test/utils/comparators.h" namespace ll = liblevenshtein; +namespace lld = liblevenshtein::distance; namespace llt = liblevenshtein::test; -static ll::StandardDistance standard_distance; -static ll::TranspositionDistance transposition_distance; -static ll::MergeAndSplitDistance merge_and_split_distance; +static lld::StandardDistance standard_distance; +static lld::TranspositionDistance transposition_distance; +static lld::MergeAndSplitDistance merge_and_split_distance; template // NOLINTBEGIN(bugprone-easily-swappable-parameters) void test_transducer(const std::set &dictionary_terms, const std::set &query_terms, - std::size_t max_distance, ll::Distance &d) { + std::size_t max_distance, lld::Distance &d) { // NOLINTEND(bugprone-easily-swappable-parameters) std::unordered_map>