diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5020505..2efd4aa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks -exclude: '^.subtrees/(jarowinkler|rapidfuzz)/' +exclude: '^.subtrees/rapidfuzz/' repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. diff --git a/.subtrees/jarowinkler/.github/FUNDING.yml b/.subtrees/jarowinkler/.github/FUNDING.yml deleted file mode 100644 index 1db8bdd..0000000 --- a/.subtrees/jarowinkler/.github/FUNDING.yml +++ /dev/null @@ -1 +0,0 @@ -github: maxbachmann diff --git a/.subtrees/jarowinkler/.github/workflows/branchbuild.yml b/.subtrees/jarowinkler/.github/workflows/branchbuild.yml deleted file mode 100644 index 22e24b7..0000000 --- a/.subtrees/jarowinkler/.github/workflows/branchbuild.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: Build - -on: - push: - branches-ignore: - - main - -jobs: - tests: - name: "Python ${{ matrix.python-version }}" - runs-on: "ubuntu-latest" - strategy: - matrix: - python-version: ["3.6", "3.9", "3.10"] - os: [ubuntu-latest, windows-latest, macos-latest] - - steps: - - uses: "actions/checkout@v2" - with: - submodules: 'true' - - uses: "actions/setup-python@v2" - with: - python-version: "${{ matrix.python-version }}" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest hypothesis mypy - - - name: build - run: | - pip install build; python -m build --sdist - # test whether tarball contains all files required for compiling - pip install dist/jarowinkler-*.tar.gz - - - name: Test type stubs - run: | - python -m mypy.stubtest jarowinkler --ignore-missing-stub - - - name: Test with pytest - run: | - pytest tests diff --git a/.subtrees/jarowinkler/.github/workflows/releasebuild.yml b/.subtrees/jarowinkler/.github/workflows/releasebuild.yml deleted file mode 100644 index 77f1c44..0000000 --- a/.subtrees/jarowinkler/.github/workflows/releasebuild.yml +++ /dev/null @@ -1,223 +0,0 @@ -name: Build - -on: - push: - branches: - - main - release: - types: - - published - -jobs: - build_wheels_windows: - name: Build wheel on windows-latest/${{matrix.arch}}/${{matrix.python_tag}} - runs-on: windows-latest - strategy: - fail-fast: false - matrix: - arch: [auto32, auto64] - python_tag: ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "cp310-*", "pp37-*", "pp38-*", "pp39-*"] - exclude: - # PyPy only supports x86_64 on Windows - - arch: auto32 - python_tag: "pp37-*" - - arch: auto32 - python_tag: "pp38-*" - - arch: auto32 - python_tag: "pp39-*" - - # PyPy Windows is currently broken in scikit-build - - arch: auto64 - python_tag: "pp37-*" - - arch: auto64 - python_tag: "pp38-*" - - arch: auto64 - python_tag: "pp39-*" - env: - CIBW_BUILD: ${{matrix.python_tag}} - CIBW_ARCHS: ${{matrix.arch}} - CIBW_TEST_REQUIRES: pytest hypothesis - CIBW_TEST_COMMAND: pytest {package}/tests - CIBW_BUILD_VERBOSITY: 3 - - steps: - - uses: actions/checkout@v2 - with: - submodules: 'true' - - - uses: actions/setup-python@v2 - - - name: Build wheels - uses: pypa/cibuildwheel@v2.7.0 - with: - output-dir: wheelhouse - - - name: Upload wheels - uses: actions/upload-artifact@v2 - with: - path: ./wheelhouse/*.whl - - build_wheels_macos: - name: Build wheel on macos-latest/${{matrix.arch}}/${{matrix.python_tag}} - runs-on: macos-latest - strategy: - fail-fast: false - matrix: - arch: [x86_64, arm64, universal2] - python_tag: ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "cp310-*", "pp37-*", "pp38-*", "pp39-*"] - exclude: - # MacOS Arm only supported since Python 3.8 - - arch: arm64 - python_tag: "cp36-*" - - arch: arm64 - python_tag: "cp37-*" - - arch: universal2 - python_tag: "cp36-*" - - arch: universal2 - python_tag: "cp37-*" - - # PyPy not supported on MacOS Arm - - arch: arm64 - python_tag: "pp37-*" - - arch: arm64 - python_tag: "pp38-*" - - arch: arm64 - python_tag: "pp39-*" - - arch: universal2 - python_tag: "pp37-*" - - arch: universal2 - python_tag: "pp38-*" - - arch: universal2 - python_tag: "pp39-*" - env: - CIBW_BUILD: ${{matrix.python_tag}} - CIBW_ARCHS: ${{matrix.arch}} - CIBW_TEST_SKIP: "*-macosx_{universal2,arm64}" - CIBW_TEST_REQUIRES: pytest hypothesis - CIBW_TEST_COMMAND: pytest {package}/tests - CIBW_BUILD_VERBOSITY: 3 - - steps: - - uses: actions/checkout@v2 - with: - submodules: 'true' - - - uses: actions/setup-python@v2 - - - name: Build wheels - uses: pypa/cibuildwheel@v2.7.0 - with: - output-dir: wheelhouse - - - name: Upload wheels - uses: actions/upload-artifact@v2 - with: - path: ./wheelhouse/*.whl - - build_wheels_linux: - name: Build wheels on ubuntu-latest/${{matrix.arch}}/${{matrix.python_tag}} - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - arch: [auto, aarch64, ppc64le, s390x] - python_tag: [ "cp36-*", "cp37-*", "cp38-*", "cp39-*", "cp310-*", "pp37-*", "pp38-*", "pp39-*"] - exclude: - # PyPy builds not available for these platforms - - arch: ppc64le - python_tag: "pp37-*" - - arch: ppc64le - python_tag: "pp38-*" - - arch: ppc64le - python_tag: "pp39-*" - - arch: s390x - python_tag: "pp37-*" - - arch: s390x - python_tag: "pp38-*" - - arch: s390x - python_tag: "pp39-*" - env: - CIBW_ARCHS_LINUX: ${{matrix.arch}} - CIBW_BUILD: ${{matrix.python_tag}} - CIBW_TEST_SKIP: "{*_{aarch64,ppc64le,s390x},*musllinux_*}" - CIBW_TEST_REQUIRES: pytest hypothesis - CIBW_TEST_COMMAND: pytest {package}/tests - CIBW_BUILD_VERBOSITY: 3 - - steps: - - uses: actions/checkout@v2 - with: - submodules: 'true' - - - uses: actions/setup-python@v2 - - - uses: docker/setup-qemu-action@v1 - name: Set up QEMU - - - name: Build wheel - uses: pypa/cibuildwheel@v2.7.0 - with: - output-dir: wheelhouse - - - name: Upload wheels - uses: actions/upload-artifact@v2 - with: - path: ./wheelhouse/*.whl - - build_sdist: - name: Build source distribution - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - submodules: 'true' - - - uses: actions/setup-python@v2 - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest hypothesis mypy rapidfuzz_capi Cython==3.0.0a11 - - # The cythonized files allow installation from the sdist without cython - - name: Generate cython - run: | - chmod +x ./src/jarowinkler/generate.sh - ./src/jarowinkler/generate.sh - - - name: Build sdist - run: | - git apply ./tools/sdist.patch - pip install build; python -m build --sdist - # test whether tarball contains all files required for compiling - pip install dist/jarowinkler-*.tar.gz - - - name: Test type stubs - run: | - python -m mypy.stubtest jarowinkler --ignore-missing-stub - - - name: Test with pytest - run: | - pytest tests - python -m pytest tests - - - uses: actions/upload-artifact@v2 - with: - path: dist/*.tar.gz - - deploy-wheels: - if: github.event_name == 'release' && github.event.action == 'published' - needs: [build_wheels_windows, build_wheels_macos, build_wheels_linux, build_sdist] - name: deploy wheels to pypi - runs-on: ubuntu-18.04 - - steps: - - uses: actions/download-artifact@v2 - with: - name: artifact - path: dist - - - uses: pypa/gh-action-pypi-publish@master - with: - user: __token__ - password: ${{ secrets.pypi_password }} diff --git a/.subtrees/jarowinkler/.gitignore b/.subtrees/jarowinkler/.gitignore deleted file mode 100644 index 27d0b7a..0000000 --- a/.subtrees/jarowinkler/.gitignore +++ /dev/null @@ -1,30 +0,0 @@ -.vscode/ -__pycache__/ -.idea/ -build/ -_skbuild/ -*.egg-info/ -dist/ -*.data -*.so -*.o -*.out -test.py -src/*.html -.coverage -coverage.xml -.venv/ -.coveragerc - -# Sphinx documentation -site/ - -# benchmark results -bench_results/ - -# Hypothesis results -.hypothesis/ - -# Cython -*.cxx - diff --git a/.subtrees/jarowinkler/.gitmodules b/.subtrees/jarowinkler/.gitmodules deleted file mode 100644 index ae1cad3..0000000 --- a/.subtrees/jarowinkler/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "extern/jarowinkler-cpp"] - path = extern/jarowinkler-cpp - url = https://github.com/maxbachmann/jarowinkler-cpp.git diff --git a/.subtrees/jarowinkler/CHANGELOG.md b/.subtrees/jarowinkler/CHANGELOG.md deleted file mode 100644 index 9308c04..0000000 --- a/.subtrees/jarowinkler/CHANGELOG.md +++ /dev/null @@ -1,49 +0,0 @@ -## Changelog - -### [1.2.0] - 2022-07-19 -#### Changed -- added in-tree build backend to install cmake and ninja only when it is not installed yet - and only when wheels are available - -### [1.1.2] - 2022-07-11 -#### Fixed -- remove incorrect module import - -### [1.1.1] - 2022-07-09 -#### Fixed -- fix missing type stubs - -### [1.1.0] - 2022-07-04 -#### Changed -- change src layout to make package import from root directory possible -- added pure python fallback for all implementations with the following exceptions: - - no support for sequences of hashables. Only strings supported so far - -#### Fixed -- fixed type hints of jarowinkler_similarity - -### [1.0.5] - 2022-06-29 -#### Fixed -- treat hash for -1 and -2 as different - -### [1.0.4] - 2022-06-23 -#### Changed -- add fallback implementations of `jarowinkler-cpp` back to wheel, - since some package building systems like piwheels can't clone sources - -## [1.0.3] - 2022-06-11 -#### Added -- add wheels for PyPy3.9 -- added tests to sdist - -#### Changed -- Allow installation from system installed version of jarowinkler-cpp -- use system version of cmake on arm platforms, since the cmake package fails to compile - -## [1.0.2] - 2022-03-13 -#### Fixed -- only depend on cython when it is actually required - -## [1.0.1] - 2022-03-06 -#### Fixed -- type hints are now correctly packaged in the wheels diff --git a/.subtrees/jarowinkler/CMakeLists.txt b/.subtrees/jarowinkler/CMakeLists.txt deleted file mode 100644 index 709c81f..0000000 --- a/.subtrees/jarowinkler/CMakeLists.txt +++ /dev/null @@ -1,27 +0,0 @@ -cmake_minimum_required(VERSION 3.12.0) - -cmake_policy(SET CMP0054 NEW) -set(SKBUILD_LINK_LIBRARIES_KEYWORD PRIVATE) - -set(THREADS_PREFER_PTHREAD_FLAG ON) -if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") - set(CMAKE_OSX_DEPLOYMENT_TARGET "10.9" CACHE STRING "Minimum OS X deployment version") -endif() - -project(jarowinkler LANGUAGES C CXX) - -find_package(PythonExtensions REQUIRED) -find_package(Python COMPONENTS Interpreter Development) -include(FetchContent) - -set(JW_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) - -find_package(jaro_winkler 1.0.2 QUIET) -if (jaro_winkler_FOUND) - message("Using system supplied version of jaro_winkler") -else() - message("Using packaged version of jaro_winkler") - add_subdirectory(extern/jarowinkler-cpp) -endif() - -add_subdirectory(src/jarowinkler) diff --git a/.subtrees/jarowinkler/LICENSE b/.subtrees/jarowinkler/LICENSE deleted file mode 100644 index 5b55a39..0000000 --- a/.subtrees/jarowinkler/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -Copyright © 2020 maxbachmann -Copyright © 2011 Adam Cohen - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/.subtrees/jarowinkler/MANIFEST.in b/.subtrees/jarowinkler/MANIFEST.in deleted file mode 100644 index 10c95b2..0000000 --- a/.subtrees/jarowinkler/MANIFEST.in +++ /dev/null @@ -1,16 +0,0 @@ -include MANIFEST.in -include setup.py -include CMakeLists.txt -include README.md -include LICENSE -include pyproject.toml -include _custom_build/backend.py -include src/jarowinkler/py.typed - -recursive-include src/jarowinkler CMakeLists.txt -recursive-include src/jarowinkler *.hpp *.pyx *.pxd *.cxx *.pyi -recursive-include tests * - -include extern/jarowinkler-cpp/LICENSE -include extern/jarowinkler-cpp/CMakeLists.txt -recursive-include extern/jarowinkler-cpp/jaro_winkler *.hpp diff --git a/.subtrees/jarowinkler/README.md b/.subtrees/jarowinkler/README.md deleted file mode 100644 index c4ffae1..0000000 --- a/.subtrees/jarowinkler/README.md +++ /dev/null @@ -1,117 +0,0 @@ - -

- JaroWinkler -

-

- - Continous Integration - - - PyPI package version - - - Python versions -
- - GitHub license - -

- -

JaroWinkler is a library to calculate the Jaro and Jaro-Winkler similarity. It is easy to use, is far more performant than all alternatives and is designed to integrate seemingless with RapidFuzz.

- - - -## :zap: Quickstart -```python ->>> from jarowinkler import * - ->>> jaro_similarity("Johnathan", "Jonathan") -0.8796296296296297 - ->>> jarowinkler_similarity("Johnathan", "Jonathan") -0.9037037037037037 -``` - -## 🚀 Benchmarks -The implementation is based on a novel approach to calculate the Jaro-Winkler similarity using bitparallelism. This is significantly faster than the original approach used in other libraries. The following benchmark shows the performance difference to jellyfish and python-Levenshtein. - -

-Benchmark JaroWinkler -

- -## ⚙️ Installation - -You can install this library from [PyPI](https://pypi.org/project/jarowinkler/) with pip: -``` -pip install jarowinkler -``` -JaroWinkler provides binary wheels for all common platforms. - -### Source builds - -For a source build (for example from a SDist packaged) you only require a C++14 compatible compiler. You can install directly from GitHub if you would like. -``` -pip install git+https://github.com/maxbachmann/JaroWinkler.git@main -``` - -## 📖 Usage - -Any algorithms in JaroWinkler can not only be used with strings, but with any arbitary sequences of hashable objects: -```python -from jarowinkler import jarowinkler_similarity - - -jarowinkler_similarity("this is an example".split(), ["this", "is", "a", "example"]) -# 0.8666666666666667 -``` - -So as long as two objects have the same hash they are treated as similar. You can provide a `__hash__` method for your own object instances. - -```python -class MyObject: - def __init__(self, hash): - self.hash = hash - - def __hash__(self): - return self.hash - -jarowinkler_similarity([MyObject(1), MyObject(2)], [MyObject(1), MyObject(2), MyObject(3)]) -# 0.9111111111111111 -``` - -All algorithms provide a `score_cutoff` parameter. This parameter can be used to filter out bad matches. Internally this allows JaroWinkler to select faster implementations in some places: - -```python -jaro_similarity("Johnathan", "Jonathan", score_cutoff=0.9) -# 0.0 - -jaro_similarity("Johnathan", "Jonathan", score_cutoff=0.85) -# 0.8796296296296297 -``` - -JaroWinkler can be used with RapidFuzz, which provides multiple methods to compute string metrics on collections of inputs. JaroWinkler implements the RapidFuzz C-API which allows RapidFuzz to call the functions without any of the usual overhead of python, which makes this even faster. - -```python -from rapidfuzz import process - -process.cdist(["Johnathan", "Jonathan"], ["Johnathan", "Jonathan"], scorer=jarowinkler_similarity) -array([[1. , 0.9037037], - [0.9037037, 1. ]], dtype=float32) -``` - -## 👍 Contributing - -PRs are welcome! -- Found a bug? Report it in form of an [issue](https://github.com/maxbachmann/JaroWinkler/issues) or even better fix it! -- Can make something faster? Great! Just avoid external dependencies and remember that existing functionality should still work. -- Something else that do you think is good? Do it! Just make sure that CI passes and everything from the README is still applicable (interface, features, and so on). -- Have no time to code? Tell your friends and subscribers about JaroWinkler. More users, more contributions, more amazing features. - -Thank you :heart: - -## ⚠️ License -Copyright 2021 - present [maxbachmann](https://github.com/maxbachmann). `JaroWinkler` is free and open-source software licensed under the [MIT License](https://github.com/maxbachmann/JaroWinkler/blob/main/LICENSE). diff --git a/.subtrees/jarowinkler/_custom_build/backend.py b/.subtrees/jarowinkler/_custom_build/backend.py deleted file mode 100644 index 5cd1ba5..0000000 --- a/.subtrees/jarowinkler/_custom_build/backend.py +++ /dev/null @@ -1,90 +0,0 @@ -from setuptools import build_meta as _orig -from packaging import version as _version -from packaging.tags import sys_tags as _sys_tags -from skbuild.exceptions import SKBuildError as _SKBuildError -from skbuild.cmaker import get_cmake_version as _get_cmake_version -import subprocess as _subprocess -import platform as _platform - -prepare_metadata_for_build_wheel = _orig.prepare_metadata_for_build_wheel -build_wheel = _orig.build_wheel -build_sdist = _orig.build_sdist -get_requires_for_build_sdist = _orig.get_requires_for_build_sdist - -cmake_wheels = { - "win_amd64", - "win32", - "musllinux_1_1_x86_64", - "musllinux_1_1_s390x", - "musllinux_1_1_ppc64le", - "musllinux_1_1_i686", - "musllinux_1_1_aarch64", - "manylinux_2_17_s390x", - "manylinux_2_17_ppc64le", - "manylinux_2_17_aarch64", - "manylinux_2_17_x86_64", - "manylinux_2_17_i686", - "manylinux_2_5_x86_64", - "manylinux_2_5_i686", - "macosx_10_10_universal2", -} - -ninja_wheels = { - "win_amd64", - "win32.whl", - "musllinux_1_1_x86_64", - "musllinux_1_1_s390x", - "musllinux_1_1_ppc64le", - "musllinux_1_1_i686", - "musllinux_1_1_aarch64", - "manylinux_2_17_s390x", - "manylinux_2_17_ppc64le", - "manylinux_2_17_aarch64", - "manylinux_2_5_x86_64", - "manylinux_2_5_i686", - "macosx_10_9_universal2", -} - -def _cmake_required(): - try: - if _version.parse(_get_cmake_version()) >= _version.parse("3.12"): - print("Using System version of cmake") - return False - except _SKBuildError: - pass - - for tag in _sys_tags(): - if tag.platform in cmake_wheels: - return True - - print("No Cmake wheel available on platform") - return False - -def _ninja_required(): - if _platform.system() == "Windows": - print("Ninja is part of the MSVC installation on Windows") - return False - - for generator in ("ninja", "make"): - try: - _subprocess.check_output([generator, '--version']) - print(f"Using System version of {generator}") - return False - except (OSError, _subprocess.CalledProcessError): - pass - - for tag in _sys_tags(): - if tag.platform in ninja_wheels: - return True - - print("No Ninja wheel available on platform") - return False - -def get_requires_for_build_wheel(self, config_settings=None): - packages = [] - if _cmake_required(): - packages.append('cmake') - if _ninja_required(): - packages.append('ninja') - - return _orig.get_requires_for_build_wheel(config_settings) + packages diff --git a/.subtrees/jarowinkler/bench/benchmark_jaro_winkler.py b/.subtrees/jarowinkler/bench/benchmark_jaro_winkler.py deleted file mode 100644 index 56907d6..0000000 --- a/.subtrees/jarowinkler/bench/benchmark_jaro_winkler.py +++ /dev/null @@ -1,52 +0,0 @@ -# todo combine benchmarks of scorers into common code base -import timeit -import pandas -import numpy as np - -def benchmark(name, func, setup, lengths, count): - print(f"starting {name}") - start = timeit.default_timer() - results = [] - for length in lengths: - test = timeit.Timer(func, setup=setup.format(length, count)) - results.append(min(test.timeit(number=1) for _ in range(7)) / count) - stop = timeit.default_timer() - print(f"finished {name}, Runtime: ", stop - start) - return results - -setup =""" -from jarowinkler import JaroWinkler -import jellyfish -import Levenshtein -import string -import random -random.seed(18) -characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation -a = ''.join(random.choice(characters) for _ in range({0})) -b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})] -""" - -lengths = list(range(1,512,4)) -count = 4000 - -time_jarowinkler = benchmark("jarowinkler", - '[JaroWinkler.similarity(a, b) for b in b_list]', - setup, lengths, count) - -# this gets very slow, so only benchmark it for smaller values -time_jellyfish = benchmark("jellyfish", - '[jellyfish.jaro_winkler(a, b) for b in b_list]', - setup, list(range(1,128,4)), count) + [np.NaN] * 96 - -time_python_levenshtein = benchmark("python-Levenshtein", - '[Levenshtein.jaro_winkler(a, b) for b in b_list]', - setup, list(range(1,256,4)), count) + [np.NaN] * 64 - -df = pandas.DataFrame(data={ - "length": lengths, - "jarowinkler": time_jarowinkler, - "jellyfish": time_jellyfish, - "python-Levenshtein": time_python_levenshtein -}) - -df.to_csv("results/jaro_winkler.csv", sep=',',index=False) diff --git a/.subtrees/jarowinkler/bench/benchmark_visualize.py b/.subtrees/jarowinkler/bench/benchmark_visualize.py deleted file mode 100644 index 089d2e6..0000000 --- a/.subtrees/jarowinkler/bench/benchmark_visualize.py +++ /dev/null @@ -1,22 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt - -df=pd.read_csv("results/jaro_winkler.csv") - -df *= 1000 * 1000 -df["length"] /= 1000 * 1000 - - -ax=df.plot(x="length") - -plt.xticks(list(range(0, 513, 64))) - -plt.title("Performance comparision of the \nJaro-Winkler similarity in different libraries") -plt.xlabel("string length [in characters]") -plt.ylabel("runtime [μs]") -ax.set_xlim(xmin=0) -ax.set_ylim(bottom=0) -plt.grid() -plt.show() - - diff --git a/.subtrees/jarowinkler/bench/results/JaroWinkler.svg b/.subtrees/jarowinkler/bench/results/JaroWinkler.svg deleted file mode 100644 index 1b840f2..0000000 --- a/.subtrees/jarowinkler/bench/results/JaroWinkler.svg +++ /dev/null @@ -1,1587 +0,0 @@ - - - - - - - - 2022-01-08T13:47:59.158581 - image/svg+xml - - - Matplotlib v3.5.1, https://matplotlib.org/ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/.subtrees/jarowinkler/bench/results/jaro_winkler.csv b/.subtrees/jarowinkler/bench/results/jaro_winkler.csv deleted file mode 100644 index bb53364..0000000 --- a/.subtrees/jarowinkler/bench/results/jaro_winkler.csv +++ /dev/null @@ -1,129 +0,0 @@ -length,jarowinkler,jellyfish,python-Levenshtein -1,1.0242649997849184e-07,2.1970424999722126e-06,1.3773950001905178e-07 -5,1.604507500019281e-07,2.2241952499939545e-06,1.7559025002356065e-07 -9,1.723850000132643e-07,2.262624000024971e-06,2.1795374999555862e-07 -13,1.8368225002518557e-07,2.374377749958967e-06,2.877064999893264e-07 -17,1.924025000050733e-07,2.5461917500138043e-06,3.8818949997221353e-07 -21,2.0168775000684037e-07,2.744041250025475e-06,5.088235000130226e-07 -25,2.1474375000707368e-07,3.0330870000057074e-06,6.272479999438474e-07 -29,2.2143399999663415e-07,3.3364647500206955e-06,7.628200000340257e-07 -33,2.312827500077219e-07,3.6793797499967697e-06,9.097077499973238e-07 -37,2.425382499779971e-07,4.034564999983559e-06,1.1261084999887317e-06 -41,2.527402500049902e-07,4.3864649999818535e-06,1.257153750032103e-06 -45,2.6626899997950205e-07,4.817342999956509e-06,1.474185999995825e-06 -49,2.7837799999019806e-07,5.273319750017435e-06,1.6941224999982296e-06 -53,2.916282500109446e-07,5.771962249980334e-06,1.9098797499736977e-06 -57,3.0220974997519077e-07,6.267949749997115e-06,2.1186800000236873e-06 -61,3.139089999990574e-07,6.878165500040723e-06,2.3666472499712653e-06 -65,9.577017499964314e-07,7.4521482500244925e-06,2.59658500004889e-06 -69,1.02525724997804e-06,8.081028749984397e-06,2.834802999984731e-06 -73,1.0829539999974712e-06,8.788844749972213e-06,3.129796999985501e-06 -77,1.2351502499825528e-06,9.51282099998707e-06,3.3736514999986865e-06 -81,1.2876245000086329e-06,1.024160924998796e-05,3.654816749985912e-06 -85,1.3457082499996887e-06,1.1041964499952429e-05,3.954588499993861e-06 -89,1.4153912499921263e-06,1.1803611000004821e-05,4.2408792500054916e-06 -93,1.45458750000671e-06,1.2555690750048142e-05,4.554326999993919e-06 -97,1.5235607500017068e-06,1.3453278249983214e-05,4.865760500024408e-06 -101,1.5784222499917177e-06,1.4144099250017917e-05,5.180090250007652e-06 -105,1.6366962500171667e-06,1.5205634249980448e-05,5.503461500040885e-06 -109,1.71132099998772e-06,1.5863534250001975e-05,5.833295749994249e-06 -113,1.783151500006852e-06,1.6970079249972512e-05,6.203173500011871e-06 -117,1.8469757499985918e-06,1.781968174998383e-05,6.561579749984503e-06 -121,1.8918727500079059e-06,1.84527600000024e-05,6.9218770000247784e-06 -125,1.9274875000121483e-06,1.9624498999974093e-05,7.310718749977241e-06 -129,2.2159554999916508e-06,,7.5819754999884025e-06 -133,2.2954330000004573e-06,,7.969897750001564e-06 -137,2.3665685000082704e-06,,8.366741500026364e-06 -141,2.4165595000056327e-06,,8.765191750001123e-06 -145,2.4681579999992208e-06,,9.184628750006141e-06 -149,2.5228457500077185e-06,,9.602934249983264e-06 -153,2.5684847499860553e-06,,1.0019458500039491e-05 -157,2.6178302499886286e-06,,1.0451530250009e-05 -161,2.6563587499879304e-06,,1.0880893499972898e-05 -165,2.708121250009299e-06,,1.1332152499960557e-05 -169,2.758271999994122e-06,,1.1775580749997517e-05 -173,2.8185682500065923e-06,,1.2239040999986628e-05 -177,2.860736249999718e-06,,1.2703667250036687e-05 -181,2.9110177500228927e-06,,1.3161808250004014e-05 -185,2.9579790000013874e-06,,1.3631287750001775e-05 -189,3.0070002499940073e-06,,1.4119006749979234e-05 -193,3.2074334999947496e-06,,1.4624213249987861e-05 -197,3.290538500010598e-06,,1.5134493499999735e-05 -201,3.3515060000013364e-06,,1.5658886499977598e-05 -205,3.412287250000645e-06,,1.6156184999999822e-05 -209,3.459391749998986e-06,,1.668725200005383e-05 -213,3.50881224997579e-06,,1.7207881249987622e-05 -217,3.5567982500026573e-06,,1.7732209249970764e-05 -221,3.591963499985695e-06,,1.82662612500053e-05 -225,3.6219195000057878e-06,,1.88169255000048e-05 -229,3.662653750012623e-06,,1.9391362500016384e-05 -233,3.703763749996369e-06,,1.995370349999348e-05 -237,3.755599750007832e-06,,2.05113547499991e-05 -241,3.767430000010563e-06,,2.1101369500001966e-05 -245,3.8017152500060546e-06,,2.1704576999979962e-05 -249,3.850303250004572e-06,,2.2287643250024303e-05 -253,3.896969250007487e-06,,2.2908058750033435e-05 -257,4.140366999990874e-06,, -261,4.22011474998385e-06,, -265,4.2900737500133345e-06,, -269,4.329841250012123e-06,, -273,4.39095650000354e-06,, -277,4.449307999976781e-06,, -281,4.502229749988373e-06,, -285,4.560061000006499e-06,, -289,4.669512500015571e-06,, -293,4.663764999975229e-06,, -297,4.724741250015541e-06,, -301,4.7662102500112265e-06,, -305,4.83254349995832e-06,, -309,4.873275249963171e-06,, -313,4.932502999963617e-06,, -317,4.978358999949251e-06,, -321,5.282012499947086e-06,, -325,5.36726349997707e-06,, -329,5.430304250012341e-06,, -333,5.472211750031874e-06,, -337,5.5281757499869855e-06,, -341,5.582512750038404e-06,, -345,5.626182500009236e-06,, -349,5.66335949997665e-06,, -353,5.646911749977335e-06,, -357,5.685996249951586e-06,, -361,5.725557999994635e-06,, -365,5.744895250018089e-06,, -369,5.78226274996041e-06,, -373,5.792189250030333e-06,, -377,5.839759249965937e-06,, -381,5.896312250001756e-06,, -385,6.1217295000233206e-06,, -389,6.182883750000201e-06,, -393,6.2012852499719885e-06,, -397,6.2434312499703995e-06,, -401,6.280881999998655e-06,, -405,6.341878749992702e-06,, -409,6.389777749973291e-06,, -413,6.441894000033699e-06,, -417,6.478261499978544e-06,, -421,6.519561999994039e-06,, -425,6.581959500010726e-06,, -429,6.62881125003878e-06,, -433,6.693353749994912e-06,, -437,6.717007500014915e-06,, -441,6.778519750014311e-06,, -445,6.792850250008087e-06,, -449,7.10868499999151e-06,, -453,7.1913857499907865e-06,, -457,7.262821499978145e-06,, -461,7.324349999976221e-06,, -465,7.3540757499586105e-06,, -469,7.43951474998994e-06,, -473,7.4477272499962055e-06,, -477,7.495162249995246e-06,, -481,7.522579749945635e-06,, -485,7.55803875000538e-06,, -489,7.57794699995884e-06,, -493,7.66633949996276e-06,, -497,7.68676900003129e-06,, -501,7.716344000016307e-06,, -505,7.767418250011816e-06,, -509,7.779927999990831e-06,, diff --git a/.subtrees/jarowinkler/extern/jarowinkler-cpp b/.subtrees/jarowinkler/extern/jarowinkler-cpp deleted file mode 160000 index 655c259..0000000 --- a/.subtrees/jarowinkler/extern/jarowinkler-cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 655c25926250f6a73a9380a862a603c70e016aec diff --git a/.subtrees/jarowinkler/pyproject.toml b/.subtrees/jarowinkler/pyproject.toml deleted file mode 100644 index 2ea8ca1..0000000 --- a/.subtrees/jarowinkler/pyproject.toml +++ /dev/null @@ -1,9 +0,0 @@ -[build-system] -requires = [ - "setuptools>=42", - "scikit-build>=0.13.0", - "Cython==3.0.0a11", - "rapidfuzz_capi==1.0.5" -] -build-backend = "backend" -backend-path = ["_custom_build"] diff --git a/.subtrees/jarowinkler/setup.py b/.subtrees/jarowinkler/setup.py deleted file mode 100644 index 45d2009..0000000 --- a/.subtrees/jarowinkler/setup.py +++ /dev/null @@ -1,77 +0,0 @@ -import os - -def show_message(*lines): - print("=" * 74) - for line in lines: - print(line) - print("=" * 74) - -with open('README.md', 'rt', encoding="utf8") as f: - readme = f.read() - -setup_args = { - "name": "jarowinkler", - "version": "1.2.0", - "url": "https://github.com/maxbachmann/JaroWinkler", - "author": "Max Bachmann", - "author_email": "pypi@maxbachmann.de", - "description": "library for fast approximate string matching using Jaro and Jaro-Winkler similarity", - "long_description": readme, - "long_description_content_type": "text/markdown", - - "license": "MIT", - "classifiers": [ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "License :: OSI Approved :: MIT License" - ], - - "packages": ["jarowinkler"], - "package_dir": {'': 'src'}, - "package_data": {"jarowinkler": ["*.pyi", "py.typed"]}, - "python_requires": ">=3.6" -} - -def run_setup(with_binary): - if with_binary: - from skbuild import setup - import rapidfuzz_capi - - setup( - **setup_args, - cmake_args=[ - f'-DRF_CAPI_PATH:STRING={rapidfuzz_capi.get_include()}' - ] - ) - else: - from setuptools import setup - setup(**setup_args) - -# when packaging only build wheels which include the C extension -packaging = "1" in { - os.environ.get("CIBUILDWHEEL", "0"), - os.environ.get("CONDA_BUILD", "0"), - os.environ.get("JAROWINKLER_BUILD_EXTENSION", "0") -} -if packaging: - run_setup(True) -else: - try: - run_setup(True) - except: - show_message( - "WARNING: The C extension could not be compiled, speedups" - " are not enabled.", - "Failure information, if any, is above.", - "Retrying the build without the C extension now.", - ) - run_setup(False) - show_message( - "WARNING: The C extension could not be compiled, speedups" - " are not enabled.", - "Plain-Python build succeeded.", - ) diff --git a/.subtrees/jarowinkler/src/jarowinkler/CMakeLists.txt b/.subtrees/jarowinkler/src/jarowinkler/CMakeLists.txt deleted file mode 100644 index 8caf71a..0000000 --- a/.subtrees/jarowinkler/src/jarowinkler/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -function(create_cython_target _name) - if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_name}.cxx) - set(${_name} ${CMAKE_CURRENT_LIST_DIR}/${_name}.cxx PARENT_SCOPE) - else() - find_package(Cython REQUIRED) - # should use target_include_directories once this is supported by scikit-build - include_directories(${JW_BASE_DIR}/jarowinkler) - add_cython_target(${_name} CXX) - set(${_name} ${_name} PARENT_SCOPE) - endif() -endfunction(create_cython_target) - -create_cython_target(_initialize_cpp) -add_library(_initialize_cpp MODULE ${_initialize_cpp}) -target_compile_features(_initialize_cpp PUBLIC cxx_std_14) -target_include_directories(_initialize_cpp PRIVATE ${RF_CAPI_PATH} ${JW_BASE_DIR}/jarowinkler) -target_link_libraries(_initialize_cpp PRIVATE jaro_winkler::jaro_winkler) -python_extension_module(_initialize_cpp) -install(TARGETS _initialize_cpp LIBRARY DESTINATION src/jarowinkler) diff --git a/.subtrees/jarowinkler/src/jarowinkler/__init__.py b/.subtrees/jarowinkler/src/jarowinkler/__init__.py deleted file mode 100644 index de157f6..0000000 --- a/.subtrees/jarowinkler/src/jarowinkler/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -__author__: str = "Max Bachmann" -__license__: str = "MIT" -__version__: str = "1.2.0" - -def _fallback_import(module: str, name: str): - import importlib - import os - - impl = os.environ.get("JAROWINKLER_IMPLEMENTATION") - - if impl == "cpp": - mod = importlib.import_module(module + "_cpp") - elif impl == "python": - mod = importlib.import_module(module + "_py") - else: - try: - mod = importlib.import_module(module + "_cpp") - except ModuleNotFoundError: - mod = importlib.import_module(module + "_py") - - func = getattr(mod, name) - if not func: - raise ImportError( - f"cannot import name '{name}' from '{mod.__name}' ({mod.__file__})" - ) - return func - -jaro_similarity = _fallback_import("jarowinkler._initialize", "jaro_similarity") -jarowinkler_similarity = _fallback_import("jarowinkler._initialize", "jarowinkler_similarity") \ No newline at end of file diff --git a/.subtrees/jarowinkler/src/jarowinkler/__init__.pyi b/.subtrees/jarowinkler/src/jarowinkler/__init__.pyi deleted file mode 100644 index a9a2ccf..0000000 --- a/.subtrees/jarowinkler/src/jarowinkler/__init__.pyi +++ /dev/null @@ -1,20 +0,0 @@ -from typing import Callable, Hashable, Sequence, Optional, Union, TypeVar - -__author__: str -__license__: str -__version__: str - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -def jaro_similarity( - s1: _S1, s2: _S2, *, - processor: Optional[Callable[[Union[_S1, _S2]], _StringType]] = None, - score_cutoff: Optional[float] = 0) -> float: ... - -def jarowinkler_similarity( - s1: _S1, s2: _S2, *, - prefix_weight: float = 0.1, - processor: Optional[Callable[[Union[_S1, _S2]], _StringType]] = None, - score_cutoff: Optional[float] = 0) -> float: ... diff --git a/.subtrees/jarowinkler/src/jarowinkler/_initialize_cpp.pyx b/.subtrees/jarowinkler/src/jarowinkler/_initialize_cpp.pyx deleted file mode 100644 index 013ede5..0000000 --- a/.subtrees/jarowinkler/src/jarowinkler/_initialize_cpp.pyx +++ /dev/null @@ -1,165 +0,0 @@ -# distutils: language=c++ -# cython: language_level=3, binding=True, linetrace=True - -from array import array - -from rapidfuzz_capi cimport ( - RF_String, RF_Scorer, RF_Kwargs, RF_ScorerFunc, RF_Preprocess, RF_KwargsInit, - SCORER_STRUCT_VERSION, RF_Preprocessor, - RF_ScorerFlags, - RF_SCORER_FLAG_RESULT_F64, RF_SCORER_FLAG_SYMMETRIC -) -from common cimport RF_StringWrapper, conv_sequence - -from libcpp cimport bool -from libc.stdint cimport int64_t -from libc.stdlib cimport malloc, free -from cpython.pycapsule cimport PyCapsule_New, PyCapsule_IsValid, PyCapsule_GetPointer -from cython.operator cimport dereference - -cdef extern from "scorer.hpp": - double jaro_similarity_func( const RF_String&, const RF_String&, double) nogil except + - bool JaroSimilarityInit( RF_ScorerFunc*, const RF_Kwargs*, int64_t, const RF_String*) nogil except False - double jaro_winkler_similarity_func(const RF_String &, const RF_String &, double, double) nogil except + - bool JaroWinklerSimilarityInit(RF_ScorerFunc *, const RF_Kwargs *, int64_t, const RF_String *) nogil except False - -cdef inline void preprocess_strings(s1, s2, processor, RF_StringWrapper* s1_proc, RF_StringWrapper* s2_proc) except *: - cdef RF_Preprocessor* preprocess_context = NULL - - if processor is None: - s1_proc[0] = RF_StringWrapper(conv_sequence(s1)) - s2_proc[0] = RF_StringWrapper(conv_sequence(s2)) - else: - processor_capsule = getattr(processor, '_RF_Preprocess', processor) - if PyCapsule_IsValid(processor_capsule, NULL): - preprocess_context = PyCapsule_GetPointer(processor_capsule, NULL) - - if preprocess_context != NULL and preprocess_context.version == 1: - preprocess_context.preprocess(s1, &(s1_proc[0].string)) - preprocess_context.preprocess(s2, &(s2_proc[0].string)) - else: - s1 = processor(s1) - s1_proc[0] = RF_StringWrapper(conv_sequence(s1), s1) - s2 = processor(s2) - s2_proc[0] = RF_StringWrapper(conv_sequence(s2), s2) - -def jaro_similarity(s1, s2, *, processor=None, score_cutoff=None): - """ - Calculates the jaro similarity - - Parameters - ---------- - s1 : Sequence[Hashable] - First string to compare. - s2 : Sequence[Hashable] - Second string to compare. - processor: callable, optional - Optional callable that is used to preprocess the strings before - comparing them. Default is None, which deactivates this behaviour. - score_cutoff : float, optional - Optional argument for a score threshold as a float between 0 and 1.0. - For ratio < score_cutoff 0 is returned instead. Default is 0, - which deactivates this behaviour. - - Returns - ------- - similarity : float - similarity between s1 and s2 as a float between 0 and 1.0 - - """ - cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff - cdef RF_StringWrapper s1_proc, s2_proc - - if s1 is None or s2 is None: - return 0 - - preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc) - return jaro_similarity_func(s1_proc.string, s2_proc.string, c_score_cutoff) - -cdef bool NoKwargsInit(RF_Kwargs* self, dict kwargs) except False: - if len(kwargs): - raise TypeError("Got unexpected keyword arguments: ", ", ".join(kwargs.keys())) - - dereference(self).context = NULL - dereference(self).dtor = NULL - return True - -cdef bool GetScorerFlagsJaroSimilarity(const RF_Kwargs* self, RF_ScorerFlags* scorer_flags) nogil except False: - dereference(scorer_flags).flags = RF_SCORER_FLAG_RESULT_F64 | RF_SCORER_FLAG_SYMMETRIC - dereference(scorer_flags).optimal_score.f64 = 1.0 - dereference(scorer_flags).worst_score.f64 = 0 - return True - -cdef RF_Scorer JaroSimilarityContext -JaroSimilarityContext.version = SCORER_STRUCT_VERSION -JaroSimilarityContext.kwargs_init = NoKwargsInit -JaroSimilarityContext.get_scorer_flags = GetScorerFlagsJaroSimilarity -JaroSimilarityContext.scorer_func_init = JaroSimilarityInit -jaro_similarity._RF_Scorer = PyCapsule_New(&JaroSimilarityContext, NULL, NULL) - -def jarowinkler_similarity(s1, s2, *, double prefix_weight=0.1, processor=None, score_cutoff=None): - """ - Calculates the jaro winkler similarity - - Parameters - ---------- - s1 : Sequence[Hashable] - First string to compare. - s2 : Sequence[Hashable] - Second string to compare. - prefix_weight : float, optional - Weight used for the common prefix of the two strings. - Has to be between 0 and 0.25. Default is 0.1. - processor: callable, optional - Optional callable that is used to preprocess the strings before - comparing them. Default is None, which deactivates this behaviour. - score_cutoff : float, optional - Optional argument for a score threshold as a float between 0 and 1.0. - For ratio < score_cutoff 0 is returned instead. Default is 0, - which deactivates this behaviour. - - Returns - ------- - similarity : float - similarity between s1 and s2 as a float between 0 and 1.0 - - Raises - ------ - ValueError - If prefix_weight is invalid - """ - cdef double c_score_cutoff = 0.0 if score_cutoff is None else score_cutoff - cdef RF_StringWrapper s1_proc, s2_proc - - if s1 is None or s2 is None: - return 0 - - preprocess_strings(s1, s2, processor, &s1_proc, &s2_proc) - return jaro_winkler_similarity_func(s1_proc.string, s2_proc.string, prefix_weight, c_score_cutoff) - -cdef void KwargsDeinit(RF_Kwargs * self): - free( dereference(self).context) - -cdef bool JaroWinklerKwargsInit(RF_Kwargs * self, dict kwargs) except False: - cdef double * prefix_weight = malloc(sizeof(double)) - - if not prefix_weight: - raise MemoryError - - prefix_weight[0] = kwargs.get("prefix_weight", 0.1) - dereference(self).context = prefix_weight - dereference(self).dtor = KwargsDeinit - return True - -cdef bool GetScorerFlagsJaroWinklerSimilarity(const RF_Kwargs * self, RF_ScorerFlags * scorer_flags) nogil except False: - dereference(scorer_flags).flags = RF_SCORER_FLAG_RESULT_F64 | RF_SCORER_FLAG_SYMMETRIC - dereference(scorer_flags).optimal_score.f64 = 1.0 - dereference(scorer_flags).worst_score.f64 = 0 - return True - -cdef RF_Scorer JaroWinklerSimilarityContext -JaroWinklerSimilarityContext.version = SCORER_STRUCT_VERSION -JaroWinklerSimilarityContext.kwargs_init = JaroWinklerKwargsInit -JaroWinklerSimilarityContext.get_scorer_flags = GetScorerFlagsJaroWinklerSimilarity -JaroWinklerSimilarityContext.scorer_func_init = JaroWinklerSimilarityInit -jarowinkler_similarity._RF_Scorer = PyCapsule_New(&JaroWinklerSimilarityContext, NULL, NULL) diff --git a/.subtrees/jarowinkler/src/jarowinkler/_initialize_py.py b/.subtrees/jarowinkler/src/jarowinkler/_initialize_py.py deleted file mode 100644 index 235d591..0000000 --- a/.subtrees/jarowinkler/src/jarowinkler/_initialize_py.py +++ /dev/null @@ -1,196 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2022 Max Bachmann - -def _jaro_calculate_similarity(P_len: int, T_len: int, CommonChars: int, Transpositions: int) -> float: - Transpositions //= 2 - Sim = 0.0 - Sim += CommonChars / P_len - Sim += CommonChars / T_len - Sim += (CommonChars - Transpositions) / CommonChars - return Sim / 3.0 - -def _jaro_length_filter(P_len: int, T_len: int, score_cutoff: float) -> bool: - """ - filter matches below score_cutoff based on string lengths - """ - if not P_len or not T_len: return False - - sim = _jaro_calculate_similarity(P_len, T_len, min(P_len, T_len), 0) - return sim >= score_cutoff - -def _jaro_common_char_filter(P_len: int, T_len: int, CommonChars: int, score_cutoff: float) -> bool: - """ - filter matches below score_cutoff based on string lengths and common characters - """ - if not CommonChars: return False - - sim = _jaro_calculate_similarity(P_len, T_len, CommonChars, 0) - return sim >= score_cutoff - - -def _jaro_bounds(s1, s2): - """ - find bounds and skip out of bound parts of the sequences - """ - P_len = len(s1) - T_len = len(s2) - - # since jaro uses a sliding window some parts of T/P might never be in - # range an can be removed ahead of time - Bound = 0 - if T_len > P_len: - Bound = T_len // 2 - 1 - if T_len > P_len + Bound: - s2 = s2[:P_len + Bound] - else: - Bound = P_len // 2 - 1 - if P_len > T_len + Bound: - s1 = s1[:T_len + Bound] - return s1, s2, Bound - -def jaro_similarity(s1, s2, *, processor=None, score_cutoff=None) -> float: - """ - Calculates the jaro similarity - - Parameters - ---------- - s1 : Sequence[Hashable] - First string to compare. - s2 : Sequence[Hashable] - Second string to compare. - processor: callable, optional - Optional callable that is used to preprocess the strings before - comparing them. Default is None, which deactivates this behaviour. - score_cutoff : float, optional - Optional argument for a score threshold as a float between 0 and 1.0. - For ratio < score_cutoff 0 is returned instead. Default is 0, - which deactivates this behaviour. - - Returns - ------- - similarity : float - similarity between s1 and s2 as a float between 0 and 1.0 - - """ - if s1 is None or s2 is None: - return 0 - - if processor is not None: - s1 = processor(s1) - s2 = processor(s2) - - if score_cutoff is None: - score_cutoff = 0 - - P_len = len(s1) - T_len = len(s2) - - # short circuit if score_cutoff can not be reached - if not _jaro_length_filter(P_len, T_len, score_cutoff): - return 0 - - if P_len == 1 and T_len == 1: - return float(s1[0] == s2[0]) - - s1, s2, Bound = _jaro_bounds(s1, s2) - - s1_flags = [False] * P_len - s2_flags = [False] * T_len - - # todo use bitparallel implementation - # looking only within search range, count & flag matched pairs - CommonChars = 0 - for i, s1_ch in enumerate(s1): - low = max(0, i - Bound) - hi = min(i + Bound, T_len - 1) - for j in range(low, hi + 1): - if not s2_flags[j] and s2[j] == s1_ch: - s1_flags[i] = s2_flags[j] = True - CommonChars += 1 - break - - # short circuit if score_cutoff can not be reached - if not _jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff): - return 0 - - # todo use bitparallel implementation - # count transpositions - k = trans_count = 0 - for i, s1_f in enumerate(s1_flags): - if s1_f: - for j in range(k, T_len): - if s2_flags[j]: - k = j + 1 - break - if s1[i] != s2[j]: - trans_count += 1 - - return _jaro_calculate_similarity(P_len, T_len, CommonChars, trans_count) - - -def jarowinkler_similarity(s1, s2, *, prefix_weight=0.1, processor=None, score_cutoff=None) -> float: - """ - Calculates the jaro winkler similarity - - Parameters - ---------- - s1 : Sequence[Hashable] - First string to compare. - s2 : Sequence[Hashable] - Second string to compare. - prefix_weight : float, optional - Weight used for the common prefix of the two strings. - Has to be between 0 and 0.25. Default is 0.1. - processor: callable, optional - Optional callable that is used to preprocess the strings before - comparing them. Default is None, which deactivates this behaviour. - score_cutoff : float, optional - Optional argument for a score threshold as a float between 0 and 1.0. - For ratio < score_cutoff 0 is returned instead. Default is 0, - which deactivates this behaviour. - - Returns - ------- - similarity : float - similarity between s1 and s2 as a float between 0 and 1.0 - - Raises - ------ - ValueError - If prefix_weight is invalid - """ - if s1 is None or s2 is None: - return 0 - - if processor is not None: - s1 = processor(s1) - s2 = processor(s2) - - if score_cutoff is None: - score_cutoff = 0 - - P_len = len(s1) - T_len = len(s2) - min_len = min(P_len, T_len) - prefix = 0 - max_prefix = min(min_len, 4) - - for _ in range(max_prefix): - if s1[prefix] != s2[prefix]: - break - prefix += 1 - - jaro_score_cutoff = score_cutoff - if (jaro_score_cutoff > 0.7): - prefix_sim = prefix * prefix_weight - - if (prefix_sim >= 1.0): - jaro_score_cutoff = 0.7 - else: - jaro_score_cutoff = max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0)) - - Sim = jaro_similarity(s1, s2, score_cutoff=jaro_score_cutoff) - if (Sim > 0.7): - Sim += prefix * prefix_weight * (1.0 - Sim) - - return Sim if Sim >= score_cutoff else 0 diff --git a/.subtrees/jarowinkler/src/jarowinkler/common.hpp b/.subtrees/jarowinkler/src/jarowinkler/common.hpp deleted file mode 100644 index b8d75eb..0000000 --- a/.subtrees/jarowinkler/src/jarowinkler/common.hpp +++ /dev/null @@ -1,280 +0,0 @@ -#pragma once -#include "Python.h" -#include -#include - -#include "rapidfuzz_capi.h" - -#define PYTHON_VERSION(major, minor, micro) ((major << 24) | (minor << 16) | (micro << 8)) - -class PythonTypeError: public std::bad_typeid { -public: - - PythonTypeError(char const* error) - : m_error(error) {} - - virtual char const* what() const noexcept { - return m_error; - } -private: - char const* m_error; -}; - -/* copy from cython */ -static inline void CppExn2PyErr() { - try { - if (PyErr_Occurred()) - ; // let the latest Python exn pass through and ignore the current one - else - throw; - } catch (const std::bad_alloc& exn) { - PyErr_SetString(PyExc_MemoryError, exn.what()); - } catch (const std::bad_cast& exn) { - PyErr_SetString(PyExc_TypeError, exn.what()); - } catch (const std::bad_typeid& exn) { - PyErr_SetString(PyExc_TypeError, exn.what()); - } catch (const std::domain_error& exn) { - PyErr_SetString(PyExc_ValueError, exn.what()); - } catch (const std::invalid_argument& exn) { - PyErr_SetString(PyExc_ValueError, exn.what()); - } catch (const std::ios_base::failure& exn) { - PyErr_SetString(PyExc_IOError, exn.what()); - } catch (const std::out_of_range& exn) { - PyErr_SetString(PyExc_IndexError, exn.what()); - } catch (const std::overflow_error& exn) { - PyErr_SetString(PyExc_OverflowError, exn.what()); - } catch (const std::range_error& exn) { - PyErr_SetString(PyExc_ArithmeticError, exn.what()); - } catch (const std::underflow_error& exn) { - PyErr_SetString(PyExc_ArithmeticError, exn.what()); - } catch (const std::exception& exn) { - PyErr_SetString(PyExc_RuntimeError, exn.what()); - } - catch (...) - { - PyErr_SetString(PyExc_RuntimeError, "Unknown exception"); - } -} - -#define LIST_OF_CASES() \ - X_ENUM(RF_UINT8, uint8_t ) \ - X_ENUM(RF_UINT16, uint16_t) \ - X_ENUM(RF_UINT32, uint32_t) \ - X_ENUM(RF_UINT64, uint64_t) - -/* RAII Wrapper for RF_String */ -struct RF_StringWrapper { - RF_String string; - PyObject* obj; - - RF_StringWrapper() - : string({nullptr, (RF_StringType)0, nullptr, 0, nullptr}), obj(nullptr) {} - - RF_StringWrapper(RF_String string_) - : string(string_), obj(nullptr) {} - - RF_StringWrapper(RF_String string_, PyObject* o) - : string(string_), obj(o) - { - Py_XINCREF(obj); - } - - RF_StringWrapper(const RF_StringWrapper&) = delete; - RF_StringWrapper& operator=(const RF_StringWrapper&) = delete; - - RF_StringWrapper(RF_StringWrapper&& other) - : RF_StringWrapper() - { - swap(*this, other); - } - - RF_StringWrapper& operator=(RF_StringWrapper&& other) { - if (&other != this) { - if (string.dtor) { - string.dtor(&string); - } - Py_XDECREF(obj); - string = other.string; - obj = other.obj; - other.string = {nullptr, (RF_StringType)0, nullptr, 0, nullptr}; - other.obj = nullptr; - } - return *this; - }; - - ~RF_StringWrapper() { - if (string.dtor) { - string.dtor(&string); - } - Py_XDECREF(obj); - } - - friend void swap(RF_StringWrapper& first, RF_StringWrapper& second) noexcept - { - using std::swap; - swap(first.string, second.string); - swap(first.obj, second.obj); - } -}; - -void default_string_deinit(RF_String* string) -{ - free(string->data); -} - -template -auto visit(const RF_String& str, Func&& f, Args&&... args) -{ - switch(str.kind) { -# define X_ENUM(kind, type) case kind: return f((type*)str.data, (type*)str.data + str.length, std::forward(args)...); - LIST_OF_CASES() -# undef X_ENUM - default: - throw std::logic_error("Invalid string type"); - } -} - -template -auto visitor(const RF_String& str1, const RF_String& str2, Func&& f, Args&&... args) -{ - return visit(str2, - [&](auto first, auto last) { - return visit(str1, std::forward(f), first, last, std::forward(args)...); - } - ); -} - -static inline bool is_valid_string(PyObject* py_str) -{ - bool is_string = false; - - if (PyBytes_Check(py_str)) { - is_string = true; - } - else if (PyUnicode_Check(py_str)) { - // PEP 623 deprecates legacy strings and therefor - // deprecates e.g. PyUnicode_READY in Python 3.10 -#if PY_VERSION_HEX < PYTHON_VERSION(3, 10, 0) - if (PyUnicode_READY(py_str)) { - // cython will use the exception set by PyUnicode_READY - throw std::runtime_error(""); - } -#endif - is_string = true; - } - - return is_string; -} - -static inline void validate_string(PyObject* py_str, const char* err) -{ - if (PyBytes_Check(py_str)) { - return; - } - else if (PyUnicode_Check(py_str)) { - // PEP 623 deprecates legacy strings and therefor - // deprecates e.g. PyUnicode_READY in Python 3.10 -#if PY_VERSION_HEX < PYTHON_VERSION(3, 10, 0) - if (PyUnicode_READY(py_str)) { - // cython will use the exception set by PyUnicode_READY - throw std::runtime_error(""); - } -#endif - return; - } - - throw PythonTypeError(err); -} - -static inline RF_String convert_string(PyObject* py_str) -{ - if (PyBytes_Check(py_str)) { - return { - nullptr, - RF_UINT8, - PyBytes_AS_STRING(py_str), - static_cast(PyBytes_GET_SIZE(py_str)), - nullptr - }; - } else { - RF_StringType kind; - switch(PyUnicode_KIND(py_str)) { - case PyUnicode_1BYTE_KIND: - kind = RF_UINT8; - break; - case PyUnicode_2BYTE_KIND: - kind = RF_UINT16; - break; - default: - kind = RF_UINT32; - break; - } - - return { - nullptr, - kind, - PyUnicode_DATA(py_str), - static_cast(PyUnicode_GET_LENGTH(py_str)), - nullptr - }; - } -} - -template -static void scorer_deinit(RF_ScorerFunc* self) -{ - delete (CachedScorer*)self->context; -} - -template -static inline bool scorer_func_wrapper_f64(const RF_ScorerFunc* self, const RF_String* str, int64_t str_count, double score_cutoff, double* result) -{ - CachedScorer& scorer = *(CachedScorer*)self->context; - try { - if (str_count != 1) - { - throw std::logic_error("Only str_count == 1 supported"); - } - *result = visit(*str, [&](auto first, auto last){ - return scorer.similarity(first, last, score_cutoff); - }); - } catch(...) { - PyGILState_STATE gilstate_save = PyGILState_Ensure(); - CppExn2PyErr(); - PyGILState_Release(gilstate_save); - return false; - } - return true; -} - -template