From afa1ff298e15b9362b4ccdee59dd98ae21ec5fbb Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Mon, 29 Aug 2022 23:02:29 -0800 Subject: [PATCH 1/3] feat: Auto cleanup scores with weakref.finalize It wasn't stated anywhere in the docs that you have to cleanup the possibly-a-tempfile, so people might not have been doing it. And, I thought this was better than the alternatives from https://stackoverflow.com/questions/865115/how-do-i-correctly-clean-up-a-python-object --- dedupe/api.py | 14 -------------- dedupe/core.py | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index 71edb4c5b..05ac2a71a 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -9,7 +9,6 @@ import itertools import logging import multiprocessing -import os import pickle import sqlite3 import tempfile @@ -175,7 +174,6 @@ def partition( clusters = self.cluster(pair_scores, threshold) clusters = self._add_singletons(data, clusters) clusters = list(clusters) - _cleanup_scores(pair_scores) return clusters def _add_singletons(self, data: Data, clusters: Clusters) -> Clusters: @@ -514,7 +512,6 @@ def join( links = pair_scores[pair_scores["score"] > threshold] links = list(links) - _cleanup_scores(pair_scores) return links def one_to_one(self, scores: Scores, threshold: float = 0.0) -> Links: @@ -1468,14 +1465,3 @@ def flatten_training( y.extend([encoded_y] * len(pairs)) return examples, numpy.array(y) - - -def _cleanup_scores(arr: Scores) -> None: - try: - mmap_file = arr.filename # type: ignore - except AttributeError: - pass - else: - del arr - if mmap_file: - os.remove(mmap_file) diff --git a/dedupe/core.py b/dedupe/core.py index f89a33f59..2923be93b 100644 --- a/dedupe/core.py +++ b/dedupe/core.py @@ -10,6 +10,7 @@ import os import queue import tempfile +import weakref from typing import TYPE_CHECKING, overload import numpy @@ -176,9 +177,29 @@ def scoreDuplicates( else: scored_pairs = numpy.array([], dtype=dtype) + # Monkeypatch in these extra methods and attributes. + # See https://docs.python.org/3/library/weakref.html#comparing-finalizers-with-del-methods + scored_pairs.remove = weakref.finalize(scored_pairs, _cleanup_scores, scored_pairs) # type: ignore[union-attr] + scored_pairs.removed = property(_is_removed) # type: ignore[union-attr] + return scored_pairs +def _cleanup_scores(arr: Scores) -> None: + try: + mmap_file = arr.filename # type: ignore + except AttributeError: + pass + else: + del arr + if mmap_file: + os.remove(mmap_file) + + +def _is_removed(self): + return not self.remove.alive + + def fillQueue( queue: _Queue, iterable: Iterable[Any], stop_signals: int, chunk_size: int = 20000 ) -> None: From f19314673feb6acc9f219f8f8da19d7a7cf07c6c Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Tue, 30 Aug 2022 00:02:08 -0800 Subject: [PATCH 2/3] Fix and improve score() docstrings --- dedupe/api.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index 05ac2a71a..b33b0a1ba 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -97,12 +97,27 @@ class IntegralMatching(Matching): def score(self, pairs: RecordPairs) -> Scores: """ - Scores pairs of records. Returns pairs of tuples of records id and - associated probabilities that the pair of records are match + Scores pairs of records. Returns a numpy structured array of scores. Args: - pairs: Iterator of pairs of records - + pairs: Iterator of pairs of records, such as from the output of :func:`pairs` + + Returns: + A numpy + `structured array `_ + with a with a dtype of `[('pairs', id_type, 2), ('score', 'f4')]` + where dtype is either a str or int, + and score is a 32-bit float in the range (0, 1]. + The 'pairs' column contains pairs of ids of + the records compared and the 'score' column contains + the similarity score for that pair of records. + + This array will be a numpy.array when self.num_cores is 1, + and a numpy.memmap when self.num_cores is greater than 1. + This memmap will automatically clean itself up, you don't + have to worry about it. + + For each pair, the smaller id will be first. """ try: matches = core.scoreDuplicates( @@ -802,6 +817,8 @@ def score(self, blocks: Blocks) -> Generator[Scores, None, None]: Args: blocks: Iterator of blocks of records + Yields: + Structured numpy arrays. See :meth:`dedupe.Dedupe.score` for more info. """ matches = core.scoreGazette( @@ -943,7 +960,7 @@ def __init__( Args: settings_file: A file object containing settings info produced from the - :func:`~dedupe.api.ActiveMatching.write_settings` method. + :meth:`dedupe.Dedupe.write_settings` method. num_cores: The number of cpus to use for parallel processing, defaults to the number of cpus From 133d74408f4efbd22d353a1d87e677b2791e42cd Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Tue, 30 Aug 2022 09:32:34 -0800 Subject: [PATCH 3/3] dev: display specific mypy error codes Now you can #type: ignore[specific-code] --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 6d4674602..964cf56c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,10 +4,12 @@ requires = ["setuptools", "cython"] [tool.mypy] +python_version = "3.10" plugins = "numpy.typing.mypy_plugin" ignore_missing_imports = true files = "dedupe" check_untyped_defs = true +show_error_codes = true [tool.pytest.ini_options] minversion = "7.1"