dedupeio · NickCrews · Aug 30, 2022 · Aug 30, 2022 · Aug 30, 2022 · NickCrews
diff --git a/dedupe/api.py b/dedupe/api.py
@@ -9,7 +9,6 @@
 import itertools
 import logging
 import multiprocessing
-import os
 import pickle
 import sqlite3
 import tempfile
@@ -98,12 +97,27 @@ class IntegralMatching(Matching):
 
     def score(self, pairs: RecordPairs) -> Scores:
         """
-        Scores pairs of records. Returns pairs of tuples of records id and
-        associated probabilities that the pair of records are match
+        Scores pairs of records. Returns a numpy structured array of scores.
 
         Args:
-            pairs: Iterator of pairs of records
-
+            pairs: Iterator of pairs of records, such as from the output of :func:`pairs`
+
+        Returns:
+            A numpy
+            `structured array <https://docs.scipy.org/doc/numpy/user/basics.rec.html>`_
+            with a with a dtype of `[('pairs', id_type, 2), ('score', 'f4')]`
+            where dtype is either a str or int,
+            and score is a 32-bit float in the range (0, 1].
+            The 'pairs' column contains pairs of ids of
+            the records compared and the 'score' column contains
+            the similarity score for that pair of records.
+
+            This array will be a numpy.array when self.num_cores is 1,
+            and a numpy.memmap when self.num_cores is greater than 1.
+            This memmap will automatically clean itself up, you don't
+            have to worry about it.
+
+            For each pair, the smaller id will be first.
         """
         try:
             matches = core.scoreDuplicates(
@@ -175,7 +189,6 @@ def partition(
         clusters = self.cluster(pair_scores, threshold)
         clusters = self._add_singletons(data, clusters)
         clusters = list(clusters)
-        _cleanup_scores(pair_scores)
         return clusters
 
     def _add_singletons(self, data: Data, clusters: Clusters) -> Clusters:
@@ -514,7 +527,6 @@ def join(
             links = pair_scores[pair_scores["score"] > threshold]
 
         links = list(links)
-        _cleanup_scores(pair_scores)
         return links
 
     def one_to_one(self, scores: Scores, threshold: float = 0.0) -> Links:
@@ -805,6 +817,8 @@ def score(self, blocks: Blocks) -> Generator[Scores, None, None]:
         Args:
             blocks: Iterator of blocks of records
 
+        Yields:
+            Structured numpy arrays. See :meth:`dedupe.Dedupe.score` for more info.
         """
 
         matches = core.scoreGazette(
@@ -946,7 +960,7 @@ def __init__(
         Args:
             settings_file: A file object containing settings
                            info produced from the
-                           :func:`~dedupe.api.ActiveMatching.write_settings` method.
+                           :meth:`dedupe.Dedupe.write_settings` method.
 
             num_cores: The number of cpus to use for parallel
                        processing, defaults to the number of cpus
@@ -1468,14 +1482,3 @@ def flatten_training(
         y.extend([encoded_y] * len(pairs))
 
     return examples, numpy.array(y)
-
-
-def _cleanup_scores(arr: Scores) -> None:
-    try:
-        mmap_file = arr.filename  # type: ignore
-    except AttributeError:
-        pass
-    else:
-        del arr
-        if mmap_file:
-            os.remove(mmap_file)
diff --git a/dedupe/core.py b/dedupe/core.py
@@ -10,6 +10,7 @@
 import os
 import queue
 import tempfile
+import weakref
 from typing import TYPE_CHECKING, overload
 
 import numpy
@@ -176,9 +177,29 @@ def scoreDuplicates(
     else:
         scored_pairs = numpy.array([], dtype=dtype)
 
+    # Monkeypatch in these extra methods and attributes.
+    # See https://docs.python.org/3/library/weakref.html#comparing-finalizers-with-del-methods
+    scored_pairs.remove = weakref.finalize(scored_pairs, _cleanup_scores, scored_pairs)  # type: ignore[union-attr]
+    scored_pairs.removed = property(_is_removed)  # type: ignore[union-attr]
+
     return scored_pairs
 
 
+def _cleanup_scores(arr: Scores) -> None:
+    try:
+        mmap_file = arr.filename  # type: ignore
+    except AttributeError:
+        pass
+    else:
+        del arr
+        if mmap_file:
+            os.remove(mmap_file)
+
+
+def _is_removed(self):
+    return not self.remove.alive
+
+
 def fillQueue(
     queue: _Queue, iterable: Iterable[Any], stop_signals: int, chunk_size: int = 20000
 ) -> None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,10 +4,12 @@ requires = ["setuptools",
             "cython"]
 
 [tool.mypy]
+python_version = "3.10"
 plugins = "numpy.typing.mypy_plugin"
 ignore_missing_imports = true
 files = "dedupe"
 check_untyped_defs = true
+show_error_codes = true
 
 [tool.pytest.ini_options]
 minversion = "7.1"