From afa1ff298e15b9362b4ccdee59dd98ae21ec5fbb Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Mon, 29 Aug 2022 23:02:29 -0800
Subject: [PATCH 1/3] feat: Auto cleanup scores with weakref.finalize

It wasn't stated anywhere in the docs that you have to cleanup the
possibly-a-tempfile, so people might not have been doing it.
And, I thought this was better than
the alternatives from
https://stackoverflow.com/questions/865115/how-do-i-correctly-clean-up-a-python-object
---
 dedupe/api.py  | 14 --------------
 dedupe/core.py | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/dedupe/api.py b/dedupe/api.py
index 71edb4c5b..05ac2a71a 100644
--- a/dedupe/api.py
+++ b/dedupe/api.py
@@ -9,7 +9,6 @@
 import itertools
 import logging
 import multiprocessing
-import os
 import pickle
 import sqlite3
 import tempfile
@@ -175,7 +174,6 @@ def partition(
         clusters = self.cluster(pair_scores, threshold)
         clusters = self._add_singletons(data, clusters)
         clusters = list(clusters)
-        _cleanup_scores(pair_scores)
         return clusters
 
     def _add_singletons(self, data: Data, clusters: Clusters) -> Clusters:
@@ -514,7 +512,6 @@ def join(
             links = pair_scores[pair_scores["score"] > threshold]
 
         links = list(links)
-        _cleanup_scores(pair_scores)
         return links
 
     def one_to_one(self, scores: Scores, threshold: float = 0.0) -> Links:
@@ -1468,14 +1465,3 @@ def flatten_training(
         y.extend([encoded_y] * len(pairs))
 
     return examples, numpy.array(y)
-
-
-def _cleanup_scores(arr: Scores) -> None:
-    try:
-        mmap_file = arr.filename  # type: ignore
-    except AttributeError:
-        pass
-    else:
-        del arr
-        if mmap_file:
-            os.remove(mmap_file)
diff --git a/dedupe/core.py b/dedupe/core.py
index f89a33f59..2923be93b 100644
--- a/dedupe/core.py
+++ b/dedupe/core.py
@@ -10,6 +10,7 @@
 import os
 import queue
 import tempfile
+import weakref
 from typing import TYPE_CHECKING, overload
 
 import numpy
@@ -176,9 +177,29 @@ def scoreDuplicates(
     else:
         scored_pairs = numpy.array([], dtype=dtype)
 
+    # Monkeypatch in these extra methods and attributes.
+    # See https://docs.python.org/3/library/weakref.html#comparing-finalizers-with-del-methods
+    scored_pairs.remove = weakref.finalize(scored_pairs, _cleanup_scores, scored_pairs)  # type: ignore[union-attr]
+    scored_pairs.removed = property(_is_removed)  # type: ignore[union-attr]
+
     return scored_pairs
 
 
+def _cleanup_scores(arr: Scores) -> None:
+    try:
+        mmap_file = arr.filename  # type: ignore
+    except AttributeError:
+        pass
+    else:
+        del arr
+        if mmap_file:
+            os.remove(mmap_file)
+
+
+def _is_removed(self):
+    return not self.remove.alive
+
+
 def fillQueue(
     queue: _Queue, iterable: Iterable[Any], stop_signals: int, chunk_size: int = 20000
 ) -> None:

From f19314673feb6acc9f219f8f8da19d7a7cf07c6c Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Tue, 30 Aug 2022 00:02:08 -0800
Subject: [PATCH 2/3] Fix and improve score() docstrings

---
 dedupe/api.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/dedupe/api.py b/dedupe/api.py
index 05ac2a71a..b33b0a1ba 100644
--- a/dedupe/api.py
+++ b/dedupe/api.py
@@ -97,12 +97,27 @@ class IntegralMatching(Matching):
 
     def score(self, pairs: RecordPairs) -> Scores:
         """
-        Scores pairs of records. Returns pairs of tuples of records id and
-        associated probabilities that the pair of records are match
+        Scores pairs of records. Returns a numpy structured array of scores.
 
         Args:
-            pairs: Iterator of pairs of records
-
+            pairs: Iterator of pairs of records, such as from the output of :func:`pairs`
+
+        Returns:
+            A numpy
+            `structured array <https://docs.scipy.org/doc/numpy/user/basics.rec.html>`_
+            with a with a dtype of `[('pairs', id_type, 2), ('score', 'f4')]`
+            where dtype is either a str or int,
+            and score is a 32-bit float in the range (0, 1].
+            The 'pairs' column contains pairs of ids of
+            the records compared and the 'score' column contains
+            the similarity score for that pair of records.
+
+            This array will be a numpy.array when self.num_cores is 1,
+            and a numpy.memmap when self.num_cores is greater than 1.
+            This memmap will automatically clean itself up, you don't
+            have to worry about it.
+
+            For each pair, the smaller id will be first.
         """
         try:
             matches = core.scoreDuplicates(
@@ -802,6 +817,8 @@ def score(self, blocks: Blocks) -> Generator[Scores, None, None]:
         Args:
             blocks: Iterator of blocks of records
 
+        Yields:
+            Structured numpy arrays. See :meth:`dedupe.Dedupe.score` for more info.
         """
 
         matches = core.scoreGazette(
@@ -943,7 +960,7 @@ def __init__(
         Args:
             settings_file: A file object containing settings
                            info produced from the
-                           :func:`~dedupe.api.ActiveMatching.write_settings` method.
+                           :meth:`dedupe.Dedupe.write_settings` method.
 
             num_cores: The number of cpus to use for parallel
                        processing, defaults to the number of cpus

From 133d74408f4efbd22d353a1d87e677b2791e42cd Mon Sep 17 00:00:00 2001
From: Nick Crews <nicholas.b.crews@gmail.com>
Date: Tue, 30 Aug 2022 09:32:34 -0800
Subject: [PATCH 3/3] dev: display specific mypy error codes

Now you can #type: ignore[specific-code]
---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 6d4674602..964cf56c3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,10 +4,12 @@ requires = ["setuptools",
             "cython"]
 
 [tool.mypy]
+python_version = "3.10"
 plugins = "numpy.typing.mypy_plugin"
 ignore_missing_imports = true
 files = "dedupe"
 check_untyped_defs = true
+show_error_codes = true
 
 [tool.pytest.ini_options]
 minversion = "7.1"