Skip to content

Commit

Permalink
feat: Auto cleanup scores with weakref.finalize
Browse files Browse the repository at this point in the history
It wasn't stated anywhere in the docs that you have to cleanup the
possibly-a-tempfile, so people might not have been doing it.
And, I thought this was better than
the alternatives from
https://stackoverflow.com/questions/865115/how-do-i-correctly-clean-up-a-python-object
  • Loading branch information
NickCrews committed Aug 30, 2022
1 parent bf028e9 commit afa1ff2
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 14 deletions.
14 changes: 0 additions & 14 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import itertools
import logging
import multiprocessing
import os
import pickle
import sqlite3
import tempfile
Expand Down Expand Up @@ -175,7 +174,6 @@ def partition(
clusters = self.cluster(pair_scores, threshold)
clusters = self._add_singletons(data, clusters)
clusters = list(clusters)
_cleanup_scores(pair_scores)
return clusters

def _add_singletons(self, data: Data, clusters: Clusters) -> Clusters:
Expand Down Expand Up @@ -514,7 +512,6 @@ def join(
links = pair_scores[pair_scores["score"] > threshold]

links = list(links)
_cleanup_scores(pair_scores)
return links

def one_to_one(self, scores: Scores, threshold: float = 0.0) -> Links:
Expand Down Expand Up @@ -1468,14 +1465,3 @@ def flatten_training(
y.extend([encoded_y] * len(pairs))

return examples, numpy.array(y)


def _cleanup_scores(arr: Scores) -> None:
try:
mmap_file = arr.filename # type: ignore
except AttributeError:
pass
else:
del arr
if mmap_file:
os.remove(mmap_file)
21 changes: 21 additions & 0 deletions dedupe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
import queue
import tempfile
import weakref
from typing import TYPE_CHECKING, overload

import numpy
Expand Down Expand Up @@ -176,9 +177,29 @@ def scoreDuplicates(
else:
scored_pairs = numpy.array([], dtype=dtype)

# Monkeypatch in these extra methods and attributes.
# See https://docs.python.org/3/library/weakref.html#comparing-finalizers-with-del-methods
scored_pairs.remove = weakref.finalize(scored_pairs, _cleanup_scores, scored_pairs) # type: ignore[union-attr]
scored_pairs.removed = property(_is_removed) # type: ignore[union-attr]

return scored_pairs


def _cleanup_scores(arr: Scores) -> None:
try:
mmap_file = arr.filename # type: ignore
except AttributeError:
pass
else:
del arr
if mmap_file:
os.remove(mmap_file)


def _is_removed(self):
return not self.remove.alive


def fillQueue(
queue: _Queue, iterable: Iterable[Any], stop_signals: int, chunk_size: int = 20000
) -> None:
Expand Down

0 comments on commit afa1ff2

Please sign in to comment.