From f19314673feb6acc9f219f8f8da19d7a7cf07c6c Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Tue, 30 Aug 2022 00:02:08 -0800 Subject: [PATCH] Fix and improve score() docstrings --- dedupe/api.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/dedupe/api.py b/dedupe/api.py index 05ac2a71..b33b0a1b 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -97,12 +97,27 @@ class IntegralMatching(Matching): def score(self, pairs: RecordPairs) -> Scores: """ - Scores pairs of records. Returns pairs of tuples of records id and - associated probabilities that the pair of records are match + Scores pairs of records. Returns a numpy structured array of scores. Args: - pairs: Iterator of pairs of records - + pairs: Iterator of pairs of records, such as from the output of :func:`pairs` + + Returns: + A numpy + `structured array `_ + with a with a dtype of `[('pairs', id_type, 2), ('score', 'f4')]` + where dtype is either a str or int, + and score is a 32-bit float in the range (0, 1]. + The 'pairs' column contains pairs of ids of + the records compared and the 'score' column contains + the similarity score for that pair of records. + + This array will be a numpy.array when self.num_cores is 1, + and a numpy.memmap when self.num_cores is greater than 1. + This memmap will automatically clean itself up, you don't + have to worry about it. + + For each pair, the smaller id will be first. """ try: matches = core.scoreDuplicates( @@ -802,6 +817,8 @@ def score(self, blocks: Blocks) -> Generator[Scores, None, None]: Args: blocks: Iterator of blocks of records + Yields: + Structured numpy arrays. See :meth:`dedupe.Dedupe.score` for more info. """ matches = core.scoreGazette( @@ -943,7 +960,7 @@ def __init__( Args: settings_file: A file object containing settings info produced from the - :func:`~dedupe.api.ActiveMatching.write_settings` method. + :meth:`dedupe.Dedupe.write_settings` method. num_cores: The number of cpus to use for parallel processing, defaults to the number of cpus