From 22c54bb639d7bd98c546020964f1da29e82cb6bf Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 11:18:47 -0800 Subject: [PATCH 01/11] ref: remove __len__ from DataModel It isn't used publicly anywhere, and it adds more API surface area to worry about. This is still backwards compatible, as self._len is still saved/restored byt pickle the same way, now we just access it directly instead of going via __len__(). --- dedupe/datamodel.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index bb4335658..a1d50d0dd 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -50,9 +50,6 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]): self._len = len(all_variables) - def __len__(self) -> int: - return self._len - # Changing this from a property to just a normal attribute causes # pickling problems, because we are removing static methods from # their class context. This could be fixed by defining comparators @@ -82,7 +79,7 @@ def distances( ) -> numpy.typing.NDArray[numpy.float_]: num_records = len(record_pairs) - distances = numpy.empty((num_records, len(self)), "f4") + distances = numpy.empty((num_records, self._len), "f4") for i, (record_1, record_2) in enumerate(record_pairs): From 9a34c008d2ef9c38fd994717c27792e1dddf47c2 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 11:47:02 -0800 Subject: [PATCH 02/11] ref: make typifying variables more clear --- dedupe/datamodel.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index a1d50d0dd..b107fb273 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -38,8 +38,8 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]): variable_definitions = list(variable_definitions) if not variable_definitions: raise ValueError("The variable definitions cannot be empty") - all_variables: list[Variable] - self.primary_variables, all_variables = typify_variables(variable_definitions) + self.primary_variables = typify_variables(variable_definitions) + all_variables = _expand_higher_variables(self.primary_variables) self._derived_start = len(all_variables) all_variables += interactions(variable_definitions, self.primary_variables) @@ -141,9 +141,8 @@ def __setstate__(self, d): def typify_variables( variable_definitions: Iterable[VariableDefinition], -) -> tuple[list[FieldVariable], list[Variable]]: - primary_variables: list[FieldVariable] = [] - all_variables: list[Variable] = [] +) -> list[FieldVariable]: + variables: list[FieldVariable] = [] only_custom = True for definition in variable_definitions: @@ -188,13 +187,7 @@ def typify_variables( variable_object = variable_class(definition) assert isinstance(variable_object, FieldVariable) - primary_variables.append(variable_object) - - if hasattr(variable_object, "higher_vars"): - all_variables.extend(variable_object.higher_vars) - else: - variable_object = cast(Variable, variable_object) - all_variables.append(variable_object) + variables.append(variable_object) if only_custom: raise ValueError( @@ -203,7 +196,17 @@ def typify_variables( "blocking rules" ) - return primary_variables, all_variables + return variables + + +def _expand_higher_variables(variables: Iterable[Variable]) -> list[Variable]: + result: list[Variable] = [] + for variable in variables: + if hasattr(variable, "higher_vars"): + result.extend(variable.higher_vars) + else: + result.append(variable) + return result def missing(variables: list[Variable]) -> list[MissingDataType]: From a6504d58d843e09ee21610e5882c6804bf21814d Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 12:47:33 -0800 Subject: [PATCH 03/11] ref: tweak error raising in typify_variables() --- dedupe/datamodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index b107fb273..2249dbbd8 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -179,9 +179,9 @@ def typify_variables( try: variable_class = VARIABLE_CLASSES[variable_type] except KeyError: + valid = ", ".join(VARIABLE_CLASSES) raise KeyError( - "Field type %s not valid. Valid types include %s" - % (definition["type"], ", ".join(VARIABLE_CLASSES)) + f"Variable type {variable_type} not valid. Valid types include {valid}" ) variable_object = variable_class(definition) From 4871eae9d14d2946941f5a256db3ffe4ebf1d683 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 12:50:59 -0800 Subject: [PATCH 04/11] ref: Clarify only_custom logic in typify_variables --- dedupe/datamodel.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index 2249dbbd8..83fbeae90 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -8,6 +8,7 @@ import numpy import dedupe.variables +from dedupe.variables.base import CustomType from dedupe.variables.base import FieldType as FieldVariable from dedupe.variables.base import MissingDataType, Variable from dedupe.variables.interaction import InteractionType @@ -143,8 +144,6 @@ def typify_variables( variable_definitions: Iterable[VariableDefinition], ) -> list[FieldVariable]: variables: list[FieldVariable] = [] - only_custom = True - for definition in variable_definitions: try: variable_type = definition["type"] @@ -163,9 +162,6 @@ def typify_variables( "{'field' : 'Phone', type: 'String'}" ) - if variable_type != "Custom": - only_custom = False - if variable_type == "Interaction": continue @@ -189,6 +185,7 @@ def typify_variables( variables.append(variable_object) + only_custom = all(isinstance(v, CustomType) for v in variables) if only_custom: raise ValueError( "At least one of the variable types needs to be a type" From 8dc4b800c95634e06993932388d60f7ed1f957ab Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 12:55:16 -0800 Subject: [PATCH 05/11] typ: Fix typing of typify_variables() We iterate through the input more than once, so an Iterable is insufficient. --- dedupe/datamodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index 83fbeae90..269ca576d 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -141,7 +141,7 @@ def __setstate__(self, d): def typify_variables( - variable_definitions: Iterable[VariableDefinition], + variable_definitions: list[VariableDefinition], ) -> list[FieldVariable]: variables: list[FieldVariable] = [] for definition in variable_definitions: From 442a65af75d3390d14af0d9ec391981d140fabc0 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 13:12:54 -0800 Subject: [PATCH 06/11] ref: Further rename field to variable in datamodel --- dedupe/datamodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index 269ca576d..bbca107a8 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -217,13 +217,13 @@ def missing(variables: list[Variable]) -> list[MissingDataType]: def interactions( definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable] ) -> list[InteractionType]: - field_d = {field.name: field for field in primary_variables} + var_d = {var.name: var for var in primary_variables} interactions = [] for definition in definitions: if definition["type"] == "Interaction": var = InteractionType(definition) - var.expandInteractions(field_d) + var.expandInteractions(var_d) interactions.extend(var.higher_vars) return interactions From 4eb1aa15d1b3e1ebb2976dad7ff335f2d3d3a7ce Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 13:46:38 -0800 Subject: [PATCH 07/11] test: Add more tests for interaction variables --- tests/test_api.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/test_api.py b/tests/test_api.py index 84ac9169a..32458a17d 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -58,11 +58,13 @@ def test_initialize_fields(self): [], ) + # only customs with self.assertRaises(ValueError): dedupe.api.ActiveMatching( [{"field": "name", "type": "Custom", "comparator": lambda x, y: 1}], ) + # Only customs with self.assertRaises(ValueError): dedupe.api.ActiveMatching( [ @@ -71,6 +73,24 @@ def test_initialize_fields(self): ], ) + # Only custom and interactions + with self.assertRaises(ValueError): + dedupe.api.ActiveMatching( + [ + {"field": "name", "type": "Custom", "comparator": lambda x, y: 1}, + {"field": "age", "type": "Custom", "comparator": lambda x, y: 1}, + {"type": "Interaction", "interaction variables": ["name", "age"]}, + ], + ) + + # Only interactions + with self.assertRaises(ValueError): + dedupe.api.ActiveMatching( + [ + {"type": "Interaction", "interaction variables": []}, + ], + ) + dedupe.api.ActiveMatching( [ {"field": "name", "type": "Custom", "comparator": lambda x, y: 1}, @@ -78,6 +98,19 @@ def test_initialize_fields(self): ], ) + dedupe.api.ActiveMatching( + [ + {"field": "name", "variable name": "name", "type": "String"}, + { + "field": "age", + "variable name": "age", + "type": "Custom", + "comparator": lambda x, y: 1, + }, + {"type": "Interaction", "interaction variables": ["name", "age"]}, + ], + ) + def test_check_record(self): matcher = dedupe.api.ActiveMatching(self.field_definition) From 35b53cb2d09edb3d8b3539438c918ba5dd7dcc03 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 14:43:50 -0800 Subject: [PATCH 08/11] Ensure unique variable names Before, we never checked for this, so when we called list.index() we just got back the first instance. I swapped us over to a lookup table because that makes more sense, but I wanted to also add this check because if someone had duplicate names, this would quitely change the behavior underneath someone: before it gave the first index, now it gives the last. --- dedupe/datamodel.py | 16 ++++++++++++++-- tests/test_api.py | 20 ++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index bbca107a8..8f5b4108d 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -233,15 +233,27 @@ def missing_field_indices(variables: list[Variable]) -> list[int]: def interaction_indices(variables: list[Variable]) -> list[list[int]]: - var_names = [var.name for var in variables] + _ensure_unique_names(variables) + name_to_index = {var.name: i for i, var in enumerate(variables)} indices = [] for var in variables: if hasattr(var, "interaction_fields"): - interaction_indices = [var_names.index(f) for f in var.interaction_fields] # type: ignore + interaction_indices = [name_to_index[f] for f in var.interaction_fields] # type: ignore indices.append(interaction_indices) return indices +def _ensure_unique_names(variables: Iterable[Variable]) -> None: + seen = set() + for var in variables: + if var.name in seen: + raise ValueError( + "Variable name used more than once! " + "Choose a unique name for each variable: '{var.name}'" + ) + seen.add(var.name) + + def reduce_method(m): # type: ignore[no-untyped-def] return (getattr, (m.__self__, m.__func__.__name__)) diff --git a/tests/test_api.py b/tests/test_api.py index 32458a17d..6f9f40dd5 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -91,6 +91,26 @@ def test_initialize_fields(self): ], ) + # Duplicate variable names (explicitly) + with self.assertRaises(ValueError) as e: + dedupe.api.ActiveMatching( + [ + {"field": "age", "type": "String", "variable name": "my_age"}, + {"field": "age", "type": "ShortString", "variable name": "my_age"}, + ], + ) + assert "Variable name used more than once!" in str(e.exception) + + # Duplicate variable names (implicitly) + with self.assertRaises(ValueError) as e: + dedupe.api.ActiveMatching( + [ + {"field": "age", "type": "String"}, + {"field": "age", "type": "String"}, + ], + ) + assert "Variable name used more than once!" in str(e.exception) + dedupe.api.ActiveMatching( [ {"field": "name", "type": "Custom", "comparator": lambda x, y: 1}, From 2fdbe6e0a19db183351561b8a71426358f948e0b Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 14:52:35 -0800 Subject: [PATCH 09/11] ref: Remove "type" from non-exposed variables These variables 1. you can't currently create them because they inherit from Variable, not FieldVariable, so they don't appear in the datamodel.VARIABLE_CLASSES lookup table 2. You SHOULDN'T be able to instantiate these variables directly from a variable definition --- dedupe/variables/base.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py index f80b28faa..c958db5ea 100644 --- a/dedupe/variables/base.py +++ b/dedupe/variables/base.py @@ -58,21 +58,16 @@ def all_subclasses( class DerivedType(Variable): - type = "Derived" - def __init__(self, definition: VariableDefinition): self.name = "(%s: %s)" % (str(definition["name"]), str(definition["type"])) super(DerivedType, self).__init__(definition) class MissingDataType(Variable): - type = "MissingData" + has_missing = False def __init__(self, name: str): - - self.name = "(%s: Not Missing)" % name - - self.has_missing = False + self.name = f"({name}: Not Missing)" class FieldType(Variable): From 2d9a6a306080c17ab49cb95558055b0d2c911950 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 15:29:01 -0800 Subject: [PATCH 10/11] ref: Create InteractionVariables the same as the rest Before the flow was: 1. create all the primary variables EXCEPT for the interactions 2. Expand those. 3. go back through and NOW create the InteractionVariables I found this confusing. We parse the var definitions twice in separate places, and InteractionVariables are a special case in the first pass. The other point of this is to make it so that there is one list of Variable instances which is the single source of truth. Once we do this then we can turn all the other private instance variables of DataModel into @functools.cached_property's, which will make it much easier to ensure pickle compatibility in the future, because only one variable needs to get saved and restored --- dedupe/datamodel.py | 54 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index 8f5b4108d..c21e11616 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -29,7 +29,7 @@ ) from dedupe.predicates import Predicate -VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k} +VARIABLE_CLASSES = {k: v for k, v in Variable.all_subclasses() if k} class DataModel(object): @@ -39,11 +39,16 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]): variable_definitions = list(variable_definitions) if not variable_definitions: raise ValueError("The variable definitions cannot be empty") - self.primary_variables = typify_variables(variable_definitions) - all_variables = _expand_higher_variables(self.primary_variables) - self._derived_start = len(all_variables) - - all_variables += interactions(variable_definitions, self.primary_variables) + variables = typify_variables(variable_definitions) + non_interactions: list[FieldVariable] = [ + v for v in variables if not isinstance(v, InteractionType) # type: ignore[misc] + ] + self.primary_variables = non_interactions + expanded_primary = _expand_higher_variables(self.primary_variables) + self._derived_start = len(expanded_primary) + + all_variables = expanded_primary.copy() + all_variables += _expanded_interactions(variables) all_variables += missing(all_variables) self._missing_field_indices = missing_field_indices(all_variables) @@ -140,10 +145,8 @@ def __setstate__(self, d): self.__dict__ = d -def typify_variables( - variable_definitions: list[VariableDefinition], -) -> list[FieldVariable]: - variables: list[FieldVariable] = [] +def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Variable]: + variables: list[Variable] = [] for definition in variable_definitions: try: variable_type = definition["type"] @@ -162,9 +165,6 @@ def typify_variables( "{'field' : 'Phone', type: 'String'}" ) - if variable_type == "Interaction": - continue - if variable_type == "FuzzyCategorical" and "other fields" not in definition: definition["other fields"] = [ # type: ignore d["field"] @@ -181,16 +181,18 @@ def typify_variables( ) variable_object = variable_class(definition) - assert isinstance(variable_object, FieldVariable) + assert isinstance(variable_object, Variable) variables.append(variable_object) - only_custom = all(isinstance(v, CustomType) for v in variables) - if only_custom: + no_blocking_variables = all( + isinstance(v, (CustomType, InteractionType)) for v in variables + ) + if no_blocking_variables: raise ValueError( - "At least one of the variable types needs to be a type" - "other than 'Custom'. 'Custom' types have no associated" - "blocking rules" + "At least one of the variable types needs to be a type " + "other than 'Custom' or 'Interaction', " + "since these types have no associated blocking rules." ) return variables @@ -214,16 +216,12 @@ def missing(variables: list[Variable]) -> list[MissingDataType]: return missing_variables -def interactions( - definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable] -) -> list[InteractionType]: - var_d = {var.name: var for var in primary_variables} - +def _expanded_interactions(variables: list[Variable]) -> list[InteractionType]: + field_vars = {var.name: var for var in variables if isinstance(var, FieldVariable)} interactions = [] - for definition in definitions: - if definition["type"] == "Interaction": - var = InteractionType(definition) - var.expandInteractions(var_d) + for var in variables: + if isinstance(var, InteractionType): + var.expandInteractions(field_vars) interactions.extend(var.higher_vars) return interactions From 5482c21b2f31b123da8e43a0d6416ffb551658fb Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 16 Sep 2022 15:41:26 -0800 Subject: [PATCH 11/11] ref: Move check for empty var def into typify_variables --- dedupe/datamodel.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index c21e11616..786988d97 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -36,9 +36,6 @@ class DataModel(object): version = 1 def __init__(self, variable_definitions: Iterable[VariableDefinition]): - variable_definitions = list(variable_definitions) - if not variable_definitions: - raise ValueError("The variable definitions cannot be empty") variables = typify_variables(variable_definitions) non_interactions: list[FieldVariable] = [ v for v in variables if not isinstance(v, InteractionType) # type: ignore[misc] @@ -145,7 +142,13 @@ def __setstate__(self, d): self.__dict__ = d -def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Variable]: +def typify_variables( + variable_definitions: Iterable[VariableDefinition], +) -> list[Variable]: + variable_definitions = list(variable_definitions) + if not variable_definitions: + raise ValueError("The variable definitions cannot be empty") + variables: list[Variable] = [] for definition in variable_definitions: try: @@ -179,10 +182,8 @@ def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Var raise KeyError( f"Variable type {variable_type} not valid. Valid types include {valid}" ) - variable_object = variable_class(definition) assert isinstance(variable_object, Variable) - variables.append(variable_object) no_blocking_variables = all(