-
Notifications
You must be signed in to change notification settings - Fork 550
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Prep DataModel for removal #1102
base: main
Are you sure you want to change the base?
Changes from 1 commit
22c54bb
9a34c00
a6504d5
4871eae
8dc4b80
442a65a
4eb1aa1
35b53cb
2fdbe6e
2d9a6a3
5482c21
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,7 +29,7 @@ | |
) | ||
from dedupe.predicates import Predicate | ||
|
||
VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k} | ||
VARIABLE_CLASSES = {k: v for k, v in Variable.all_subclasses() if k} | ||
|
||
|
||
class DataModel(object): | ||
|
@@ -39,11 +39,16 @@ def __init__(self, variable_definitions: Iterable[VariableDefinition]): | |
variable_definitions = list(variable_definitions) | ||
if not variable_definitions: | ||
raise ValueError("The variable definitions cannot be empty") | ||
self.primary_variables = typify_variables(variable_definitions) | ||
all_variables = _expand_higher_variables(self.primary_variables) | ||
self._derived_start = len(all_variables) | ||
|
||
all_variables += interactions(variable_definitions, self.primary_variables) | ||
variables = typify_variables(variable_definitions) | ||
non_interactions: list[FieldVariable] = [ | ||
v for v in variables if not isinstance(v, InteractionType) # type: ignore[misc] | ||
] | ||
self.primary_variables = non_interactions | ||
expanded_primary = _expand_higher_variables(self.primary_variables) | ||
self._derived_start = len(expanded_primary) | ||
|
||
all_variables = expanded_primary.copy() | ||
all_variables += _expanded_interactions(variables) | ||
all_variables += missing(all_variables) | ||
|
||
self._missing_field_indices = missing_field_indices(all_variables) | ||
|
@@ -140,10 +145,8 @@ def __setstate__(self, d): | |
self.__dict__ = d | ||
|
||
|
||
def typify_variables( | ||
variable_definitions: list[VariableDefinition], | ||
) -> list[FieldVariable]: | ||
variables: list[FieldVariable] = [] | ||
def typify_variables(variable_definitions: list[VariableDefinition]) -> list[Variable]: | ||
variables: list[Variable] = [] | ||
for definition in variable_definitions: | ||
try: | ||
variable_type = definition["type"] | ||
|
@@ -162,9 +165,6 @@ def typify_variables( | |
"{'field' : 'Phone', type: 'String'}" | ||
) | ||
|
||
if variable_type == "Interaction": | ||
continue | ||
|
||
if variable_type == "FuzzyCategorical" and "other fields" not in definition: | ||
definition["other fields"] = [ # type: ignore | ||
d["field"] | ||
|
@@ -181,16 +181,18 @@ def typify_variables( | |
) | ||
|
||
variable_object = variable_class(definition) | ||
assert isinstance(variable_object, FieldVariable) | ||
assert isinstance(variable_object, Variable) | ||
|
||
variables.append(variable_object) | ||
|
||
only_custom = all(isinstance(v, CustomType) for v in variables) | ||
if only_custom: | ||
no_blocking_variables = all( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Continuing from #1098 (comment) I think it would be better if this was more duck-typed though, such as That would be a slight change though, since I think really there needs to be a more official API where variables declare the predicates they export. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the code that add the ExistsPredicate should only really be on FieldPredicates as it depends on there being a field. https://github.com/dedupeio/dedupe/blob/main/dedupe/variables/base.py#L37 if we move that code, then we could do pretty much what you are saying. no_blocking_variables = not any(len(v.predicates) for v in variables) we might also want to have a check like only_exists = all(isinstance(pred, ExistsPredicate) for v in variables for pred in v.predicates)
if only_exists:
warning.warn('You are only using the Exists Predicate which is usually pretty bad') |
||
isinstance(v, (CustomType, InteractionType)) for v in variables | ||
) | ||
if no_blocking_variables: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps this check should happen in DataModel.init, and not in this utility function? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it seems of a piece with the "Incorrect variable specification: variable" error? |
||
raise ValueError( | ||
"At least one of the variable types needs to be a type" | ||
"other than 'Custom'. 'Custom' types have no associated" | ||
"blocking rules" | ||
"At least one of the variable types needs to be a type " | ||
"other than 'Custom' or 'Interaction', " | ||
"since these types have no associated blocking rules." | ||
) | ||
|
||
return variables | ||
|
@@ -214,16 +216,12 @@ def missing(variables: list[Variable]) -> list[MissingDataType]: | |
return missing_variables | ||
|
||
|
||
def interactions( | ||
definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable] | ||
) -> list[InteractionType]: | ||
var_d = {var.name: var for var in primary_variables} | ||
|
||
def _expanded_interactions(variables: list[Variable]) -> list[InteractionType]: | ||
field_vars = {var.name: var for var in variables if isinstance(var, FieldVariable)} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, this should feels like it needs a more official API, or should be responsibility of the InteractionType There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i can think of a few designs here, but this code is actually doing two things for us, both of which are kind of important.
seems like we kind of need to do both. |
||
interactions = [] | ||
for definition in definitions: | ||
if definition["type"] == "Interaction": | ||
var = InteractionType(definition) | ||
var.expandInteractions(var_d) | ||
for var in variables: | ||
if isinstance(var, InteractionType): | ||
var.expandInteractions(field_vars) | ||
interactions.extend(var.higher_vars) | ||
return interactions | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is another case where I think a Variable should have a more official API for if they export if they are an Interaction or not. Maybe should think of a better name for this too. Atomic vs derived? leaf vs nonleaf? base vs derived?