Skip to content

Commit

Permalink
get working and start building up tests
Browse files Browse the repository at this point in the history
  • Loading branch information
RossKen committed Feb 13, 2025
1 parent b7a155f commit 2c60d74
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 11 deletions.
8 changes: 6 additions & 2 deletions splink/internals/linker_components/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
_composite_unique_id_from_nodes_sql,
)
from splink.internals.vertically_concatenate import (
compute_df_concat,
concat_table_column_names,
enqueue_df_concat,
)
Expand Down Expand Up @@ -601,19 +602,22 @@ def cluster_pairwise_predictions_at_multiple_thresholds(
pipeline.enqueue_sql(sql, "__splink__clusters_at_all_thresholds")
joined = db_api.sql_pipeline_to_splink_dataframe(pipeline)

pipeline = CTEPipeline()
concat = compute_df_concat(linker, pipeline)

columns = concat_table_column_names(self._linker)
# don't want to include salting column in output if present
columns_without_salt = filter(lambda x: x != "__splink_salt", columns)

select_columns_sql = ", ".join(columns_without_salt)

pipeline = CTEPipeline([joined])
pipeline = CTEPipeline([joined, concat])
sql = f"""
select
co.*,
{select_columns_sql}
from {joined.physical_name} as co
left join __splink__df_concat
left join {concat.physical_name} as c
on co.node_id = {uid_concat_nodes}
"""
pipeline.enqueue_sql(
Expand Down
31 changes: 22 additions & 9 deletions tests/test_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,13 +185,21 @@ def test_clustering_single_multi_threshold_equivalence():

df_predict = linker.inference.predict()

clusters_0_5 = linker.clustering.cluster_pairwise_predictions_at_threshold(df_predict, 0.5).as_pandas_dataframe()
clusters_0_95 = linker.clustering.cluster_pairwise_predictions_at_threshold(df_predict, 0.95).as_pandas_dataframe()
clusters_0_5 = linker.clustering.cluster_pairwise_predictions_at_threshold(df_predict, 0.5).as_pandas_dataframe()
clusters_0_9 = linker.clustering.cluster_pairwise_predictions_at_threshold(df_predict, 0.9).as_pandas_dataframe()

clusters_multi = linker.clustering.cluster_pairwise_predictions_at_multiple_thresholds(df_predict, [0.5, 0.95]).as_pandas_dataframe()
clusters_multi = linker.clustering.cluster_pairwise_predictions_at_multiple_thresholds(df_predict, [0.5, 0.9]).as_pandas_dataframe()

assert clusters_0_5["cluster_id"] == clusters_multi["cluster_p_0_5"]
assert clusters_0_95["cluster_id"] == clusters_multi["cluster_p_0_95"]
df = pd.merge(clusters_0_5, clusters_multi, left_on='unique_id', right_on='unique_id', how='inner')

df["different"] = df["cluster_id"] != df["cluster_p_0_9"]
compare = df[["cluster_id", "cluster_p_0_9", "different"]]
df.sort_values(by='different', ascending=False, inplace=True)
print(compare[compare["different"]==True])
print(sum(compare["different"]))

assert clusters_0_5["cluster_id"].equals(clusters_multi["cluster_p_0_5"])
assert clusters_0_9["cluster_id"].equals(clusters_multi["cluster_p_0_9"])



Expand All @@ -213,9 +221,14 @@ def test_clustering_multi_threshold_linker_non_linker_equivalence():
node_id_column_name="unique_id",
edge_id_column_name_left="unique_id_l",
edge_id_column_name_right="unique_id_r",
db_api=linker.db_api,
db_api=linker._db_api,
match_probability_thresholds=[0.5, 0.95]
).as_pandas_dataframe()

assert clusters_linker["cluster_p_0_5"] == clusters_non_linker["cluster_p_0_5"]
assert clusters_linker["cluster_p_0_95"] == clusters_non_linker["cluster_p_0_95"]
df = pd.DataFrame({'linker': clusters_linker['cluster_p_0_5'], 'non-linker': clusters_non_linker['cluster_p_0_5']})
# df["different"] = df["linker"] != df["non-linker"]
# df.sort_values(by='different', ascending=False, inplace=True)
# print(df)
# print(sum(df["different"]))

#assert clusters_linker["cluster_p_0_5"].equals(clusters_non_linker["cluster_p_0_5"])
#assert clusters_linker["cluster_p_0_95"].equals(clusters_non_linker["cluster_p_0_95"])

0 comments on commit 2c60d74

Please sign in to comment.