From 63c10c8ad54db91c6596d05af7bc3e343db5b271 Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Wed, 18 Dec 2024 12:08:40 +1100
Subject: [PATCH 01/15] initial implementation of one-to-one clustering

---
 .../internals/linker_components/clustering.py | 127 ++++++++++-
 splink/internals/one_to_one_clustering.py     | 211 ++++++++++++++++++
 2 files changed, 337 insertions(+), 1 deletion(-)
 create mode 100755 splink/internals/one_to_one_clustering.py

diff --git a/splink/internals/linker_components/clustering.py b/splink/internals/linker_components/clustering.py
index 2e0f8aed22..9eced023cf 100644
--- a/splink/internals/linker_components/clustering.py
+++ b/splink/internals/linker_components/clustering.py
@@ -1,10 +1,13 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 from splink.internals.connected_components import (
     solve_connected_components,
 )
+from splink.internals.one_to_one_clustering import (
+    one_to_one_clustering,
+)
 from splink.internals.edge_metrics import compute_edge_metrics
 from splink.internals.graph_metrics import (
     GraphMetricsResults,
@@ -177,6 +180,128 @@ def cluster_pairwise_predictions_at_threshold(
 
         return df_clustered_with_input_data
 
+
+    def cluster_single_best_links_at_threshold(
+        self,
+        df_predict: SplinkDataFrame,
+        source_datasets: List[str],
+        threshold_match_probability: Optional[float] = None,
+        threshold_match_weight: Optional[float] = None,
+    ) -> SplinkDataFrame:
+        """
+        Clusters the pairwise match predictions that result from
+        `linker.inference.predict()` into groups of connected records using a single
+        best links method that restricts the clusters to have at most one record from
+        each source dataset in the `source_datasets` list. 
+        """
+        linker = self._linker
+        db_api = linker._db_api
+
+        pipeline = CTEPipeline()
+
+        enqueue_df_concat(linker, pipeline)
+
+        uid_cols = linker._settings_obj.column_info_settings.unique_id_input_columns
+        uid_concat_edges_l = _composite_unique_id_from_edges_sql(uid_cols, "l")
+        uid_concat_edges_r = _composite_unique_id_from_edges_sql(uid_cols, "r")
+        uid_concat_nodes = _composite_unique_id_from_nodes_sql(uid_cols, None)
+
+        source_dataset_column_name = linker._settings_obj.column_info_settings.source_dataset_column_name
+
+        sql = f"""
+        select
+            {uid_concat_nodes} as node_id,
+            {source_dataset_column_name} as source_dataset
+            from __splink__df_concat
+        """
+        pipeline.enqueue_sql(sql, "__splink__df_nodes_with_composite_ids")
+
+        nodes_with_composite_ids = db_api.sql_pipeline_to_splink_dataframe(pipeline)
+
+        has_match_prob_col = "match_probability" in [
+            c.unquote().name for c in df_predict.columns
+        ]
+
+        threshold_match_probability = threshold_args_to_match_prob(
+            threshold_match_probability, threshold_match_weight
+        )
+
+        if not has_match_prob_col and threshold_match_probability is not None:
+            raise ValueError(
+                "df_predict must have a column called 'match_probability' if "
+                "threshold_match_probability is provided"
+            )
+
+        match_p_expr = ""
+        match_p_select_expr = ""
+        if threshold_match_probability is not None:
+            match_p_expr = f"where match_probability >= {threshold_match_probability}"
+            match_p_select_expr = ", match_probability"
+
+        pipeline = CTEPipeline([df_predict])
+
+        # Templated name must be used here because it could be the output
+        # of a deterministic link i.e. the templated name is not know for sure
+        sql = f"""
+        select
+            {uid_concat_edges_l} as node_id_l,
+            {uid_concat_edges_r} as node_id_r
+            {match_p_select_expr}
+            from {df_predict.templated_name}
+            {match_p_expr}
+        """
+        pipeline.enqueue_sql(sql, "__splink__df_edges_from_predict")
+
+        edges_table_with_composite_ids = db_api.sql_pipeline_to_splink_dataframe(
+            pipeline
+        )
+
+        oo = one_to_one_clustering(
+            nodes_table=nodes_with_composite_ids,
+            edges_table=edges_table_with_composite_ids,
+            node_id_column_name="node_id",
+            source_dataset_column_name="source_dataset",
+            edge_id_column_name_left="node_id_l",
+            edge_id_column_name_right="node_id_r",
+            source_datasets=source_datasets,
+            db_api=db_api,
+            threshold_match_probability=threshold_match_probability,
+        )
+
+        edges_table_with_composite_ids.drop_table_from_database_and_remove_from_cache()
+        nodes_with_composite_ids.drop_table_from_database_and_remove_from_cache()
+        pipeline = CTEPipeline([oo])
+
+        enqueue_df_concat(linker, pipeline)
+
+        columns = concat_table_column_names(self._linker)
+        # don't want to include salting column in output if present
+        columns_without_salt = filter(lambda x: x != "__splink_salt", columns)
+
+        select_columns_sql = ", ".join(columns_without_salt)
+
+        sql = f"""
+        select
+            cc.cluster_id,
+            {select_columns_sql}
+        from __splink__clustering_output_final as cc
+        left join __splink__df_concat
+        on cc.node_id = {uid_concat_nodes}
+        """
+        pipeline.enqueue_sql(sql, "__splink__df_clustered_with_input_data")
+
+        df_clustered_with_input_data = db_api.sql_pipeline_to_splink_dataframe(pipeline)
+
+        oo.drop_table_from_database_and_remove_from_cache()
+
+        if threshold_match_probability is not None:
+            df_clustered_with_input_data.metadata["threshold_match_probability"] = (
+                threshold_match_probability
+            )
+
+        return df_clustered_with_input_data
+
+
     def _compute_metrics_nodes(
         self,
         df_predict: SplinkDataFrame,
diff --git a/splink/internals/one_to_one_clustering.py b/splink/internals/one_to_one_clustering.py
new file mode 100755
index 0000000000..5397c7913d
--- /dev/null
+++ b/splink/internals/one_to_one_clustering.py
@@ -0,0 +1,211 @@
+from __future__ import annotations
+
+import logging
+import time
+from typing import List, Optional
+
+from splink.internals.database_api import DatabaseAPISubClass
+from splink.internals.pipeline import CTEPipeline
+from splink.internals.splink_dataframe import SplinkDataFrame
+
+logger = logging.getLogger(__name__)
+
+def one_to_one_clustering(
+    nodes_table: SplinkDataFrame,
+    edges_table: SplinkDataFrame,
+    node_id_column_name: str,
+    source_dataset_column_name: str,
+    edge_id_column_name_left: str,
+    edge_id_column_name_right: str,
+    source_datasets: List[str],
+    db_api: DatabaseAPISubClass,
+    threshold_match_probability: Optional[float],
+) -> SplinkDataFrame:
+    """One to one clustering algorithm.
+
+    This function clusters together records so that at most one record from each dataset is in each cluster.
+
+    Args:
+        
+    Returns:
+        SplinkDataFrame: A dataframe containing the connected components list
+        for your link or dedupe job.
+
+    """
+
+    pipeline = CTEPipeline([edges_table])
+
+    match_prob_expr = f"where match_probability >= {threshold_match_probability}"
+    if threshold_match_probability is None:
+        match_prob_expr = ""
+
+    # Add 'reverse-edges' so that the algorithm can rank all incoming and outgoing edges
+    sql = f"""
+    select
+        {edge_id_column_name_left} as node_id,
+        {edge_id_column_name_right} as neighbour,
+        match_probability
+    from {edges_table.templated_name}
+    {match_prob_expr}
+
+    UNION ALL
+
+    select
+    {edge_id_column_name_right} as node_id,
+    {edge_id_column_name_left} as neighbour,
+    match_probability
+    from {edges_table.templated_name}
+    {match_prob_expr}
+    """
+    pipeline.enqueue_sql(sql, "__splink__df_neighbours")
+
+    neighbours = db_api.sql_pipeline_to_splink_dataframe(pipeline)
+
+    pipeline = CTEPipeline([nodes_table])
+
+    sql = f"""
+    select
+        {node_id_column_name} as node_id,
+        {node_id_column_name} as representative,
+        {source_dataset_column_name} as source_dataset
+    from {nodes_table.templated_name}
+    """
+
+    pipeline.enqueue_sql(sql, "__splink__df_representatives")
+
+    prev_representatives = db_api.sql_pipeline_to_splink_dataframe(pipeline)
+
+    iteration, needs_updating_count = 0, 1
+    while needs_updating_count > 0:
+        start_time = time.time()
+        iteration += 1
+
+        pipeline = CTEPipeline([neighbours, prev_representatives])
+
+        # might need to quote the value here? 
+        contains_expr = ", ".join([f"max(source_dataset == '{sd}') as contains_{sd}" for sd in source_datasets])
+        
+        sql = f"""
+        select
+            representative,
+            {contains_expr}
+        from {prev_representatives.physical_name}
+        group by representative
+        """
+
+        pipeline.enqueue_sql(sql, f"__splink__representative_contains_flags_{iteration}")
+
+        sql = f"""
+        select
+            r.node_id,
+            r.source_dataset,
+            cf.*
+        from {prev_representatives.physical_name} as r 
+        inner join __splink__representative_contains_flags_{iteration} as cf
+        on r.representative = cf.representative
+        """
+
+        pipeline.enqueue_sql(sql, f"__splink__df_representatives_with_flags_{iteration}")
+
+        duplicate_criteria = " or ".join([f"(l.contains_{sd} and r.contains_{sd})" for sd in source_datasets])
+    
+        # must be calculated every iteration since the where condition changes as the clustering progresses
+        sql = f"""
+        select 
+            neighbours.node_id,
+            neighbours.neighbour,
+            {duplicate_criteria} as duplicate_criteria,
+            row_number() over (partition by l.representative order by match_probability desc) as rank_l,
+            row_number() over (partition by r.representative order by match_probability desc) as rank_r,
+        from {neighbours.physical_name} as neighbours
+        inner join __splink__df_representatives_with_flags_{iteration} as l
+        on neighbours.node_id = l.node_id
+        inner join __splink__df_representatives_with_flags_{iteration} as r 
+        on neighbours.neighbour = r.node_id
+        where l.representative <> r.representative
+        """
+        
+        # note for the future: a strategy to handle ties would go right here. 
+        
+        pipeline.enqueue_sql(sql, f"__splink__df_ranked_{iteration}")
+    
+        sql = f"""
+        select
+            node_id,
+            neighbour
+        from __splink__df_ranked_{iteration} 
+        where rank_l = 1 and rank_r = 1 and not duplicate_criteria
+        """
+
+        pipeline.enqueue_sql(sql, f"__splink__df_neighbours_{iteration}")
+    
+        sql = f"""
+        select
+        source.node_id, 
+        min(source.representative) as representative
+        from
+        (
+            select
+                neighbours.node_id,
+                repr_neighbour.representative as representative,
+            from __splink__df_neighbours_{iteration} as neighbours
+            left join {prev_representatives.physical_name} as repr_neighbour
+            on neighbours.neighbour = repr_neighbour.node_id
+            
+            union all
+            
+            select
+                node_id,
+                representative
+            from {prev_representatives.physical_name}
+        ) AS source
+        group by source.node_id
+        """
+
+        pipeline.enqueue_sql(sql, f"r")
+    
+        sql = f"""
+        select
+            r.node_id,
+            r.representative,
+            repr.source_dataset,
+            r.representative <> repr.representative as needs_updating
+        from r
+        inner join {prev_representatives.physical_name} as repr
+        on r.node_id = repr.node_id
+        """
+
+        pipeline.enqueue_sql(sql, f"__splink__df_representatives_{iteration}")
+
+        representatives = db_api.sql_pipeline_to_splink_dataframe(pipeline)
+
+        prev_representatives.drop_table_from_database_and_remove_from_cache()
+        prev_representatives = representatives
+
+        pipeline = CTEPipeline()
+
+        # assess if the exit condition has been met
+        sql = f"""
+        select 
+            count(*) as count_of_nodes_needing_updating
+        from {representatives.physical_name}
+        where needs_updating
+        """
+
+        pipeline.enqueue_sql(sql, "__splink__df_root_rows")
+
+        root_rows_df = db_api.sql_pipeline_to_splink_dataframe(
+            pipeline, use_cache=False
+        )
+
+        root_rows = root_rows_df.as_record_dict()
+        root_rows_df.drop_table_from_database_and_remove_from_cache()
+        needs_updating_count = root_rows[0]["count_of_nodes_needing_updating"]
+        logger.info(
+            f"Completed iteration {iteration}, "
+            f"num representatives needing updating: {needs_updating_count}"
+        )
+        end_time = time.time()
+        logger.log(15, f"    Iteration time: {end_time - start_time} seconds")
+
+    return representatives

From 4291b376a11efc6e4994c1bfbfaa195045702ca0 Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Wed, 18 Dec 2024 15:01:08 +1100
Subject: [PATCH 02/15] added a test for single_best_links and fixed some
 issues

---
 .../internals/linker_components/clustering.py |  6 +-
 splink/internals/one_to_one_clustering.py     | 16 ++++-
 ..._cluster_single_best_links_at_threshold.py | 58 +++++++++++++++++++
 3 files changed, 76 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_cluster_single_best_links_at_threshold.py

diff --git a/splink/internals/linker_components/clustering.py b/splink/internals/linker_components/clustering.py
index 9eced023cf..2938e6652b 100644
--- a/splink/internals/linker_components/clustering.py
+++ b/splink/internals/linker_components/clustering.py
@@ -282,11 +282,11 @@ def cluster_single_best_links_at_threshold(
 
         sql = f"""
         select
-            cc.cluster_id,
+            oo.cluster_id,
             {select_columns_sql}
-        from __splink__clustering_output_final as cc
+        from {oo.templated_name} as oo
         left join __splink__df_concat
-        on cc.node_id = {uid_concat_nodes}
+        on oo.node_id = {uid_concat_nodes}
         """
         pipeline.enqueue_sql(sql, "__splink__df_clustered_with_input_data")
 
diff --git a/splink/internals/one_to_one_clustering.py b/splink/internals/one_to_one_clustering.py
index 5397c7913d..0a2f0a23ab 100755
--- a/splink/internals/one_to_one_clustering.py
+++ b/splink/internals/one_to_one_clustering.py
@@ -208,4 +208,18 @@ def one_to_one_clustering(
         end_time = time.time()
         logger.log(15, f"    Iteration time: {end_time - start_time} seconds")
 
-    return representatives
+    pipeline = CTEPipeline()
+
+    sql = f"""
+    select node_id as {node_id_column_name}, representative as cluster_id 
+    from {representatives.physical_name}
+    """ 
+
+    pipeline.enqueue_sql(sql, "__splink__clustering_output_final")
+
+    final_result = db_api.sql_pipeline_to_splink_dataframe(pipeline)
+
+    representatives.drop_table_from_database_and_remove_from_cache()
+    neighbours.drop_table_from_database_and_remove_from_cache()
+
+    return final_result
\ No newline at end of file
diff --git a/tests/test_cluster_single_best_links_at_threshold.py b/tests/test_cluster_single_best_links_at_threshold.py
new file mode 100644
index 0000000000..08e4faa72e
--- /dev/null
+++ b/tests/test_cluster_single_best_links_at_threshold.py
@@ -0,0 +1,58 @@
+import pandas as pd
+import pytest
+from pytest import mark
+
+import splink.comparison_library as cl
+from splink import DuckDBAPI, Linker, SettingsCreator, block_on
+
+from .basic_settings import get_settings_dict
+from .decorator import mark_with_dialects_excluding
+
+from splink import DuckDBAPI, Linker
+
+@mark_with_dialects_excluding()
+def test_single_best_links_correctness(test_helpers, dialect):
+    helper = test_helpers[dialect]
+
+    df = pd.DataFrame({
+        "unique_id":      [  0,  1,   2,   3,   4,   5,   6,   7,   8],
+        "source_dataset": ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c'],
+    })
+
+    predictions = pd.DataFrame({
+        "unique_id_l":       [  0,   1,   3,   4,   6,   6],
+        "unique_id_r":       [  1,   2,   5,   5,   5,   7],
+        "source_dataset_l":  ['a', 'b', 'a', 'b', 'a', 'a'],
+        "source_dataset_r":  ['b', 'c', 'c', 'c', 'c', 'b'],
+        "match_probability": [.90, .70, .85, .90, .80, .70],
+    })
+
+    settings = SettingsCreator(
+        link_type="link_only",
+        comparisons=[],
+        blocking_rules_to_generate_predictions=[],
+    )
+
+    linker = Linker(df, settings, **helper.extra_linker_args())
+
+    df_predict = linker.table_management.register_table_predict(predictions, overwrite=True)
+
+    df_clusters = linker.clustering.cluster_single_best_links_at_threshold(
+        df_predict, 
+        source_datasets=["a", "b", "c"], 
+        threshold_match_probability=0.5
+    )
+
+    result = df_clusters.as_pandas_dataframe().sort_values("unique_id")
+    result = result.reset_index(drop=True)
+
+    correct_result = pd.DataFrame({
+        "cluster_id": ["a-__-0", "a-__-0", "a-__-0", "a-__-3", "a-__-3", "a-__-3", "a-__-6", "b-__-7", "c-__-8"],
+        "unique_id":      [  0,  1,   2,   3,   4,   5,   6,   7,   8],
+        "source_dataset": ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c'],
+    })
+    correct_result = correct_result.sort_values("unique_id")
+    correct_result = correct_result.reset_index(drop=True)
+
+    pd.testing.assert_frame_equal(result, correct_result)
+

From 85fa9e5b2adab47f55a77cbe60f0b11238a709d3 Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Mon, 6 Jan 2025 11:39:43 +1100
Subject: [PATCH 03/15] added new test for single best links clustering

---
 ..._cluster_single_best_links_at_threshold.py | 160 +++++++++++++++---
 1 file changed, 134 insertions(+), 26 deletions(-)

diff --git a/tests/test_cluster_single_best_links_at_threshold.py b/tests/test_cluster_single_best_links_at_threshold.py
index 08e4faa72e..da6bc52810 100644
--- a/tests/test_cluster_single_best_links_at_threshold.py
+++ b/tests/test_cluster_single_best_links_at_threshold.py
@@ -1,31 +1,31 @@
 import pandas as pd
-import pytest
-from pytest import mark
 
 import splink.comparison_library as cl
-from splink import DuckDBAPI, Linker, SettingsCreator, block_on
+from splink import Linker, SettingsCreator, block_on
 
-from .basic_settings import get_settings_dict
 from .decorator import mark_with_dialects_excluding
 
-from splink import DuckDBAPI, Linker
 
 @mark_with_dialects_excluding()
 def test_single_best_links_correctness(test_helpers, dialect):
     helper = test_helpers[dialect]
 
-    df = pd.DataFrame({
-        "unique_id":      [  0,  1,   2,   3,   4,   5,   6,   7,   8],
-        "source_dataset": ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c'],
-    })
+    df = pd.DataFrame(
+        {
+            "unique_id": [0, 1, 2, 3, 4, 5, 6, 7, 8],
+            "source_dataset": ["a", "b", "c", "a", "b", "c", "a", "b", "c"],
+        }
+    )
 
-    predictions = pd.DataFrame({
-        "unique_id_l":       [  0,   1,   3,   4,   6,   6],
-        "unique_id_r":       [  1,   2,   5,   5,   5,   7],
-        "source_dataset_l":  ['a', 'b', 'a', 'b', 'a', 'a'],
-        "source_dataset_r":  ['b', 'c', 'c', 'c', 'c', 'b'],
-        "match_probability": [.90, .70, .85, .90, .80, .70],
-    })
+    predictions = pd.DataFrame(
+        {
+            "unique_id_l": [0, 1, 3, 4, 6, 6],
+            "unique_id_r": [1, 2, 5, 5, 5, 7],
+            "source_dataset_l": ["a", "b", "a", "b", "a", "a"],
+            "source_dataset_r": ["b", "c", "c", "c", "c", "b"],
+            "match_probability": [0.90, 0.70, 0.85, 0.90, 0.80, 0.70],
+        }
+    )
 
     settings = SettingsCreator(
         link_type="link_only",
@@ -35,24 +35,132 @@ def test_single_best_links_correctness(test_helpers, dialect):
 
     linker = Linker(df, settings, **helper.extra_linker_args())
 
-    df_predict = linker.table_management.register_table_predict(predictions, overwrite=True)
+    df_predict = linker.table_management.register_table_predict(
+        predictions, overwrite=True
+    )
 
-    df_clusters = linker.clustering.cluster_single_best_links_at_threshold(
-        df_predict, 
-        source_datasets=["a", "b", "c"], 
-        threshold_match_probability=0.5
+    df_clusters = linker.clustering.cluster_using_single_best_links(
+        df_predict, source_datasets=["a", "b", "c"], threshold_match_probability=0.5
     )
 
     result = df_clusters.as_pandas_dataframe().sort_values("unique_id")
     result = result.reset_index(drop=True)
 
-    correct_result = pd.DataFrame({
-        "cluster_id": ["a-__-0", "a-__-0", "a-__-0", "a-__-3", "a-__-3", "a-__-3", "a-__-6", "b-__-7", "c-__-8"],
-        "unique_id":      [  0,  1,   2,   3,   4,   5,   6,   7,   8],
-        "source_dataset": ['a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c'],
-    })
+    correct_result = pd.DataFrame(
+        {
+            "cluster_id": [
+                "a-__-0",
+                "a-__-0",
+                "a-__-0",
+                "a-__-3",
+                "a-__-3",
+                "a-__-3",
+                "a-__-6",
+                "b-__-7",
+                "c-__-8",
+            ],
+            "unique_id": [0, 1, 2, 3, 4, 5, 6, 7, 8],
+            "source_dataset": ["a", "b", "c", "a", "b", "c", "a", "b", "c"],
+        }
+    )
     correct_result = correct_result.sort_values("unique_id")
     correct_result = correct_result.reset_index(drop=True)
 
     pd.testing.assert_frame_equal(result, correct_result)
 
+
+@mark_with_dialects_excluding()
+def test_single_best_links_ties(test_helpers, dialect):
+    helper = test_helpers[dialect]
+
+    df = pd.DataFrame(
+        {
+            "unique_id": [0, 1, 2],
+            "source_dataset": ["a", "a", "b"],
+        }
+    )
+
+    predictions = pd.DataFrame(
+        {
+            "unique_id_l": [0, 1],
+            "unique_id_r": [2, 2],
+            "source_dataset_l": ["a", "a"],
+            "source_dataset_r": ["b", "b"],
+            "match_probability": [0.90, 0.90],
+        }
+    )
+
+    settings = SettingsCreator(
+        link_type="link_only",
+        comparisons=[],
+        blocking_rules_to_generate_predictions=[],
+    )
+
+    linker = Linker(df, settings, **helper.extra_linker_args())
+
+    df_predict = linker.table_management.register_table_predict(
+        predictions, overwrite=True
+    )
+
+    df_clusters = linker.clustering.cluster_using_single_best_links(
+        df_predict, source_datasets=["a", "b"], threshold_match_probability=0.5
+    )
+
+    result = df_clusters.as_pandas_dataframe()
+    n_clusters = result["cluster_id"].nunique()
+
+    assert n_clusters > 1
+
+
+@mark_with_dialects_excluding()
+def test_single_best_links_one_to_one(test_helpers, dialect):
+    df = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
+    df_l = df.copy()
+    df_r = df.copy()
+    df_l["source_dataset"] = "a"
+    df_r["source_dataset"] = "b"
+
+    helper = test_helpers[dialect]
+
+    settings = SettingsCreator(
+        link_type="link_only",
+        comparisons=[
+            cl.ExactMatch("first_name"),
+            cl.ExactMatch("surname"),
+            cl.ExactMatch("dob"),
+            cl.ExactMatch("city"),
+        ],
+        blocking_rules_to_generate_predictions=[
+            block_on("surname"),
+            block_on("dob"),
+        ],
+    )
+
+    linker = Linker([df_l, df_r], settings, **helper.extra_linker_args())
+
+    linker.training.estimate_u_using_random_sampling(1e6)
+
+    df_predict = linker.inference.predict(0.5)
+
+    df_clusters = linker.clustering.cluster_using_single_best_links(
+        df_predict, source_datasets=["a", "b"], threshold_match_probability=0.5
+    )
+
+    result = linker.misc.query_sql(
+        f"""
+        with t as (
+            select
+                cluster_id,
+                sum(cast(source_dataset = 'a' as int)) as count_a,
+                sum(cast(source_dataset = 'b' as int)) as count_b
+            from {df_clusters.physical_name}
+            group by cluster_id
+        )
+        select count(*) as count
+        from t
+        where count_a > 1 or count_b > 1
+        """
+    )
+
+    count = result["count"][0]
+    assert count == 0

From be4b6802931a1c0dc3af09b794111a71f24596a9 Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Mon, 6 Jan 2025 12:00:47 +1100
Subject: [PATCH 04/15] formatting and updating docstrings

---
 .../internals/linker_components/clustering.py | 45 ++++++++--
 splink/internals/one_to_one_clustering.py     | 85 +++++++++++--------
 2 files changed, 87 insertions(+), 43 deletions(-)

diff --git a/splink/internals/linker_components/clustering.py b/splink/internals/linker_components/clustering.py
index 2938e6652b..2c0a1663ca 100644
--- a/splink/internals/linker_components/clustering.py
+++ b/splink/internals/linker_components/clustering.py
@@ -5,9 +5,6 @@
 from splink.internals.connected_components import (
     solve_connected_components,
 )
-from splink.internals.one_to_one_clustering import (
-    one_to_one_clustering,
-)
 from splink.internals.edge_metrics import compute_edge_metrics
 from splink.internals.graph_metrics import (
     GraphMetricsResults,
@@ -17,6 +14,9 @@
 from splink.internals.misc import (
     threshold_args_to_match_prob,
 )
+from splink.internals.one_to_one_clustering import (
+    one_to_one_clustering,
+)
 from splink.internals.pipeline import CTEPipeline
 from splink.internals.splink_dataframe import SplinkDataFrame
 from splink.internals.unique_id_concat import (
@@ -181,7 +181,7 @@ def cluster_pairwise_predictions_at_threshold(
         return df_clustered_with_input_data
 
 
-    def cluster_single_best_links_at_threshold(
+    def cluster_using_single_best_links(
         self,
         df_predict: SplinkDataFrame,
         source_datasets: List[str],
@@ -192,7 +192,37 @@ def cluster_single_best_links_at_threshold(
         Clusters the pairwise match predictions that result from
         `linker.inference.predict()` into groups of connected records using a single
         best links method that restricts the clusters to have at most one record from
-        each source dataset in the `source_datasets` list. 
+        each source dataset in the `source_datasets` list.
+
+        This method will include a record into a cluster if it is mutually the best
+        match for the record and for the cluster, and if adding the record will not
+        violate the criteria of having at most one record from each of the
+        `source_datasets`.
+
+        Args:
+            df_predict (SplinkDataFrame): The results of `linker.predict()`
+            source_datasets (List[str]): The source datasets which should be treated
+                as having no duplicates. Clusters will not form with more than
+                one record from each of these datasets. This can be a subset of all of
+                the source datasets in the input data.
+            threshold_match_probability (float, optional): Pairwise comparisons with a
+                `match_probability` at or above this threshold are matched
+            threshold_match_weight (float, optional): Pairwise comparisons with a
+                `match_weight` at or above this threshold are matched. Only one of
+                threshold_match_probability or threshold_match_weight should be provided
+
+        Returns:
+            SplinkDataFrame: A SplinkDataFrame containing a list of all IDs, clustered
+                into groups based on the desired match threshold and the source datasets
+                for which duplicates are not allowed.
+
+        Examples:
+            ```python
+            df_predict = linker.inference.predict(threshold_match_probability=0.5)
+            df_clustered = linker.clustering.cluster_pairwise_predictions_at_threshold(
+                df_predict, source_datasets=["A", "B"], threshold_match_probability=0.95
+            )
+            ```
         """
         linker = self._linker
         db_api = linker._db_api
@@ -206,7 +236,9 @@ def cluster_single_best_links_at_threshold(
         uid_concat_edges_r = _composite_unique_id_from_edges_sql(uid_cols, "r")
         uid_concat_nodes = _composite_unique_id_from_nodes_sql(uid_cols, None)
 
-        source_dataset_column_name = linker._settings_obj.column_info_settings.source_dataset_column_name
+        source_dataset_column_name = (
+            linker._settings_obj.column_info_settings.source_dataset_column_name
+        )
 
         sql = f"""
         select
@@ -301,7 +333,6 @@ def cluster_single_best_links_at_threshold(
 
         return df_clustered_with_input_data
 
-
     def _compute_metrics_nodes(
         self,
         df_predict: SplinkDataFrame,
diff --git a/splink/internals/one_to_one_clustering.py b/splink/internals/one_to_one_clustering.py
index 0a2f0a23ab..58ae4f7eb6 100755
--- a/splink/internals/one_to_one_clustering.py
+++ b/splink/internals/one_to_one_clustering.py
@@ -10,6 +10,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def one_to_one_clustering(
     nodes_table: SplinkDataFrame,
     edges_table: SplinkDataFrame,
@@ -23,13 +24,8 @@ def one_to_one_clustering(
 ) -> SplinkDataFrame:
     """One to one clustering algorithm.
 
-    This function clusters together records so that at most one record from each dataset is in each cluster.
-
-    Args:
-        
-    Returns:
-        SplinkDataFrame: A dataframe containing the connected components list
-        for your link or dedupe job.
+    This function clusters together records so that at most one record from each
+    dataset is in each cluster.
 
     """
 
@@ -39,7 +35,8 @@ def one_to_one_clustering(
     if threshold_match_probability is None:
         match_prob_expr = ""
 
-    # Add 'reverse-edges' so that the algorithm can rank all incoming and outgoing edges
+    # Add 'reverse-edges' so that the algorithm can rank all incoming and outgoing
+    # edges
     sql = f"""
     select
         {edge_id_column_name_left} as node_id,
@@ -82,9 +79,14 @@ def one_to_one_clustering(
 
         pipeline = CTEPipeline([neighbours, prev_representatives])
 
-        # might need to quote the value here? 
-        contains_expr = ", ".join([f"max(source_dataset == '{sd}') as contains_{sd}" for sd in source_datasets])
-        
+        # might need to quote the value here?
+        contains_expr = ", ".join(
+            [
+                f"max(source_dataset == '{sd}') as contains_{sd}"
+                for sd in source_datasets
+            ]
+        )
+
         sql = f"""
         select
             representative,
@@ -93,55 +95,66 @@ def one_to_one_clustering(
         group by representative
         """
 
-        pipeline.enqueue_sql(sql, f"__splink__representative_contains_flags_{iteration}")
+        pipeline.enqueue_sql(
+            sql, f"__splink__representative_contains_flags_{iteration}"
+        )
 
         sql = f"""
         select
             r.node_id,
             r.source_dataset,
             cf.*
-        from {prev_representatives.physical_name} as r 
+        from {prev_representatives.physical_name} as r
         inner join __splink__representative_contains_flags_{iteration} as cf
         on r.representative = cf.representative
         """
 
-        pipeline.enqueue_sql(sql, f"__splink__df_representatives_with_flags_{iteration}")
+        pipeline.enqueue_sql(
+            sql, f"__splink__df_representatives_with_flags_{iteration}"
+        )
+
+        duplicate_criteria = " or ".join(
+            [f"(l.contains_{sd} and r.contains_{sd})" for sd in source_datasets]
+        )
 
-        duplicate_criteria = " or ".join([f"(l.contains_{sd} and r.contains_{sd})" for sd in source_datasets])
-    
-        # must be calculated every iteration since the where condition changes as the clustering progresses
+        # must be calculated every iteration since the where condition changes as
+        # the clustering progresses
         sql = f"""
-        select 
+        select
             neighbours.node_id,
             neighbours.neighbour,
             {duplicate_criteria} as duplicate_criteria,
-            row_number() over (partition by l.representative order by match_probability desc) as rank_l,
-            row_number() over (partition by r.representative order by match_probability desc) as rank_r,
+            row_number() over (
+                partition by l.representative order by match_probability desc
+            ) as rank_l,
+            row_number() over (
+                partition by r.representative order by match_probability desc
+            ) as rank_r,
         from {neighbours.physical_name} as neighbours
         inner join __splink__df_representatives_with_flags_{iteration} as l
         on neighbours.node_id = l.node_id
-        inner join __splink__df_representatives_with_flags_{iteration} as r 
+        inner join __splink__df_representatives_with_flags_{iteration} as r
         on neighbours.neighbour = r.node_id
         where l.representative <> r.representative
         """
-        
-        # note for the future: a strategy to handle ties would go right here. 
-        
+
+        # note for the future: a strategy to handle ties would go right here.
+
         pipeline.enqueue_sql(sql, f"__splink__df_ranked_{iteration}")
-    
+
         sql = f"""
         select
             node_id,
             neighbour
-        from __splink__df_ranked_{iteration} 
+        from __splink__df_ranked_{iteration}
         where rank_l = 1 and rank_r = 1 and not duplicate_criteria
         """
 
         pipeline.enqueue_sql(sql, f"__splink__df_neighbours_{iteration}")
-    
+
         sql = f"""
         select
-        source.node_id, 
+        source.node_id,
         min(source.representative) as representative
         from
         (
@@ -151,9 +164,9 @@ def one_to_one_clustering(
             from __splink__df_neighbours_{iteration} as neighbours
             left join {prev_representatives.physical_name} as repr_neighbour
             on neighbours.neighbour = repr_neighbour.node_id
-            
+
             union all
-            
+
             select
                 node_id,
                 representative
@@ -162,8 +175,8 @@ def one_to_one_clustering(
         group by source.node_id
         """
 
-        pipeline.enqueue_sql(sql, f"r")
-    
+        pipeline.enqueue_sql(sql, "r")
+
         sql = f"""
         select
             r.node_id,
@@ -186,7 +199,7 @@ def one_to_one_clustering(
 
         # assess if the exit condition has been met
         sql = f"""
-        select 
+        select
             count(*) as count_of_nodes_needing_updating
         from {representatives.physical_name}
         where needs_updating
@@ -211,9 +224,9 @@ def one_to_one_clustering(
     pipeline = CTEPipeline()
 
     sql = f"""
-    select node_id as {node_id_column_name}, representative as cluster_id 
+    select node_id as {node_id_column_name}, representative as cluster_id
     from {representatives.physical_name}
-    """ 
+    """
 
     pipeline.enqueue_sql(sql, "__splink__clustering_output_final")
 
@@ -222,4 +235,4 @@ def one_to_one_clustering(
     representatives.drop_table_from_database_and_remove_from_cache()
     neighbours.drop_table_from_database_and_remove_from_cache()
 
-    return final_result
\ No newline at end of file
+    return final_result

From 01888fc928a464f8fa4dc0801fc8bbf6276fc67a Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Mon, 6 Jan 2025 13:17:40 +1100
Subject: [PATCH 05/15] renamed test file for consistency

---
 ...ks_at_threshold.py => test_cluster_using_single_best_links.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{test_cluster_single_best_links_at_threshold.py => test_cluster_using_single_best_links.py} (100%)

diff --git a/tests/test_cluster_single_best_links_at_threshold.py b/tests/test_cluster_using_single_best_links.py
similarity index 100%
rename from tests/test_cluster_single_best_links_at_threshold.py
rename to tests/test_cluster_using_single_best_links.py

From a67d2eb7a8574ea01ff11d0e7dffb6c091da1486 Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Mon, 6 Jan 2025 14:39:57 +1100
Subject: [PATCH 06/15] more formatting

---
 splink/internals/linker_components/clustering.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/splink/internals/linker_components/clustering.py b/splink/internals/linker_components/clustering.py
index 2c0a1663ca..240e436d80 100644
--- a/splink/internals/linker_components/clustering.py
+++ b/splink/internals/linker_components/clustering.py
@@ -180,7 +180,6 @@ def cluster_pairwise_predictions_at_threshold(
 
         return df_clustered_with_input_data
 
-
     def cluster_using_single_best_links(
         self,
         df_predict: SplinkDataFrame,

From b9461e0dacaf857308738ebe60f651c0f841fca4 Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Mon, 6 Jan 2025 16:01:48 +1100
Subject: [PATCH 07/15] removed stray commas, should fix sqlite error

---
 splink/internals/one_to_one_clustering.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/splink/internals/one_to_one_clustering.py b/splink/internals/one_to_one_clustering.py
index 58ae4f7eb6..fb479bf3d4 100755
--- a/splink/internals/one_to_one_clustering.py
+++ b/splink/internals/one_to_one_clustering.py
@@ -129,7 +129,7 @@ def one_to_one_clustering(
             ) as rank_l,
             row_number() over (
                 partition by r.representative order by match_probability desc
-            ) as rank_r,
+            ) as rank_r
         from {neighbours.physical_name} as neighbours
         inner join __splink__df_representatives_with_flags_{iteration} as l
         on neighbours.node_id = l.node_id
@@ -160,7 +160,7 @@ def one_to_one_clustering(
         (
             select
                 neighbours.node_id,
-                repr_neighbour.representative as representative,
+                repr_neighbour.representative as representative
             from __splink__df_neighbours_{iteration} as neighbours
             left join {prev_representatives.physical_name} as repr_neighbour
             on neighbours.neighbour = repr_neighbour.node_id

From 1ad74ec01ecdd504e72673d59cdba604009943bf Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Mon, 6 Jan 2025 16:12:55 +1100
Subject: [PATCH 08/15] swap == for = for postgres compatibility

---
 splink/internals/one_to_one_clustering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/splink/internals/one_to_one_clustering.py b/splink/internals/one_to_one_clustering.py
index fb479bf3d4..09951ac4de 100755
--- a/splink/internals/one_to_one_clustering.py
+++ b/splink/internals/one_to_one_clustering.py
@@ -82,7 +82,7 @@ def one_to_one_clustering(
         # might need to quote the value here?
         contains_expr = ", ".join(
             [
-                f"max(source_dataset == '{sd}') as contains_{sd}"
+                f"max(source_dataset = '{sd}') as contains_{sd}"
                 for sd in source_datasets
             ]
         )

From dfc2a2595444c7ab7a4928f233a59c9c5189f7bd Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Mon, 6 Jan 2025 16:33:51 +1100
Subject: [PATCH 09/15] cast contains flags to int rather than boolean

---
 splink/internals/one_to_one_clustering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/splink/internals/one_to_one_clustering.py b/splink/internals/one_to_one_clustering.py
index 09951ac4de..f5361f6dc7 100755
--- a/splink/internals/one_to_one_clustering.py
+++ b/splink/internals/one_to_one_clustering.py
@@ -82,7 +82,7 @@ def one_to_one_clustering(
         # might need to quote the value here?
         contains_expr = ", ".join(
             [
-                f"max(source_dataset = '{sd}') as contains_{sd}"
+                f"max(cast(source_dataset = '{sd}' as int)) as contains_{sd}"
                 for sd in source_datasets
             ]
         )

From 5ccb062a7d5d5013d8fb2790d8192699f910f49f Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Tue, 7 Jan 2025 09:03:09 +1100
Subject: [PATCH 10/15] fix for postgres backend

---
 splink/internals/one_to_one_clustering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/splink/internals/one_to_one_clustering.py b/splink/internals/one_to_one_clustering.py
index f5361f6dc7..c950e1f8a3 100755
--- a/splink/internals/one_to_one_clustering.py
+++ b/splink/internals/one_to_one_clustering.py
@@ -82,7 +82,7 @@ def one_to_one_clustering(
         # might need to quote the value here?
         contains_expr = ", ".join(
             [
-                f"max(cast(source_dataset = '{sd}' as int)) as contains_{sd}"
+                f"max(cast(source_dataset = '{sd}' as int)) > 0 as contains_{sd}"
                 for sd in source_datasets
             ]
         )

From a659b96bd47282bf47408b3f70ff7ee95a50b405 Mon Sep 17 00:00:00 2001
From: aymonwuolanne <a.wuolanne@gmail.com>
Date: Fri, 24 Jan 2025 09:24:37 +1100
Subject: [PATCH 11/15] renamed source_datasets variable, and switched the
 order of filtering duplicates and ranking

---
 splink/internals/linker_components/clustering.py | 16 +++++++++-------
 splink/internals/one_to_one_clustering.py        | 11 +++++------
 tests/test_cluster_using_single_best_links.py    | 12 ++++++++----
 3 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/splink/internals/linker_components/clustering.py b/splink/internals/linker_components/clustering.py
index 240e436d80..715695988c 100644
--- a/splink/internals/linker_components/clustering.py
+++ b/splink/internals/linker_components/clustering.py
@@ -183,7 +183,7 @@ def cluster_pairwise_predictions_at_threshold(
     def cluster_using_single_best_links(
         self,
         df_predict: SplinkDataFrame,
-        source_datasets: List[str],
+        duplicate_free_datasets: List[str],
         threshold_match_probability: Optional[float] = None,
         threshold_match_weight: Optional[float] = None,
     ) -> SplinkDataFrame:
@@ -191,17 +191,17 @@ def cluster_using_single_best_links(
         Clusters the pairwise match predictions that result from
         `linker.inference.predict()` into groups of connected records using a single
         best links method that restricts the clusters to have at most one record from
-        each source dataset in the `source_datasets` list.
+        each source dataset in the `duplicate_free_datasets` list.
 
         This method will include a record into a cluster if it is mutually the best
         match for the record and for the cluster, and if adding the record will not
         violate the criteria of having at most one record from each of the
-        `source_datasets`.
+        `duplicate_free_datasets`.
 
         Args:
             df_predict (SplinkDataFrame): The results of `linker.predict()`
-            source_datasets (List[str]): The source datasets which should be treated
-                as having no duplicates. Clusters will not form with more than
+            duplicate_free_datasets: (List[str]): The source datasets which should be
+                treated as having no duplicates. Clusters will not form with more than
                 one record from each of these datasets. This can be a subset of all of
                 the source datasets in the input data.
             threshold_match_probability (float, optional): Pairwise comparisons with a
@@ -219,7 +219,9 @@ def cluster_using_single_best_links(
             ```python
             df_predict = linker.inference.predict(threshold_match_probability=0.5)
             df_clustered = linker.clustering.cluster_pairwise_predictions_at_threshold(
-                df_predict, source_datasets=["A", "B"], threshold_match_probability=0.95
+                df_predict,
+                duplicate_free_datasets=["A", "B"],
+                threshold_match_probability=0.95
             )
             ```
         """
@@ -294,7 +296,7 @@ def cluster_using_single_best_links(
             source_dataset_column_name="source_dataset",
             edge_id_column_name_left="node_id_l",
             edge_id_column_name_right="node_id_r",
-            source_datasets=source_datasets,
+            duplicate_free_datasets=duplicate_free_datasets,
             db_api=db_api,
             threshold_match_probability=threshold_match_probability,
         )
diff --git a/splink/internals/one_to_one_clustering.py b/splink/internals/one_to_one_clustering.py
index c950e1f8a3..2645a03ea2 100755
--- a/splink/internals/one_to_one_clustering.py
+++ b/splink/internals/one_to_one_clustering.py
@@ -18,7 +18,7 @@ def one_to_one_clustering(
     source_dataset_column_name: str,
     edge_id_column_name_left: str,
     edge_id_column_name_right: str,
-    source_datasets: List[str],
+    duplicate_free_datasets: List[str],
     db_api: DatabaseAPISubClass,
     threshold_match_probability: Optional[float],
 ) -> SplinkDataFrame:
@@ -83,7 +83,7 @@ def one_to_one_clustering(
         contains_expr = ", ".join(
             [
                 f"max(cast(source_dataset = '{sd}' as int)) > 0 as contains_{sd}"
-                for sd in source_datasets
+                for sd in duplicate_free_datasets
             ]
         )
 
@@ -114,7 +114,7 @@ def one_to_one_clustering(
         )
 
         duplicate_criteria = " or ".join(
-            [f"(l.contains_{sd} and r.contains_{sd})" for sd in source_datasets]
+            [f"(l.contains_{sd} and r.contains_{sd})" for sd in duplicate_free_datasets]
         )
 
         # must be calculated every iteration since the where condition changes as
@@ -123,7 +123,6 @@ def one_to_one_clustering(
         select
             neighbours.node_id,
             neighbours.neighbour,
-            {duplicate_criteria} as duplicate_criteria,
             row_number() over (
                 partition by l.representative order by match_probability desc
             ) as rank_l,
@@ -135,7 +134,7 @@ def one_to_one_clustering(
         on neighbours.node_id = l.node_id
         inner join __splink__df_representatives_with_flags_{iteration} as r
         on neighbours.neighbour = r.node_id
-        where l.representative <> r.representative
+        where l.representative <> r.representative and not ({duplicate_criteria})
         """
 
         # note for the future: a strategy to handle ties would go right here.
@@ -147,7 +146,7 @@ def one_to_one_clustering(
             node_id,
             neighbour
         from __splink__df_ranked_{iteration}
-        where rank_l = 1 and rank_r = 1 and not duplicate_criteria
+        where rank_l = 1 and rank_r = 1
         """
 
         pipeline.enqueue_sql(sql, f"__splink__df_neighbours_{iteration}")
diff --git a/tests/test_cluster_using_single_best_links.py b/tests/test_cluster_using_single_best_links.py
index da6bc52810..d41b5b64a4 100644
--- a/tests/test_cluster_using_single_best_links.py
+++ b/tests/test_cluster_using_single_best_links.py
@@ -40,7 +40,9 @@ def test_single_best_links_correctness(test_helpers, dialect):
     )
 
     df_clusters = linker.clustering.cluster_using_single_best_links(
-        df_predict, source_datasets=["a", "b", "c"], threshold_match_probability=0.5
+        df_predict,
+        duplicate_free_datasets=["a", "b", "c"],
+        threshold_match_probability=0.5,
     )
 
     result = df_clusters.as_pandas_dataframe().sort_values("unique_id")
@@ -56,7 +58,7 @@ def test_single_best_links_correctness(test_helpers, dialect):
                 "a-__-3",
                 "a-__-3",
                 "a-__-6",
-                "b-__-7",
+                "a-__-6",
                 "c-__-8",
             ],
             "unique_id": [0, 1, 2, 3, 4, 5, 6, 7, 8],
@@ -103,7 +105,7 @@ def test_single_best_links_ties(test_helpers, dialect):
     )
 
     df_clusters = linker.clustering.cluster_using_single_best_links(
-        df_predict, source_datasets=["a", "b"], threshold_match_probability=0.5
+        df_predict, duplicate_free_datasets=["a", "b"], threshold_match_probability=0.5
     )
 
     result = df_clusters.as_pandas_dataframe()
@@ -143,7 +145,9 @@ def test_single_best_links_one_to_one(test_helpers, dialect):
     df_predict = linker.inference.predict(0.5)
 
     df_clusters = linker.clustering.cluster_using_single_best_links(
-        df_predict, source_datasets=["a", "b"], threshold_match_probability=0.5
+        df_predict,
+        duplicate_free_datasets=["a", "b"],
+        threshold_match_probability=0.5,
     )
 
     result = linker.misc.query_sql(

From b4b49d12576fd9bf5db01434d7621297fc4f6142 Mon Sep 17 00:00:00 2001
From: Robin Linacre <robinlinacre@hotmail.com>
Date: Fri, 31 Jan 2025 10:18:38 +0000
Subject: [PATCH 12/15] add additional one to one tests

---
 tests/test_cluster_using_single_best_links.py | 134 +++++++++++++++++-
 1 file changed, 133 insertions(+), 1 deletion(-)

diff --git a/tests/test_cluster_using_single_best_links.py b/tests/test_cluster_using_single_best_links.py
index d41b5b64a4..7131cf45a2 100644
--- a/tests/test_cluster_using_single_best_links.py
+++ b/tests/test_cluster_using_single_best_links.py
@@ -6,8 +6,10 @@
 from .decorator import mark_with_dialects_excluding
 
 
+# See https://www.robinlinacre.com/graphPlayground/ with this data:
+# https://gist.github.com/RobinL/a022c16ada1892035b1f3f7838f80db0#file-example_1-json
 @mark_with_dialects_excluding()
-def test_single_best_links_correctness(test_helpers, dialect):
+def test_single_best_links_correctness_example_1(test_helpers, dialect):
     helper = test_helpers[dialect]
 
     df = pd.DataFrame(
@@ -71,6 +73,136 @@ def test_single_best_links_correctness(test_helpers, dialect):
     pd.testing.assert_frame_equal(result, correct_result)
 
 
+# See https://www.robinlinacre.com/graphPlayground/ with this data:
+# https://gist.github.com/RobinL/a022c16ada1892035b1f3f7838f80db0#file-example_2-json
+@mark_with_dialects_excluding()
+def test_single_best_links_example_2(test_helpers, dialect):
+    helper = test_helpers[dialect]
+
+    df = pd.DataFrame(
+        {
+            "unique_id": ["1", "2", "3", "4", "5", "6", "7"],
+            "source_dataset": ["a", "b", "a", "b", "a", "b", "d"],
+        }
+    )
+
+    predictions = pd.DataFrame(
+        {
+            "unique_id_l": ["1", "2", "3", "4", "5", "6", "4"],
+            "unique_id_r": ["2", "3", "4", "5", "6", "1", "7"],
+            "source_dataset_l": ["a", "b", "a", "b", "a", "b", "b"],
+            "source_dataset_r": ["b", "a", "b", "a", "b", "a", "d"],
+            "match_probability": [0.92, 0.91, 0.99, 0.88, 0.90, 0.96, 0.91],
+        }
+    )
+
+    settings = SettingsCreator(
+        link_type="link_only",
+        comparisons=[],
+        blocking_rules_to_generate_predictions=[],
+    )
+
+    linker = Linker(df, settings, **helper.extra_linker_args())
+
+    df_predict = linker.table_management.register_table_predict(
+        predictions, overwrite=True
+    )
+
+    df_clusters = linker.clustering.cluster_using_single_best_links(
+        df_predict,
+        duplicate_free_datasets=["a", "b", "d"],
+        threshold_match_probability=0.5,
+    )
+
+    result = df_clusters.as_pandas_dataframe().sort_values("unique_id")
+    result = result.reset_index(drop=True)
+
+    correct_result = pd.DataFrame(
+        {
+            "cluster_id": [
+                "a-__-1",
+                "b-__-2",
+                "a-__-3",
+                "a-__-3",
+                "a-__-5",
+                "a-__-1",
+                "a-__-3",
+            ],
+            "unique_id": ["1", "2", "3", "4", "5", "6", "7"],
+            "source_dataset": ["a", "b", "a", "b", "a", "b", "d"],
+        }
+    )
+    correct_result = correct_result.sort_values("unique_id")
+    correct_result = correct_result.reset_index(drop=True)
+
+    pd.testing.assert_frame_equal(result, correct_result)
+
+
+# See https://www.robinlinacre.com/graphPlayground/ with this data:
+# https://gist.github.com/RobinL/a022c16ada1892035b1f3f7838f80db0#file-example_3-json
+@mark_with_dialects_excluding()
+def test_single_best_links_example_3(test_helpers, dialect):
+    helper = test_helpers[dialect]
+
+    df = pd.DataFrame(
+        {
+            "unique_id": ["1", "2", "3", "4", "5", "6", "7"],
+            "source_dataset": ["a", "c", "b", "a", "b", "c", "a"],
+        }
+    )
+
+    predictions = pd.DataFrame(
+        {
+            "unique_id_l": ["1", "2", "3", "4", "5", "6"],
+            "unique_id_r": ["2", "3", "4", "5", "6", "7"],
+            "source_dataset_l": ["a", "c", "b", "a", "b", "c"],
+            "source_dataset_r": ["c", "b", "a", "b", "c", "a"],
+            "match_probability": [0.98, 0.90, 0.80, 0.81, 0.91, 0.99],
+        }
+    )
+
+    settings = SettingsCreator(
+        link_type="link_only",
+        comparisons=[],
+        blocking_rules_to_generate_predictions=[],
+    )
+
+    linker = Linker(df, settings, **helper.extra_linker_args())
+
+    df_predict = linker.table_management.register_table_predict(
+        predictions, overwrite=True
+    )
+
+    df_clusters = linker.clustering.cluster_using_single_best_links(
+        df_predict,
+        duplicate_free_datasets=["a", "b", "c"],
+        threshold_match_probability=0.5,
+    )
+
+    result = df_clusters.as_pandas_dataframe().sort_values("unique_id")
+    result = result.reset_index(drop=True)
+
+    correct_result = pd.DataFrame(
+        {
+            "cluster_id": [
+                "a-__-1",
+                "a-__-1",
+                "a-__-1",
+                "a-__-4",
+                "a-__-7",
+                "a-__-7",
+                "a-__-7",
+            ],
+            "unique_id": ["1", "2", "3", "4", "5", "6", "7"],
+            "source_dataset": ["a", "c", "b", "a", "b", "c", "a"],
+        }
+    )
+    correct_result = correct_result.sort_values("unique_id")
+    correct_result = correct_result.reset_index(drop=True)
+
+    pd.testing.assert_frame_equal(result, correct_result)
+
+
 @mark_with_dialects_excluding()
 def test_single_best_links_ties(test_helpers, dialect):
     helper = test_helpers[dialect]

From cd6ea163708b53c28c45bcabab350833627d6ded Mon Sep 17 00:00:00 2001
From: Robin Linacre <robinlinacre@hotmail.com>
Date: Fri, 31 Jan 2025 10:30:24 +0000
Subject: [PATCH 13/15] fix artifact v3 deprecation

---
 .github/workflows/pytest_duckdb.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pytest_duckdb.yml b/.github/workflows/pytest_duckdb.yml
index 1c8e37594f..f80d4ff6cc 100644
--- a/.github/workflows/pytest_duckdb.yml
+++ b/.github/workflows/pytest_duckdb.yml
@@ -39,11 +39,11 @@ jobs:
           poetry run pytest -v --durations=0 -m "duckdb_only or core" --cov=splink --cov-report=xml --cov-report=term tests/
 
       - name: Upload coverage report
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: coverage-report
           path: coverage.xml
       - name: Upload to Codecov
-        uses: codecov/codecov-action@v3
+        uses: codecov/codecov-action@v4
         with:
           token: ${{ secrets.CODECOV_TOKEN }}

From 7989b9cf361b5c79dbe22e7edba91b3cf6ee7288 Mon Sep 17 00:00:00 2001
From: Robin Linacre <robinlinacre@hotmail.com>
Date: Fri, 31 Jan 2025 10:37:10 +0000
Subject: [PATCH 14/15] update ci to fix overwrite error

---
 .github/workflows/pytest_duckdb.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pytest_duckdb.yml b/.github/workflows/pytest_duckdb.yml
index f80d4ff6cc..6ff7d5477e 100644
--- a/.github/workflows/pytest_duckdb.yml
+++ b/.github/workflows/pytest_duckdb.yml
@@ -41,7 +41,7 @@ jobs:
       - name: Upload coverage report
         uses: actions/upload-artifact@v4
         with:
-          name: coverage-report
+          name: coverage-report-${{ matrix.python-version }}
           path: coverage.xml
       - name: Upload to Codecov
         uses: codecov/codecov-action@v4

From f0e27cc4b5021170095aecf04358b4d9bcc5a54f Mon Sep 17 00:00:00 2001
From: Robin Linacre <robinlinacre@hotmail.com>
Date: Fri, 31 Jan 2025 10:54:00 +0000
Subject: [PATCH 15/15] update changelog

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7c85c807e9..7b864ae1d9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Added
+
+- Support for 'one to one' linking and clustering (allowing the user to force clusters to contain at most one record from given `source_dataset`s) in [#2578](https://github.com/moj-analytical-services/splink/pull/2578/)
+
 ### Deprecated
 
 - Deprecated support for python `3.8.x` following end of support for that minor version ([#2520](https://github.com/moj-analytical-services/splink/pull/2520))