From 7c44782030bbd84c7e4e01382e3b31603ba18976 Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Tue, 9 Jan 2024 17:13:45 +0000
Subject: [PATCH 01/12] update sampling logic for density

---
 splink/cluster_studio.py | 49 ++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py
index a000d34a4a..940d542dae 100644
--- a/splink/cluster_studio.py
+++ b/splink/cluster_studio.py
@@ -228,41 +228,48 @@ def _get_cluster_id_of_each_size(
     return df_cluster_sample_with_size.as_record_dict()
 
 
-def _get_lowest_density_cluster_ids(
+def _get_lowest_density_clusters(
     linker: "Linker",
     df_cluster_metrics: SplinkDataFrame,
-    sample_size: int,
+    rows_per_partition: int,
     min_nodes: int,
 ):
-    """Retrieves cluster IDs based on density metric, ordered from least to
-    most dense.
+    """Samples lowest density clusters stratified by size.
 
     Args:
         linker: An instance of the Splink Linker class.
         df_cluster_metrics (SplinkDataFrame): dataframe containing
-        cluster metrics, including density.
-        sample_size (int): size of sample returned.
+        cluster metrics including density.
+        rows_per_partition (int):
         min_nodes (int): The minimum number of nodes a cluster must contain
         to be included in the sample.
 
     Returns:
-        list: A list of cluster IDs based on density metric, ordered from least to
-        most dense.
+        list: A list of cluster IDs of the least dense clusters across different sizes
     """
-    # Ordering: least dense clusters first
+
     sql = f"""
-    SELECT cluster_id
-    FROM {df_cluster_metrics.physical_name}
-    WHERE n_nodes >= {min_nodes}
-    ORDER BY density
-    LIMIT {sample_size}
+    select
+        cluster_id,
+        n_nodes as cluster_size,
+        row_number() over (partition by n_nodes order by density) as row_num
+    from {df_cluster_metrics.physical_name}
+    where n_nodes >= {min_nodes}
     """
 
-    df_density_sample = linker._sql_to_splink_dataframe_checking_cache(
-        sql, "__splink__density_sample"
-    )
+    linker._enqueue_sql(sql, "__splink__partition_clusters_by_size")
+
+    sql = f"""
+    select
+        cluster_id
+    from __splink__partition_clusters_by_size
+    where row_num <= {rows_per_partition}
+    """
+
+    linker._enqueue_sql(sql, "__splink__lowest_density_clusters")
+    df_cluster_sample_lowest_density = linker._execute_sql_pipeline()
 
-    return [r["cluster_id"] for r in df_density_sample.as_record_dict()]
+    return [r["cluster_id"] for r in df_cluster_sample_lowest_density.as_record_dict()]
 
 
 def render_splink_cluster_studio_html(
@@ -310,9 +317,11 @@ def render_splink_cluster_studio_html(
             else:
                 # Using sensible default for min_nodes. Might become option
                 # for users in future
-                cluster_ids = _get_lowest_density_cluster_ids(
-                    linker, _df_cluster_metrics, sample_size, min_nodes=3
+                cluster_ids = _get_lowest_density_clusters(
+                    linker, _df_cluster_metrics, 1, 3
                 )
+                if len(cluster_ids) > sample_size:
+                    cluster_ids = random.sample(cluster_ids, k=sample_size)
 
     cluster_recs = df_clusters_as_records(linker, df_clustered_nodes, cluster_ids)
     df_nodes = create_df_nodes(linker, df_clustered_nodes, cluster_ids)

From 8a541c7fc24d3ccb7e4e4f4921933e393fa9a009 Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Tue, 9 Jan 2024 18:14:53 +0000
Subject: [PATCH 02/12] update doc string and variable names

---
 splink/cluster_studio.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py
index 940d542dae..6706f50b7f 100644
--- a/splink/cluster_studio.py
+++ b/splink/cluster_studio.py
@@ -234,18 +234,19 @@ def _get_lowest_density_clusters(
     rows_per_partition: int,
     min_nodes: int,
 ):
-    """Samples lowest density clusters stratified by size.
+    """Returns cluster ids of lowest density clusters across different sized clusters
+    by performing stratified sampling.
 
     Args:
         linker: An instance of the Splink Linker class.
         df_cluster_metrics (SplinkDataFrame): dataframe containing
         cluster metrics including density.
-        rows_per_partition (int):
-        min_nodes (int): The minimum number of nodes a cluster must contain
+        rows_per_partition (int): number of rows in each strata (partition)
+        min_nodes (int): minimum number of nodes a cluster must contain
         to be included in the sample.
 
     Returns:
-        list: A list of cluster IDs of the least dense clusters across different sizes
+        list: A list of cluster ids of lowest density clusters of different sizes
     """
 
     sql = f"""
@@ -307,7 +308,7 @@ def render_splink_cluster_studio_html(
             ]
             cluster_ids = [c["cluster_id"] for c in cluster_ids]
             named_clusters_dict = dict(zip(cluster_ids, cluster_names))
-        if sampling_method == "lowest_density_clusters":
+        if sampling_method == "lowest_density_by_size":
             if _df_cluster_metrics is None:
                 raise SplinkException(
                     """To sample by density, you must provide a cluster metrics table

From 58104710d66299926fb704b2acaec2041798bed5 Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Wed, 10 Jan 2024 11:09:46 +0000
Subject: [PATCH 03/12] update docstring

---
 splink/cluster_studio.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py
index 6706f50b7f..43d5303b41 100644
--- a/splink/cluster_studio.py
+++ b/splink/cluster_studio.py
@@ -234,8 +234,8 @@ def _get_lowest_density_clusters(
     rows_per_partition: int,
     min_nodes: int,
 ):
-    """Returns cluster ids of lowest density clusters across different sized clusters
-    by performing stratified sampling.
+    """Returns ids of lowest density clusters of different sizes by
+    performing stratified sampling.
 
     Args:
         linker: An instance of the Splink Linker class.
@@ -246,7 +246,7 @@ def _get_lowest_density_clusters(
         to be included in the sample.
 
     Returns:
-        list: A list of cluster ids of lowest density clusters of different sizes
+        list: A list of cluster ids of lowest density clusters of different sizes.
     """
 
     sql = f"""
@@ -308,7 +308,7 @@ def render_splink_cluster_studio_html(
             ]
             cluster_ids = [c["cluster_id"] for c in cluster_ids]
             named_clusters_dict = dict(zip(cluster_ids, cluster_names))
-        if sampling_method == "lowest_density_by_size":
+        if sampling_method == "lowest_density_clusters_by_size":
             if _df_cluster_metrics is None:
                 raise SplinkException(
                     """To sample by density, you must provide a cluster metrics table

From 61121362417cba5a8fae046784f6f4c42aa5ac1c Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Wed, 10 Jan 2024 13:13:37 +0000
Subject: [PATCH 04/12] dashboard rendering and building correctly

---
 splink/cluster_studio.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py
index 43d5303b41..298fcb1afc 100644
--- a/splink/cluster_studio.py
+++ b/splink/cluster_studio.py
@@ -268,9 +268,9 @@ def _get_lowest_density_clusters(
     """
 
     linker._enqueue_sql(sql, "__splink__lowest_density_clusters")
-    df_cluster_sample_lowest_density = linker._execute_sql_pipeline()
+    df_lowest_density_clusters = linker._execute_sql_pipeline()
 
-    return [r["cluster_id"] for r in df_cluster_sample_lowest_density.as_record_dict()]
+    return [r["cluster_id"] for r in df_lowest_density_clusters.as_record_dict()]
 
 
 def render_splink_cluster_studio_html(
@@ -319,7 +319,7 @@ def render_splink_cluster_studio_html(
                 # Using sensible default for min_nodes. Might become option
                 # for users in future
                 cluster_ids = _get_lowest_density_clusters(
-                    linker, _df_cluster_metrics, 1, 3
+                    linker, _df_cluster_metrics, rows_per_partition=1, min_nodes=3
                 )
                 if len(cluster_ids) > sample_size:
                     cluster_ids = random.sample(cluster_ids, k=sample_size)

From 9c63e014f9646ba023f50c73f4f9bb56fdbc819d Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Wed, 10 Jan 2024 13:55:19 +0000
Subject: [PATCH 05/12] display density value in dropdown menu

---
 splink/cluster_studio.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py
index 298fcb1afc..ae2a46f308 100644
--- a/splink/cluster_studio.py
+++ b/splink/cluster_studio.py
@@ -252,7 +252,8 @@ def _get_lowest_density_clusters(
     sql = f"""
     select
         cluster_id,
-        n_nodes as cluster_size,
+        n_nodes,
+        density,
         row_number() over (partition by n_nodes order by density) as row_num
     from {df_cluster_metrics.physical_name}
     where n_nodes >= {min_nodes}
@@ -262,7 +263,8 @@ def _get_lowest_density_clusters(
 
     sql = f"""
     select
-        cluster_id
+        cluster_id,
+        round(density, 4) as density_4dp
     from __splink__partition_clusters_by_size
     where row_num <= {rows_per_partition}
     """
@@ -270,7 +272,7 @@ def _get_lowest_density_clusters(
     linker._enqueue_sql(sql, "__splink__lowest_density_clusters")
     df_lowest_density_clusters = linker._execute_sql_pipeline()
 
-    return [r["cluster_id"] for r in df_lowest_density_clusters.as_record_dict()]
+    return df_lowest_density_clusters.as_record_dict()
 
 
 def render_splink_cluster_studio_html(
@@ -323,6 +325,12 @@ def render_splink_cluster_studio_html(
                 )
                 if len(cluster_ids) > sample_size:
                     cluster_ids = random.sample(cluster_ids, k=sample_size)
+                cluster_names = [
+                    f"Cluster ID: {c['cluster_id']}, density (4dp)  {c['density_4dp']}"
+                    for c in cluster_ids
+                ]
+                cluster_ids = [c["cluster_id"] for c in cluster_ids]
+                named_clusters_dict = dict(zip(cluster_ids, cluster_names))
 
     cluster_recs = df_clusters_as_records(linker, df_clustered_nodes, cluster_ids)
     df_nodes = create_df_nodes(linker, df_clustered_nodes, cluster_ids)

From 33a4ee126953a62f58afede57b3fe9b0dcce2bb5 Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Wed, 10 Jan 2024 15:51:40 +0000
Subject: [PATCH 06/12] update test

---
 tests/test_cluster_studio.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/tests/test_cluster_studio.py b/tests/test_cluster_studio.py
index 6ce12698fa..ac5670ca7c 100644
--- a/tests/test_cluster_studio.py
+++ b/tests/test_cluster_studio.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from splink.cluster_studio import _get_lowest_density_cluster_ids
+from splink.cluster_studio import _get_lowest_density_clusters
 from splink.duckdb.linker import DuckDBLinker
 
 
@@ -17,13 +17,13 @@ def test_density_sample():
 
     # Dummy cluster metrics table
     cluster = ["A", "B", "C", "D", "E"]
-    n_nodes = [3, 2, 10, 3, 19]
-    n_edges = [2, 1, 5, 2, 25]
+    n_nodes = [3, 3, 3, 10, 10]
+    n_edges = [1, 2, 3, 9, 20]
     density = [
         (n_edges * 2) / (n_nodes * (n_nodes - 1))
         for n_nodes, n_edges in zip(n_nodes, n_edges)
     ]
-    df_metrics = pd.DataFrame(
+    pd_metrics = pd.DataFrame(
         {
             "cluster_id": cluster,
             "n_nodes": n_nodes,
@@ -34,10 +34,15 @@ def test_density_sample():
 
     # Convert to Splink dataframe
     df_cluster_metrics = linker.register_table(
-        df_metrics, "df_cluster_metrics", overwrite=True
+        pd_metrics, "df_cluster_metrics", overwrite=True
     )
-    df_result = _get_lowest_density_cluster_ids(
-        linker, df_cluster_metrics, sample_size=3, min_nodes=3
+    result = _get_lowest_density_clusters(
+        linker, df_cluster_metrics, rows_per_partition=1, min_nodes=3
     )
-    df_expect = ["C", "E", "A"]
-    assert df_result == df_expect
+
+    expect = [
+        {"cluster_id": "D", "density_4dp": 0.2},
+        {"cluster_id": "A", "density_4dp": 0.3333},
+    ]
+
+    assert result == expect

From 9734f531ed0f4191dda472db2045bc67bbd1a72e Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Thu, 11 Jan 2024 12:45:44 +0000
Subject: [PATCH 07/12] order by cluster id

---
 splink/cluster_studio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py
index ae2a46f308..06bd973059 100644
--- a/splink/cluster_studio.py
+++ b/splink/cluster_studio.py
@@ -254,7 +254,7 @@ def _get_lowest_density_clusters(
         cluster_id,
         n_nodes,
         density,
-        row_number() over (partition by n_nodes order by density) as row_num
+        row_number() over (partition by n_nodes order by density, cluster_id) as row_num
     from {df_cluster_metrics.physical_name}
     where n_nodes >= {min_nodes}
     """

From 7086ff3b385f5884e1d786ab1fd1f2ef8fe7ccfb Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Thu, 11 Jan 2024 12:45:52 +0000
Subject: [PATCH 08/12] update test

---
 tests/test_cluster_studio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_cluster_studio.py b/tests/test_cluster_studio.py
index ac5670ca7c..457fc597a7 100644
--- a/tests/test_cluster_studio.py
+++ b/tests/test_cluster_studio.py
@@ -41,8 +41,8 @@ def test_density_sample():
     )
 
     expect = [
-        {"cluster_id": "D", "density_4dp": 0.2},
         {"cluster_id": "A", "density_4dp": 0.3333},
+        {"cluster_id": "D", "density_4dp": 0.2},
     ]
 
     assert result == expect

From 98f44cbd359e168f15ad69a9c4dfe3a0be251a81 Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Thu, 11 Jan 2024 16:36:18 +0000
Subject: [PATCH 09/12] fix test

---
 tests/test_cluster_studio.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/test_cluster_studio.py b/tests/test_cluster_studio.py
index 457fc597a7..ef62fc65ed 100644
--- a/tests/test_cluster_studio.py
+++ b/tests/test_cluster_studio.py
@@ -16,9 +16,9 @@ def test_density_sample():
     linker = DuckDBLinker(df, settings)
 
     # Dummy cluster metrics table
-    cluster = ["A", "B", "C", "D", "E"]
-    n_nodes = [3, 3, 3, 10, 10]
-    n_edges = [1, 2, 3, 9, 20]
+    cluster = ["A", "B", "C", "D", "E", "F"]
+    n_nodes = [2, 3, 3, 3, 10, 10]
+    n_edges = [1, 2, 2, 3, 9, 20]
     density = [
         (n_edges * 2) / (n_nodes * (n_nodes - 1))
         for n_nodes, n_edges in zip(n_nodes, n_edges)
@@ -40,9 +40,11 @@ def test_density_sample():
         linker, df_cluster_metrics, rows_per_partition=1, min_nodes=3
     )
 
+    result = sorted(result, key=lambda x: x["cluster_id"])
+
     expect = [
-        {"cluster_id": "A", "density_4dp": 0.3333},
-        {"cluster_id": "D", "density_4dp": 0.2},
+        {"cluster_id": "B", "density_4dp": 0.6667},
+        {"cluster_id": "E", "density_4dp": 0.2},
     ]
 
     assert result == expect

From 8771516db9e5ee5f6f3040b987b3559a6b06e14f Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Mon, 29 Jan 2024 14:00:54 +0000
Subject: [PATCH 10/12] update following review

---
 splink/cluster_studio.py | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py
index 06bd973059..9b9ba40181 100644
--- a/splink/cluster_studio.py
+++ b/splink/cluster_studio.py
@@ -234,7 +234,7 @@ def _get_lowest_density_clusters(
     rows_per_partition: int,
     min_nodes: int,
 ):
-    """Returns ids of lowest density clusters of different sizes by
+    """Returns lowest density clusters of different sizes by
     performing stratified sampling.
 
     Args:
@@ -246,7 +246,8 @@ def _get_lowest_density_clusters(
         to be included in the sample.
 
     Returns:
-        list: A list of cluster ids of lowest density clusters of different sizes.
+        list: A list of record dictionaries containing cluster ids, densities
+        and sizes of lowest density clusters.
     """
 
     sql = f"""
@@ -264,7 +265,8 @@ def _get_lowest_density_clusters(
     sql = f"""
     select
         cluster_id,
-        round(density, 4) as density_4dp
+        round(density, 4) as density_4dp,
+        n_nodes as cluster_size
     from __splink__partition_clusters_by_size
     where row_num <= {rows_per_partition}
     """
@@ -305,7 +307,7 @@ def render_splink_cluster_studio_html(
             if len(cluster_ids) > sample_size:
                 cluster_ids = random.sample(cluster_ids, k=sample_size)
             cluster_names = [
-                f"Cluster ID: {c['cluster_id']}, size  {c['cluster_size']}"
+                f"Cluster ID: {c['cluster_id']}, size:  {c['cluster_size']}"
                 for c in cluster_ids
             ]
             cluster_ids = [c["cluster_id"] for c in cluster_ids]
@@ -315,22 +317,22 @@ def render_splink_cluster_studio_html(
                 raise SplinkException(
                     """To sample by density, you must provide a cluster metrics table
                       containing density. This can be generated by calling the
-                      _compute_cluster_metrics method on the linker."""
+                      _compute_graph_metrics method on the linker."""
                 )
-            else:
-                # Using sensible default for min_nodes. Might become option
-                # for users in future
-                cluster_ids = _get_lowest_density_clusters(
-                    linker, _df_cluster_metrics, rows_per_partition=1, min_nodes=3
-                )
-                if len(cluster_ids) > sample_size:
-                    cluster_ids = random.sample(cluster_ids, k=sample_size)
-                cluster_names = [
-                    f"Cluster ID: {c['cluster_id']}, density (4dp)  {c['density_4dp']}"
-                    for c in cluster_ids
-                ]
-                cluster_ids = [c["cluster_id"] for c in cluster_ids]
-                named_clusters_dict = dict(zip(cluster_ids, cluster_names))
+            # Using sensible default for min_nodes. Might become option
+            # for users in future
+            cluster_ids = _get_lowest_density_clusters(
+                linker, _df_cluster_metrics, rows_per_partition=1, min_nodes=3
+            )
+            if len(cluster_ids) > sample_size:
+                cluster_ids = random.sample(cluster_ids, k=sample_size)
+            cluster_names = [
+                f"""Cluster ID: {c['cluster_id']}, density (4dp): {c['density_4dp']},
+                size: {c['cluster_size']}"""
+                for c in cluster_ids
+            ]
+            cluster_ids = [c["cluster_id"] for c in cluster_ids]
+            named_clusters_dict = dict(zip(cluster_ids, cluster_names))
 
     cluster_recs = df_clusters_as_records(linker, df_clustered_nodes, cluster_ids)
     df_nodes = create_df_nodes(linker, df_clustered_nodes, cluster_ids)

From f9ac77db79b67da314bbdb74f58af8791f8aa8f0 Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Mon, 29 Jan 2024 14:02:39 +0000
Subject: [PATCH 11/12] update following review

---
 splink/cluster_studio.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py
index 9b9ba40181..695b14e53a 100644
--- a/splink/cluster_studio.py
+++ b/splink/cluster_studio.py
@@ -188,7 +188,7 @@ def _get_random_cluster_ids(
 
 
 def _get_cluster_id_of_each_size(
-    linker: "Linker", connected_components: SplinkDataFrame, rows_per_cluster: int
+    linker: "Linker", connected_components: SplinkDataFrame, rows_per_partition: int
 ):
     sql = f"""
     select
@@ -218,8 +218,7 @@ def _get_cluster_id_of_each_size(
         cluster_id,
         cluster_size
     from __splink__cluster_count_row_numbered
-    where row_num <= {rows_per_cluster}
-    and cluster_size > 1
+    where row_num <= {rows_per_partition}
     """
 
     linker._enqueue_sql(sql, "__splink__cluster_count_row_numbered")

From 497a1f4b2e63bcbe3b4a2fb2d9080210070958ab Mon Sep 17 00:00:00 2001
From: zslade <zoe.slade@gmail.com>
Date: Mon, 29 Jan 2024 14:13:23 +0000
Subject: [PATCH 12/12] update test

---
 tests/test_cluster_studio.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_cluster_studio.py b/tests/test_cluster_studio.py
index ef62fc65ed..c5c194c9e4 100644
--- a/tests/test_cluster_studio.py
+++ b/tests/test_cluster_studio.py
@@ -43,8 +43,8 @@ def test_density_sample():
     result = sorted(result, key=lambda x: x["cluster_id"])
 
     expect = [
-        {"cluster_id": "B", "density_4dp": 0.6667},
-        {"cluster_id": "E", "density_4dp": 0.2},
+        {"cluster_id": "B", "density_4dp": 0.6667, "cluster_size": 3},
+        {"cluster_id": "E", "density_4dp": 0.2, "cluster_size": 10},
     ]
 
     assert result == expect