From 7c44782030bbd84c7e4e01382e3b31603ba18976 Mon Sep 17 00:00:00 2001 From: zslade Date: Tue, 9 Jan 2024 17:13:45 +0000 Subject: [PATCH 01/12] update sampling logic for density --- splink/cluster_studio.py | 49 ++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py index a000d34a4a..940d542dae 100644 --- a/splink/cluster_studio.py +++ b/splink/cluster_studio.py @@ -228,41 +228,48 @@ def _get_cluster_id_of_each_size( return df_cluster_sample_with_size.as_record_dict() -def _get_lowest_density_cluster_ids( +def _get_lowest_density_clusters( linker: "Linker", df_cluster_metrics: SplinkDataFrame, - sample_size: int, + rows_per_partition: int, min_nodes: int, ): - """Retrieves cluster IDs based on density metric, ordered from least to - most dense. + """Samples lowest density clusters stratified by size. Args: linker: An instance of the Splink Linker class. df_cluster_metrics (SplinkDataFrame): dataframe containing - cluster metrics, including density. - sample_size (int): size of sample returned. + cluster metrics including density. + rows_per_partition (int): min_nodes (int): The minimum number of nodes a cluster must contain to be included in the sample. Returns: - list: A list of cluster IDs based on density metric, ordered from least to - most dense. + list: A list of cluster IDs of the least dense clusters across different sizes """ - # Ordering: least dense clusters first + sql = f""" - SELECT cluster_id - FROM {df_cluster_metrics.physical_name} - WHERE n_nodes >= {min_nodes} - ORDER BY density - LIMIT {sample_size} + select + cluster_id, + n_nodes as cluster_size, + row_number() over (partition by n_nodes order by density) as row_num + from {df_cluster_metrics.physical_name} + where n_nodes >= {min_nodes} """ - df_density_sample = linker._sql_to_splink_dataframe_checking_cache( - sql, "__splink__density_sample" - ) + linker._enqueue_sql(sql, "__splink__partition_clusters_by_size") + + sql = f""" + select + cluster_id + from __splink__partition_clusters_by_size + where row_num <= {rows_per_partition} + """ + + linker._enqueue_sql(sql, "__splink__lowest_density_clusters") + df_cluster_sample_lowest_density = linker._execute_sql_pipeline() - return [r["cluster_id"] for r in df_density_sample.as_record_dict()] + return [r["cluster_id"] for r in df_cluster_sample_lowest_density.as_record_dict()] def render_splink_cluster_studio_html( @@ -310,9 +317,11 @@ def render_splink_cluster_studio_html( else: # Using sensible default for min_nodes. Might become option # for users in future - cluster_ids = _get_lowest_density_cluster_ids( - linker, _df_cluster_metrics, sample_size, min_nodes=3 + cluster_ids = _get_lowest_density_clusters( + linker, _df_cluster_metrics, 1, 3 ) + if len(cluster_ids) > sample_size: + cluster_ids = random.sample(cluster_ids, k=sample_size) cluster_recs = df_clusters_as_records(linker, df_clustered_nodes, cluster_ids) df_nodes = create_df_nodes(linker, df_clustered_nodes, cluster_ids) From 8a541c7fc24d3ccb7e4e4f4921933e393fa9a009 Mon Sep 17 00:00:00 2001 From: zslade Date: Tue, 9 Jan 2024 18:14:53 +0000 Subject: [PATCH 02/12] update doc string and variable names --- splink/cluster_studio.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py index 940d542dae..6706f50b7f 100644 --- a/splink/cluster_studio.py +++ b/splink/cluster_studio.py @@ -234,18 +234,19 @@ def _get_lowest_density_clusters( rows_per_partition: int, min_nodes: int, ): - """Samples lowest density clusters stratified by size. + """Returns cluster ids of lowest density clusters across different sized clusters + by performing stratified sampling. Args: linker: An instance of the Splink Linker class. df_cluster_metrics (SplinkDataFrame): dataframe containing cluster metrics including density. - rows_per_partition (int): - min_nodes (int): The minimum number of nodes a cluster must contain + rows_per_partition (int): number of rows in each strata (partition) + min_nodes (int): minimum number of nodes a cluster must contain to be included in the sample. Returns: - list: A list of cluster IDs of the least dense clusters across different sizes + list: A list of cluster ids of lowest density clusters of different sizes """ sql = f""" @@ -307,7 +308,7 @@ def render_splink_cluster_studio_html( ] cluster_ids = [c["cluster_id"] for c in cluster_ids] named_clusters_dict = dict(zip(cluster_ids, cluster_names)) - if sampling_method == "lowest_density_clusters": + if sampling_method == "lowest_density_by_size": if _df_cluster_metrics is None: raise SplinkException( """To sample by density, you must provide a cluster metrics table From 58104710d66299926fb704b2acaec2041798bed5 Mon Sep 17 00:00:00 2001 From: zslade Date: Wed, 10 Jan 2024 11:09:46 +0000 Subject: [PATCH 03/12] update docstring --- splink/cluster_studio.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py index 6706f50b7f..43d5303b41 100644 --- a/splink/cluster_studio.py +++ b/splink/cluster_studio.py @@ -234,8 +234,8 @@ def _get_lowest_density_clusters( rows_per_partition: int, min_nodes: int, ): - """Returns cluster ids of lowest density clusters across different sized clusters - by performing stratified sampling. + """Returns ids of lowest density clusters of different sizes by + performing stratified sampling. Args: linker: An instance of the Splink Linker class. @@ -246,7 +246,7 @@ def _get_lowest_density_clusters( to be included in the sample. Returns: - list: A list of cluster ids of lowest density clusters of different sizes + list: A list of cluster ids of lowest density clusters of different sizes. """ sql = f""" @@ -308,7 +308,7 @@ def render_splink_cluster_studio_html( ] cluster_ids = [c["cluster_id"] for c in cluster_ids] named_clusters_dict = dict(zip(cluster_ids, cluster_names)) - if sampling_method == "lowest_density_by_size": + if sampling_method == "lowest_density_clusters_by_size": if _df_cluster_metrics is None: raise SplinkException( """To sample by density, you must provide a cluster metrics table From 61121362417cba5a8fae046784f6f4c42aa5ac1c Mon Sep 17 00:00:00 2001 From: zslade Date: Wed, 10 Jan 2024 13:13:37 +0000 Subject: [PATCH 04/12] dashboard rendering and building correctly --- splink/cluster_studio.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py index 43d5303b41..298fcb1afc 100644 --- a/splink/cluster_studio.py +++ b/splink/cluster_studio.py @@ -268,9 +268,9 @@ def _get_lowest_density_clusters( """ linker._enqueue_sql(sql, "__splink__lowest_density_clusters") - df_cluster_sample_lowest_density = linker._execute_sql_pipeline() + df_lowest_density_clusters = linker._execute_sql_pipeline() - return [r["cluster_id"] for r in df_cluster_sample_lowest_density.as_record_dict()] + return [r["cluster_id"] for r in df_lowest_density_clusters.as_record_dict()] def render_splink_cluster_studio_html( @@ -319,7 +319,7 @@ def render_splink_cluster_studio_html( # Using sensible default for min_nodes. Might become option # for users in future cluster_ids = _get_lowest_density_clusters( - linker, _df_cluster_metrics, 1, 3 + linker, _df_cluster_metrics, rows_per_partition=1, min_nodes=3 ) if len(cluster_ids) > sample_size: cluster_ids = random.sample(cluster_ids, k=sample_size) From 9c63e014f9646ba023f50c73f4f9bb56fdbc819d Mon Sep 17 00:00:00 2001 From: zslade Date: Wed, 10 Jan 2024 13:55:19 +0000 Subject: [PATCH 05/12] display density value in dropdown menu --- splink/cluster_studio.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py index 298fcb1afc..ae2a46f308 100644 --- a/splink/cluster_studio.py +++ b/splink/cluster_studio.py @@ -252,7 +252,8 @@ def _get_lowest_density_clusters( sql = f""" select cluster_id, - n_nodes as cluster_size, + n_nodes, + density, row_number() over (partition by n_nodes order by density) as row_num from {df_cluster_metrics.physical_name} where n_nodes >= {min_nodes} @@ -262,7 +263,8 @@ def _get_lowest_density_clusters( sql = f""" select - cluster_id + cluster_id, + round(density, 4) as density_4dp from __splink__partition_clusters_by_size where row_num <= {rows_per_partition} """ @@ -270,7 +272,7 @@ def _get_lowest_density_clusters( linker._enqueue_sql(sql, "__splink__lowest_density_clusters") df_lowest_density_clusters = linker._execute_sql_pipeline() - return [r["cluster_id"] for r in df_lowest_density_clusters.as_record_dict()] + return df_lowest_density_clusters.as_record_dict() def render_splink_cluster_studio_html( @@ -323,6 +325,12 @@ def render_splink_cluster_studio_html( ) if len(cluster_ids) > sample_size: cluster_ids = random.sample(cluster_ids, k=sample_size) + cluster_names = [ + f"Cluster ID: {c['cluster_id']}, density (4dp) {c['density_4dp']}" + for c in cluster_ids + ] + cluster_ids = [c["cluster_id"] for c in cluster_ids] + named_clusters_dict = dict(zip(cluster_ids, cluster_names)) cluster_recs = df_clusters_as_records(linker, df_clustered_nodes, cluster_ids) df_nodes = create_df_nodes(linker, df_clustered_nodes, cluster_ids) From 33a4ee126953a62f58afede57b3fe9b0dcce2bb5 Mon Sep 17 00:00:00 2001 From: zslade Date: Wed, 10 Jan 2024 15:51:40 +0000 Subject: [PATCH 06/12] update test --- tests/test_cluster_studio.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tests/test_cluster_studio.py b/tests/test_cluster_studio.py index 6ce12698fa..ac5670ca7c 100644 --- a/tests/test_cluster_studio.py +++ b/tests/test_cluster_studio.py @@ -1,6 +1,6 @@ import pandas as pd -from splink.cluster_studio import _get_lowest_density_cluster_ids +from splink.cluster_studio import _get_lowest_density_clusters from splink.duckdb.linker import DuckDBLinker @@ -17,13 +17,13 @@ def test_density_sample(): # Dummy cluster metrics table cluster = ["A", "B", "C", "D", "E"] - n_nodes = [3, 2, 10, 3, 19] - n_edges = [2, 1, 5, 2, 25] + n_nodes = [3, 3, 3, 10, 10] + n_edges = [1, 2, 3, 9, 20] density = [ (n_edges * 2) / (n_nodes * (n_nodes - 1)) for n_nodes, n_edges in zip(n_nodes, n_edges) ] - df_metrics = pd.DataFrame( + pd_metrics = pd.DataFrame( { "cluster_id": cluster, "n_nodes": n_nodes, @@ -34,10 +34,15 @@ def test_density_sample(): # Convert to Splink dataframe df_cluster_metrics = linker.register_table( - df_metrics, "df_cluster_metrics", overwrite=True + pd_metrics, "df_cluster_metrics", overwrite=True ) - df_result = _get_lowest_density_cluster_ids( - linker, df_cluster_metrics, sample_size=3, min_nodes=3 + result = _get_lowest_density_clusters( + linker, df_cluster_metrics, rows_per_partition=1, min_nodes=3 ) - df_expect = ["C", "E", "A"] - assert df_result == df_expect + + expect = [ + {"cluster_id": "D", "density_4dp": 0.2}, + {"cluster_id": "A", "density_4dp": 0.3333}, + ] + + assert result == expect From 9734f531ed0f4191dda472db2045bc67bbd1a72e Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 11 Jan 2024 12:45:44 +0000 Subject: [PATCH 07/12] order by cluster id --- splink/cluster_studio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py index ae2a46f308..06bd973059 100644 --- a/splink/cluster_studio.py +++ b/splink/cluster_studio.py @@ -254,7 +254,7 @@ def _get_lowest_density_clusters( cluster_id, n_nodes, density, - row_number() over (partition by n_nodes order by density) as row_num + row_number() over (partition by n_nodes order by density, cluster_id) as row_num from {df_cluster_metrics.physical_name} where n_nodes >= {min_nodes} """ From 7086ff3b385f5884e1d786ab1fd1f2ef8fe7ccfb Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 11 Jan 2024 12:45:52 +0000 Subject: [PATCH 08/12] update test --- tests/test_cluster_studio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cluster_studio.py b/tests/test_cluster_studio.py index ac5670ca7c..457fc597a7 100644 --- a/tests/test_cluster_studio.py +++ b/tests/test_cluster_studio.py @@ -41,8 +41,8 @@ def test_density_sample(): ) expect = [ - {"cluster_id": "D", "density_4dp": 0.2}, {"cluster_id": "A", "density_4dp": 0.3333}, + {"cluster_id": "D", "density_4dp": 0.2}, ] assert result == expect From 98f44cbd359e168f15ad69a9c4dfe3a0be251a81 Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 11 Jan 2024 16:36:18 +0000 Subject: [PATCH 09/12] fix test --- tests/test_cluster_studio.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_cluster_studio.py b/tests/test_cluster_studio.py index 457fc597a7..ef62fc65ed 100644 --- a/tests/test_cluster_studio.py +++ b/tests/test_cluster_studio.py @@ -16,9 +16,9 @@ def test_density_sample(): linker = DuckDBLinker(df, settings) # Dummy cluster metrics table - cluster = ["A", "B", "C", "D", "E"] - n_nodes = [3, 3, 3, 10, 10] - n_edges = [1, 2, 3, 9, 20] + cluster = ["A", "B", "C", "D", "E", "F"] + n_nodes = [2, 3, 3, 3, 10, 10] + n_edges = [1, 2, 2, 3, 9, 20] density = [ (n_edges * 2) / (n_nodes * (n_nodes - 1)) for n_nodes, n_edges in zip(n_nodes, n_edges) @@ -40,9 +40,11 @@ def test_density_sample(): linker, df_cluster_metrics, rows_per_partition=1, min_nodes=3 ) + result = sorted(result, key=lambda x: x["cluster_id"]) + expect = [ - {"cluster_id": "A", "density_4dp": 0.3333}, - {"cluster_id": "D", "density_4dp": 0.2}, + {"cluster_id": "B", "density_4dp": 0.6667}, + {"cluster_id": "E", "density_4dp": 0.2}, ] assert result == expect From 8771516db9e5ee5f6f3040b987b3559a6b06e14f Mon Sep 17 00:00:00 2001 From: zslade Date: Mon, 29 Jan 2024 14:00:54 +0000 Subject: [PATCH 10/12] update following review --- splink/cluster_studio.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py index 06bd973059..9b9ba40181 100644 --- a/splink/cluster_studio.py +++ b/splink/cluster_studio.py @@ -234,7 +234,7 @@ def _get_lowest_density_clusters( rows_per_partition: int, min_nodes: int, ): - """Returns ids of lowest density clusters of different sizes by + """Returns lowest density clusters of different sizes by performing stratified sampling. Args: @@ -246,7 +246,8 @@ def _get_lowest_density_clusters( to be included in the sample. Returns: - list: A list of cluster ids of lowest density clusters of different sizes. + list: A list of record dictionaries containing cluster ids, densities + and sizes of lowest density clusters. """ sql = f""" @@ -264,7 +265,8 @@ def _get_lowest_density_clusters( sql = f""" select cluster_id, - round(density, 4) as density_4dp + round(density, 4) as density_4dp, + n_nodes as cluster_size from __splink__partition_clusters_by_size where row_num <= {rows_per_partition} """ @@ -305,7 +307,7 @@ def render_splink_cluster_studio_html( if len(cluster_ids) > sample_size: cluster_ids = random.sample(cluster_ids, k=sample_size) cluster_names = [ - f"Cluster ID: {c['cluster_id']}, size {c['cluster_size']}" + f"Cluster ID: {c['cluster_id']}, size: {c['cluster_size']}" for c in cluster_ids ] cluster_ids = [c["cluster_id"] for c in cluster_ids] @@ -315,22 +317,22 @@ def render_splink_cluster_studio_html( raise SplinkException( """To sample by density, you must provide a cluster metrics table containing density. This can be generated by calling the - _compute_cluster_metrics method on the linker.""" + _compute_graph_metrics method on the linker.""" ) - else: - # Using sensible default for min_nodes. Might become option - # for users in future - cluster_ids = _get_lowest_density_clusters( - linker, _df_cluster_metrics, rows_per_partition=1, min_nodes=3 - ) - if len(cluster_ids) > sample_size: - cluster_ids = random.sample(cluster_ids, k=sample_size) - cluster_names = [ - f"Cluster ID: {c['cluster_id']}, density (4dp) {c['density_4dp']}" - for c in cluster_ids - ] - cluster_ids = [c["cluster_id"] for c in cluster_ids] - named_clusters_dict = dict(zip(cluster_ids, cluster_names)) + # Using sensible default for min_nodes. Might become option + # for users in future + cluster_ids = _get_lowest_density_clusters( + linker, _df_cluster_metrics, rows_per_partition=1, min_nodes=3 + ) + if len(cluster_ids) > sample_size: + cluster_ids = random.sample(cluster_ids, k=sample_size) + cluster_names = [ + f"""Cluster ID: {c['cluster_id']}, density (4dp): {c['density_4dp']}, + size: {c['cluster_size']}""" + for c in cluster_ids + ] + cluster_ids = [c["cluster_id"] for c in cluster_ids] + named_clusters_dict = dict(zip(cluster_ids, cluster_names)) cluster_recs = df_clusters_as_records(linker, df_clustered_nodes, cluster_ids) df_nodes = create_df_nodes(linker, df_clustered_nodes, cluster_ids) From f9ac77db79b67da314bbdb74f58af8791f8aa8f0 Mon Sep 17 00:00:00 2001 From: zslade Date: Mon, 29 Jan 2024 14:02:39 +0000 Subject: [PATCH 11/12] update following review --- splink/cluster_studio.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/splink/cluster_studio.py b/splink/cluster_studio.py index 9b9ba40181..695b14e53a 100644 --- a/splink/cluster_studio.py +++ b/splink/cluster_studio.py @@ -188,7 +188,7 @@ def _get_random_cluster_ids( def _get_cluster_id_of_each_size( - linker: "Linker", connected_components: SplinkDataFrame, rows_per_cluster: int + linker: "Linker", connected_components: SplinkDataFrame, rows_per_partition: int ): sql = f""" select @@ -218,8 +218,7 @@ def _get_cluster_id_of_each_size( cluster_id, cluster_size from __splink__cluster_count_row_numbered - where row_num <= {rows_per_cluster} - and cluster_size > 1 + where row_num <= {rows_per_partition} """ linker._enqueue_sql(sql, "__splink__cluster_count_row_numbered") From 497a1f4b2e63bcbe3b4a2fb2d9080210070958ab Mon Sep 17 00:00:00 2001 From: zslade Date: Mon, 29 Jan 2024 14:13:23 +0000 Subject: [PATCH 12/12] update test --- tests/test_cluster_studio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_cluster_studio.py b/tests/test_cluster_studio.py index ef62fc65ed..c5c194c9e4 100644 --- a/tests/test_cluster_studio.py +++ b/tests/test_cluster_studio.py @@ -43,8 +43,8 @@ def test_density_sample(): result = sorted(result, key=lambda x: x["cluster_id"]) expect = [ - {"cluster_id": "B", "density_4dp": 0.6667}, - {"cluster_id": "E", "density_4dp": 0.2}, + {"cluster_id": "B", "density_4dp": 0.6667, "cluster_size": 3}, + {"cluster_id": "E", "density_4dp": 0.2, "cluster_size": 10}, ] assert result == expect