add node centrality functionality and docs

moj-analytical-services · Feb 12, 2025 · f87db4b · f87db4b
1 parent 44d6f5d
commit f87db4b
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 15 deletions.
diff --git a/docs/img/clusters/basic_graph_centralisataion.drawio.png b/docs/img/clusters/basic_graph_centralisataion.drawio.png
diff --git a/docs/topic_guides/evaluation/clusters/graph_metrics.md b/docs/topic_guides/evaluation/clusters/graph_metrics.md
@@ -41,6 +41,28 @@ However, erroneous links (false positives) could also be the reason for _high_ n
 
 It is important to consider [cluster size](#cluster-size) when looking at node degree. By definition, larger clusters contain more nodes to form links between, allowing nodes within them to attain higher degrees compared to those in smaller clusters. Consequently, low node degree within larger clusters can carry greater significance.
 
+Bear in mind, that the centrality of a single node in a cluster isn't necessarily representative of the overall connectedness of a cluster. This is where [cluster centralisation](#cluster-centralisation) can help.
+
+### Node Centrality
+
+##### Definition
+
+Node centrality is the **proportion of all possible edges connected to a node**. It can also be interpreted as a normalised node degree, or the proportion of other nodes in the cluster that are linked to. Centrality ranges from 0 to 1. A centrality of 1 means a node is connected to all other nodes in a cluster.
+
+##### Example
+
+In the cluster below node B is connected to all nodes (giving a centrality of 1), whereas node A is connected to 1 out of 4 nodes (giving a centrality of 0.25).
+
+![](../../../img/clusters/basic_graph_centralisataion.drawio.png){:width="80%"}
+
+##### Application in Data Linkage
+
+High node centrality is generally considered good as it means the node is directly connected to many of the other nodes in a cluster. Low node centrality (particularly in relation to the rest of the nodes in the cluster) can be an indicative of a false link (false positive).
+
+Unlike node degree, centrality takes the cluster size into account and, being normalised, is more appropriate for comparing nodes across clusters.
+
+Node centrality can be useful as the node with the highest centrality in a cluster could be chosen to represent a cluster (sometimes know as a "golden record"). This is not appropriate in all cases, but the most connected node within a cluster will likely have much in common with other nodes.
+
 Bear in mind, that the degree of a single node in a cluster isn't necessarily representative of the overall connectedness of a cluster. This is where [cluster centralisation](#cluster-centralisation) can help.
 
 <hr>

diff --git a/docs/topic_guides/evaluation/clusters/how_to_compute_metrics.ipynb b/docs/topic_guides/evaluation/clusters/how_to_compute_metrics.ipynb
@@ -65,6 +65,7 @@
     "The metrics computed by `compute_graph_metrics()` include all those mentioned in the [Graph metrics](./graph_metrics.md) chapter, namely:\n",
     "\n",
     "* Node degree\n",
+    "* Node centrality\n",
     "* 'Is bridge'\n",
     "* Cluster size\n",
     "* Cluster density\n",

diff --git a/splink/internals/graph_metrics.py b/splink/internals/graph_metrics.py
@@ -25,7 +25,7 @@ def _truncated_edges_sql(
     return sql_info
 
 
-def _node_degree_sql(
+def _node_degree_centralisation_sql(
     df_predict: SplinkDataFrame,
     df_clustered: SplinkDataFrame,
     composite_uid_edges_l: str,
@@ -34,7 +34,8 @@ def _node_degree_sql(
     threshold_match_probability: float,
 ) -> List[Dict[str, str]]:
     """
-    Generates sql for computing node degree per node, at a given edge threshold.
+    Generates sql for computing node degree and node centralisation (i.e.
+    normalised node degree) per node, at a given edge threshold.
 
     This is includes nodes with no edges, as identified via the clusters table.
 
@@ -77,17 +78,26 @@ def _node_degree_sql(
     # join clusters table to capture edge-less nodes
     # want all clusters included so left join
     sql = f"""
+        WITH all_nodes AS (
         SELECT
             c.{composite_uid_clusters} AS composite_unique_id,
             c.cluster_id AS cluster_id,
-            COUNT(*) FILTER (WHERE neighbour IS NOT NULL) AS node_degree
+            COUNT(*) FILTER (WHERE neighbour IS NOT NULL) AS node_degree,
+            COUNT(*) OVER(PARTITION BY c.cluster_id) AS cluster_size
         FROM
             {df_clustered.physical_name} c
         LEFT JOIN
             {all_nodes_table_name} n
         ON
             c.{composite_uid_clusters} = n.node
-        GROUP BY composite_unique_id, cluster_id
+        GROUP BY composite_unique_id, cluster_id)
+
+        SELECT 
+            composite_unique_id,
+            cluster_id,
+            node_degree,
+            node_degree / (cluster_size - 1) AS node_centralisation
+        FROM all_nodes
     """
     sql_info = {"sql": sql, "output_table_name": "__splink__graph_metrics_nodes"}
     sqls.append(sql_info)

diff --git a/splink/internals/linker_components/clustering.py b/splink/internals/linker_components/clustering.py
@@ -8,7 +8,7 @@
 from splink.internals.edge_metrics import compute_edge_metrics
 from splink.internals.graph_metrics import (
     GraphMetricsResults,
-    _node_degree_sql,
+    _node_degree_centralisation_sql,
     _size_density_centralisation_sql,
 )
 from splink.internals.misc import (
@@ -349,17 +349,18 @@ def _compute_metrics_nodes(
 
         Node metrics produced:
         * node_degree (absolute number of neighbouring nodes)
+        * node_centralisation (proportion of neighbours wrt maximum possible number)
 
         Output table has a single row per input node, along with the cluster id (as
-        assigned in `linker.cluster_pairwise_at_threshold()`) and the metric
-        node_degree:
-
-        |-------------------------------------------------|
-        | composite_unique_id | cluster_id  | node_degree |
-        |---------------------|-------------|-------------|
-        | s1-__-10001         | s1-__-10001 | 6           |
-        | s1-__-10002         | s1-__-10001 | 4           |
-        | s1-__-10003         | s1-__-10003 | 2           |
+        assigned in `linker.cluster_pairwise_at_threshold()`) and the metrics
+        node_degree and node_centralisation:
+
+        |-----------------------------------------------------------------------|
+        | composite_unique_id | cluster_id  | node_degree | node_centralisation |
+        |---------------------|-------------|-------------|---------------------|
+        | s1-__-10001         | s1-__-10001 | 6           | 0.9                 |
+        | s1-__-10002         | s1-__-10001 | 4           | 0.6                 |
+        | s1-__-10003         | s1-__-10003 | 2           | 0.3                 |
         ...
         """
         uid_cols = (
@@ -371,7 +372,7 @@ def _compute_metrics_nodes(
         composite_uid_clusters = _composite_unique_id_from_nodes_sql(uid_cols)
 
         pipeline = CTEPipeline()
-        sqls = _node_degree_sql(
+        sqls = _node_degree_centralisation_sql(
             df_predict,
             df_clustered,
             composite_uid_edges_l,