moj-analytical-services · RossKen · Feb 13, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
diff --git a/docs/img/clusters/basic_graph_centralisataion.drawio.png b/docs/img/clusters/basic_graph_centralisataion.drawio.png
diff --git a/docs/topic_guides/evaluation/clusters/graph_metrics.md b/docs/topic_guides/evaluation/clusters/graph_metrics.md
@@ -41,6 +41,28 @@ However, erroneous links (false positives) could also be the reason for _high_ n
 
 It is important to consider [cluster size](#cluster-size) when looking at node degree. By definition, larger clusters contain more nodes to form links between, allowing nodes within them to attain higher degrees compared to those in smaller clusters. Consequently, low node degree within larger clusters can carry greater significance.
 
+Bear in mind, that the centrality of a single node in a cluster isn't necessarily representative of the overall connectedness of a cluster. This is where [cluster centralisation](#cluster-centralisation) can help.
+
+### Node Centrality
+
+##### Definition
+
+Node centrality is the **proportion of all possible edges connected to a node**. It can also be interpreted as a normalised node degree, or the proportion of other nodes in the cluster that are linked to. Centrality ranges from 0 to 1. A centrality of 1 means a node is connected to all other nodes in a cluster.
+
+##### Example
+
+In the cluster below node B is connected to all nodes (giving a centrality of 1), whereas node A is connected to 1 out of 4 nodes (giving a centrality of 0.25).
+
+![](../../../img/clusters/basic_graph_records.drawio.png){:width="80%"}
+
+##### Application in Data Linkage
+
+High node centrality is generally considered good as it means the node is directly connected to many of the other nodes in a cluster. Low node centrality (particularly in relation to the rest of the nodes in the cluster) can be an indicative of a false link (false positive).
+
+Unlike node degree, centrality takes the cluster size into account and, being normalised, is more appropriate for comparing nodes across clusters.
+
+Node centrality can be useful as the node with the highest centrality in a cluster could be chosen to represent a cluster (sometimes know as a "golden record"). This is not appropriate in all cases, but the most connected node within a cluster will likely have much in common with other nodes.
+
 Bear in mind, that the degree of a single node in a cluster isn't necessarily representative of the overall connectedness of a cluster. This is where [cluster centralisation](#cluster-centralisation) can help.
 
 <hr>

diff --git a/docs/topic_guides/evaluation/clusters/how_to_compute_metrics.ipynb b/docs/topic_guides/evaluation/clusters/how_to_compute_metrics.ipynb
@@ -65,6 +65,7 @@
     "The metrics computed by `compute_graph_metrics()` include all those mentioned in the [Graph metrics](./graph_metrics.md) chapter, namely:\n",
     "\n",
     "* Node degree\n",
+    "* Node centrality\n",
     "* 'Is bridge'\n",
     "* Cluster size\n",
     "* Cluster density\n",

diff --git a/splink/internals/graph_metrics.py b/splink/internals/graph_metrics.py
@@ -25,7 +25,7 @@ def _truncated_edges_sql(
     return sql_info
 
 
-def _node_degree_sql(
+def _node_degree_centralisation_sql(
     df_predict: SplinkDataFrame,
     df_clustered: SplinkDataFrame,
     composite_uid_edges_l: str,
@@ -34,7 +34,8 @@ def _node_degree_sql(
     threshold_match_probability: float,
 ) -> List[Dict[str, str]]:
     """
-    Generates sql for computing node degree per node, at a given edge threshold.
+    Generates sql for computing node degree and node centralisation (i.e.
+    normalised node degree) per node, at a given edge threshold.
 
     This is includes nodes with no edges, as identified via the clusters table.
 
@@ -80,7 +81,8 @@ def _node_degree_sql(
         SELECT
             c.{composite_uid_clusters} AS composite_unique_id,
             c.cluster_id AS cluster_id,
-            COUNT(*) FILTER (WHERE neighbour IS NOT NULL) AS node_degree
+            COUNT(*) FILTER (WHERE n.neighbour IS NOT NULL) AS node_degree,
+            COUNT(*) OVER(PARTITION BY c.cluster_id) AS cluster_size
         FROM
             {df_clustered.physical_name} c
         LEFT JOIN
@@ -89,8 +91,25 @@ def _node_degree_sql(
             c.{composite_uid_clusters} = n.node
         GROUP BY composite_unique_id, cluster_id
     """
+    node_degree_table_name = "__splink__graph_metrics_node_degree"
+    sql_info = {"sql": sql, "output_table_name": node_degree_table_name}
+    sqls.append(sql_info)
+
+    # calculate node centrality
+    sql = f"""
+        SELECT
+            composite_unique_id,
+            cluster_id,
+            node_degree,
+            CASE
+                WHEN cluster_size > 1 THEN (1.0 * node_degree) / (cluster_size - 1)
+                ELSE 0
+            END AS node_centrality
+        FROM {node_degree_table_name}
+    """
     sql_info = {"sql": sql, "output_table_name": "__splink__graph_metrics_nodes"}
     sqls.append(sql_info)
+
     return sqls
 
 

diff --git a/splink/internals/linker_components/clustering.py b/splink/internals/linker_components/clustering.py
@@ -8,7 +8,7 @@
 from splink.internals.edge_metrics import compute_edge_metrics
 from splink.internals.graph_metrics import (
     GraphMetricsResults,
-    _node_degree_sql,
+    _node_degree_centralisation_sql,
     _size_density_centralisation_sql,
 )
 from splink.internals.misc import (
@@ -349,17 +349,18 @@ def _compute_metrics_nodes(
 
         Node metrics produced:
         * node_degree (absolute number of neighbouring nodes)
+        * node_centralisation (proportion of neighbours wrt maximum possible number)
 
         Output table has a single row per input node, along with the cluster id (as
-        assigned in `linker.cluster_pairwise_at_threshold()`) and the metric
-        node_degree:
-
-        |-------------------------------------------------|
-        | composite_unique_id | cluster_id  | node_degree |
-        |---------------------|-------------|-------------|
-        | s1-__-10001         | s1-__-10001 | 6           |
-        | s1-__-10002         | s1-__-10001 | 4           |
-        | s1-__-10003         | s1-__-10003 | 2           |
+        assigned in `linker.cluster_pairwise_at_threshold()`) and the metrics
+        node_degree and node_centralisation:
+
+        |-----------------------------------------------------------------------|
+        | composite_unique_id | cluster_id  | node_degree | node_centralisation |
+        |---------------------|-------------|-------------|---------------------|
+        | s1-__-10001         | s1-__-10001 | 6           | 0.9                 |
+        | s1-__-10002         | s1-__-10001 | 4           | 0.6                 |
+        | s1-__-10003         | s1-__-10003 | 2           | 0.3                 |
         ...
         """
         uid_cols = (
@@ -371,7 +372,7 @@ def _compute_metrics_nodes(
         composite_uid_clusters = _composite_unique_id_from_nodes_sql(uid_cols)
 
         pipeline = CTEPipeline()
-        sqls = _node_degree_sql(
+        sqls = _node_degree_centralisation_sql(
             df_predict,
             df_clustered,
             composite_uid_edges_l,

diff --git a/tests/test_graph_metrics.py b/tests/test_graph_metrics.py
@@ -190,42 +190,42 @@ def test_metrics(dialect, test_helpers):
         + [{"cluster_id": 5, "unique_id": i} for i in range(24, 24 + 1)]
     )
 
-    expected_node_degrees = [
+    expected_node_metrics = [
         # cluster 1
         # max degree 3
         # centralisation = (1 + 2 + 1)/(3 * 2)
-        (1, 3),
-        (2, 2),
-        (3, 1),
-        (4, 2),
+        (1, 3, 1.0),
+        (2, 2, 2.0 / 3),
+        (3, 1, 1.0 / 3),
+        (4, 2, 2.0 / 3),
         # cluster 2
         # centralisation = (2 + 1 + 2 + 1 + 2)/(5 * 4)
-        (5, 3),
-        (6, 1),
-        (7, 2),
-        (8, 1),
-        (9, 2),
-        (10, 1),
+        (5, 3, 0.6),
+        (6, 1, 0.2),
+        (7, 2, 0.4),
+        (8, 1, 0.2),
+        (9, 2, 0.4),
+        (10, 1, 0.2),
         # cluster 3
         # centralisation = NULL
-        (11, 1),
-        (12, 1),
+        (11, 1, 1.0),
+        (12, 1, 1.0),
         # cluster 4
         # centralisation = (3 + 2 + 1 + 3 + 1 + 4 + 3 + 4 + 3 + 4)/(10*9)
-        (13, 6),
-        (14, 3),
-        (15, 4),
-        (16, 5),
-        (17, 3),
-        (18, 5),
-        (19, 2),
-        (20, 3),
-        (21, 2),
-        (22, 3),
-        (23, 2),
+        (13, 6, 0.6),
+        (14, 3, 0.3),
+        (15, 4, 0.4),
+        (16, 5, 0.5),
+        (17, 3, 0.3),
+        (18, 5, 0.5),
+        (19, 2, 0.2),
+        (20, 3, 0.3),
+        (21, 2, 0.2),
+        (22, 3, 0.3),
+        (23, 2, 0.2),
         # cluster 5
         # centralisation = NULL
-        (24, 0),
+        (24, 0, 0.0),
     ]
 
     # pass in dummy frame to linker
@@ -280,13 +280,18 @@ def test_metrics(dialect, test_helpers):
 
     df_nm = cm.nodes.as_pandas_dataframe()
 
-    for unique_id, expected_node_degree in expected_node_degrees:
+    for unique_id, expected_degree, expected_centrality in expected_node_metrics:
         relevant_row = df_nm[df_nm["composite_unique_id"] == unique_id]
         calculated_node_degree = relevant_row["node_degree"].iloc[0]
-        assert calculated_node_degree == expected_node_degree, (
-            f"Expected node degree {expected_node_degree} for node {unique_id}, "
+        assert calculated_node_degree == expected_degree, (
+            f"Expected node degree {expected_degree} for node {unique_id}, "
             f"but found node degree {calculated_node_degree}"
         )
+        calculated_node_centrality = relevant_row["node_centrality"].iloc[0]
+        assert float(calculated_node_centrality) == approx(expected_centrality), (
+            f"Expected node centrality {expected_centrality} for node {unique_id}, "
+            f"but found node centrality {calculated_node_centrality}"
+        )
 
 
 def make_edge_row(