Merge pull request #2635 from moj-analytical-services/add_to_cookbook

add nested example
moj-analytical-services · Feb 24, 2025 · c22d22d · c22d22d
2 parents 23068a8 + d1326c2
commit c22d22d
Showing 1 changed file with 232 additions and 0 deletions.
diff --git a/docs/demos/examples/duckdb_no_test/cookbook.ipynb b/docs/demos/examples/duckdb_no_test/cookbook.ipynb
@@ -957,6 +957,238 @@
     "\n",
     "pairwise_predictions = linker.inference.predict(threshold_match_weight=-10)\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Nested linkage\n",
+    "\n",
+    "In this example, we want to deduplicate persons but only within each company.\n",
+    "\n",
+    "The problem is that the companies themselves may be duplicates, so we proceed by deduplicating the companies first and then deduplicating persons nested within each company we resolved in step 1.\n",
+    "\n",
+    "Note I do not include full model training code here, just a simple/illustrative model spec.  The example is more about demonstrating the nested linkage process."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import duckdb\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "import splink.comparison_library as cl\n",
+    "from splink import DuckDBAPI, Linker, SettingsCreator, block_on\n",
+    "from splink.clustering import cluster_pairwise_predictions_at_threshold\n",
+    "\n",
+    "# Example data with companies and persons\n",
+    "company_person_records_list = [\n",
+    "    {\n",
+    "        \"unique_id\": 1001,\n",
+    "        \"client_id\": \"GGN1\",\n",
+    "        \"company_name\": \"Green Garden Nurseries Ltd\",\n",
+    "        \"postcode\": \"NR1 1AB\",\n",
+    "        \"person_firstname\": \"John\",\n",
+    "        \"person_surname\": \"Smith\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"unique_id\": 1002,\n",
+    "        \"client_id\": \"GGN1\",\n",
+    "        \"company_name\": \"Green Gardens Ltd\",\n",
+    "        \"postcode\": \"NR1 1AB\",\n",
+    "        \"person_firstname\": \"Sarah\",\n",
+    "        \"person_surname\": \"Jones\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"unique_id\": 1003,\n",
+    "        \"client_id\": \"GGN2\",\n",
+    "        \"company_name\": \"Green Garden Nurseries Ltd\",\n",
+    "        \"postcode\": \"NR1 1AB\",\n",
+    "        \"person_firstname\": \"John\",\n",
+    "        \"person_surname\": \"Smith\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"unique_id\": 3001,\n",
+    "        \"client_id\": \"GW1\",\n",
+    "        \"company_name\": \"Garden World\",\n",
+    "        \"postcode\": \"LS2 3EF\",\n",
+    "        \"person_firstname\": \"Emma\",\n",
+    "        \"person_surname\": \"Wilson\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"unique_id\": 3002,\n",
+    "        \"client_id\": \"GW1\",\n",
+    "        \"company_name\": \"Garden World UK\",\n",
+    "        \"postcode\": \"LS2 3EF\",\n",
+    "        \"person_firstname\": \"Emma\",\n",
+    "        \"person_surname\": \"Wilson\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"unique_id\": 3003,\n",
+    "        \"client_id\": \"GW2\",\n",
+    "        \"company_name\": \"Garden World\",\n",
+    "        \"postcode\": \"LS2 3EF\",\n",
+    "        \"person_firstname\": \"Emma\",\n",
+    "        \"person_surname\": \"Wilson\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"unique_id\": 3004,\n",
+    "        \"client_id\": \"GW2\",\n",
+    "        \"company_name\": \"Garden World\",\n",
+    "        \"postcode\": \"LS2 3EF\",\n",
+    "        \"person_firstname\": \"James\",\n",
+    "        \"person_surname\": \"Taylor\",\n",
+    "    },\n",
+    "]\n",
+    "company_person_records = pd.DataFrame(company_person_records_list)\n",
+    "company_person_records\n",
+    "print(\"========== NESTED COMPANY-PERSON LINKAGE EXAMPLE ==========\")\n",
+    "print(\"This example demonstrates a two-phase linkage process:\")\n",
+    "print(\"1. First, link and cluster to find duplicate companies (client_id)\")\n",
+    "print(\"2. Then, deduplicate persons ONLY within each company cluster\")\n",
+    "\n",
+    "# Initialize database\n",
+    "if os.path.exists(\"nested_linkage.ddb\"):\n",
+    "    os.remove(\"nested_linkage.ddb\")\n",
+    "con = duckdb.connect(\"nested_linkage.ddb\")\n",
+    "\n",
+    "# Load data into DuckDB\n",
+    "con.execute(\n",
+    "    \"CREATE OR REPLACE TABLE company_person_records AS \"\n",
+    "    \"SELECT * FROM company_person_records\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "print(\"\\n--- PHASE 1: COMPANY LINKAGE ---\")\n",
+    "print(\"Company records to be linked:\")\n",
+    "con.table(\"company_person_records\").show()\n",
+    "\n",
+    "# STEP 1: Find duplicate client_ids\n",
+    "\n",
+    "\n",
+    "# Configure company linkage\n",
+    "# We match on person name because if we have duplicate client_ids,\n",
+    "# it's likely that they may share the same contact\n",
+    "# Note though, at this stage the entity is client not a person\n",
+    "company_settings = SettingsCreator(\n",
+    "    link_type=\"dedupe_only\",\n",
+    "    unique_id_column_name=\"unique_id\",\n",
+    "    probability_two_random_records_match=0.001,\n",
+    "    comparisons=[\n",
+    "        cl.ExactMatch(\"client_id\"),\n",
+    "        cl.JaroWinklerAtThresholds(\"person_firstname\"),\n",
+    "        cl.JaroWinklerAtThresholds(\"person_surname\"),\n",
+    "        cl.JaroWinklerAtThresholds(\"company_name\"),\n",
+    "        cl.ExactMatch(\"postcode\"),\n",
+    "    ],\n",
+    "    blocking_rules_to_generate_predictions=[\n",
+    "        block_on(\"postcode\"),\n",
+    "        block_on(\"company_name\"),\n",
+    "    ],\n",
+    "    retain_matching_columns=True,\n",
+    ")\n",
+    "\n",
+    "db_api = DuckDBAPI(connection=con)\n",
+    "company_linker = Linker(\"company_person_records\", company_settings, db_api)\n",
+    "company_predictions = company_linker.inference.predict(threshold_match_probability=0.5)\n",
+    "\n",
+    "print(\"\\nCompany pairwise matches:\")\n",
+    "company_predictions.as_duckdbpyrelation().show()\n",
+    "\n",
+    "# Cluster companies\n",
+    "company_nodes = con.sql(\"SELECT DISTINCT client_id FROM company_person_records\")\n",
+    "company_edges = con.sql(f\"\"\"\n",
+    "    SELECT\n",
+    "        client_id_l as n_1,\n",
+    "        client_id_r as n_2,\n",
+    "        match_probability\n",
+    "    FROM {company_predictions.physical_name}\n",
+    "\"\"\")\n",
+    "\n",
+    "# Perform company clustering\n",
+    "company_clusters = cluster_pairwise_predictions_at_threshold(\n",
+    "    company_nodes,\n",
+    "    company_edges,\n",
+    "    node_id_column_name=\"client_id\",\n",
+    "    edge_id_column_name_left=\"n_1\",\n",
+    "    edge_id_column_name_right=\"n_2\",\n",
+    "    db_api=db_api,\n",
+    "    threshold_match_probability=0.5,\n",
+    ")\n",
+    "\n",
+    "# Add company cluster IDs to original records\n",
+    "company_clusters_ddb = company_clusters.as_duckdbpyrelation()\n",
+    "con.register(\"company_clusters_ddb\", company_clusters_ddb)\n",
+    "\n",
+    "\n",
+    "sql = \"\"\"\n",
+    "CREATE TABLE records_with_company_cluster AS\n",
+    "SELECT cr.*,\n",
+    "       cc.cluster_id as company_cluster_id\n",
+    "FROM company_person_records cr\n",
+    "LEFT JOIN company_clusters_ddb cc\n",
+    "ON cr.client_id = cc.client_id\n",
+    "\"\"\"\n",
+    "con.execute(sql)\n",
+    "print(\"Records with company cluster:\")\n",
+    "con.table(\"records_with_company_cluster\").show()\n",
+    "\n",
+    "# Not needed, just to see what's happening\n",
+    "print(\"\\nCompany clustering results:\")\n",
+    "con.sql(\"\"\"\n",
+    "SELECT\n",
+    "    company_cluster_id,\n",
+    "    array_agg(DISTINCT client_id) as client_ids,\n",
+    "    array_agg(DISTINCT company_name) as company_names\n",
+    "FROM records_with_company_cluster\n",
+    "GROUP BY company_cluster_id\n",
+    "\"\"\").show()\n",
+    "\n",
+    "print(\"\\n--- PHASE 2: PERSON LINKAGE WITHIN COMPANIES ---\")\n",
+    "print(\"Now linking persons, but only within their company clusters\")\n",
+    "\n",
+    "# STEP 2: Link persons within company clusters\n",
+    "# Create a new connection to isolate this step\n",
+    "con2 = duckdb.connect()\n",
+    "con2.sql(\"attach 'nested_linkage.ddb' as linkage_db\")\n",
+    "con2.execute(\n",
+    "    \"create table records_with_company_cluster as select * from linkage_db.records_with_company_cluster\"\n",
+    ")\n",
+    "db_api2 = DuckDBAPI(connection=con2)\n",
+    "\n",
+    "# Configure person linkage within company clusters\n",
+    "# Simple linking model just distinguishes between people within a client_id\n",
+    "# There shouldn't be many so this model can be straightforward\n",
+    "person_settings = SettingsCreator(\n",
+    "    link_type=\"dedupe_only\",\n",
+    "    probability_two_random_records_match=0.01,\n",
+    "    comparisons=[\n",
+    "        cl.JaroWinklerAtThresholds(\"person_firstname\"),\n",
+    "        cl.JaroWinklerAtThresholds(\"person_surname\"),\n",
+    "    ],\n",
+    "    blocking_rules_to_generate_predictions=[\n",
+    "        # Critical: Block on company_cluster_id to only compare within company\n",
+    "        block_on(\"company_cluster_id\"),\n",
+    "    ],\n",
+    "    retain_matching_columns=True,\n",
+    ")\n",
+    "\n",
+    "# Link persons within company clusters\n",
+    "person_linker = Linker(\"records_with_company_cluster\", person_settings, db_api2)\n",
+    "person_predictions = person_linker.inference.predict(threshold_match_probability=0.5)\n",
+    "\n",
+    "print(\"\\nPerson pairwise matches (within company clusters):\")\n",
+    "person_predictions.as_duckdbpyrelation().show(max_width=1000)\n",
+    "\n",
+    "person_clusters = person_linker.clustering.cluster_pairwise_predictions_at_threshold(\n",
+    "    person_predictions, threshold_match_probability=0.5\n",
+    ")\n",
+    "\n",
+    "person_clusters.as_duckdbpyrelation().sort(\"cluster_id\").show(max_width=1000)\n"
+   ]
   }
  ],
  "metadata": {