Merge branch 'master' into master

moj-analytical-services · Feb 11, 2025 · a800990 · a800990
2 parents a498ed0 + 0fd67de
commit a800990
Show file tree

Hide file tree

Showing 16 changed files with 399 additions and 295 deletions.
diff --git a/.github/workflows/pytest_duckdb.yml b/.github/workflows/pytest_duckdb.yml
@@ -8,6 +8,7 @@ on:
       - "splink/**"
       - "tests/**"
       - "pyproject.toml"
+      - "poetry.lock"
 
 jobs:
   test:

diff --git a/.github/workflows/pytest_postgres.yml b/.github/workflows/pytest_postgres.yml
@@ -8,6 +8,7 @@ on:
       - "splink/**"
       - "tests/**"
       - "pyproject.toml"
+      - "poetry.lock"
 
 jobs:
   test:

diff --git a/.github/workflows/pytest_spark.yml b/.github/workflows/pytest_spark.yml
@@ -8,6 +8,7 @@ on:
       - "splink/**"
       - "tests/**"
       - "pyproject.toml"
+      - "poetry.lock"
 
 jobs:
   test:

diff --git a/.github/workflows/pytest_sqlite.yml b/.github/workflows/pytest_sqlite.yml
@@ -8,6 +8,7 @@ on:
       - "splink/**"
       - "tests/**"
       - "pyproject.toml"
+      - "poetry.lock"
 
 jobs:
   test:

diff --git a/.github/workflows/run_demos_examples.yml b/.github/workflows/run_demos_examples.yml
@@ -10,6 +10,7 @@ on:
       - "docs/demos/examples/**"
       - "!docs/demos/examples/examples_index.md"
       - "pyproject.toml"
+      - "poetry.lock"
   workflow_dispatch:
 
 jobs:

diff --git a/.github/workflows/run_demos_tutorials.yml b/.github/workflows/run_demos_tutorials.yml
@@ -9,6 +9,7 @@ on:
       - splink/**
       - docs/demos/tutorials/**
       - pyproject.toml
+      - "poetry.lock"
 
   workflow_dispatch:
 

diff --git a/docs/demos/examples/duckdb/cookbook.ipynb b/docs/demos/examples/duckdb/cookbook.ipynb
@@ -874,6 +874,89 @@
     "\n",
     "linker.inference.predict().as_duckdbpyrelation().show()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using a DuckDB UDF in a comparison level"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import difflib\n",
+    "\n",
+    "import duckdb\n",
+    "\n",
+    "import splink.comparison_level_library as cll\n",
+    "import splink.comparison_library as cl\n",
+    "from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets\n",
+    "\n",
+    "\n",
+    "def custom_partial_ratio(s1, s2):\n",
+    "    \"\"\"Custom function to compute partial ratio similarity between two strings.\"\"\"\n",
+    "    s1, s2 = str(s1), str(s2)\n",
+    "    matcher = difflib.SequenceMatcher(None, s1, s2)\n",
+    "    return matcher.ratio()\n",
+    "\n",
+    "\n",
+    "df = splink_datasets.fake_1000\n",
+    "\n",
+    "con = duckdb.connect()\n",
+    "con.create_function(\n",
+    "    \"custom_partial_ratio\",\n",
+    "    custom_partial_ratio,\n",
+    "    [duckdb.typing.VARCHAR, duckdb.typing.VARCHAR],\n",
+    "    duckdb.typing.DOUBLE,\n",
+    ")\n",
+    "db_api = DuckDBAPI(connection=con)\n",
+    "\n",
+    "\n",
+    "fuzzy_email_comparison = {\n",
+    "    \"output_column_name\": \"email_fuzzy\",\n",
+    "    \"comparison_levels\": [\n",
+    "        cll.NullLevel(\"email\"),\n",
+    "        cll.ExactMatchLevel(\"email\"),\n",
+    "        {\n",
+    "            \"sql_condition\": \"custom_partial_ratio(email_l, email_r) > 0.8\",\n",
+    "            \"label_for_charts\": \"Fuzzy match (≥ 0.8)\",\n",
+    "        },\n",
+    "        cll.ElseLevel(),\n",
+    "    ],\n",
+    "}\n",
+    "\n",
+    "settings = SettingsCreator(\n",
+    "    link_type=\"dedupe_only\",\n",
+    "    comparisons=[\n",
+    "        cl.ExactMatch(\"first_name\"),\n",
+    "        cl.ExactMatch(\"surname\"),\n",
+    "        cl.ExactMatch(\"dob\"),\n",
+    "        cl.ExactMatch(\"city\").configure(term_frequency_adjustments=True),\n",
+    "        fuzzy_email_comparison,\n",
+    "    ],\n",
+    "    blocking_rules_to_generate_predictions=[\n",
+    "        block_on(\"first_name\"),\n",
+    "        block_on(\"surname\"),\n",
+    "    ],\n",
+    "    max_iterations=2,\n",
+    ")\n",
+    "\n",
+    "linker = Linker(df, settings, db_api)\n",
+    "\n",
+    "linker.training.estimate_probability_two_random_records_match(\n",
+    "    [block_on(\"first_name\", \"surname\")], recall=0.7\n",
+    ")\n",
+    "\n",
+    "linker.training.estimate_u_using_random_sampling(max_pairs=1e5)\n",
+    "\n",
+    "linker.training.estimate_parameters_using_expectation_maximisation(block_on(\"dob\"))\n",
+    "\n",
+    "pairwise_predictions = linker.inference.predict(threshold_match_weight=-10)\n"
+   ]
   }
  ],
  "metadata": {

diff --git a/docs/index.md b/docs/index.md
@@ -62,31 +62,33 @@ Here is a list of some of our known users and their use cases:
 
 	- [Ministry of Justice](https://www.gov.uk/government/organisations/ministry-of-justice) created [linked datasets (combining courts, prisons and probation data)](https://www.adruk.org/our-work/browse-all-projects/data-first-harnessing-the-potential-of-linked-administrative-data-for-the-justice-system-169/) for use by researchers as part of the [Data First programme](https://www.gov.uk/guidance/ministry-of-justice-data-first)
 	- [Office for National Statistics](https://www.ons.gov.uk/)'s [Business Index](https://unece.org/sites/default/files/2023-04/ML2023_S1_UK_Breton_A.pdf) (formerly the Inter Departmental Business Register), [Demographic Index](https://uksa.statisticsauthority.gov.uk/wp-content/uploads/2023/02/EAP182-Quality-work-for-Demographic-Index-MDQA.pdf) and the [2021 Census](https://github.com/Data-Linkage/Splink-census-linkage/blob/main/SplinkCaseStudy.pdf).  See also [this article](https://www.government-transformation.com/data/interview-modernizing-public-sector-insight-through-automated-linkage).
-	- [Ministry of Defence](https://www.gov.uk/government/organisations/ministry-of-defence) recently launched their [Veteran's Card system](https://www.gov.uk/government/news/hm-armed-forces-veteran-cards-will-officially-launch-in-the-new-year-following-a-successful-assessment-from-the-central-digital-and-data-office) which uses Splink to verify applicants against historic records. This project was shortlisted for the [Civil Service Awards](https://www.civilserviceawards.com/creative-solutions-award/)
+	- [Ministry of Defence](https://www.gov.uk/government/organisations/ministry-of-defence) launched their [Veteran's Card system](https://www.gov.uk/government/news/hm-armed-forces-veteran-cards-will-officially-launch-in-the-new-year-following-a-successful-assessment-from-the-central-digital-and-data-office) which uses Splink to verify applicants against historic records. This project was shortlisted for the [Civil Service Awards](https://www.civilserviceawards.com/creative-solutions-award/)
 	- [UK Health Security Agency](https://www.gov.uk/government/organisations/uk-health-security-agency) [used Splink](https://www.gov.uk/government/publications/bloodborne-viruses-opt-out-testing-in-emergency-departments/appendix-for-emergency-department-bloodborne-virus-opt-out-testing-12-month-interim-report-2023#:~:text=Appendix%202D%3A%20public%20health%20evaluation%20data%20linkage%20methodology) to link HIV testing data to national health records to [evaluate the impact of emergency department opt-out bloodborne virus testing](https://www.gov.uk/government/publications/bloodborne-viruses-opt-out-testing-in-emergency-departments/public-health-evaluation-of-bbv-opt-out-testing-in-eds-in-england-24-month-interim-report).
 	- The Department for Education uses Splink to match records from certain data providers to existing learners and reduce the volume of clerical work required for corrections
+	- [SAIL Databank](https://saildatabank.com/), in collaboration with [Secure eResearch Platform (SeRP)](https://serp.ac.uk/), uses Splink to produce linked cohorts for a wide range of population-level research applications
 	- [Lewisham Council](https://lewisham.gov.uk/) (London) [identified and auto-enrolled over 500 additional eligible families](https://lewisham.gov.uk/articles/news/extra-funding-for-lewisham-schools-in-pilot-data-project) to receive Free School Meals
+	- [Integrated Corporate Services](https://icsdigital.blog.gov.uk/2024/05/24/introducing-ics-digital/) have used Splink to match address data in historical datasets, substantially improving match rates.
 	- [London Office of Technology and Innovation](https://loti.london/) created a dashboard to help [better measure and reduce rough sleeping](https://loti.london/projects/rough-sleeping-insights-project/) across London
 	- [Competition and Markets Authority](https://www.gov.uk/government/organisations/competition-and-markets-authority) identified ['Persons with Significant Control' and estimated ownership groups](https://assets.publishing.service.gov.uk/media/626ab6c4d3bf7f0e7f9d5a9b/220426_Annex_-State_of_Competition_Appendices_FINAL.pdf) across companies
 	- [Office for Health Improvement and Disparities](https://www.gov.uk/government/organisations/office-for-health-improvement-and-disparities) linked Health and Justice data to [assess the pathways between probation and specialist alcohol and drug treatment services](https://www.gov.uk/government/statistics/pathways-between-probation-and-addiction-treatment-in-england#:~:text=Details,of%20Health%20and%20Social%20Care) as part of the [Better Outcomes through Linked Data programme](https://www.gov.uk/government/publications/ministry-of-justice-better-outcomes-through-linked-data-bold)
  	- [Gateshead Council](https://www.gateshead.gov.uk/), in partnership with the [National Innovation Centre for Data](https://www.nicd.org.uk/) are creating a [single view of debt](https://nicd.org.uk/knowledge-hub/an-end-to-end-guide-to-overcoming-unique-identifier-challenges-with-splink)
- 	- [SAIL Databank](https://saildatabank.com/), in collaboration with [Secure eResearch Platform (SeRP)](https://serp.ac.uk/), uses Splink to produce linked cohorts for a wide range of population-level research applications
-	- [Integrated Corporate Services](https://icsdigital.blog.gov.uk/2024/05/24/introducing-ics-digital/), have used Splink to match address data in historical datasets, substantially improving match rates.
-
 
 === "Public Sector (International)"
 
-	- The German Federal Statistical Office ([Destatis](https://www.destatis.de/EN/Home/_node.html)) uses Splink to conduct projects in linking register-based census data.
-	- The [European Medicines Agency](https://www.ema.europa.eu/en/homepage) uses Splink to detect duplicate adverse event reports for veterinary medicines
-	- The Defense Health Agency (US Department of Defense) used Splink to identify duplicated hospital records across over 200 million data points in the military hospital data system
-	- [Chilean Ministry of Health](https://www.gob.cl/en/ministries/ministry-of-health/) and [University College London](https://www.ucl.ac.uk/) have [assessed the access to immunisation programs among the migrant population](https://ijpds.org/article/view/2348)
-	- [Florida Cancer Registry](https://www.floridahealth.gov/diseases-and-conditions/cancer/cancer-registry/index.html), published a [feasibility study](https://scholar.googleusercontent.com/scholar?q=cache:sADwxy-D75IJ:scholar.google.com/+splink+florida&hl=en&as_sdt=0,5) which showed Splink was faster and more accurate than alternatives
- 	- [UNHCR](unhcr.org) uses Splink to analyse and enhance the quality of datasets by identifying and addressing potential duplicates.
-	- [Catalyst Cooperative](https://catalyst.coop)'s [Public Utility Data Liberation Project](https://github.com/catalyst-cooperative/pudl) links public financial and operational data from electric utilities for use by US climate advocates, policymakers, and researchers seeking to accelerate the transition away from fossil fuels.
+    - 🇦🇺 The Australian Bureau of Statistics (ABS) used Splink to build the 2024 National Linkage Spine underpinning the [National Disability Data Asset](https://www.abs.gov.au/about/data-services/data-integration/integrated-data/national-disability-data-asset) and will use Splink for the 2025 [Person Linkage Spine](https://www.abs.gov.au/about/data-services/data-integration/person-linkage-spine) build. They are also planning to use Splink for the Post Enumeration Survey as part of the 2026 Census quality assurance process.
+	- 🇩🇪 The German Federal Statistical Office ([Destatis](https://www.destatis.de/EN/Home/_node.html)) uses Splink to conduct projects in linking register-based census data.
+	- 🇪🇺 The [European Medicines Agency](https://www.ema.europa.eu/en/homepage) uses Splink to detect duplicate adverse event reports for veterinary medicines
+	- 🇺🇸 The Defense Health Agency (US Department of Defense) used Splink to identify duplicated hospital records across over 200 million data points in the military hospital data system
+	- 🌐 [UNHCR](unhcr.org) uses Splink to analyse and enhance the quality of datasets by identifying and addressing potential duplicates.
+	- 🇨🇦 The Data Integration Unit at the [Ontario Ministry of Children, Community, and Social Services](https://www.ontario.ca/page/ministry-children-community-and-social-services) are using Splink as their main data-integration tool for all intra- and inter-ministerial data-linking projects.
+	- 🇨🇱🇬🇧 [Chilean Ministry of Health](https://www.gob.cl/en/ministries/ministry-of-health/) and [University College London](https://www.ucl.ac.uk/) have [assessed the access to immunisation programs among the migrant population](https://ijpds.org/article/view/2348)
+	- 🇺🇸 [Florida Cancer Registry](https://www.floridahealth.gov/diseases-and-conditions/cancer/cancer-registry/index.html), published a [feasibility study](https://scholar.googleusercontent.com/scholar?q=cache:sADwxy-D75IJ:scholar.google.com/+splink+florida&hl=en&as_sdt=0,5) which showed Splink was faster and more accurate than alternatives
+	- 🇺🇸 [Catalyst Cooperative](https://catalyst.coop)'s [Public Utility Data Liberation Project](https://github.com/catalyst-cooperative/pudl) links public financial and operational data from electric utilities for use by US climate advocates, policymakers, and researchers seeking to accelerate the transition away from fossil fuels.
 
 === "Academia"
 
 	- [Stanford University](https://www.stanford.edu/) investigated the impact of [receiving government assistance has on political attitudes](https://www.cambridge.org/core/journals/american-political-science-review/article/abs/does-receiving-government-assistance-shape-political-attitudes-evidence-from-agricultural-producers/39552BC5A496EAB6CB484FCA51C6AF21)
+	- Researchers from [Harvard Medical School](https://hms.harvard.edu/), [Vanderbilt University Medical Center](https://www.vumc.org/) and [Brigham and Women's Hospital](https://www.brighamandwomens.org/) published a study on [augmenting death ascertainment in electronic health records using publicly available internet media sources]([https://doi.org/10.1101/2025.01.24.25321042](https://www.medrxiv.org/content/medrxiv/early/2025/01/27/2025.01.24.25321042.full.pdf)).
 	- [Bern University](https://arbor.bfh.ch/) researched how [Active Learning can be applied to Biomedical Record Linkage](https://ebooks.iospress.nl/doi/10.3233/SHTI230545)
 
 === "Other"

diff --git a/docs/topic_guides/performance/optimising_spark.md b/docs/topic_guides/performance/optimising_spark.md
@@ -22,13 +22,13 @@ It is assumed readers have already read the more general [guide to linking big d
 For a cluster with 10 CPUs, that outputs about 8GB of data in parquet format, the following setup may be appropriate:
 
 ```python
+from splink import SparkAPI
+
 spark.conf.set("spark.default.parallelism", "50")
 spark.conf.set("spark.sql.shuffle.partitions", "50")
 
-linker = Linker(
-    person_standardised_nodes,
-    settings,
-    db_api=spark_api,
+db_api = SparkAPI(
+    spark_session=spark,
     break_lineage_method="parquet",
     num_partitions_on_repartition=80,
 )
@@ -45,17 +45,17 @@ Splink will automatically break lineage in sensible places. We have found in pra
 
 You can do this using the `break_lineage_method` parameter as follows:
 
-```
-linker = Linker(
-    person_standardised_nodes,
-    settings,
-    db_api=db_api,
-    break_lineage_method="parquet"
-)
+```python
+from splink import SparkAPI
 
+db_api = SparkAPI(
+    spark_session=spark,
+    break_lineage_method="parquet",
+    num_partitions_on_repartition=80,
+)
 ```
 
-Other options are `checkpoint` and `persist`. For different Spark setups, particularly if you have fast local storage, you may find these options perform better.
+Other options are `checkpoint` and `persist`, plus a [few others](https://github.com/moj-analytical-services/splink/blob/2ed9f8bf2a21fffafa14e3bb848aa69370043e33/splink/internals/spark/database_api.py#L34) for databricks. For different Spark setups, particularly if you have fast local storage, you may find these options perform better.
 
 ## Spark Parallelism
 

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -262,7 +262,6 @@ extra_css:
 - css/neoteroi-mkdocs.css
 extra_javascript:
   - javascripts/mathjax.js
-  - https://polyfill.io/v3/polyfill.min.js?features=es6
   - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
   - https://cdn.jsdelivr.net/npm/vega@5
   - https://cdn.jsdelivr.net/npm/vega-lite@5