From 0f0f186a87a33b5f80c413bae9106c68cf18b9f9 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Fri, 6 Dec 2024 14:30:22 +0000 Subject: [PATCH 1/5] update LLM generation scripts --- scripts/create_llm_prompt_long.py | 61 +++++++++++++++++++++++------- scripts/create_llm_prompt_short.py | 43 ++++++++++++++++++++- 2 files changed, 89 insertions(+), 15 deletions(-) diff --git a/scripts/create_llm_prompt_long.py b/scripts/create_llm_prompt_long.py index 3f1e2ffb8f..f742a2c47b 100644 --- a/scripts/create_llm_prompt_long.py +++ b/scripts/create_llm_prompt_long.py @@ -1,6 +1,7 @@ import inspect import logging import os +import urllib.request import nbformat @@ -137,9 +138,7 @@ def extract_all_function_docstrings_from_module(module): return docstrings -def save_docstrings_with_append( - docstrings, docstring_filename="llm_context_long.txt", append_filenames=None -): +def save_docstrings_with_append(docstrings, docstring_filename, append_filenames=None): append_content = "" if append_filenames: for filename in append_filenames: @@ -196,7 +195,7 @@ def save_docstrings_with_append( print( # NOQA: T201 "\nDocstrings extracted, saved, and organized into sections in " - "llm_context_long.txt" + f"{docstring_filename}" ) @@ -217,8 +216,32 @@ def extract_and_append_md_content(md_files, docstring_filename): print(f"Warning: File {full_path} not found.") # NOQA: T201 +def fetch_url_content(url): + with urllib.request.urlopen(url) as response: + return response.read().decode("utf-8") + + +additional_instructions_to_llms = """ +If the user asks for Splink code, when writing Splink code use ONLY +functions and methods which you've seen used in the context provided. Do NOT guess at +other Splink methods or functions. + +If you don't know how to do something, put a comment referring them to +https://moj-analytical-services.github.io/splink + +If the user asks for data cleaning, do it in their chosen backend (spark, duckdb) +rather than pandas. In general avoid pandas unless specificaly asked for. + +In most cases, you should assume the incoming data is not clean and suggest some simple +data cleaning steps. +""" + # Main execution if __name__ == "__main__": + # Clear the output file at start + output_filename = "llm_context_long.txt" + open(output_filename, "w").close() + # Extract docstrings for all public methods in specified Linker submodules linker_docstrings = extract_method_docstrings( linker, @@ -277,7 +300,7 @@ def extract_and_append_md_content(md_files, docstring_filename): print("Extracting and saving docstrings...") # NOQA: T201 save_docstrings_with_append( all_docstrings, - "llm_context_long.txt", + output_filename, append_filenames=[ "../docs/api_docs/settings_dict_guide.md", "../docs/api_docs/datasets.md", @@ -288,8 +311,8 @@ def extract_and_append_md_content(md_files, docstring_filename): demos_examples_dir = "../docs/demos/examples" demos_tutorials_dir = "../docs/demos/tutorials" - extract_and_append_notebook_content(demos_examples_dir, "llm_context_long.txt") - extract_and_append_notebook_content(demos_tutorials_dir, "llm_context_long.txt") + extract_and_append_notebook_content(demos_examples_dir, output_filename) + extract_and_append_notebook_content(demos_tutorials_dir, output_filename) # New part: Append content from specified Markdown files mds_to_append = [ @@ -304,9 +327,21 @@ def extract_and_append_md_content(md_files, docstring_filename): "/docs/topic_guides/performance/performance_evaluation.md", "/docs/api_docs/settings_dict_guide.md", ] - extract_and_append_md_content(mds_to_append, "llm_context_long.txt") - - print( # NOQA: T201 - "Docstrings extracted, saved, and all specified content " - "appended to llm_context_long.txt" - ) + extract_and_append_md_content(mds_to_append, output_filename) + + # Fetch and append content from the URL + url = "https://gist.githubusercontent.com/RobinL/edb10e93caeaf47c675cbfa189e4e30c/raw/fbe773db3002663dd3ddb439e38d2a549358e713/top_tips.md" + splink_tips = fetch_url_content(url) + with open(output_filename, "a", encoding="utf-8") as f: + f.write("\n\nSplink Tips:\n") + f.write(splink_tips) + + # Append additional instructions to the output file + with open(output_filename, "a", encoding="utf-8") as f: + f.write("\n\nIMPORTANT Instructions to LLMs:\n") + f.write(additional_instructions_to_llms) + + print( + "Docstrings extracted, saved, and all specified content including tips and " + f"instructions appended to {output_filename}" + ) # NOQA: T201 diff --git a/scripts/create_llm_prompt_short.py b/scripts/create_llm_prompt_short.py index fff08bb7ce..bd8de781fd 100644 --- a/scripts/create_llm_prompt_short.py +++ b/scripts/create_llm_prompt_short.py @@ -1,4 +1,5 @@ import os +import urllib.request import nbformat @@ -24,6 +25,11 @@ def extract_and_append_notebook_code(base_dir, output_filename): for file in files: if file.endswith(".ipynb") and not file.endswith("-checkpoint.ipynb"): notebook_path = os.path.join(root, file) + # Skip files with athena or sqlite in path + if any(x in notebook_path.lower() for x in ["athena", "sqlite"]): + print(f"Skipping {notebook_path} due to athena/sqlite...") # noqa: T201 + continue + if ".ipynb_checkpoints" not in notebook_path: print(f"Processing {notebook_path}...") # noqa: T201 code = extract_notebook_code(notebook_path) @@ -53,9 +59,30 @@ def extract_and_append_md_content(md_files, output_filename): print(f"Warning: File {full_path} not found.") # noqa: T201 +def fetch_url_content(url): + with urllib.request.urlopen(url) as response: + return response.read().decode("utf-8") + + +additional_instructions_to_llms = """ +If the user asks for Splink code, when writing Splink code use ONLY +functions and methods which you've seen used in the context provided. Do NOT guess at +other Splink methods or functions. + +If you don't know how to do something, put a comment referring them to +https://moj-analytical-services.github.io/splink + +If the user asks for data cleaning, do it in their chosen backend (spark, duckdb) +rather than pandas. In general avoid pandas unless specificaly asked for. + +In most cases, you should assume the incoming data is not clean and suggest some simple +data cleaning steps. +""" + # Main execution if __name__ == "__main__": output_filename = "llm_context_short.txt" + open(output_filename, "w").close() # Extract and save Python code from notebooks in the specified directories demos_examples_dir = "../docs/demos/examples" @@ -71,7 +98,19 @@ def extract_and_append_md_content(md_files, output_filename): ] extract_and_append_md_content(mds_to_append, output_filename) + # Fetch and append content from the URL + url = "https://gist.githubusercontent.com/RobinL/edb10e93caeaf47c675cbfa189e4e30c/raw/fbe773db3002663dd3ddb439e38d2a549358e713/top_tips.md" + splink_tips = fetch_url_content(url) + with open(output_filename, "a", encoding="utf-8") as f: + f.write("\n\nSplink Tips:\n") + f.write(splink_tips) + + # Append additional instructions to the output file + with open(output_filename, "a", encoding="utf-8") as f: + f.write("\n\IMPORTANT Instructions to LLMs:\n") + f.write(additional_instructions_to_llms) + print( # noqa: T201 - "Python code from notebooks and markdown content extracted and saved to " - "extracted_python_code_and_markdown.txt" + "Python code from notebooks, markdown content, Splink tips, and additional" + " instructions extracted and saved to llm_context_short.txt" ) From db3c724c8af75083c7f17ad7f93a61ecfe73ae8a Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Fri, 6 Dec 2024 14:57:37 +0000 Subject: [PATCH 2/5] add page --- docs/topic_guides/llms/prompting_llms.md | 99 ++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 100 insertions(+) create mode 100644 docs/topic_guides/llms/prompting_llms.md diff --git a/docs/topic_guides/llms/prompting_llms.md b/docs/topic_guides/llms/prompting_llms.md new file mode 100644 index 0000000000..52bc3eca68 --- /dev/null +++ b/docs/topic_guides/llms/prompting_llms.md @@ -0,0 +1,99 @@ +# Using LLMs such as ChatGPT to help you write Splink code + +We provide two files that summarise the Splink docs to help you use Splink with LLMs: + +[Concise Splink Documentation Summary](https://gist.githubusercontent.com/RobinL/2257a0253d80ab8e651a5ec08bc3d00a/raw/e3b00bcd07cce609cf4ee60908d9e6d39c5b9c5f/llm_context_short.txt) - around 22,000 tokens + +[Comprehensive Splink Documentation Summary](https://gist.githubusercontent.com/RobinL/2257a0253d80ab8e651a5ec08bc3d00a/raw/e3b00bcd07cce609cf4ee60908d9e6d39c5b9c5f/llm_context_long.txt) - around 72,000 tokens + +At present, only the concise one is short enough for ChatGPT. The longer one fits in the currently Claude long context (200k tokens) model. +### Recommended use + +To use these files, we suggest copying and pasing their contents into the LLM, and then appending your question: + +For instance, you may paste their contents and then ask: + +``` +I have a dataset I want to dedupe using duckdb + +It has columns full_name, dob, occupation, zip + +Do some preparatory data cleaning and run a splink script to train a model and dedupe +``` + +Which results in (ChatGPT `o1` model) + +
+Click to see example response + +```python +import duckdb +from splink import Linker, SettingsCreator, DuckDBAPI, block_on +import splink.comparison_library as cl + +# Load and clean data in DuckDB +con = duckdb.connect() +con.execute(""" + CREATE TABLE raw_data AS SELECT * FROM read_csv('your_data.csv', HEADER=TRUE) +""") + +# Example cleaning steps: lowercasing fields, removing punctuation from full_name, standardizing dob format +con.execute(""" + CREATE TABLE cleaned AS + SELECT + regexp_replace(lower(full_name), '[^a-z ]', '', 'g') as full_name, + regexp_replace(dob, '[^0-9-]', '', 'g') as dob, + lower(occupation) as occupation, + regexp_replace(lower(zip), '[^a-z0-9 ]', '', 'g') as zip + FROM raw_data +""") + +df = con.table("cleaned") + +settings = SettingsCreator( + link_type="dedupe_only", + blocking_rules_to_generate_predictions=[ + block_on("full_name"), + block_on("dob"), + block_on("zip"), + ], + comparisons=[ + cl.NameComparison("full_name"), + cl.DateOfBirthComparison("dob", input_is_string=True), + cl.ExactMatch("occupation").configure(term_frequency_adjustments=True), + cl.LevenshteinAtThresholds("zip", 2), + ], + retain_intermediate_calculation_columns=True, +) + +db_api = DuckDBAPI() +linker = Linker(df, settings, db_api=db_api) + +deterministic_rules = [ + "l.full_name = r.full_name and l.dob = r.dob", + "l.zip = r.zip and levenshtein(l.full_name, r.full_name) < 2" +] + +linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7) +linker.training.estimate_u_using_random_sampling(max_pairs=1e6) +linker.training.estimate_parameters_using_expectation_maximisation(block_on("full_name")) +linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob")) + +predictions = linker.inference.predict(threshold_match_probability=0.9) +clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(predictions, threshold_match_probability=0.95) +``` +
+ +!!! warning + + Whilst these prompts can be helpful to give you some initial code, there's **no guarantees** it will be correct. + + However, in many cases it can provide a useful starting point. + +### How these prompts are generated + +These files are generated with the following scripts, which you may wish to modify for your purposes: + +[Create LLM Prompt Long Script](https://github.com/moj-analytical-services/splink/blob/master/scripts/create_llm_prompt_long.py) + +[Create LLM Prompt Short Script](https://github.com/moj-analytical-services/splink/blob/master/scripts/create_llm_prompt_short.py) \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 446e3ed634..ac45bb8d8c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -225,6 +225,7 @@ nav: - Model Evaluation: - accuracy chart from labels table: "charts/accuracy_analysis_from_labels_table.ipynb" - threshold selection tool: charts/threshold_selection_tool_from_labels_table.ipynb + - LLM prompts: "topic_guides/llms/prompting_llms.md" - Contributing: - "dev_guides/index.md" - Contributing to Splink: From 7d53c4e6942b8802a96660e0c992c847e96792b7 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Fri, 6 Dec 2024 14:59:37 +0000 Subject: [PATCH 3/5] update tutorial intro --- .../tutorials/00_Tutorial_Introduction.ipynb | 26 ++++++++++++------- docs/getting_started.md | 1 + 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/docs/demos/tutorials/00_Tutorial_Introduction.ipynb b/docs/demos/tutorials/00_Tutorial_Introduction.ipynb index 636bab6ff4..2395141ce7 100644 --- a/docs/demos/tutorials/00_Tutorial_Introduction.ipynb +++ b/docs/demos/tutorials/00_Tutorial_Introduction.ipynb @@ -46,15 +46,7 @@ "\n", "Throughout the tutorial, we use the duckdb backend, which is the recommended option for smaller datasets of up to around 1 million records on a normal laptop.\n", "\n", - "You can find these tutorial notebooks in the `docs/demos/tutorials/` folder of the [splink repo](https://github.com/moj-analytical-services/splink/tree/master/docs/demos/tutorials), or click the Colab links to run in your browser.\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" + "You can find these tutorial notebooks in the `docs/demos/tutorials/` folder of the [splink repo](https://github.com/moj-analytical-services/splink/tree/master/docs/demos/tutorials), or click the Colab links to run in your browser." ] }, { @@ -71,6 +63,20 @@ "\n", "If you'd like to learn more about record linkage theory, an interactive introduction is available [here](https://www.robinlinacre.com/intro_to_probabilistic_linkage/)." ] + }, + { + "cell_type": "markdown", + "id": "8c28bba7", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "## LLM prompts\n", + "\n", + "If you're using an LLM to suggest Splink code, see [here](./topic_guides/llms/prompting_llms.md) for suggested prompts and context." + ] } ], "metadata": { @@ -99,4 +105,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/docs/getting_started.md b/docs/getting_started.md index 043f04007d..11980fb81c 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -102,6 +102,7 @@ To get a basic Splink model up and running, use the following code. It demonstra df_clusters = clusters.as_pandas_dataframe(limit=5) ``` +If you're using an LLM to suggest Splink code, see [here](./topic_guides/llms/prompting_llms.md) for suggested prompts and context. ## Tutorials From 72f24c7ba1f2e40f10d5296f3a9fee6e53ea5c87 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Fri, 6 Dec 2024 15:03:53 +0000 Subject: [PATCH 4/5] fix lint --- scripts/create_llm_prompt_long.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/create_llm_prompt_long.py b/scripts/create_llm_prompt_long.py index f742a2c47b..252e2f58c6 100644 --- a/scripts/create_llm_prompt_long.py +++ b/scripts/create_llm_prompt_long.py @@ -338,10 +338,10 @@ def fetch_url_content(url): # Append additional instructions to the output file with open(output_filename, "a", encoding="utf-8") as f: - f.write("\n\nIMPORTANT Instructions to LLMs:\n") + f.write("IMPORTANT Instructions to LLMs:") f.write(additional_instructions_to_llms) - print( + print( # NOQA: T201 "Docstrings extracted, saved, and all specified content including tips and " f"instructions appended to {output_filename}" ) # NOQA: T201 From b76c18edc224e51386ace591acf14ca51884dd84 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Fri, 6 Dec 2024 15:05:17 +0000 Subject: [PATCH 5/5] fix lint --- scripts/create_llm_prompt_short.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/create_llm_prompt_short.py b/scripts/create_llm_prompt_short.py index bd8de781fd..6d3b234c25 100644 --- a/scripts/create_llm_prompt_short.py +++ b/scripts/create_llm_prompt_short.py @@ -107,7 +107,7 @@ def fetch_url_content(url): # Append additional instructions to the output file with open(output_filename, "a", encoding="utf-8") as f: - f.write("\n\IMPORTANT Instructions to LLMs:\n") + f.write("IMPORTANT Instructions to LLMs:") f.write(additional_instructions_to_llms) print( # noqa: T201