From 58d6d9c1d1be8ec722e83b83f7b27244e12877c3 Mon Sep 17 00:00:00 2001 From: Zeb Burke-Conte Date: Wed, 3 Apr 2024 12:35:42 -0700 Subject: [PATCH 01/21] Make spellcheck work cross-platform --- .gitignore | 2 + .../changing_splink/contributing_to_docs.md | 14 +++-- scripts/conda/development_environment.yaml | 1 + ...elopment_environment_lock_Linux-x86_64.txt | 3 + scripts/pyspelling/spellchecker.sh | 56 +++++++++---------- 5 files changed, 43 insertions(+), 33 deletions(-) diff --git a/.gitignore b/.gitignore index cc66da9e8c..834ecea95e 100644 --- a/.gitignore +++ b/.gitignore @@ -179,3 +179,5 @@ cython_debug/ splink_db splink_db_log spark-warehouse + +scripts/pyspelling/dictionary.dic diff --git a/docs/dev_guides/changing_splink/contributing_to_docs.md b/docs/dev_guides/changing_splink/contributing_to_docs.md index 45dc9530ab..cf7f22d0cb 100644 --- a/docs/dev_guides/changing_splink/contributing_to_docs.md +++ b/docs/dev_guides/changing_splink/contributing_to_docs.md @@ -16,21 +16,27 @@ Once you've finished updating Splink documentation we ask that you run our spell ## Spellchecking docs -When updating Splink documentation, we ask that you run our spellchecker before submitting a pull request. This is to help ensure quality and consistency across the documentation. Please note, the spellchecker _only works on markdown files_ and currently only works on systems which support `Homebrew` package manager. Instructions for other operating systems will be released later. +When updating Splink documentation, we ask that you run our spellchecker before submitting a pull request. This is to help ensure quality and consistency across the documentation. Please note, the spellchecker _only works on markdown files_. + +If you are a Mac user with the `Homebrew` package manager installed, the script below will automatically install +the required system dependency, `aspell`. +If you've created your development environment [using conda](./development_quickstart.md), `aspell` will have been installed as part of that +process. +Instructions for installing `aspell` through other means may be added here in the future. To run the spellchecker on either a single markdown file or folder of markdown files, you can use the following script: ```sh -source scripts/pyspelling/spellchecker.sh +./scripts/pyspelling/spellchecker.sh ``` Omitting the file/folder path will run the spellchecker on all markdown files contained in the `docs` folder. We recommend running the spellchecker only on files that you have created or edited. The spellchecker uses the Python package [PySpelling](https://facelessuser.github.io/pyspelling/) and its underlying spellchecking tool, Aspell. Running the above script will automatically install these packages along with any other necessary dependencies. -The spellchecker compares words to a [standard British English dictionary](https://github.com/LibreOffice/dictionaries/blob/master/en/en_GB.aff) and a custom dictionary (`scripts/pyspelling/custom_dictionary.txt`) of words. If no spelling mistakes are found, you will see the following terminal printout: +The spellchecker compares words to a standard British English dictionary and a custom dictionary (`scripts/pyspelling/custom_dictionary.txt`) of words. If no spelling mistakes are found, you will see the following terminal printout: -```sh +``` Spelling check passed :) diff --git a/scripts/conda/development_environment.yaml b/scripts/conda/development_environment.yaml index a8813c35c3..7c92719dcf 100644 --- a/scripts/conda/development_environment.yaml +++ b/scripts/conda/development_environment.yaml @@ -6,6 +6,7 @@ dependencies: - pip - openjdk<18 # Required for Spark; Spark 3.4 only supports older Java - postgresql + - aspell - pip: - poetry==1.4.2 # TODO: Unpin. Current poetry.lock requires older poetry. # NOTE: These are the python packages to install alongside poetry, not within the diff --git a/scripts/conda/development_environment_lock_Linux-x86_64.txt b/scripts/conda/development_environment_lock_Linux-x86_64.txt index 0cb54a1f1f..5ef7c7d942 100644 --- a/scripts/conda/development_environment_lock_Linux-x86_64.txt +++ b/scripts/conda/development_environment_lock_Linux-x86_64.txt @@ -18,6 +18,7 @@ https://conda.anaconda.org/conda-forge/noarch/fonts-conda-ecosystem-1-0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_5.conda https://conda.anaconda.org/conda-forge/linux-64/alsa-lib-1.2.11-hd590300_1.conda https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda +https://conda.anaconda.org/conda-forge/linux-64/gettext-0.21.1-h27087fc_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/giflib-5.2.1-h0b41bf4_3.conda https://conda.anaconda.org/conda-forge/linux-64/graphite2-1.3.13-h59595ed_1003.conda https://conda.anaconda.org/conda-forge/linux-64/icu-73.2-h59595ed_0.conda @@ -55,12 +56,14 @@ https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.45.2-h2797004_0.cond https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.15-h0b41bf4_0.conda https://conda.anaconda.org/conda-forge/linux-64/libxml2-2.12.6-h232c23b_1.conda https://conda.anaconda.org/conda-forge/linux-64/pcre2-10.43-hcad00b1_0.conda +https://conda.anaconda.org/conda-forge/linux-64/perl-5.32.1-7_hd590300_perl5.conda https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda https://conda.anaconda.org/conda-forge/linux-64/xorg-fixesproto-5.0-h7f98852_1002.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/xorg-libsm-1.2.4-h7391055_0.conda https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.5-hfc55251_0.conda +https://conda.anaconda.org/conda-forge/linux-64/aspell-0.60.8-pl5321hcb278e6_1.conda https://conda.anaconda.org/conda-forge/linux-64/freetype-2.12.1-h267a509_2.conda https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.2-h659d440_0.conda https://conda.anaconda.org/conda-forge/linux-64/libglib-2.80.0-hf2295e7_1.conda diff --git a/scripts/pyspelling/spellchecker.sh b/scripts/pyspelling/spellchecker.sh index afd032c240..52ed2c08ef 100644 --- a/scripts/pyspelling/spellchecker.sh +++ b/scripts/pyspelling/spellchecker.sh @@ -1,44 +1,46 @@ #!/bin/bash -line_block="==============" +cd "$(dirname "$0")" + +set -e -package_name="aspell" -pyspelling_yaml="scripts/pyspelling/pyspelling.yml" -default_path_to_spellcheck="docs" +line_block="==============" # Use either the first command line arg or the default path to spellcheck -path_to_spellcheck="${1:-$default_path_to_spellcheck}" +path_to_spellcheck="${1:-docs}" echo "Path to spellcheck: $path_to_spellcheck" -# Function to check if necessary packages are installed -source scripts/utils/ensure_packages_installed.sh -ensure_homebrew_packages_installed aspell yq +source ../utils/ensure_packages_installed.sh +if ! command -v aspell &> /dev/null +then + ensure_homebrew_packages_installed aspell +fi -cwd=$(pwd) +# Go up to the root of the repo +cd ../.. # Set up venv, install pyspelling and download dictionary files -if [[ "$VIRTUAL_ENV" != "$cwd/spellcheck-venv" ]]; then +if [[ "$VIRTUAL_ENV" != "$(pwd)/spellcheck-venv" ]]; then # If already in a venv then deactivate if [ -n "$VIRTUAL_ENV" ]; then deactivate fi - # Set up venv - python3 -m venv spellcheck-venv - source spellcheck-venv/bin/activate - # Install pyspelling - echo "$line_block Installing pyspelling $line_block" - python -m pip install pyspelling - # Download dictionary files into correct directory - echo "$line_block Downloading dictionay files to Library/Spelling $line_block" - curl -LJ https://github.com/LibreOffice/dictionaries/raw/master/en/en_GB.dic -o ~/Library/Spelling/en_GB.dic - curl -LJ https://github.com/LibreOffice/dictionaries/blob/master/en/en_GB.aff -o ~/Library/Spelling/en_GB.aff -fi + if ! [ -d spellcheck-venv ]; then + echo "$line_block Creating spellchecking venv $line_block" + python3 -m venv spellcheck-venv + source spellcheck-venv/bin/activate + echo "$line_block Installing pyspelling $line_block" + python -m pip install pyspelling + else + source spellcheck-venv/bin/activate + fi +fi # Finally, validate the path or file that the user has entered to be spellchecked if [ -d "$path_to_spellcheck" ]; then # Checks if a directory has been entered and adds a recursive search for markdown files - source_to_spellcheck="$path_to_spellcheck"/**/*.md + source_to_spellcheck="$path_to_spellcheck"'/**/*.md' elif [ -f "$path_to_spellcheck" ]; then # Checks that the file extension is .md if [[ $path_to_spellcheck == *.md ]]; then @@ -54,11 +56,7 @@ else return 0 2>/dev/null fi -pyspelling_run="Running pyspelling spellchecker on docs" -echo "$line_block $pyspelling_run $line_block" - -# Update pyspelling.yml with a new source path -yq e ".matrix[0].sources = [\"$source_to_spellcheck|!docs/includes/**/*.md\"]" -i "$pyspelling_yaml" +echo "$line_block Running pyspelling spellchecker on docs $line_block" -echo $source_to_spellcheck -pyspelling -c ./$pyspelling_yaml +echo "$source_to_spellcheck" +pyspelling -c ./scripts/pyspelling/pyspelling.yml -S "$source_to_spellcheck"'|!docs/includes/**/*.md' From 75e752fe6dce6ace92e1cf5ee267fe7ac2abe6e6 Mon Sep 17 00:00:00 2001 From: Zeb Burke-Conte Date: Tue, 9 Apr 2024 15:55:37 -0700 Subject: [PATCH 02/21] Make spellchecker script executable --- scripts/pyspelling/spellchecker.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/pyspelling/spellchecker.sh diff --git a/scripts/pyspelling/spellchecker.sh b/scripts/pyspelling/spellchecker.sh old mode 100644 new mode 100755 From 99838f3c2764267a017505008f48d5848d8f671c Mon Sep 17 00:00:00 2001 From: Zeb Burke-Conte Date: Tue, 9 Apr 2024 15:59:20 -0700 Subject: [PATCH 03/21] Include task in pyspelling call Co-authored-by: Tom Hepworth <45356472+ThomasHepworth@users.noreply.github.com> --- scripts/pyspelling/spellchecker.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/pyspelling/spellchecker.sh b/scripts/pyspelling/spellchecker.sh index 52ed2c08ef..5ceaf96261 100755 --- a/scripts/pyspelling/spellchecker.sh +++ b/scripts/pyspelling/spellchecker.sh @@ -59,4 +59,7 @@ fi echo "$line_block Running pyspelling spellchecker on docs $line_block" echo "$source_to_spellcheck" -pyspelling -c ./scripts/pyspelling/pyspelling.yml -S "$source_to_spellcheck"'|!docs/includes/**/*.md' +pyspelling \ + -c ./scripts/pyspelling/pyspelling.yml \ + -n "Markdown docs" \ + -S "$source_to_spellcheck"'|!docs/includes/**/*.md' From 71844d5746796eeb907f70ea1404fce6de8a379d Mon Sep 17 00:00:00 2001 From: Zeb Burke-Conte Date: Tue, 9 Apr 2024 16:02:02 -0700 Subject: [PATCH 04/21] Include sentence to encourage contributions Co-authored-by: Zoe Slade --- docs/dev_guides/changing_splink/contributing_to_docs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dev_guides/changing_splink/contributing_to_docs.md b/docs/dev_guides/changing_splink/contributing_to_docs.md index cf7f22d0cb..a5756e2a56 100644 --- a/docs/dev_guides/changing_splink/contributing_to_docs.md +++ b/docs/dev_guides/changing_splink/contributing_to_docs.md @@ -16,7 +16,7 @@ Once you've finished updating Splink documentation we ask that you run our spell ## Spellchecking docs -When updating Splink documentation, we ask that you run our spellchecker before submitting a pull request. This is to help ensure quality and consistency across the documentation. Please note, the spellchecker _only works on markdown files_. +When updating Splink documentation, we ask that you run our spellchecker before submitting a pull request. This is to help ensure quality and consistency across the documentation. If for whatever reason you can't run the spellchecker on your system, please don't let this prevent you from contributing to the documentation. Please note, the spellchecker _only works on markdown files_. If you are a Mac user with the `Homebrew` package manager installed, the script below will automatically install the required system dependency, `aspell`. From ab173751301a110a39d05562ce247cb0af2a33fa Mon Sep 17 00:00:00 2001 From: Tom Hepworth Date: Tue, 23 Apr 2024 16:01:36 +0100 Subject: [PATCH 05/21] Update documentation on settings validation in response to code changes --- .../extending_settings_validator.md | 175 ++++++------------ .../settings_validation_overview.md | 31 ++-- 2 files changed, 77 insertions(+), 129 deletions(-) diff --git a/docs/dev_guides/settings_validation/extending_settings_validator.md b/docs/dev_guides/settings_validation/extending_settings_validator.md index 61530a26ff..784dd112ba 100644 --- a/docs/dev_guides/settings_validation/extending_settings_validator.md +++ b/docs/dev_guides/settings_validation/extending_settings_validator.md @@ -1,71 +1,38 @@ -## Expanding the Settings Validator +# Enhancing the Settings Validator -If a validation check is currently missing, you might want to expand the existing validation codebase. +## Overview of Current Validation Checks -Before adding any code, it's essential to determine whether the checks you want to include fit into any of the general validation categories already in place. +Below is a summary of the key validation checks currently implemented by our settings validator. For detailed information, please refer to the source code: -In summary, the following validation checks are currently carried out: +- **Blocking Rules and Comparison Levels Validation**: Ensures that the user’s blocking rules and comparison levels are correctly [imported from the designated library](https://github.com/moj-analytical-services/splink/pull/1579), and that they contain the necessary details for effective use within the Splink. +- **Column Existence Verification**: [Verifies the presence of columns](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py) specified in the user’s settings across all input dataframes, preventing errors due to missing data fields. +- **Miscellaneous Checks**: Conducts a range of additional checks aimed at providing clear and informative error messages, facilitating smoother user experiences when deviations from typical Splink usage are detected. -* Verifying that the user's blocking rules and comparison levels have been [imported from the correct library](https://github.com/moj-analytical-services/splink/pull/1579) and contain sufficient information for Splink model usage. -* [Performing column lookups](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py) to ensure that columns specified in the user's settings dictionary exist within **all** of the user's input dataframes. -* Various miscellaneous checks designed to generate more informative error messages for the user if they happen to employ Splink in an unintended manner. +### Extending Validation Logic -If you plan to introduce checks that differ from those currently in place, it's advisable to create a new script within `splink/settings_validation`. +If you are introducing new validation checks that deviate from the existing ones, please incorporate them as functions within a new script located in the [`splink/settings_validation` directory](https://github.com/moj-analytical-services/splink/tree/master/splink/settings_validation). This ensures that all validation logic is centrally managed and easily maintainable.
-## Splink Exceptions and Warnings +## Error handling and logging -While working on extending the settings validation tools suite, it's important to consider how we notify users when they've included invalid settings or features. +Error handling and logging in the settings validator takes the following forms: -Exception handling and warnings should be integrated into your validation functions to either halt the program or inform the user when errors occur, raising informative error messages as needed. +- **Raising INFO level logs** - These are raised when the settings validator detects an issue with the user's settings dictionary. These logs are intended to provide the user with information on how to rectify the issue, but should not halt the program. +- **Raising single exceptions** - Raise a built-in Python or Splink exception in response to finding an error. +- **Concurrently raising multiple exceptions** - In some instances, it makes sense to raise multiple errors simultaneously, so as not to disrupt the program. This is achieved using the `ErrorLogger` class. -### Warnings in Splink +The first two use standard Python logging and exception handling. The third is a custom class, covered in more detail below. -Warnings should be employed when you want to alert the user that an included setting might lead to unintended consequences, allowing the user to decide if it warrants further action. +You should look to use whichever makes the most sense given your requirements. -This could be applicable in scenarios such as: +### Raising multiple exceptions concurrently -* Parsing SQL where the potential for failure or incorrect column parsing exists. -* Situations where the user is better positioned to determine whether the issue should be treated as an error, like when dealing with exceptionally high values for [probability_two_random_records_match](https://github.com/moj-analytical-services/splink/blob/master/splink/files/settings_jsonschema.json#L29). +Raising multiple exceptions simultaneously provides users with faster and more manageable feedback, avoiding the tedious back-and-forth that typically occurs when errors are reported and addressed one at a time. -Implementing warnings is straightforward and involves creating a logger instance within your script, followed by a warning call. +To enable the logging of multiple errors in a single check, the [`ErrorLogger`](https://github.com/moj-analytical-services/splink/blob/master/splink/exceptions.py) class can be utilised. This is designed to operate similarly to a list, allowing the storing of errors using the `append` method. -??? note "Warnings in practice:" - ```py - import logging - logger = logging.getLogger(__name__) - - logger.warning("My warning message") - ``` - - Which will print: - - > `My warning message` - - to both the console and your log file. - -### Splink Exceptions - -Exceptions should be raised when you want the program to halt due to an unequivocal error. - -In addition to the built-in exception types, such as [`SyntaxError`](https://docs.python.org/3/library/exceptions.html#SyntaxError), we have several Splink-specific exceptions available for use. - -These exceptions serve to raise issues specific to Splink or to customise exception behaviour. For instance, you can specify a message prefix by modifying the constructor of an exception, as exemplified in the [`ComparisonSettingsException`](https://github.com/moj-analytical-services/splink/blob/f7c155c27ccf3c906c92180411b527a4cfd1111b/splink/exceptions.py#L14). - -It's crucial to also consider how to inform the user that such behaviour is not permitted. For guidelines on crafting effective error messages, refer to [How to Write Good Error Messages](https://uxplanet.org/how-to-write-good-error-messages-858e4551cd4). - -For a comprehensive list of exceptions native to Splink, visit [the exceptions.py script](https://github.com/moj-analytical-services/splink/blob/master/splink/exceptions.py). - -#### Raising Multiple Exceptions - -Raising multiple errors sequentially without disrupting the program, is a feature we commonly wish to implement across the validation steps. - -In numerous instances, it makes sense to wait until all checks have been performed before raising exceptions captured to the user in one go. - -To enable the logging of multiple errors in a singular check, or across multiple checks, an [`ErrorLogger`](https://github.com/moj-analytical-services/splink/blob/settings_validation_refactor_and_improved_logging/splink/exceptions.py#L34) class is available for use. - -The `ErrorLogger` operates in a similar way to working with a list, allowing you to add additional errors using the `append` method. Once you've logged all of your errors, you can raise them with the `raise_and_log_all_errors` method. +Once all errors have been logged, you can raise them with the `raise_and_log_all_errors` method. This will raise an exception of your choice and report all stored errors to the user. ??? note "`ErrorLogger` in practice" ```py @@ -86,102 +53,82 @@ The `ErrorLogger` operates in a similar way to working with a list, allowing you
-## Expanding our Miscellaneous Checks +## Expanding miscellaneous checks -Miscellaneous checks should typically be added as standalone functions. These functions can then be integrated into the linker's startup process for validation. +Miscellaneous checks should be added as standalone functions within an appropriate check inside `splink/settings_validation`. These functions can then be integrated into the linker's startup process for validation. -In most cases, you have more flexibility in how you structure your solutions. You can place the checks in a script that corresponds to the specific checks being performed, or, if one doesn't already exist, create a new script with a descriptive name. +An example of a miscellaneous check is the [`validate_dialect`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py#L26) function. This assesses whether the settings dialect aligns with the linker's dialect. -A prime example of a miscellaneous check is [`validate_dialect`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py#L31), which assesses whether the settings dialect aligns with the linker's dialect. +This is then injected into the `_validate_settings` method within our linker, as seen [here](https://github.com/moj-analytical-services/splink/blob/master/splink/linker.py#L500).
-## Additional Comparison and Blocking Rule Checks +## Additional comparison and blocking rule checks -If your checks pertain to comparisons or blocking rules, most of these checks are currently implemented within the [valid_types.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/valid_types.py) script. +Comparison and Blocking Rule checks can be found within the [`valid_types.py`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py#L26) script. -Currently, comparison and blocking rule checks are organised in a modular format. +These checks currently interface with the `ErrorLogger` class which is used to store and raise multiple errors simultaneously (see above). -To expand the current suite of tests, you should: +If you wish to expand the current set of tests, it is advised that you incorporate any new checks into either [`log_comparison_errors`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py#L64) or `_validate_settings` (mentioned above). -1. Create a function to inspect the presence of the error you're evaluating. -2. Define an error message that you intend to add to the `ErrorLogger` class. -3. Integrate these elements into either the [`validate_comparison_levels`](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/valid_types.py#L43) function (or something similar), which appends any detected errors to an `ErrorLogger`. -4. Finally, work out where this function should live in the setup process of the linker object. Typically, you should look to add these checks before any processing of the settings dictionary is performed. +
-The above steps are set to change as we are looking to refactor our settings object. +## Checking for the existence of user specified columns -
+Column and SQL validation is performed within [`log_invalid_columns.py`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py). -## Checking that columns exist +The aim of this script is to check that the columns specified by the user exist within the input dataframe(s). If any invalid columns are found, the script will log this with the user. Should you need to include extra checks to assess the validity of columns supplied by a user, your primary focus should be on the [column_lookups.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py) script. -There are currently three classes employed to construct the current log strings. These can be extended to perform additional column checks. +There are two main classes within this script that can be used or extended to perform additional column checks: ??? note "`InvalidCols`" - `InvalidCols` is a `NamedTuple`, used to construct the bulk of our log strings. This accepts a list of columns and the type of error, producing a complete log string when requested. + [`InvalidCols`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/settings_validation_log_strings.py) is a `NamedTuple`, used to construct the bulk of our log strings. This accepts a list of columns and the type of error, producing a complete log string when requested. + + For simplicity, there are three partial implementations to cover the most common cases: + - `MissingColumnsLogGenerator` - missing column identified. + - `InvalidTableNamesLogGenerator` - table name entered by the user is missing or invalid. + - `InvalidColumnSuffixesLogGenerator` - `_l` and `_r` suffixes are missing or invalid. - In practice, this is used as follows: + In practice, this can be used as follows: ```py - # Store the invalid columns and why they're invalid - my_invalid_cols = InvalidCols("invalid_cols", ["first_col", "second_col"]) + # Store our invalid columns + my_invalid_cols = MissingColumnsLogGenerator(["first_col", "second_col"]) # Construct the corresponding log string my_invalid_cols.construct_log_string() ``` -??? note "`InvalidColValidator`" - `InvalidColValidator` houses a series of validation checks to evaluate whether the column(s) contained within either a SQL string or a user's raw input string, are present within the underlying dataframes. - - To achieve this, it employs a range of cleaning functions to standardise our column inputs and conducts a series of checks on these cleaned columns. It utilises `InvalidCols` tuples to log any identified invalid columns. - - It inherits from our the `SettingsValidator` class. - ??? note "`InvalidColumnsLogger`" - The principal logging class for our invalid column checks. - - This class primarily calls our builder functions outlined in `InvalidColValidator`, constructing a series of log strings for output to both the console and the user's log file (if it exists). - - -To extend the column checks, you simply need to add an additional validation method to the [`InvalidColValidator`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L15) class, followed by an extension of the [`InvalidColumnsLogger`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L164). - -### A Practical Example of a Column Check - -For an example of column checks in practice, see [`validate_uid`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L195). - -Here, we call `validate_settings_column`, checking whether the unique ID column submitted by the user is valid. The output of this call yields either an `InvalidCols` tuple, or `None`. - -From there, we can use the built-in log constructor [`construct_generic_settings_log_string`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L329C27-L329C27) to construct and print the required logs. Where the output above was `None`, nothing is logged. - -If your checks aren't part of the initial settings check (say you want to assess additional columns found in blocking rules supplied at a later stage by the user), you should add a new method to `InvalidColumnsLogger`, similar in functionality to [`construct_output_logs`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L319). - -However, it is worth noting that not all checks are performed on a simple string columns. Where you require checks to be performed on SQL strings, there's an additional step required, outlined below. - -### Single Column Checks - -To review single columns, [`validate_settings_column`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L144) should be used. This takes in a `setting_id` (analogous to the title you want to give your log string) and a list of columns to be checked. + `InvalidColumnsLogger` takes in a series of cleansed columns from your settings object (see [`SettingsColumnCleaner`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/settings_column_cleaner.py#L101)) and runs a series of validation checks to assess whether the column(s) are present within the underlying dataframes. -A working example of this in practice can be found in the section above. + Any invalid columns are stored in an `InvalidCols` instance (see above), which is then used to construct a log string. -### Checking Columns in SQL statements + Logs are output to the user at the `INFO` level. -For raw SQL statements, you should make use of the [`validate_columns_in_sql_strings`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L102) method. +To extend the column checks, you simply need to add an additional validation method to the [`InvalidColumnsLogger`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py#L171C7-L171C27) class. Checks must be added as a new method and then called within `construct_output_logs`. -This takes in a list of SQL strings and spits out a list of `InvalidCols` tuples, depending on the checks you ask it to perform. +## Single column, multi-column and SQL checks -Should you need more control, the process is similar to that of the single column case, just with an additional parsing step. +### Single and multi-column -Parsing is handled by [`parse_columns_in_sql`](https://github.com/moj-analytical-services/splink/blob/master/splink/parse_sql.py#L45). This will spit out a list of column names that were identified by SQLGlot. +Single and multi-column checks are relatively straightforward. Assuming you have a clean set of columns, you can leverage the [`check_for_missing_settings_column`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py#L56) function. -> Note that as this is handled by SQLGlot, it's not always 100% accurate. For our purposes though, its flexibility is unparalleled and allows us to more easily and efficiently extract column names. +This expects the following arguments: +* **settings_id**: the name of the settings ID. This is only used for logging and does not necessarily need to match the true ID. +* **settings_column_to_check**: the column(s) you wish to validate. +* **valid_input_dataframe_columns**: the cleaned columns from your **all** input dataframes. -Once your columns have been parsed, you can again run a series of lookups against your input dataframe(s). This is identical to the steps outlined in the **Single Column Checks** section. +### Checking columns in SQL statements -You may also wish to perform additional checks on the columns, to assess whether they contain valid prefixes, suffixes or some other quality of the column. +Checking SQL statements is a little more complex, given the need to parse SQL in order to extract your column names. -Additional checks can be passed to [`validate_columns_in_sql_strings`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L102) and should be specified as methods in the `InvalidColValidator` class. +To do this, you can leverage the [`check_for_missing_or_invalid_columns_in_sql_strings`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py#L73) function. -See [validate_blocking_rules](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/column_lookups.py#L209) for a practical example where we loop through each blocking rule, parse it and then assess whether it: +This expects the following arguments: +* **sql_dialect**: The SQL dialect used by the linker. +* **sql_strings**: A list of SQL strings. +* **valid_input_dataframe_columns**: The list of columns identified in your input dataframe(s). +* additional_validation_checks: Functions used to check for other issues with the parsed SQL string, namely, table name and column suffix validation. -1. Contains a valid list of columns -2. Each column contains a valid table prefix. +_NB: for nested SQL statements, you'll need to add an additional loop. See [`check_comparison_for_missing_or_invalid_sql_strings`](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py#L140) for more details._ diff --git a/docs/dev_guides/settings_validation/settings_validation_overview.md b/docs/dev_guides/settings_validation/settings_validation_overview.md index 68cb5c9fd8..e8f9013561 100644 --- a/docs/dev_guides/settings_validation/settings_validation_overview.md +++ b/docs/dev_guides/settings_validation/settings_validation_overview.md @@ -1,6 +1,6 @@ ## Settings Validation -A common issue within Splink is users providing invalid settings dictionaries. To prevent this, the settings validator scans through a settings dictionary and provides user-friendly feedback on what needs to be fixed. +A common problem within Splink comes from users providing invalid settings dictionaries. To prevent this, we've built a settings validator to scan through a given settings dictionary and provide user-friendly feedback on what needs to be fixed. At a high level, this includes: @@ -21,9 +21,9 @@ You can modify the schema by manually editing the [json schema](https://github.c Modifications can be used to (amongst other uses): -* Set or remove default values for schema keys. -* Set the required data type for a given key. -* Expand or refine previous titles and descriptions to help with clarity. +- Set or remove default values for schema keys. +- Set the required data type for a given key. +- Expand or refine previous titles and descriptions to help with clarity. Any updates you wish to make to the schema should be discussed with the wider team, to ensure it won't break backwards compatibility and makes sense as a design decision. @@ -33,23 +33,24 @@ Detailed information on the arguments that can be supplied to the json schema ca ## Settings Validator -The settings validation code currently resides in the [settings validation](https://github.com/moj-analytical-services/splink/tree/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation) directory of Splink. This code is responsible for executing a secondary series of tests to determine whether all values within the settings dictionary will generate valid SQL. +As long as an input is of the correct data type, it will pass our initial schema checks. This can then mean that user inputs that would generate invalid SQL can slip through and are then often caught by the database engine, [commonly resulting in uninformative errors](https://github.com/moj-analytical-services/splink/issues/1362). This can result in uninformative and confusing errors that the user is unsure of how to resolve. -Numerous inputs pass our initial schema checks before breaking other parts of the codebase. These breaks are typically due to the construction of invalid SQL, that is then passed to the database engine, [commonly resulting in uninformative errors](https://github.com/moj-analytical-services/splink/issues/1362). +The settings validation code (found within the [settings validation](https://github.com/moj-analytical-services/splink/tree/master/splink/settings_validation) directory of Splink) is another layer of validation, executing a series of checks to determine whether values in the user's settings dictionary will generate invalid SQL. Frequently encountered problems include: -* Usage of invalid column names. For example, specifying a [`unique_id_column_name`](https://github.com/moj-analytical-services/splink/blob/settings_validation_docs/splink/files/settings_jsonschema.json#L61) that doesn't exist in the underlying dataframe(s). Such names satisfy the schema requirements as long as they are strings. -* Users not updating default values in the settings schema, even when these values are inappropriate for their provided input dataframes. -* Importing comparisons and blocking rules from incorrect sections of the codebase, or using an inappropriate data type (comparison level vs. comparison). -* Using Splink for an invalid form of linkage. See the [following discussion](https://github.com/moj-analytical-services/splink/issues/1362). +- **Invalid column names**. For example, specifying a [`unique_id_column_name`](https://github.com/moj-analytical-services/splink/blob/settings_validation_docs/splink/files/settings_jsonschema.json#L61) that doesn't exist in the underlying dataframe(s). Such names satisfy the schema requirements as long as they are strings. +- **Using the settings dictionary's default values** +- **Importing comparisons and blocking rules for the wrong dialect**. +- **Using an inappropriate custom data types** - (comparison level vs. comparison within our comparisons). +- **Using Splink for an invalid form of linkage** - See the [following discussion](https://github.com/moj-analytical-services/splink/issues/1362). -Currently, the [settings validation](https://github.com/moj-analytical-services/splink/tree/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation) scripts are setup in a modular fashion, to allow each to inherit the checks it needs. -The folder is comprised of three scripts, each of which inspects the settings dictionary at different stages of its journey: +All code relating to [settings validation](https://github.com/moj-analytical-services/splink/tree/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation) can be found within one of the following scripts: -* [valid_types.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/valid_types.py) - This script includes various miscellaneous checks for comparison levels, blocking rules, and linker objects. These checks are primarily performed within settings.py. -* [settings_validator.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/settings_validator.py) - This script includes the core `SettingsValidator` class and contains a series of methods that retrieve information on fields within the user's settings dictionary that contain information on columns to be used in training and prediction. Additionally, it provides supplementary cleaning functions to assist in the removal of quotes, prefixes, and suffixes that may be present in a given column name. -* [column_lookups.py](https://github.com/moj-analytical-services/splink/blob/32e66db1c8c0bed54682daf9a6fea8ef4ed79ab4/splink/settings_validation/column_lookups.py) - This script contains helper functions that generate a series of log strings outlining invalid columns identified within your settings dictionary. It primarily consists of methods that run validation checks on either raw SQL or input columns and assesses their presence in **all** dataframes supplied by the user. +- [valid_types.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/valid_types.py) - This script includes various miscellaneous checks for comparison levels, blocking rules, and linker objects. These checks are primarily performed within settings.py. +- [settings_column_cleaner.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/settings_column_cleaner.py) - Includes a set of functions for cleaning and extracting data, designed to sanitise user inputs in the settings dictionary and retrieve necessary SQL or column identifiers. +- [log_invalid_columns.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/log_invalid_columns.py) - Pulls the information extracted in [settings_column_cleaner.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/settings_column_cleaner.py) and generates any log strings outlining invalid columns or SQL identified within the settings dictionary. Any generated error logs are reported to the user when initialising a linker object at the `INFO` level. +- [settings_validation_log_strings.py](https://github.com/moj-analytical-services/splink/blob/master/splink/settings_validation/settings_validation_log_strings.py) - a home for any error messages or logs generated by the settings validator. For information on expanding the range of checks available to the validator, see [Extending the Settings Validator](./extending_settings_validator.md). From ebba34b48fc8f840a4de12bcca2af64c459e8423 Mon Sep 17 00:00:00 2001 From: Sam Lindsay Date: Wed, 24 Apr 2024 22:51:16 +0100 Subject: [PATCH 06/21] Update predict.py --- splink/predict.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/predict.py b/splink/predict.py index 3a7f7555a1..efc9e18def 100644 --- a/splink/predict.py +++ b/splink/predict.py @@ -55,7 +55,7 @@ def predict_from_comparison_vectors_sqls( thres_prob_as_weight = prob_to_match_weight(threshold_match_probability) else: thres_prob_as_weight = None - if threshold_match_probability or threshold_match_weight: + if threshold_match_probability is not None or threshold_match_weight is not None: thresholds = [ thres_prob_as_weight, threshold_match_weight, From 3d7cf00893ee78b6f7937555284c4f2a1749852a Mon Sep 17 00:00:00 2001 From: aymonwuolanne Date: Wed, 1 May 2024 10:56:08 +1000 Subject: [PATCH 07/21] Fixing spurious error messages with Databricks enable_splink --- splink/databricks/enable_splink.py | 34 ++++++++++++++---------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/splink/databricks/enable_splink.py b/splink/databricks/enable_splink.py index 586fb28abe..754452c319 100644 --- a/splink/databricks/enable_splink.py +++ b/splink/databricks/enable_splink.py @@ -1,4 +1,5 @@ import logging +import os from splink.spark.jar_location import similarity_jar_location @@ -32,37 +33,34 @@ def enable_splink(spark): optionClass = getattr(sc._jvm.scala, "Option$") optionModule = getattr(optionClass, "MODULE$") - # Note(bobby): So dirty + dbr_version = float(os.environ.get("DATABRICKS_RUNTIME_VERSION")) + try: - # This will fix the exception when running on Databricks Runtime 14.x+ - lib = JavaJarId( - JarURI, - ManagedLibraryId.defaultOrganization(), - NoVersionModule.simpleString(), - optionModule.apply(None), - optionModule.apply(None), - optionModule.apply(None), - ) - except Exception as e: - logger.warn("failed to initialize for 14.x+", e) - try: - # This will fix the exception when running on Databricks Runtime 13.x + if dbr_version >= 14: lib = JavaJarId( JarURI, ManagedLibraryId.defaultOrganization(), NoVersionModule.simpleString(), optionModule.apply(None), optionModule.apply(None), + optionModule.apply(None), ) - except Exception as ex: - logger.warn("failed to initialize for 13.x", ex) - - # This will work for < 13.x + elif dbr_version >= 13: lib = JavaJarId( JarURI, ManagedLibraryId.defaultOrganization(), NoVersionModule.simpleString(), + optionModule.apply(None), + optionModule.apply(None), ) + else: + lib = JavaJarId( + JarURI, + ManagedLibraryId.defaultOrganization(), + NoVersionModule.simpleString(), + ) + except Exception as e: + logger.warn("failed to enable similarity jar functions for Databricks", e) libSeq = converters.asScalaBufferConverter((lib,)).asScala().toSeq() From 7dccd662306066930a259ea11622c07a154306a3 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 1 May 2024 07:00:44 +0100 Subject: [PATCH 08/21] format --- splink/databricks/enable_splink.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/databricks/enable_splink.py b/splink/databricks/enable_splink.py index 754452c319..2a8d87d964 100644 --- a/splink/databricks/enable_splink.py +++ b/splink/databricks/enable_splink.py @@ -34,7 +34,7 @@ def enable_splink(spark): optionModule = getattr(optionClass, "MODULE$") dbr_version = float(os.environ.get("DATABRICKS_RUNTIME_VERSION")) - + try: if dbr_version >= 14: lib = JavaJarId( From 268f77eef5afa10d572480db37001682c32a9fd7 Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 2 May 2024 16:52:03 +0100 Subject: [PATCH 09/21] remove ref to github action --- docs/dev_guides/changing_splink/contributing_to_docs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dev_guides/changing_splink/contributing_to_docs.md b/docs/dev_guides/changing_splink/contributing_to_docs.md index 45dc9530ab..a604e02411 100644 --- a/docs/dev_guides/changing_splink/contributing_to_docs.md +++ b/docs/dev_guides/changing_splink/contributing_to_docs.md @@ -38,7 +38,7 @@ Spelling check passed :) otherwise, PySpelling will printout the spelling mistakes found in each file. -Correct spellings of words not found in a standard dictionary (e.g. Splink) can be recorded as such by adding them to `scripts/pyspelling/custom_dictionary.txt`. (Don't worry about adding them in alphabetical order or accidental duplication as this will be handled automatically by a GitHub Action future.) +Correct spellings of words not found in a standard dictionary (e.g. "Splink") can be recorded as such by adding them to `scripts/pyspelling/custom_dictionary.txt`. Please correct any mistakes found or update the custom dictionary to ensure the spellchecker passes before putting in a pull request containing updates to the documentation. From e252813fde21538766fecc96a431493561bedec5 Mon Sep 17 00:00:00 2001 From: Zeb Burke-Conte Date: Mon, 6 May 2024 09:10:58 -0700 Subject: [PATCH 10/21] Reword script instructions Co-authored-by: Zoe Slade --- docs/dev_guides/changing_splink/contributing_to_docs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dev_guides/changing_splink/contributing_to_docs.md b/docs/dev_guides/changing_splink/contributing_to_docs.md index a5756e2a56..faa28acbfe 100644 --- a/docs/dev_guides/changing_splink/contributing_to_docs.md +++ b/docs/dev_guides/changing_splink/contributing_to_docs.md @@ -24,7 +24,7 @@ If you've created your development environment [using conda](./development_quick process. Instructions for installing `aspell` through other means may be added here in the future. -To run the spellchecker on either a single markdown file or folder of markdown files, you can use the following script: +To run the spellchecker on either a single markdown file or folder of markdown files, you can run the following bash script: ```sh ./scripts/pyspelling/spellchecker.sh From 6c0437ce5bd1308764de8e76add3ff5aeb1e1276 Mon Sep 17 00:00:00 2001 From: probjects Date: Thu, 9 May 2024 22:54:48 +0100 Subject: [PATCH 11/21] Fix Splink 4 blog post link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 70ee44ca7a..2620ffa475 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Documentation](https://img.shields.io/badge/API-documentation-blue)](https://moj-analytical-services.github.io/splink/) > [!IMPORTANT] -> Development has begun on Splink 4 on the `splink4_dev` branch. Splink 3 is in maintenance mode and we are no longer accepting new features. We welcome contributions to Splink 4. Read more on our latest [blog](https://moj-analytical-services.github.io/splink/blog/2024/03/19/splink4.html). +> Development has begun on Splink 4 on the `splink4_dev` branch. Splink 3 is in maintenance mode and we are no longer accepting new features. We welcome contributions to Splink 4. Read more on our latest [blog](https://moj-analytical-services.github.io/splink/blog/2024/04/02/splink-3-updates-and-splink-4-development-announcement---april-2024.html). # Fast, accurate and scalable probabilistic data linkage From 700d2877702c67df67c8370f9acb5856ae91b674 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 6 Jun 2024 10:47:00 +0100 Subject: [PATCH 12/21] add marie curie --- docs/index.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/index.md b/docs/index.md index ce34716c67..7d380b0ad7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -81,6 +81,8 @@ Here is a list of some of our known users and their use cases: - [Stanford University](https://www.stanford.edu/) investigated the impact of [receiving government assistance has on political attitudes](https://www.cambridge.org/core/journals/american-political-science-review/article/abs/does-receiving-government-assistance-shape-political-attitudes-evidence-from-agricultural-producers/39552BC5A496EAB6CB484FCA51C6AF21) - [Bern University](https://arbor.bfh.ch/) researched how [Active Learning can be applied to Biomedical Record Linkage](https://ebooks.iospress.nl/doi/10.3233/SHTI230545) +=== "Other" + - [Marie Curie](https://podcasts.apple.com/gb/podcast/unlocking-data-at-marie-curie/id1724979056?i=1000649964922) have used Splink to build a single customer view on fundraising data which has been a "huge success [...] the tooling is just so much better. [...] The power of being able to select, plug in, configure and train a tool versus writing code. It's just mind boggling actually." Sadly, we don't hear about the majority of our users or what they are working on. If you have a use case and it is not shown here please [add it to the list](https://github.com/moj-analytical-services/splink/edit/master/docs/index.md)! From 303d22a0c3fc1d73d65339432631068f76abea25 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Thu, 6 Jun 2024 11:12:04 +0100 Subject: [PATCH 13/21] update summary --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 7d380b0ad7..ca166dad80 100644 --- a/docs/index.md +++ b/docs/index.md @@ -82,7 +82,7 @@ Here is a list of some of our known users and their use cases: - [Bern University](https://arbor.bfh.ch/) researched how [Active Learning can be applied to Biomedical Record Linkage](https://ebooks.iospress.nl/doi/10.3233/SHTI230545) === "Other" - - [Marie Curie](https://podcasts.apple.com/gb/podcast/unlocking-data-at-marie-curie/id1724979056?i=1000649964922) have used Splink to build a single customer view on fundraising data which has been a "huge success [...] the tooling is just so much better. [...] The power of being able to select, plug in, configure and train a tool versus writing code. It's just mind boggling actually." + - [Marie Curie](https://podcasts.apple.com/gb/podcast/unlocking-data-at-marie-curie/id1724979056?i=1000649964922) have used Splink to build a single customer view on fundraising data which has been a "huge success [...] the tooling is just so much better. [...] The power of being able to select, plug in, configure and train a tool versus writing code. It's just mind boggling actually." Amongst other benefits, the system is expected to "dramatically reduce manual reporting efforts previously required". See also the blog post [here](https://esynergy.co.uk/our-work/marie-curie/). Sadly, we don't hear about the majority of our users or what they are working on. If you have a use case and it is not shown here please [add it to the list](https://github.com/moj-analytical-services/splink/edit/master/docs/index.md)! From 9760008d03ea3e5dad316c76e6043f82b7450abe Mon Sep 17 00:00:00 2001 From: David French Date: Fri, 7 Jun 2024 14:49:16 +0100 Subject: [PATCH 14/21] Fix bug giving warning messages in term_frequencies.py https://github.com/moj-analytical-services/splink/discussions/2197 --- splink/term_frequencies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/term_frequencies.py b/splink/term_frequencies.py index 7a4de7de82..dd8e00ef39 100644 --- a/splink/term_frequencies.py +++ b/splink/term_frequencies.py @@ -237,7 +237,7 @@ def tf_adjustment_chart( most_freq = True if not n_most_freq else df["most_freq_rank"] < n_most_freq mask = selected | least_freq | most_freq - vals_not_included = [val for val in vals_to_include if val not in df["value"]] + vals_not_included = [val for val in vals_to_include if val not in df["value"].values] if vals_not_included: warnings.warn( f"Values {vals_not_included} from `vals_to_include` were not found in " From 5e328ed00713a87e501d7ab346725b7f16f63e55 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 11 Jun 2024 22:17:36 +0100 Subject: [PATCH 15/21] fix lint --- splink/term_frequencies.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/splink/term_frequencies.py b/splink/term_frequencies.py index dd8e00ef39..43008d3aa3 100644 --- a/splink/term_frequencies.py +++ b/splink/term_frequencies.py @@ -237,7 +237,9 @@ def tf_adjustment_chart( most_freq = True if not n_most_freq else df["most_freq_rank"] < n_most_freq mask = selected | least_freq | most_freq - vals_not_included = [val for val in vals_to_include if val not in df["value"].values] + vals_not_included = [ + val for val in vals_to_include if val not in df["value"].values + ] if vals_not_included: warnings.warn( f"Values {vals_not_included} from `vals_to_include` were not found in " From 66b4331bd582b654dbaffee88635574185b93606 Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Tue, 11 Jun 2024 22:19:22 +0100 Subject: [PATCH 16/21] remove unnecessary comment --- splink/term_frequencies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/splink/term_frequencies.py b/splink/term_frequencies.py index 43008d3aa3..4e7b081e3b 100644 --- a/splink/term_frequencies.py +++ b/splink/term_frequencies.py @@ -195,7 +195,7 @@ def comparison_level_to_tf_chart_data(cl: dict): def tf_adjustment_chart( linker: Linker, col, n_most_freq, n_least_freq, vals_to_include, as_dict ): - # Data for chart + c = linker._settings_obj._get_comparison_by_output_column_name(col) c = c._as_detailed_records From 7b4385d7ddb7d8bdfd2fa759a3b2548d805e6a1d Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 12 Jun 2024 08:49:45 +0100 Subject: [PATCH 17/21] update autoblack --- .github/workflows/autoblack.yml | 37 ++++++++++++++------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 82dc56c8af..d076215b0a 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -18,28 +18,23 @@ jobs: with: python-version: ${{ env.PYTHON_VERSION }} - - name: Load cached Poetry installation - uses: actions/cache@v2 - with: - path: ~/.local # the path depends on the OS - key: poetry-0 # increment to reset cache - - name: Install Poetry - uses: snok/install-poetry@v1 - with: - version: '1.7.0' - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@v2 + - name: Install poetry using pipx + run: | + pipx install poetry + pipx ensurepath + + - uses: actions/setup-python@v5 with: - path: .venv - key: venv-lint-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-01 - - name: Install linting dependencies - if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-root --only linting + python-version: "3.12.1" + cache: "poetry" + + - name: Install dependencies + run: | + poetry config virtualenvs.in-project true + poetry install --no-interaction --no-root --only linting + + - name: Run Black autoformatter + run: poetry run black . - name: Run Black autoformatter run: | From 6b5630df7c130c29bd320ad14b4c33365696f86b Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 12 Jun 2024 08:51:07 +0100 Subject: [PATCH 18/21] update black --- .github/workflows/autoblack.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index d076215b0a..44e470a0fa 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -36,10 +36,6 @@ jobs: - name: Run Black autoformatter run: poetry run black . - - name: Run Black autoformatter - run: | - source .venv/bin/activate - python3 -m black . - name: If needed, commit black changes to the pull request run: | git config user.name "$(git log -n 1 --pretty=format:%an)" From a34ef2c928e0789e67b995a6609a248711a2b34f Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 12 Jun 2024 08:52:30 +0100 Subject: [PATCH 19/21] update black --- .github/workflows/autoblack.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml index 44e470a0fa..c58cc71805 100644 --- a/.github/workflows/autoblack.yml +++ b/.github/workflows/autoblack.yml @@ -43,7 +43,5 @@ jobs: # short-circuit if we have no changes, otherwise attempt to commit and push # should only fail on forks, in which case contributors will need to manually run black, commit, and push git diff-index --quiet HEAD || (echo "Attempting to commit changes" && git commit -am 'lint with black' && git push -f) - - name: Run black --check . - run: | - source .venv/bin/activate - python3 -m black --check . + - name: Run black --check + run: poetry run black --check . From 45a731f8bd1907594c750de0546f2327b75f5b3d Mon Sep 17 00:00:00 2001 From: Robin Linacre Date: Wed, 12 Jun 2024 08:58:59 +0100 Subject: [PATCH 20/21] update lint workflow --- .github/workflows/lint.yml | 39 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index f8bf684143..d8e85cfd0c 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -10,35 +10,26 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v4 with: python-version: ${{ env.PYTHON_VERSION }} - - name: Load cached Poetry installation - uses: actions/cache@v2 - with: - path: ~/.local # the path depends on the OS - key: poetry-0 # increment to reset cache - - name: Install Poetry - uses: snok/install-poetry@v1 - with: - version: '1.7.0' - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@v2 + - name: Install poetry using pipx + run: | + pipx install poetry + pipx ensurepath + + - uses: actions/setup-python@v5 with: - path: .venv - key: venv-lint-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-00 - - name: Install linting dependencies - if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --no-root --only linting + python-version: "3.12.1" + cache: "poetry" - - name: Lint Python files with ruff + - name: Install dependencies run: | - source .venv/bin/activate - ruff --show-source . + poetry config virtualenvs.in-project true + poetry install --no-interaction --no-root --only linting + + - name: Lint Python files with ruff + run: poetry run ruff --show-source . From 48fa5e509afa5b9ee575c7d39cf9fe8d3d1a31cd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jun 2024 22:29:58 +0000 Subject: [PATCH 21/21] Bump urllib3 from 1.26.18 to 1.26.19 Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.18 to 1.26.19. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/1.26.19/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.26.18...1.26.19) --- updated-dependencies: - dependency-name: urllib3 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- poetry.lock | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/poetry.lock b/poetry.lock index c24752edb9..cdf690ac5a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "altair" @@ -1578,13 +1578,13 @@ files = [ [[package]] name = "urllib3" -version = "1.26.18" +version = "1.26.19" description = "HTTP library with thread-safe connection pooling, file post, and more." -optional = true -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ - {file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"}, - {file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"}, + {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, + {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, ] [package.extras] @@ -1592,23 +1592,6 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] -[[package]] -name = "urllib3" -version = "2.0.7" -description = "HTTP library with thread-safe connection pooling, file post, and more." -optional = true -python-versions = ">=3.7" -files = [ - {file = "urllib3-2.0.7-py3-none-any.whl", hash = "sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e"}, - {file = "urllib3-2.0.7.tar.gz", hash = "sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84"}, -] - -[package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] -secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] -socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] -zstd = ["zstandard (>=0.18.0)"] - [[package]] name = "zipp" version = "3.17.0"