From 48c54635d475d7800c6f56f27e00153430b3a733 Mon Sep 17 00:00:00 2001 From: Milan Wiedemann Date: Tue, 19 Nov 2024 14:21:30 +0000 Subject: [PATCH] Add validation data for consultations --- lib/functions/function_get_validation_data.R | 133 ++++++++++++++++++ .../data/pf_consultation_validation_data.csv | 43 ++++++ 2 files changed, 176 insertions(+) create mode 100644 lib/functions/function_get_validation_data.R create mode 100644 lib/validation/data/pf_consultation_validation_data.csv diff --git a/lib/functions/function_get_validation_data.R b/lib/functions/function_get_validation_data.R new file mode 100644 index 0000000..dc4af6a --- /dev/null +++ b/lib/functions/function_get_validation_data.R @@ -0,0 +1,133 @@ +library(httr) +library(here) +library(rvest) +library(dplyr) +library(lubridate) +library(tidyverse) +library(readr) + +#' Extract dates from dispensing data URL +#' +#' @param urls List of URLs +#' @return String, of date in YYYY-MM-DD format if date can be found, otherwise NA. +extract_dates <- function(urls) { + urls %>% + map(~ { + match <- str_match(.x, ".*Dispensing%20Data%20(\\w+)%20(\\d{2})")[, 2:3] + + if (all(!is.na(match))) { + month <- match[1] + year <- paste0("20", match[2]) + + parsed_date <- parse_date_time( + paste("1", month, year), + orders = "dmy", + quiet = TRUE + ) + if (!is.na(parsed_date)) { + return(format(parsed_date, "%Y-%m-%d")) + } + } + return(NA) + }) +} + +get_dispensing_urls <- function(start_date = NULL, end_date = NULL) { + # This is the URL where the data is linked from + url <- "https://www.nhsbsa.nhs.uk/prescription-data/dispensing-data/dispensing-contractors-data" + + # The URL to the data we are interested in follows this URL structure + # https://www.nhsbsa.nhs.uk/sites/default/files/2024-05/Dispensing%20Data%20Jan%2024%20-%20CSV.csv + base_url <- "https://www.nhsbsa.nhs.uk" + + response <- GET(url) + html_content <- content(response, "text", encoding = "UTF-8") + + csv_links <- read_html(html_content) %>% + html_nodes("a") %>% + html_attr("href") %>% + .[grepl("Dispensing%20Data.*\\.csv", .)] + + df <- tibble( + date = as.Date(extract_dates(csv_links) |> unlist()), + url = paste0(base_url, csv_links) + ) + + if (is.null(start_date)) { + start_date <- as.Date(min(df$date)) + } + + if (is.null(end_date)) { + end_date <- as.Date(max(df$date)) + } + + df <- df |> + filter(between(date, as.Date(start_date), as.Date(end_date))) + + setNames(as.list(df$url), as.character(df$date)) +} + +get_dispensing_data <- function(start_date = NULL, end_date = NULL) { + dispensing_urls <- get_dispensing_urls(start_date = start_date, end_date = end_date) + + icb_var_list <- c( + "ICBCode", + "ICB" + ) + + pf_var_list <- c( + "NumberofPharmacyFirstClinicalPathwaysConsultations-AcuteOtitisMedia", + "NumberofPharmacyFirstClinicalPathwaysConsultations-AcuteSoreThroat", + "NumberofPharmacyFirstClinicalPathwaysConsultations-Impetigo", + "NumberofPharmacyFirstClinicalPathwaysConsultations-InfectedInsectBites", + "NumberofPharmacyFirstClinicalPathwaysConsultations-Shingles", + "NumberofPharmacyFirstClinicalPathwaysConsultations-Sinusitis", + "NumberofPharmacyFirstClinicalPathwaysConsultations-UncomplicatedUTI", + "NumberofPharmacyFirstUrgentMedicineSupplyConsultations", + "NumberofPharmacyFirstMinorIllnessReferralConsultations" + ) + + df <- dispensing_urls |> + map(read_csv, + name_repair = janitor::make_clean_names + ) |> + bind_rows(.id = "date") |> + select(date, all_of(janitor::make_clean_names(c(icb_var_list, pf_var_list)))) + + df |> + rename_with(~ str_replace(., "^numberof_pharmacy_first", "n_pf")) |> + rename_with(~ str_replace(., "clinical_pathways_consultations", "consultation")) +} + +# Calculate summary of counts +df_dispensing_data <- get_dispensing_data(start_date = "2024-02-01") + +df_dispensing_data_summary <- df_dispensing_data |> + group_by(date) |> + summarise( + n_pf_consultation_acute_otitis_media = sum(n_pf_consultation_acute_otitis_media, na.rm = TRUE), + n_pf_consultation_acute_sore_throat = sum(n_pf_consultation_acute_sore_throat, na.rm = TRUE), + n_pf_consultation_impetigo = sum(n_pf_consultation_impetigo, na.rm = TRUE), + n_pf_consultation_infected_insect_bites = sum(n_pf_consultation_infected_insect_bites, na.rm = TRUE), + n_pf_consultation_shingles = sum(n_pf_consultation_shingles, na.rm = TRUE), + n_pf_consultation_sinusitis = sum(n_pf_consultation_sinusitis, na.rm = TRUE), + n_pf_consultation_uncomplicated_uti = sum(n_pf_consultation_uncomplicated_uti, na.rm = TRUE), + # n_pf_urgent_medicine_supply_consultations = sum(n_pf_urgent_medicine_supply_consultations, na.rm = TRUE), + # n_pf_minor_illness_referral_consultations = sum(n_pf_minor_illness_referral_consultations, na.rm = TRUE) + ) |> + pivot_longer(cols = c( + n_pf_consultation_acute_otitis_media, + n_pf_consultation_acute_sore_throat, + n_pf_consultation_impetigo, + n_pf_consultation_infected_insect_bites, + n_pf_consultation_shingles, + n_pf_consultation_sinusitis, + n_pf_consultation_uncomplicated_uti, + # n_pf_urgent_medicine_supply_consultations, + # n_pf_minor_illness_referral_consultations + ), + names_to = "consultation_type", + values_to = "count") |> + mutate(consultation_type = str_replace(consultation_type, "^n_pf_consultation_", "")) + +write_csv(df_dispensing_data_summary, here("lib", "validation", "data", "pf_consultation_validation_data.csv")) diff --git a/lib/validation/data/pf_consultation_validation_data.csv b/lib/validation/data/pf_consultation_validation_data.csv new file mode 100644 index 0000000..7bfc6e0 --- /dev/null +++ b/lib/validation/data/pf_consultation_validation_data.csv @@ -0,0 +1,43 @@ +date,consultation_type,count +2024-02-01,acute_otitis_media,18893 +2024-02-01,acute_sore_throat,41014 +2024-02-01,impetigo,6816 +2024-02-01,infected_insect_bites,3412 +2024-02-01,shingles,4078 +2024-02-01,sinusitis,18384 +2024-02-01,uncomplicated_uti,32678 +2024-03-01,acute_otitis_media,22888 +2024-03-01,acute_sore_throat,54756 +2024-03-01,impetigo,6740 +2024-03-01,infected_insect_bites,4180 +2024-03-01,shingles,4069 +2024-03-01,sinusitis,17220 +2024-03-01,uncomplicated_uti,34536 +2024-04-01,acute_otitis_media,19295 +2024-04-01,acute_sore_throat,56925 +2024-04-01,impetigo,7191 +2024-04-01,infected_insect_bites,6099 +2024-04-01,shingles,4470 +2024-04-01,sinusitis,17715 +2024-04-01,uncomplicated_uti,41951 +2024-05-01,acute_otitis_media,20240 +2024-05-01,acute_sore_throat,58022 +2024-05-01,impetigo,6235 +2024-05-01,infected_insect_bites,19010 +2024-05-01,shingles,4691 +2024-05-01,sinusitis,15909 +2024-05-01,uncomplicated_uti,44340 +2024-06-01,acute_otitis_media,15826 +2024-06-01,acute_sore_throat,52288 +2024-06-01,impetigo,6362 +2024-06-01,infected_insect_bites,27294 +2024-06-01,shingles,4626 +2024-06-01,sinusitis,15052 +2024-06-01,uncomplicated_uti,43289 +2024-07-01,acute_otitis_media,17682 +2024-07-01,acute_sore_throat,54859 +2024-07-01,impetigo,6740 +2024-07-01,infected_insect_bites,36370 +2024-07-01,shingles,5103 +2024-07-01,sinusitis,14115 +2024-07-01,uncomplicated_uti,51321