From 58a487f053ea6041f9ec7d3fd73405a68e0ba616 Mon Sep 17 00:00:00 2001
From: Milan Wiedemann <milan.wiedemann@gmail.com>
Date: Sun, 24 Nov 2024 09:32:52 +0000
Subject: [PATCH 1/4] Add function and data to validate PF medication data

---
 ... => get_pf_consultation_validation_data.R} |  28 +++--
 .../get_pf_medication_validation_data.R       | 117 ++++++++++++++++++
 ...nction_plot_measures.R => plot_measures.R} |   0
 ...nction_tidy_measures.R => tidy_measures.R} |   0
 .../data/pf_medication_validation_data.csv    |  78 ++++++++++++
 reports/pharmacy_first_report.Rmd             |  86 +++++++------
 6 files changed, 257 insertions(+), 52 deletions(-)
 rename lib/functions/{function_get_validation_data.R => get_pf_consultation_validation_data.R} (89%)
 create mode 100644 lib/functions/get_pf_medication_validation_data.R
 rename lib/functions/{function_plot_measures.R => plot_measures.R} (100%)
 rename lib/functions/{function_tidy_measures.R => tidy_measures.R} (100%)
 create mode 100644 lib/validation/data/pf_medication_validation_data.csv

diff --git a/lib/functions/function_get_validation_data.R b/lib/functions/get_pf_consultation_validation_data.R
similarity index 89%
rename from lib/functions/function_get_validation_data.R
rename to lib/functions/get_pf_consultation_validation_data.R
index dc4af6a..6d64c7a 100644
--- a/lib/functions/function_get_validation_data.R
+++ b/lib/functions/get_pf_consultation_validation_data.R
@@ -115,19 +115,21 @@ df_dispensing_data_summary <- df_dispensing_data |>
     # n_pf_urgent_medicine_supply_consultations = sum(n_pf_urgent_medicine_supply_consultations, na.rm = TRUE),
     # n_pf_minor_illness_referral_consultations = sum(n_pf_minor_illness_referral_consultations, na.rm = TRUE)
   ) |>
-  pivot_longer(cols = c(
-    n_pf_consultation_acute_otitis_media,
-    n_pf_consultation_acute_sore_throat,
-    n_pf_consultation_impetigo,
-    n_pf_consultation_infected_insect_bites,
-    n_pf_consultation_shingles,
-    n_pf_consultation_sinusitis,
-    n_pf_consultation_uncomplicated_uti,
-    # n_pf_urgent_medicine_supply_consultations,
-    # n_pf_minor_illness_referral_consultations
-  ),
-  names_to = "consultation_type",
-  values_to = "count") |>
+  pivot_longer(
+    cols = c(
+      n_pf_consultation_acute_otitis_media,
+      n_pf_consultation_acute_sore_throat,
+      n_pf_consultation_impetigo,
+      n_pf_consultation_infected_insect_bites,
+      n_pf_consultation_shingles,
+      n_pf_consultation_sinusitis,
+      n_pf_consultation_uncomplicated_uti,
+      # n_pf_urgent_medicine_supply_consultations,
+      # n_pf_minor_illness_referral_consultations
+    ),
+    names_to = "consultation_type",
+    values_to = "count"
+  ) |>
   mutate(consultation_type = str_replace(consultation_type, "^n_pf_consultation_", ""))
 
 write_csv(df_dispensing_data_summary, here("lib", "validation", "data", "pf_consultation_validation_data.csv"))
diff --git a/lib/functions/get_pf_medication_validation_data.R b/lib/functions/get_pf_medication_validation_data.R
new file mode 100644
index 0000000..e0f7bf4
--- /dev/null
+++ b/lib/functions/get_pf_medication_validation_data.R
@@ -0,0 +1,117 @@
+library("httr")
+library("jsonlite")
+library("crul")
+library("here")
+library("rvest")
+library("dplyr")
+library("lubridate")
+library("tidyverse")
+library("readr")
+
+base_endpoint <- "https://opendata.nhsbsa.net/api/3/action/"
+package_list_method <- "package_list"
+package_show_method <- "package_show?id="
+action_method <- "datastore_search_sql?"
+
+get_available_datasets <- function(remove_foi = TRUE) {
+  base_endpoint <- "https://opendata.nhsbsa.net/api/3/action/"
+  package_list_method <- "package_list"
+
+  datasets_response <- fromJSON(paste0(
+    base_endpoint,
+    package_list_method
+  ))$result
+
+  if (remove_foi) {
+    datasets_response <- datasets_response[!grepl("^foi", datasets_response)]
+  }
+
+  datasets_response
+}
+
+get_available_datasets()
+
+get_dataset_table_names <- function(dataset_id, start_date = NULL, end_date = NULL) {
+  available_datasets <- get_available_datasets(remove_foi = FALSE)
+
+  if (!dataset_id %in% available_datasets) {
+    stop("The provided 'dataset_id' is not available. Run 'get_available_datasets()' to see all available datasets.", call. = FALSE)
+  }
+
+  base_endpoint <- "https://opendata.nhsbsa.net/api/3/action/"
+  package_show_method <- "package_show?id="
+
+  metadata_response <- GET(paste0(base_endpoint, package_show_method, dataset_id))
+  resources_table <- content(metadata_response)$result$resources
+
+  dataset_tables <- tibble(
+    table_name = map_chr(
+      resources_table,
+      "bq_table_name",
+      .default = NA_character_
+    ),
+    date = ym(str_extract(table_name, "\\d{6}"))
+  ) |>
+    relocate(date)
+
+  if (is.null(start_date)) {
+    start_date <- as.Date(min(dataset_tables$date))
+  }
+
+  if (is.null(end_date)) {
+    end_date <- as.Date(max(dataset_tables$date))
+  }
+
+  dataset_tables <- dataset_tables |>
+    filter(between(date, as.Date(start_date), as.Date(end_date)))
+
+  dataset_tables
+}
+
+get_dataset_table_names("prescription-cost-analysis-pca-monthly-data", "2024-09-01")
+
+construct_sql_query <- function(table_name, sql_query) {
+  gsub("{FROM_TABLE}", sprintf("FROM `%s`", table_name), sql_query, fixed = TRUE)
+}
+
+get_nhsbsa_data <- function(dataset_id, sql, start_date = NULL, end_date = NULL) {
+  base_endpoint <- "https://opendata.nhsbsa.net/api/3/action/"
+  action_method <- "datastore_search_sql?"
+
+  table_names <- get_dataset_table_names(dataset_id, start_date, end_date)$table_name
+
+  async_api_calls <- paste0(
+    base_endpoint,
+    action_method,
+    "resource_id=", table_names,
+    "&sql=", URLencode(map_chr(table_names, construct_sql_query, sql))
+  )
+
+  responses <- crul::Async$new(urls = async_api_calls)$get()
+
+  df_tmp <- bind_rows(map(responses, ~ as_tibble(jsonlite::fromJSON(.x$parse("UTF-8"))$result$result$records)))
+
+  df_tmp |>
+    janitor::clean_names() |>
+    mutate(year_month = ym(year_month)) |>
+    select(date = year_month, everything())
+}
+
+sql <- ("
+  SELECT *
+  {FROM_TABLE}
+  WHERE PHARMACY_ADVANCED_SERVICE = 'Pharmacy First Clinical Pathways'
+  ")
+
+df_validate <- get_nhsbsa_data("prescription-cost-analysis-pca-monthly-data", sql, start_date = "2024-02-01")
+
+names(df_validate)
+unique(df_validate$pharmacy_advanced_service)
+
+pf_medication_validation_data <- df_validate |>
+  select(date, snomed_code, pharmacy_advanced_service, bnf_section, bnf_paragraph, items) |>
+  group_by(date, pharmacy_advanced_service, bnf_paragraph) |>
+  summarise(count = sum(items, na.rm = TRUE)) |>
+  ungroup()
+
+write_csv(pf_medication_validation_data, here("lib", "validation", "data", "pf_medication_validation_data.csv"))
diff --git a/lib/functions/function_plot_measures.R b/lib/functions/plot_measures.R
similarity index 100%
rename from lib/functions/function_plot_measures.R
rename to lib/functions/plot_measures.R
diff --git a/lib/functions/function_tidy_measures.R b/lib/functions/tidy_measures.R
similarity index 100%
rename from lib/functions/function_tidy_measures.R
rename to lib/functions/tidy_measures.R
diff --git a/lib/validation/data/pf_medication_validation_data.csv b/lib/validation/data/pf_medication_validation_data.csv
new file mode 100644
index 0000000..4ff59ed
--- /dev/null
+++ b/lib/validation/data/pf_medication_validation_data.csv
@@ -0,0 +1,78 @@
+date,pharmacy_advanced_service,bnf_paragraph,count
+2024-02-01,Pharmacy First Clinical Pathways,Antibacterial preparations,3113
+2024-02-01,Pharmacy First Clinical Pathways,Drugs used in nasal allergy,7476
+2024-02-01,Pharmacy First Clinical Pathways,Herpesvirus infections,3336
+2024-02-01,Pharmacy First Clinical Pathways,Individually formulated preparations bought in,13
+2024-02-01,Pharmacy First Clinical Pathways,Macrolides,4528
+2024-02-01,Pharmacy First Clinical Pathways,Otitis externa,3831
+2024-02-01,Pharmacy First Clinical Pathways,Penicillins,42008
+2024-02-01,Pharmacy First Clinical Pathways,Preparations for minor cuts and abrasions,2251
+2024-02-01,Pharmacy First Clinical Pathways,Tetracyclines,1002
+2024-02-01,Pharmacy First Clinical Pathways,Urinary-tract infections,26877
+2024-03-01,Pharmacy First Clinical Pathways,Antibacterial preparations,3386
+2024-03-01,Pharmacy First Clinical Pathways,Drugs used in nasal allergy,6588
+2024-03-01,Pharmacy First Clinical Pathways,Herpesvirus infections,3451
+2024-03-01,Pharmacy First Clinical Pathways,Individually formulated preparations bought in,32
+2024-03-01,Pharmacy First Clinical Pathways,Macrolides,6201
+2024-03-01,Pharmacy First Clinical Pathways,Otitis externa,5325
+2024-03-01,Pharmacy First Clinical Pathways,Penicillins,56231
+2024-03-01,Pharmacy First Clinical Pathways,Preparations for minor cuts and abrasions,1820
+2024-03-01,Pharmacy First Clinical Pathways,Tetracyclines,1028
+2024-03-01,Pharmacy First Clinical Pathways,Urinary-tract infections,29345
+2024-04-01,Pharmacy First Clinical Pathways,Antibacterial preparations,3834
+2024-04-01,Pharmacy First Clinical Pathways,Drugs used in nasal allergy,6598
+2024-04-01,Pharmacy First Clinical Pathways,Herpesvirus infections,3818
+2024-04-01,Pharmacy First Clinical Pathways,Individually formulated preparations bought in,5
+2024-04-01,Pharmacy First Clinical Pathways,Macrolides,6530
+2024-04-01,Pharmacy First Clinical Pathways,Otitis externa,4375
+2024-04-01,Pharmacy First Clinical Pathways,Penicillins,58070
+2024-04-01,Pharmacy First Clinical Pathways,Preparations for minor cuts and abrasions,1673
+2024-04-01,Pharmacy First Clinical Pathways,Tetracyclines,1036
+2024-04-01,Pharmacy First Clinical Pathways,Urinary-tract infections,35592
+2024-05-01,Pharmacy First Clinical Pathways,Antibacterial preparations,3224
+2024-05-01,Pharmacy First Clinical Pathways,Drugs used in nasal allergy,5789
+2024-05-01,Pharmacy First Clinical Pathways,Herpesvirus infections,4065
+2024-05-01,Pharmacy First Clinical Pathways,Macrolides,7864
+2024-05-01,Pharmacy First Clinical Pathways,Otitis externa,5457
+2024-05-01,Pharmacy First Clinical Pathways,Penicillins,68200
+2024-05-01,Pharmacy First Clinical Pathways,Preparations for minor cuts and abrasions,1558
+2024-05-01,Pharmacy First Clinical Pathways,Tetracyclines,935
+2024-05-01,Pharmacy First Clinical Pathways,Urinary-tract infections,38165
+2024-06-01,Pharmacy First Clinical Pathways,Antibacterial preparations,3262
+2024-06-01,Pharmacy First Clinical Pathways,Drugs used in nasal allergy,5671
+2024-06-01,Pharmacy First Clinical Pathways,Herpesvirus infections,3979
+2024-06-01,Pharmacy First Clinical Pathways,Macrolides,7910
+2024-06-01,Pharmacy First Clinical Pathways,Otitis externa,4541
+2024-06-01,Pharmacy First Clinical Pathways,Penicillins,67691
+2024-06-01,Pharmacy First Clinical Pathways,Preparations for minor cuts and abrasions,1625
+2024-06-01,Pharmacy First Clinical Pathways,Tetracyclines,813
+2024-06-01,Pharmacy First Clinical Pathways,Urinary-tract infections,36995
+2024-07-01,Pharmacy First Clinical Pathways,Antibacterial preparations,3327
+2024-07-01,Pharmacy First Clinical Pathways,Drugs used in nasal allergy,5022
+2024-07-01,Pharmacy First Clinical Pathways,Herpesvirus infections,4332
+2024-07-01,Pharmacy First Clinical Pathways,Macrolides,9023
+2024-07-01,Pharmacy First Clinical Pathways,Otitis externa,4950
+2024-07-01,Pharmacy First Clinical Pathways,Penicillins,76597
+2024-07-01,Pharmacy First Clinical Pathways,Preparations for minor cuts and abrasions,1664
+2024-07-01,Pharmacy First Clinical Pathways,Tetracyclines,819
+2024-07-01,Pharmacy First Clinical Pathways,Urinary-tract infections,43848
+2024-08-01,Pharmacy First Clinical Pathways,Antibacterial preparations,3628
+2024-08-01,Pharmacy First Clinical Pathways,Drugs used in nasal allergy,3996
+2024-08-01,Pharmacy First Clinical Pathways,Herpesvirus infections,4372
+2024-08-01,Pharmacy First Clinical Pathways,Individually formulated preparations bought in,9
+2024-08-01,Pharmacy First Clinical Pathways,Macrolides,8538
+2024-08-01,Pharmacy First Clinical Pathways,Otitis externa,5554
+2024-08-01,Pharmacy First Clinical Pathways,Penicillins,70102
+2024-08-01,Pharmacy First Clinical Pathways,Preparations for minor cuts and abrasions,1761
+2024-08-01,Pharmacy First Clinical Pathways,Tetracyclines,572
+2024-08-01,Pharmacy First Clinical Pathways,Urinary-tract infections,45658
+2024-09-01,Pharmacy First Clinical Pathways,Antibacterial preparations,4165
+2024-09-01,Pharmacy First Clinical Pathways,Drugs used in nasal allergy,5464
+2024-09-01,Pharmacy First Clinical Pathways,Herpesvirus infections,4042
+2024-09-01,Pharmacy First Clinical Pathways,Individually formulated preparations bought in,11
+2024-09-01,Pharmacy First Clinical Pathways,Macrolides,7177
+2024-09-01,Pharmacy First Clinical Pathways,Otitis externa,4644
+2024-09-01,Pharmacy First Clinical Pathways,Penicillins,58958
+2024-09-01,Pharmacy First Clinical Pathways,Preparations for minor cuts and abrasions,1939
+2024-09-01,Pharmacy First Clinical Pathways,Tetracyclines,735
+2024-09-01,Pharmacy First Clinical Pathways,Urinary-tract infections,47327
diff --git a/reports/pharmacy_first_report.Rmd b/reports/pharmacy_first_report.Rmd
index 885c500..85125d1 100644
--- a/reports/pharmacy_first_report.Rmd
+++ b/reports/pharmacy_first_report.Rmd
@@ -20,8 +20,8 @@ library(gt)
 
 ```{r load-data, message=FALSE, warning=FALSE}
 # Load plotting function
-source(here::here("lib", "functions", "function_tidy_measures.R"))
-source(here::here("lib", "functions", "function_plot_measures.R"))
+source(here::here("lib", "functions", "tidy_measures.R"))
+source(here::here("lib", "functions", "plot_measures.R"))
 
 # Load data based on environment
 if (Sys.getenv("OPENSAFELY_BACKEND") != "") {
@@ -117,7 +117,7 @@ df_measures$region <- factor(
 df_measures <- df_measures %>%
   mutate(sex = factor(sex, levels = c("female", "male"), labels = c("Female", "Male")))
 
-df_measures$age_band[is.na(df_measures$age_band)] <- "Missing"  
+df_measures$age_band[is.na(df_measures$age_band)] <- "Missing"
 
 gradient_palette <- c("#001F4D", "#0056B3", "#007BFF", "#66B3E2", "#A4D8E1", "grey")
 region_palette <- c("red", "navy", "#018701", "#ffa600ca", "purple", "brown", "#f4a5b2", "cyan", "green", "grey")
@@ -213,7 +213,6 @@ clinical_pathways_table %>%
     style = cell_text(weight = "bold"),
     locations = cells_column_labels(columns = everything())
   )
-
 ```
 
 ### Codelists
@@ -226,7 +225,6 @@ The following two SNOMED codes were used to identify Pharmacy First consultation
 For clarity, we combined these codes for the presentation of the results.
 
 ```{r echo=FALSE}
-
 # Create pharmacy first service codes dataframe
 pharmacy_first_table <- data.frame(
   codelist = c(
@@ -262,7 +260,6 @@ pharmacy_first_table %>%
 To categorise clinical events related to Pharmacy First services used the Pharmacy First [Clinical Pathways Codelist](https://www.opencodelists.org/codelist/opensafely/pharmacy-first-clinical-pathway-conditions/7ec97762/#full-list).
 
 ```{r echo=FALSE, message=FALSE}
-
 clinical_codes_table <- data.frame(
   condition = c(
     "Acute otitis media",
@@ -337,7 +334,6 @@ ethnicity_table <- data.frame(
 #     heading.title.font.size = "large",
 #     heading.subtitle.font.size = "small"
 #   )
-
 ```
 
 # Results
@@ -786,9 +782,9 @@ df_opensafely <- df_measures %>%
 
 # Add a new column to each data frame to identify the source
 df_opensafely <- df_opensafely %>%
-  mutate(source = 'OS')
+  mutate(source = "OS")
 df_bsa_validation <- df_bsa_validation %>%
-  mutate(source = 'BSA')
+  mutate(source = "BSA")
 
 # Drop the original 'count' column from the BSA data to allow for easy consistent grouping by 'count'
 df_validation_condition_counts <- df_bsa_validation %>%
@@ -809,14 +805,14 @@ df_pivoted <- df_validation_condition_counts %>%
 df_pivoted <- df_pivoted %>%
   mutate(
     consultation_type = recode(consultation_type,
-    "sinusitis" = "Acute Sinusitis",
-    "infected_insect_bites" = "Infected Insect Bite",
-    "uncomplicated_uti" = "UTI",
-    "acute_otitis_media" = "Acute Otitis Media",
-    "acute_sore_throat" = "Acute Pharyngitis",
-    "shingles" = "Herpes Zoster",
-    "impetigo" = "Impetigo",)
-
+      "sinusitis" = "Acute Sinusitis",
+      "infected_insect_bites" = "Infected Insect Bite",
+      "uncomplicated_uti" = "UTI",
+      "acute_otitis_media" = "Acute Otitis Media",
+      "acute_sore_throat" = "Acute Pharyngitis",
+      "shingles" = "Herpes Zoster",
+      "impetigo" = "Impetigo",
+    )
   )
 # Removing date column as this will prevent grouping (date is already pivot columns)
 df_pivoted <- df_pivoted %>%
@@ -885,10 +881,12 @@ tab_pf_conditions_validation
 ```
 
 ```{r, message=FALSE, warning=FALSE, echo = FALSE, fig.width=8}
-df_long <- df_pivoted %>% pivot_longer(cols=c('02-2024_OS','02-2024_BSA','03-2024_OS','03-2024_BSA','04-2024_OS','04-2024_BSA','05-2024_OS','05-2024_BSA','06-2024_OS','06-2024_BSA','07-2024_OS','07-2024_BSA'),
-  names_to=c('month', 'source'),
+df_long <- df_pivoted %>% pivot_longer(
+  cols = c("02-2024_OS", "02-2024_BSA", "03-2024_OS", "03-2024_BSA", "04-2024_OS", "04-2024_BSA", "05-2024_OS", "05-2024_BSA", "06-2024_OS", "06-2024_BSA", "07-2024_OS", "07-2024_BSA"),
+  names_to = c("month", "source"),
   names_sep = "_",
-  values_to='count')
+  values_to = "count"
+)
 # Changing format of date to use label_date_short to keep dates consistent for figures
 df_long$month <- as.Date(paste0("01-", df_long$month), format = "%d-%m-%Y")
 
@@ -896,13 +894,16 @@ df_long$month <- as.Date(paste0("01-", df_long$month), format = "%d-%m-%Y")
 validation_total_counts_figure <- ggplot(df_long, aes(x = month, y = count, color = consultation_type, group = consultation_type)) +
   geom_point() +
   geom_line(size = 0.5) +
-  facet_wrap(~ source, scales = "free_y") +
-  labs(title = "Clinical Conditions Count by Month (NHS BSA vs OpenSAFELY Data)",
-       x = "Month", y = "Count", color = "Clinical Condition") +
+  facet_wrap(~source, scales = "free_y") +
+  labs(
+    title = "Clinical Conditions Count by Month (NHS BSA vs OpenSAFELY Data)",
+    x = "Month", y = "Count", color = "Clinical Condition"
+  ) +
   theme(
-    plot.title = element_text(hjust = 0.5)) +
+    plot.title = element_text(hjust = 0.5)
+  ) +
   scale_x_date(
-      labels = scales::label_date_short()
+    labels = scales::label_date_short()
   )
 
 validation_total_counts_figure
@@ -913,29 +914,36 @@ validation_total_counts_figure
 df_descriptive_stats <- df_descriptive_stats %>%
   mutate(
     measure = recode(measure,
-    "pf_with_pfmed" = "PF Med",
-    "pf_with_pfcondition" = "PF Condition",
-    "pf_with_pfmed_and_pfcondition" = "PF Med & PF Condition",
-    ))
+      "pf_with_pfmed" = "PF Med",
+      "pf_with_pfcondition" = "PF Condition",
+      "pf_with_pfmed_and_pfcondition" = "PF Med & PF Condition",
+    )
+  )
 
 descriptive_stats_figure <- ggplot(df_descriptive_stats, aes(x = interval_start, y = ratio, color = measure, group = measure)) +
   geom_point() +
   geom_line(size = 0.5) +
   # facet_wrap(~ measure, scales = "free_y") +
-  labs(title = "Breakdown of PF consultations with linked PF conditions and medications",
-      color = "PF consultation with:") +
+  labs(
+    title = "Breakdown of PF consultations with linked PF conditions and medications",
+    color = "PF consultation with:"
+  ) +
   theme(
-    plot.title = element_text(hjust = 0.5)) +
+    plot.title = element_text(hjust = 0.5)
+  ) +
   scale_x_date(
-      labels = scales::label_date_short()
+    labels = scales::label_date_short()
+  ) +
+  scale_y_continuous(
+    labels = scales::percent,
+    limits = c(0, 1)
   ) +
-  scale_y_continuous(labels = scales::percent,
-  limits = c(0,1)) + 
-  theme(axis.title.x = element_blank(),
-    axis.title.y = element_blank())
+  theme(
+    axis.title.x = element_blank(),
+    axis.title.y = element_blank()
+  )
 
 descriptive_stats_figure
-
 ```
 
-# References
\ No newline at end of file
+# References

From 5c6ba556655df4304c628502b65c5855485194ba Mon Sep 17 00:00:00 2001
From: Milan Wiedemann <milan.wiedemann@gmail.com>
Date: Sun, 24 Nov 2024 09:33:17 +0000
Subject: [PATCH 2/4] Add function to explore table schema for NHSBA tables

---
 .../get_dataset_nhsbsa_table_schema.R         | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 lib/functions/get_dataset_nhsbsa_table_schema.R

diff --git a/lib/functions/get_dataset_nhsbsa_table_schema.R b/lib/functions/get_dataset_nhsbsa_table_schema.R
new file mode 100644
index 0000000..04ee65d
--- /dev/null
+++ b/lib/functions/get_dataset_nhsbsa_table_schema.R
@@ -0,0 +1,50 @@
+library("httr")
+library("jsonlite")
+library("crul")
+library("here")
+library("rvest")
+library("dplyr")
+library("lubridate")
+library("tidyverse")
+library("readr")
+
+# This may be useful for writing SQL queries but reading the schema from
+# JSON in the metadata seems more tricky and maybe there's another way
+get_dataset_table_schema <- function(dataset_id) {
+  base_endpoint <- "https://opendata.nhsbsa.net/api/3/action/"
+  package_show_method <- "package_show?id="
+
+  available_datasets <- get_available_datasets(remove_foi = FALSE)
+
+  if (!dataset_id %in% available_datasets) {
+    stop("The provided 'dataset_id' is not available. Run 'get_available_datasets()' to see all available datasets.", call. = FALSE)
+  }
+
+  metadata_response <- GET(paste0(base_endpoint, package_show_method, dataset_id))
+  resources <- content(metadata_response)$result$resources
+  
+  schema_raw <- resources[[1]]$schema
+
+  # There seems to be a lot of odd strings in the JSON
+  # that needs to be fixed before we can read it
+  schema_fixed <- schema_raw
+  schema_fixed <- gsub("u\'", '"', schema_fixed)
+  schema_fixed <- gsub("u\"", '"', schema_fixed)
+  schema_fixed <- gsub("':", '":', schema_fixed)
+  schema_fixed <- gsub("',", '",', schema_fixed)
+  schema_fixed <- gsub("'}", '"}', schema_fixed)
+  schema_fixed <- gsub("']", '"]', schema_fixed)
+  schema_fixed <- gsub("-", "", schema_fixed)
+  schema_fixed <- gsub("-", "", schema_fixed)
+  tidyjson::json(schema_fixed)
+  schema_list <- fromJSON(schema_fixed, flatten = TRUE)
+
+  tibble(schema_list$fields) |>
+    select(name, title, type, description)
+}
+
+get_available_datasets()[3:10]
+get_dataset_table_schema("contractor-details")
+get_dataset_table_schema("dental-activity-delivered-by-newly-qualified-foundation-dentists")
+get_dataset_table_schema("secondary-care-medicines-data")
+map(get_available_datasets()[3:10], get_dataset_table_schema)

From 103af265845ec1918aac82dc3f23602cb886ecf3 Mon Sep 17 00:00:00 2001
From: Milan Wiedemann <milan.wiedemann@gmail.com>
Date: Sun, 24 Nov 2024 12:30:13 +0000
Subject: [PATCH 3/4] Remove empty datasets from `get_available_datasets()`

---
 lib/functions/get_pf_medication_validation_data.R | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/functions/get_pf_medication_validation_data.R b/lib/functions/get_pf_medication_validation_data.R
index e0f7bf4..7536ff5 100644
--- a/lib/functions/get_pf_medication_validation_data.R
+++ b/lib/functions/get_pf_medication_validation_data.R
@@ -13,7 +13,7 @@ package_list_method <- "package_list"
 package_show_method <- "package_show?id="
 action_method <- "datastore_search_sql?"
 
-get_available_datasets <- function(remove_foi = TRUE) {
+get_available_datasets <- function() {
   base_endpoint <- "https://opendata.nhsbsa.net/api/3/action/"
   package_list_method <- "package_list"
 
@@ -22,9 +22,10 @@ get_available_datasets <- function(remove_foi = TRUE) {
     package_list_method
   ))$result
 
-  if (remove_foi) {
-    datasets_response <- datasets_response[!grepl("^foi", datasets_response)]
-  }
+  # Remove datasets with FOI and starting with a number
+  # There does not seem to be any data that we can query from these tables
+  datasets_response <- datasets_response[!grepl("foi", datasets_response)]
+  datasets_response <- datasets_response[!grepl("^[0-9]", datasets_response)]
 
   datasets_response
 }
@@ -32,7 +33,7 @@ get_available_datasets <- function(remove_foi = TRUE) {
 get_available_datasets()
 
 get_dataset_table_names <- function(dataset_id, start_date = NULL, end_date = NULL) {
-  available_datasets <- get_available_datasets(remove_foi = FALSE)
+  available_datasets <- get_available_datasets()
 
   if (!dataset_id %in% available_datasets) {
     stop("The provided 'dataset_id' is not available. Run 'get_available_datasets()' to see all available datasets.", call. = FALSE)

From e18ce660b371cab2374057644882a246f28458a0 Mon Sep 17 00:00:00 2001
From: Milan Wiedemann <milan.wiedemann@gmail.com>
Date: Sun, 24 Nov 2024 12:30:45 +0000
Subject: [PATCH 4/4] Improve` get_dataset_table_schema()`

---
 .../get_dataset_nhsbsa_table_schema.R         | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/lib/functions/get_dataset_nhsbsa_table_schema.R b/lib/functions/get_dataset_nhsbsa_table_schema.R
index 04ee65d..5cb502f 100644
--- a/lib/functions/get_dataset_nhsbsa_table_schema.R
+++ b/lib/functions/get_dataset_nhsbsa_table_schema.R
@@ -8,13 +8,15 @@ library("lubridate")
 library("tidyverse")
 library("readr")
 
+source(here("lib/functions/get_pf_medication_validation_data.R"))
+
 # This may be useful for writing SQL queries but reading the schema from
 # JSON in the metadata seems more tricky and maybe there's another way
 get_dataset_table_schema <- function(dataset_id) {
   base_endpoint <- "https://opendata.nhsbsa.net/api/3/action/"
   package_show_method <- "package_show?id="
 
-  available_datasets <- get_available_datasets(remove_foi = FALSE)
+  available_datasets <- get_available_datasets()
 
   if (!dataset_id %in% available_datasets) {
     stop("The provided 'dataset_id' is not available. Run 'get_available_datasets()' to see all available datasets.", call. = FALSE)
@@ -22,7 +24,7 @@ get_dataset_table_schema <- function(dataset_id) {
 
   metadata_response <- GET(paste0(base_endpoint, package_show_method, dataset_id))
   resources <- content(metadata_response)$result$resources
-  
+
   schema_raw <- resources[[1]]$schema
 
   # There seems to be a lot of odd strings in the JSON
@@ -36,15 +38,17 @@ get_dataset_table_schema <- function(dataset_id) {
   schema_fixed <- gsub("']", '"]', schema_fixed)
   schema_fixed <- gsub("-", "", schema_fixed)
   schema_fixed <- gsub("-", "", schema_fixed)
-  tidyjson::json(schema_fixed)
+
   schema_list <- fromJSON(schema_fixed, flatten = TRUE)
 
   tibble(schema_list$fields) |>
     select(name, title, type, description)
 }
 
-get_available_datasets()[3:10]
-get_dataset_table_schema("contractor-details")
-get_dataset_table_schema("dental-activity-delivered-by-newly-qualified-foundation-dentists")
-get_dataset_table_schema("secondary-care-medicines-data")
-map(get_available_datasets()[3:10], get_dataset_table_schema)
+nhsbsa_table_schemas <- map(
+  set_names(get_available_datasets()),
+  safely(get_dataset_table_schema)
+)
+
+nhsbsa_table_schemas_results <- map(nhsbsa_table_schemas, "result")
+nhsbsa_table_schemas_errors <- map(nhsbsa_table_schemas, "error")