From 5c6ba556655df4304c628502b65c5855485194ba Mon Sep 17 00:00:00 2001 From: Milan Wiedemann Date: Sun, 24 Nov 2024 09:33:17 +0000 Subject: [PATCH] Add function to explore table schema for NHSBA tables --- .../get_dataset_nhsbsa_table_schema.R | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 lib/functions/get_dataset_nhsbsa_table_schema.R diff --git a/lib/functions/get_dataset_nhsbsa_table_schema.R b/lib/functions/get_dataset_nhsbsa_table_schema.R new file mode 100644 index 0000000..04ee65d --- /dev/null +++ b/lib/functions/get_dataset_nhsbsa_table_schema.R @@ -0,0 +1,50 @@ +library("httr") +library("jsonlite") +library("crul") +library("here") +library("rvest") +library("dplyr") +library("lubridate") +library("tidyverse") +library("readr") + +# This may be useful for writing SQL queries but reading the schema from +# JSON in the metadata seems more tricky and maybe there's another way +get_dataset_table_schema <- function(dataset_id) { + base_endpoint <- "https://opendata.nhsbsa.net/api/3/action/" + package_show_method <- "package_show?id=" + + available_datasets <- get_available_datasets(remove_foi = FALSE) + + if (!dataset_id %in% available_datasets) { + stop("The provided 'dataset_id' is not available. Run 'get_available_datasets()' to see all available datasets.", call. = FALSE) + } + + metadata_response <- GET(paste0(base_endpoint, package_show_method, dataset_id)) + resources <- content(metadata_response)$result$resources + + schema_raw <- resources[[1]]$schema + + # There seems to be a lot of odd strings in the JSON + # that needs to be fixed before we can read it + schema_fixed <- schema_raw + schema_fixed <- gsub("u\'", '"', schema_fixed) + schema_fixed <- gsub("u\"", '"', schema_fixed) + schema_fixed <- gsub("':", '":', schema_fixed) + schema_fixed <- gsub("',", '",', schema_fixed) + schema_fixed <- gsub("'}", '"}', schema_fixed) + schema_fixed <- gsub("']", '"]', schema_fixed) + schema_fixed <- gsub("-", "", schema_fixed) + schema_fixed <- gsub("-", "", schema_fixed) + tidyjson::json(schema_fixed) + schema_list <- fromJSON(schema_fixed, flatten = TRUE) + + tibble(schema_list$fields) |> + select(name, title, type, description) +} + +get_available_datasets()[3:10] +get_dataset_table_schema("contractor-details") +get_dataset_table_schema("dental-activity-delivered-by-newly-qualified-foundation-dentists") +get_dataset_table_schema("secondary-care-medicines-data") +map(get_available_datasets()[3:10], get_dataset_table_schema)