diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..ed69db7 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,4 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.github$ +LICENSE.md diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/check-standard.yaml b/.github/workflows/check-standard.yaml new file mode 100644 index 0000000..a3ac618 --- /dev/null +++ b/.github/workflows/check-standard.yaml @@ -0,0 +1,49 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: macos-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + R_KEEP_PKG_SOURCE: yes + + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + http-user-agent: ${{ matrix.config.http-user-agent }} + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + upload-snapshots: true diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 0000000..ed7650c --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,48 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + needs: website + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@v4.4.1 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml new file mode 100644 index 0000000..27d4528 --- /dev/null +++ b/.github/workflows/test-coverage.yaml @@ -0,0 +1,50 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: test-coverage + +jobs: + test-coverage: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - uses: actions/checkout@v3 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::covr + needs: coverage + + - name: Test coverage + run: | + covr::codecov( + quiet = FALSE, + clean = FALSE, + install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") + ) + shell: Rscript {0} + + - name: Show testthat output + if: always() + run: | + ## -------------------------------------------------------------------- + find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true + shell: bash + + - name: Upload test results + if: failure() + uses: actions/upload-artifact@v3 + with: + name: coverage-test-failures + path: ${{ runner.temp }}/package diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..55e750c --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata +README.Rmd diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..bf88dd2 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,25 @@ +Package: puremoe +Type: Package +Title: Pubmed Unified REtrieval for Multi-Output Exploration +Version: 1.0.0 +Author: Jason Timm [aut, cre] +Maintainer: Jason Timm +Description: An R package for accessing a variety of PubMed data, including abstracts, bibliometrics, pubtations, and full-text records, through a single, user-friendly interface. +License: MIT + file LICENSE +Encoding: UTF-8 +LazyData: false +Depends: + R (>= 3.5) +Imports: + rentrez, + textshape, + xml2, + data.table, + httr, + pbapply, + jsonlite, + rappdirs +Suggests: + knitr, + rmarkdown +RoxygenNote: 7.3.1 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1a747f5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2022 +COPYRIGHT HOLDER: Jason Timm diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..cc0c2a0 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +Copyright (c) 2022 Jason Timm + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..9e6b519 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,32 @@ +# Generated by roxygen2: do not edit by hand + +export(data_mesh_embeddings) +export(data_mesh_thesuarus) +export(data_mesh_trees) +export(data_pharm_action) +export(data_pmc_list) +export(get_records) +export(search_pubmed) +import(data.table) +importFrom(data.table,fread) +importFrom(data.table,rbindlist) +importFrom(data.table,setDT) +importFrom(httr,GET) +importFrom(httr,content) +importFrom(jsonlite,stream_in) +importFrom(parallel,clusterExport) +importFrom(parallel,detectCores) +importFrom(parallel,makeCluster) +importFrom(parallel,stopCluster) +importFrom(pbapply,pblapply) +importFrom(rappdirs,user_data_dir) +importFrom(rentrez,entrez_fetch) +importFrom(rentrez,entrez_search) +importFrom(utils,download.file) +importFrom(utils,read.csv) +importFrom(utils,untar) +importFrom(xml2,read_xml) +importFrom(xml2,xml_children) +importFrom(xml2,xml_find_all) +importFrom(xml2,xml_find_first) +importFrom(xml2,xml_text) diff --git a/R/data_mesh_embeddings.R b/R/data_mesh_embeddings.R new file mode 100644 index 0000000..17786d1 --- /dev/null +++ b/R/data_mesh_embeddings.R @@ -0,0 +1,71 @@ +#' Download and Process Mesh and SCR Embeddings +#' +#' This function downloads Mesh and SCR embeddings data from the specified URLs and processes it for use. +#' The data is saved locally in RDS format. If the files do not exist, they will be downloaded and processed. +#' +#' @return A data frame containing the processed Mesh and SCR embeddings data. +#' +#' @importFrom rappdirs user_data_dir +#' @importFrom utils download.file +#' @export +#' @examples +#' \donttest{ +#' if (interactive()) { +#' # Code that downloads data or performs other interactive-only operations +#' data <- data_mesh_embeddings() +#' } +#' } + +#' +data_mesh_embeddings <- function() { + + # Define the URLs for Mesh and SCR embeddings data + sf <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_mesh_embeddings.rds?raw=true' + sf2 <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_scr_embeddings.rds?raw=true' + + # Define local file paths for storing the processed data + df <- file.path(rappdirs::user_data_dir('puremoe'), 'data_mesh_embeddings.rds') + df2 <- file.path(rappdirs::user_data_dir('puremoe'), 'data_scr_embeddings.rds') + + # Check if the directory for data storage exists, and create it if not + if (!dir.exists(rappdirs::user_data_dir('puremoe'))) { + dir.create(rappdirs::user_data_dir('puremoe'), recursive = TRUE) + } + + # Download and process Mesh embeddings data if it doesn't exist + if (!file.exists(df)) { + message('Downloading the Mesh embeddings ...') + out <- tryCatch({ + utils::download.file(sf, df) + }, error = function(e) paste("Error")) + + if (out == 'Error') { + message('Download not completed ... Try options(timeout = 600)') + file.remove(df) + } + } + + # Download and process SCR embeddings data if it doesn't exist + if (!file.exists(df2)) { + message('Downloading the SCR embeddings ...') + out <- tryCatch({ + utils::download.file(sf2, df2) + }, error = function(e) paste("Error")) + + if (out == 'Error') { + message('Download not completed ... Try options(timeout = 600)') + file.remove(df2) + } + } + + # If both files exist, read and combine them + if (all(file.exists(df), file.exists(df2))) { + a1 <- readRDS(df) + a2 <- readRDS(df2) + + result <- rbind(a1, a2) + return(result) + } + + return(NULL) +} diff --git a/R/data_mesh_thesaurus.R b/R/data_mesh_thesaurus.R new file mode 100644 index 0000000..4c5ba2a --- /dev/null +++ b/R/data_mesh_thesaurus.R @@ -0,0 +1,59 @@ +#' Download and Combine MeSH and Supplemental Thesauruses +#' +#' This function downloads and combines the MeSH (Medical Subject Headings) Thesaurus +#' and a supplemental concept thesaurus for use in biomedical research and analysis. +#' The data is sourced from specified URLs and stored locally for subsequent use. +#' @param force_download A logical value indicating whether to force re-downloading +#' of the data even if it already exists locally. +#' @return A data.table containing the combined MeSH and supplemental thesaurus data. +#' @importFrom rappdirs user_data_dir +#' @importFrom utils download.file +#' @importFrom data.table rbindlist +#' @export +#' @examples +#' \donttest{ +#' if (interactive()) { +#' # Code that downloads data or performs other interactive-only operations +#' data <- data_mesh_thesaurus() +#' } +#' } + + +data_mesh_thesuarus <- function(force_download = FALSE) { + + # URLs for the MeSH thesaurus and supplemental thesaurus data + sf <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_mesh_thesaurus.rds?raw=true' + sf2 <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_scr_thesaurus.rds?raw=true' + + # Local file paths for storing the downloaded data + df <- file.path(rappdirs::user_data_dir('puremoe'), 'data_mesh_thesuarus.rds') + df2 <- file.path(rappdirs::user_data_dir('puremoe'), 'data_scr_thesuarus.rds') + + # Check for the existence of the files or force download + if (!file.exists(df) | force_download) { + # Create the directory if it doesn't exist + if (!dir.exists(rappdirs::user_data_dir('puremoe'))) { + dir.create(rappdirs::user_data_dir('puremoe'), recursive = TRUE) + } + + # Download the MeSH thesaurus data + message('Downloading the mesh thesaurus ...') + utils::download.file(sf, df, mode = "wb") + } + + # Repeat the process for the supplemental concept thesaurus + if (!file.exists(df2) | force_download) { + message('Downloading the supplemental concept thesaurus ...') + utils::download.file(sf2, df2, mode = "wb") + } + + # Read the downloaded RDS files + a1 <- readRDS(df) + a2 <- readRDS(df2) + + # Ensure the column names are consistent between the two data sets + colnames(a2) <- colnames(a1) + + # Combine the data using data.table's rbindlist + data.table::rbindlist(list(a1, a2)) +} diff --git a/R/data_mesh_trees.R b/R/data_mesh_trees.R new file mode 100644 index 0000000..5d41fee --- /dev/null +++ b/R/data_mesh_trees.R @@ -0,0 +1,44 @@ +#' Download and Load MeSH Trees Data +#' +#' This function downloads and loads the MeSH (Medical Subject Headings) Trees data +#' from a specified URL. The data is stored locally for future use. If the data already +#' exists locally, the download can be skipped unless `force_download` is set to `TRUE`. +#' +#' @param force_download A logical value indicating whether to force re-downloading +#' of the data even if it already exists locally. +#' @return A data frame containing the MeSH Trees data. +#' @importFrom rappdirs user_data_dir +#' @importFrom utils download.file +#' @export +#' @examples +#' \donttest{ +#' if (interactive()) { +#' # Code that downloads data or performs other interactive-only operations +#' data <- data_mesh_trees() +#' } +#' } + +data_mesh_trees <- function(force_download = FALSE) { + + # Define the URL for the MeSH trees data + sf <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_mesh_trees.rds?raw=true' + + # Determine the local file path for storing the data + df <- file.path(rappdirs::user_data_dir('puremoe'), 'data_mesh_trees.rds') + + # Check if the file exists or if forced download is requested + if (!file.exists(df) | force_download) { + # Create the directory if it doesn't exist + if (!dir.exists(rappdirs::user_data_dir('puremoe'))) { + dir.create(rappdirs::user_data_dir('puremoe'), recursive = TRUE) + } + + # Download the MeSH trees data + message('Downloading mesh trees ...') + utils::download.file(sf, df, mode = "wb") + } + + # Read and return the downloaded RDS file + a1 <- readRDS(df) + return(a1) +} diff --git a/R/data_pharm_action.R b/R/data_pharm_action.R new file mode 100644 index 0000000..f921065 --- /dev/null +++ b/R/data_pharm_action.R @@ -0,0 +1,45 @@ +#' Download and Load Pharmacological Actions Data +#' +#' This function downloads and loads pharmacological actions data from a specified URL. +#' The data is stored locally in the user's data directory. If the data file does not +#' exist locally or if `force_download` is TRUE, it will be downloaded. The function +#' returns the data as a data frame. +#' +#' @param force_download A logical value indicating whether to force re-downloading +#' of the data even if it already exists locally. Default is FALSE. +#' @return A data frame containing pharmacological actions data. +#' @importFrom rappdirs user_data_dir +#' @importFrom utils download.file +#' @examples +#' \donttest{ +#' if (interactive()) { +#' # Code that downloads data or performs other interactive-only operations +#' data <- data_mesh_embeddings() +#' } +#' } + +#' @export +data_pharm_action <- function(force_download = FALSE) { + + # URL for the pharmacological actions data + sf <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_pharm_action.rds?raw=true' + + # Local file path for storing the data + df <- file.path(rappdirs::user_data_dir('puremoe'), 'data_pharm_action.rds') + + # Check if the data file exists, and download it if it doesn't or if forced + if (!file.exists(df) | force_download) { + # Create the directory if it doesn't exist + if (!dir.exists(rappdirs::user_data_dir('puremoe'))) { + dir.create(rappdirs::user_data_dir('puremoe'), recursive = TRUE) + } + + # Download the pharmacological actions data + message('Downloading pharmacological actions ...') + utils::download.file(sf, df, mode = "wb") + } + + # Read and return the downloaded RDS file + a1 <- readRDS(df) + return(a1) +} diff --git a/R/data_pmc_list.R b/R/data_pmc_list.R new file mode 100644 index 0000000..0992628 --- /dev/null +++ b/R/data_pmc_list.R @@ -0,0 +1,65 @@ +#' Download and Process PMC Open Access File List +#' +#' This function downloads the PubMed Central (PMC) open access file list from the +#' National Center for Biotechnology Information (NCBI) and processes it for use. +#' The list is saved locally. If the file does not exist or if `force_install` is TRUE, +#' it will be downloaded and processed. +#' +#' @param force_install Logical, if TRUE, forces the re-download and processing of +#' the file even if it already exists locally. Default is FALSE. +#' @return A data frame containing the processed PMC open access file list. +#' @importFrom rappdirs user_data_dir +#' @importFrom data.table fread +#' @examples +#' \donttest{ +#' if (interactive()) { +#' # Code that downloads data or performs other interactive-only operations +#' data <- data_pmc_list() +#' } +#' } + +#' @export +data_pmc_list <- function(force_install = FALSE) { + + # URL for the PMC open access file list + sf <- 'https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt' + + # Local file path for storing the processed data + df <- file.path(rappdirs::user_data_dir('pubmedr'), 'oa_file_list.rds') + + # Define PMID and PMCID variables + PMID <- NULL + PMCID <- NULL + + # Check if the file exists, and download and process it if it doesn't or if forced + if (!file.exists(df) | force_install) { + # Create the directory if it doesn't exist + if (!dir.exists(rappdirs::user_data_dir('puremoe'))) { + dir.create(rappdirs::user_data_dir('puremoe'), recursive = TRUE) + } + + message('Downloading "pub/pmc/oa_file_list.txt" ...') + suppressWarnings({ + # Read the file using data.table's fread + pmc <- data.table::fread(sf, sep = '\t') + + # Set column names + colnames(pmc) <- c('fpath', 'journal', 'PMCID', 'PMID', 'license_type') + + # Process PMCID and PMID columns + pmc[, PMID := gsub('^PMID:', '', PMID)] + pmc[, PMCID := gsub('^PMC', '', PMCID)] + + # Replace empty strings with NA + pmc[pmc==''] <- NA + + # Save the processed data as an RDS file + setwd(rappdirs::user_data_dir('puremoe')) + saveRDS(pmc, 'oa_file_list.rds') + }) + } + + # Read and return the processed RDS file + pmc <- readRDS(df) + return(pmc) +} diff --git a/R/get_records.R b/R/get_records.R new file mode 100644 index 0000000..9199141 --- /dev/null +++ b/R/get_records.R @@ -0,0 +1,87 @@ +#' Retrieve Data from NLM/PubMed databases Based on PMIDs +#' +#' This function retrieves different types of data (like PubMed records, affiliations, iCites data, etc.) from PubMed based on provided PMIDs. It supports parallel processing for efficiency. +#' @param pmids A vector of PMIDs for which data is to be retrieved. +#' @param endpoint A character vector specifying the type of data to retrieve ('pubtations', 'icites', 'affiliations', 'pubmed', 'pmc'). +#' @param cores Number of cores to use for parallel processing (default is 3). +#' @param ncbi_key (Optional) NCBI API key for authenticated access. +#' @param sleep Duration (in seconds) to pause after each batch +#' @return A data.table containing combined results from the specified endpoint. +#' @importFrom parallel makeCluster stopCluster detectCores clusterExport +#' @importFrom pbapply pblapply +#' @importFrom data.table rbindlist +#' @export +#' @examples +#' \donttest{ +#' if (interactive()) { +#' pmids <- c("38136652", "31345328", "32496629") +#' results <- get_records(pmids, endpoint = "pubmed_abstracts", cores = 1) +#' } +#' } +#' +get_records <- function(pmids, + endpoint = c('pubtations', + 'icites', + 'pubmed_affiliations', + 'pubmed_abstracts', + 'pmc'), + cores = 3, + sleep = 1, + ncbi_key = NULL) { + + # Input validation + if (!(is.character(pmids) || is.numeric(pmids)) || length(pmids) == 0) { + stop("pmids must be a non-empty vector of characters or numbers") + } + + if (!is.character(endpoint) || length(endpoint) != 1 || + !endpoint %in% c('pubtations', 'icites', 'pubmed_affiliations', 'pubmed_abstracts', 'pmc')) { + stop("Invalid endpoint. Must be one of 'pubtations', 'icites', 'pubmed_affiliations', 'pubmed_abstracts', 'pmc'") + } + + if (!is.numeric(cores)) { + stop("cores must be numeric") + } + + # Set the NCBI API key for authenticated access if provided + if (!is.null(ncbi_key)) rentrez::set_entrez_key(ncbi_key) + + + # Define batch size and the specific task function based on the chosen endpoint + batch_size <- if (endpoint == "pmc") {5} else if (endpoint == "pubtations") {99} else {199} + task_function <- switch(endpoint, + "icites" = .get_icites, + "pubtations" = .get_pubtations, + "pubmed_affiliations" = .get_affiliations, + "pubmed_abstracts" = .get_records, + "pmc_fulltext" = .get_pmc, + ## + stop("Invalid endpoint")) + + # Split the PMIDs into batches for parallel processing + batches <- split(pmids, ceiling(seq_along(pmids) / batch_size)) + + if (cores > 1) { + + # Parallel processing: Create a cluster and export necessary variables + clust <- parallel::makeCluster(cores) + parallel::clusterExport(cl = clust, + varlist = c("task_function", "sleep"), + envir = environment()) + + # Apply the task function to each batch with the sleep parameter, using parallel processing + results <- pbapply::pblapply(X = batches, + FUN = function(batch) task_function(batch, sleep), + cl = clust) + parallel::stopCluster(clust) # Stop the cluster after processing + } else { + + # Sequential processing: Apply the task function to each batch with the sleep parameter + results <- lapply(batches, function(batch) task_function(batch, sleep)) + } + + df_only_list <- results[sapply(results, is.data.frame)] + # Combine results from all batches into a single data.table + combined_results <- data.table::rbindlist(df_only_list) + return(combined_results) +} diff --git a/R/search_pubmed.R b/R/search_pubmed.R new file mode 100644 index 0000000..0adcb09 --- /dev/null +++ b/R/search_pubmed.R @@ -0,0 +1,78 @@ +#' Search PubMed Records +#' +#' Performs a PubMed search based on a query, optionally filtered by publication years. +#' Returns a unique set of PubMed IDs matching the query. +#' +#' @param x Character string, the search query. +#' @param start_year Integer, the start year of publication date range (used if `use_pub_years` is TRUE). +#' @param end_year Integer, the end year of publication date range (used if `use_pub_years` is TRUE). +#' @param retmax Integer, maximum number of records to retrieve, defaults to 9999. +#' @param use_pub_years Logical, whether to filter search by publication years, defaults to TRUE. +#' @return Numeric vector of unique PubMed IDs. +#' @importFrom rentrez entrez_search +#' @export +#' @examples +#' \donttest{ +#' if (interactive()) { +#' ethnob1 <- search_pubmed("ethnobotany", 2010, 2012) +#' ethnob2 <- search_pubmed("ethnobotany", use_pub_years = FALSE) +#' } +#' } +#' +search_pubmed <- function(x, + start_year = NULL, + end_year = NULL, + retmax = 9999, + use_pub_years = TRUE) { + + if(!is.character(x) || length(x) != 1) { + stop("x must be a single character string.") + } + + if(use_pub_years) { + if(is.null(start_year) || is.null(end_year)) { + stop("start_year and end_year must be provided when use_pub_years is TRUE.") + } + if(!is.numeric(start_year) || !is.numeric(end_year) || length(start_year) != 1 || length(end_year) != 1) { + stop("start_year and end_year must be single integers.") + } + if(start_year > end_year) { + stop("start_year must be less than or equal to end_year.") + } + + all_ids <- vector("list", length = end_year - start_year + 1) + names(all_ids) <- as.character(start_year:end_year) + + for (year in start_year:end_year) { + query <- paste0(x, " AND ", year, "[Pub Date]") + all_ids[[as.character(year)]] <- .perform_search(query, retmax) + } + } else { + all_ids <- list(all_years = .perform_search(x, retmax)) + } + + return(unique(unlist(all_ids, use.names = FALSE))) +} + +#' Internal Function for PubMed Search +#' +#' Handles querying of the PubMed database and returns search results. +#' This function is used internally by 'search_pubmed'. +#' +#' @param query Character string containing the PubMed search query. +#' @param retmax Integer specifying the maximum number of records to retrieve. +#' @noRd +.perform_search <- function(query, retmax) { + tryCatch({ + result <- rentrez::entrez_search(db = "pubmed", term = query, retmax = retmax, use_history = TRUE) + if (result$count > 0) { + return(result$ids) + } else { + return(NULL) + } + }, error = function(e) { + warning(sprintf("Failed to retrieve data for query '%s': %s", query, e$message)) + return(NULL) + }) + Sys.sleep(0.5) +} diff --git a/R/source_affiliations.R b/R/source_affiliations.R new file mode 100644 index 0000000..83dc699 --- /dev/null +++ b/R/source_affiliations.R @@ -0,0 +1,69 @@ +#' Internal: Extract Author Affiliations from PubMed Records +#' +#' Function queries PubMed to extract author affiliations from the fetched records. It processes XML records to obtain detailed information about authors, including their names and affiliations. +#' @param x A character vector with search terms or IDs for fetching records from PubMed. +#' @return A data.table consisting of PubMed IDs, author names, and their affiliations. +#' @importFrom xml2 xml_find_all xml_text +#' @importFrom data.table rbindlist +#' @noRd +#' +#' +.get_affiliations <- function (x, sleep) { + + # Fetch records from PubMed based on the input x + records <- .fetch_records(x, sleep) + + # Process each PubMed record to extract author affiliations + z <- lapply(records, function(g){ + + # Extract the PubMed ID from the record + pm <- xml2::xml_text(xml2::xml_find_all(g, ".//MedlineCitation/PMID")) + + # Find all author elements in the record + auts <- xml2::xml_find_all(g, ".//Author") + + # Process each author element + cache <- lapply(auts, function(k){ + # Extract and concatenate the last name and first name of the author + Author <- paste( + xml2::xml_text(xml2::xml_find_all(k, ".//LastName")), + xml2::xml_text(xml2::xml_find_all(k, ".//ForeName")), + sep = ', ') + + # Handle cases where the author name is missing + if(length(Author) == 0){Author <- NA} + + # Extract the affiliation information of the author + Affiliation <- xml2::xml_text(xml2::xml_find_all(k, ".//Affiliation")) + + # Handle cases where the affiliation is missing + if(length(Affiliation) == 0){Affiliation <- NA} + + # Create a data frame with PubMed ID, Author name, and Affiliation + data.frame(pmid = pm, Author, Affiliation) + }) + + # Combine all author information into a single data.table + data.table::rbindlist(cache) + }) + + # Combine all records into one data.table + x0 <- data.table::rbindlist(z) + + # Return the final data.table with author affiliations + return(x0) +} + + + +# #### clean -- +# .clean_affiliations <- function(x){ +# +# x[, Affiliation := sub('^.*?([A-Z])','\\1', Affiliation)] +# x[, Affiliation := trimws(Affiliation)] +# x[, Affiliation := gsub('(^.*[[:punct:] ])(.*@.*$)', '\\1', Affiliation)] +# x[, Affiliation := gsub('(^.*[[:punct:] ])(.*@.*$)', '\\1', Affiliation)] +# x[, Affiliation := gsub('electronic address.*$|email.*$', '', Affiliation, ignore.case = T)] +# x[, Affiliation := ifelse(nchar(Affiliation) < 10, NA, Affiliation)] +# return(x) +# } diff --git a/R/source_icites.R b/R/source_icites.R new file mode 100644 index 0000000..7a09d20 --- /dev/null +++ b/R/source_icites.R @@ -0,0 +1,103 @@ +#' Internal: Fetch Data from iCite Database +#' +#' This internal function is designed to scrape data from the iCite database, a bibliometric tool provided by the NIH. It constructs a URL to query iCite with specified PubMed IDs and retrieves citation metrics and other related data. +#' @param x A vector of PubMed IDs for which data is to be fetched from the iCite database. +#' @return A data.frame consisting of the data retrieved from iCite, formatted as CSV. +#' @importFrom httr GET content +#' @importFrom utils read.csv +#' @noRd +#' + +.fetch_icites <- function(x, sleep){ + + # Construct the URL for the iCite API call, including the PubMed IDs (x) + url0 <- httr::GET(paste0("https://icite.od.nih.gov/api/pubs?pmids=", + paste(x, collapse = ","), + "&format=csv")) + + # Note: There is no error handling here, which could be a point of improvement. + + # Read the content of the response as a CSV. + csv_ <- utils::read.csv(textConnection( + httr::content(url0, + "text", + encoding = "UTF-8")), + encoding = "UTF-8") + + Sys.sleep(sleep) + # Return the CSV content as a data.frame + return(csv_) +} + + + + +#' Process and Structure Data from iCite +#' +#' Function processes and structures the data obtained via `.fetch_icites`. +#' @param x A vector of PubMed IDs for which data has been fetched from the iCite database. +#' @return A data.table enhanced with citation network information and cleaned reference and citation data. +#' @importFrom data.table setDT +#' @noRd +#' +#' +.get_icites <- function(x, sleep){ + + # Fetch data from iCite using the PubMed IDs provided + pmiddf <- .fetch_icites(x, sleep) + + # Extract the PubMed IDs for reference + gots <- pmiddf$pmid + + # Convert pmiddf to a data.table for efficient data manipulation + data.table::setDT(pmiddf) + + # Clean and format the 'ref_count' column + ref_count <- NULL + pmiddf[, ref_count := ifelse(is.null(references)|is.na(references), NULL, references)] + + # Process 'references' and 'cited_by' columns, handling empty or NA values + pmiddf[, references := ifelse(nchar(references) == 0|is.na(references), '99', references)] + pmiddf[, cited_by := ifelse(nchar(cited_by) == 0|is.na(cited_by), '99', cited_by)] + + # Split the 'cited_by' and 'references' columns into lists + cited_by <- strsplit(pmiddf$cited_by, split = " ") + references <- strsplit(pmiddf$references, split = " ") + rs <- strsplit(pmiddf$ref_count, split = " ") + + # Build a data frame for references + doc_id <- NULL + from <- NULL + refs <- data.table::data.table(doc_id = rep(gots, sapply(references, length)), + from = rep(gots, sapply(references, length)), + to = unlist(references)) + # Replace placeholder '99' with NA + refs[refs == 99] <- NA + + # Aggregate reference data and convert to a data.table + refs0 <- refs[, list(references = .N), by = list(from)] + + # Build a data frame for cited_by data + cited <- data.frame(doc_id = rep(gots, sapply(cited_by, length)), + from = unlist(cited_by), + to = rep(gots, sapply(cited_by, length))) + # Replace placeholder '99' with NA + cited[cited == 99] <- NA + + # Combine references and cited_by data + f1 <- rbind(refs, cited) + # Aggregate the combined data and format as a list within a data.table + f2 <- data.table::setDT(f1)[, list(references = list(.SD)), by = doc_id] + + # Add citation network data to pmiddf + citation_net <- NULL + pmiddf[, citation_net := f2$references] + # Calculate and add reference count + pmiddf[, ref_count := sapply(rs, length)] + # Remove the original 'cited_by' and 'references' columns + pmiddf[, c('cited_by', 'references') := NULL] + + # Return the processed data table + pmiddf[, c(1, 6:25)] +} + diff --git a/R/source_pmc.R b/R/source_pmc.R new file mode 100644 index 0000000..bb814b0 --- /dev/null +++ b/R/source_pmc.R @@ -0,0 +1,80 @@ +#' Scrape Full Text Entries from PubMed Central (PMC) +#' +#' This function retrieves full-text articles from PMC using provided PMC identifiers. It downloads and parses XML files to extract article sections and their corresponding text. +#' @param x A vector of PMC identifiers for which full-text articles are to be retrieved. +#' @return A data.table with columns for document ID, PMC identifier, section titles, and text content of each section. +#' @importFrom xml2 read_xml xml_children xml_find_first xml_text +#' @importFrom utils untar +#' @noRd +#' +#' +.get_pmc <- function(x, sleep) { + + # Initialize an empty list to store the scraped data + flist <- list() + + # Loop over each PMC identifier + for(q in 1:length(x)){ + + # Construct the file URL for the given PMC identifier + fn <- paste0('https://ftp.ncbi.nlm.nih.gov/pub/pmc/', x[q]) + + # Create a temporary file to store the downloaded content + tmp <- tempfile() + + # Try to download the file, handling errors gracefully + dd <- tryCatch(download.file(fn, destfile = tmp), + error = function(e) 'error') + + # If download is successful, proceed with extraction + if(dd != 'error'){ + + # Find XML files in the downloaded content + xmls <- grep('xml$', utils::untar(tmp, list = TRUE), value = TRUE) + + # Extract the XML files to a temporary directory + untar(tmp, files = xmls, exdir = tempdir()) + + # Read the first XML file + x0 <- xml2::read_xml(paste0(tempdir(), '/', xmls)[1]) + pmid <- pmid_value <- xml2::xml_find_first(x0, ".//article-meta//article-id[@pub-id-type='pmid']") |> + xml2::xml_text() + + + # Check if there are multiple children nodes in the XML + if(length(xml2::xml_children(x0)) > 1){ + + # Extract the second child node (assuming it contains the relevant content) + x1 <- xml2::xml_child(x0, 2) + + # Extract titles of different sections in the article + header_titles <- lapply(xml2::xml_children(x1), + function(x) { + xml2::xml_text(xml2::xml_find_first(x, ".//title"))} + ) + + # Extract the text of each section + text <- lapply(xml2::xml_children(x1), xml2::xml_text) + + # Unlist the section titles + section <- unlist(header_titles) + + # Combine the data into a data frame + df <- data.frame(pmid, + section, + text = unlist(text), + row.names = NULL) + + # Format the text for readability + df$text <- gsub('([a-z]+)([A-Z])', '\\1\n\\2', df$text) + + # Add the data frame to the list + flist[[q]] <- df + } + } + Sys.sleep(sleep) + } + + # Combine all data frames into one data.table and return + return(flist |> data.table::rbindlist()) +} diff --git a/R/source_pubmed.R b/R/source_pubmed.R new file mode 100644 index 0000000..5214f82 --- /dev/null +++ b/R/source_pubmed.R @@ -0,0 +1,176 @@ +#' Get PubMed Records +#' +#' Processes XML records obtained from PubMed. It extracts basic bibliographic information and annotations for each record. +#' @param x A character vector with search terms or IDs for fetching records from PubMed. +#' @return A data.table with columns for PubMed IDs, publication year, journal name, article title, abstract, and annotations. +#' @noRd +.get_records <- function (x, sleep) { + + # Fetch records using .fetch_records function and parse XML content + records <- .fetch_records(x, sleep) + + # Process each record to extract basic information and annotations + parsed_records <- lapply(records, function(x){ + # Extract basic bibliographic information from the record + basic_info <- .extract_basic(x) + # Extract annotations (like MeSH terms) from the record + annotations <- .extract_annotations(x) + + # Combine basic information and annotations into a list + out1 <- list('basic_info' = basic_info, 'annotations' = annotations) + return(out1) + }) + + # Convert the list of basic information into a tidy format + sum0 <- textshape::tidy_list(x = lapply(parsed_records, '[[', 1), + id.name = 'id', + content.name = 'varx') + + # Reshape the data into a wide format using data.table + id <- NULL + sum1 <- data.table::dcast(data = sum0, + formula = id ~ attribute, + value.var = 'varx') + + sum1 <- sum1[order(as.numeric(id))] + + # Select and reorder columns for the final output + sum1 <- sum1[, c('pmid', 'year', 'journal', 'articletitle', 'abstract')] + + # Add annotations to the data table + annotations <- NULL + sum1[, annotations := list(lapply(parsed_records, '[[', 2))] + + # Ensure proper encoding for compatibility + Encoding(rownames(sum1)) <- 'UTF-8' + + # Clean up NA values and return the final data table + cols <- colnames(sum1) + sum1[, c(cols) := lapply(.SD, .clean_nas), .SDcols = cols] + + return(sum1) +} + + + + + +#' Extract Basic Information from PubMed Records +#' +#' An internal function that parses XML records from PubMed. It extracts essential bibliographic information such as PubMed ID, journal title, article title, publication year, and abstract. +#' @param g An XML node set representing a single PubMed record. +#' @return A named vector with basic bibliographic information from a PubMed record. +#' @noRd + + +.extract_basic <- function(g){ + + # Extract the PubMed ID (PMID) from the XML + pm <- xml2::xml_find_all(g, ".//MedlineCitation/PMID") |> xml2::xml_text() + + # Extract the journal title + a1 <- xml2::xml_find_all(g, ".//Title") |> xml2::xml_text() + a1a <- a1[1] # In case there are multiple titles, use the first one + + # Extract the article title + a2 <- xml2::xml_find_all(g, ".//ArticleTitle") |> xml2::xml_text() + + # Extract publication type + #pub_type <- xml2::xml_find_all(g, ".//PublicationType") |> xml2::xml_text() + + # Extract the publication year. If 'Year' is not available, use 'MedlineDate' as a fallback + year <- xml2::xml_find_all(g, ".//PubDate/Year") |> xml2::xml_text() + if(length(year) == 0){ + year <- xml2::xml_find_all(g, ".//PubDate/MedlineDate") |> xml2::xml_text() + } + # Clean up the year to remove any extra characters or ranges + year <- gsub(" .+", "", year) + year <- gsub("-.+", "", year) + + # Extract the abstract text, combining multiple parts if necessary + abstract <- xml2::xml_find_all(g, ".//Abstract/AbstractText") |> xml2::xml_text() + + if(length(abstract) > 1){ + abstract <- paste(abstract, collapse = ' ')} + if(length(abstract) == 0){abstract <- NA} + + abstract <- .reformat_abstract(abstract) + # Construct the output with the extracted information + out <- c('pmid' = pm, + 'journal' = a1a, + #'pubtype' = pub_type, + 'articletitle' = a2, + 'year' = year, + 'abstract' = abstract) + + return(out) +} + + + +#' Extract Annotations from PubMed Records +#' +#' Parses XML records from PubMed to extract annotations such as MeSH terms, chemical names, and keywords. +#' @param g An XML node set representing a single PubMed record. +#' @return A data frame with annotations extracted from a PubMed record. +#' @noRd +.extract_annotations <- function(g){ + + # Extract the PubMed ID (PMID) from the XML record + pm <- xml2::xml_find_all(g, ".//MedlineCitation/PMID") |> xml2::xml_text() + + # Extract MeSH terms (Medical Subject Headings) + meshes <- xml2::xml_find_all(g, ".//DescriptorName") |> xml2::xml_text() + + # Extract chemical substances names + chems <- xml2::xml_find_all(g, ".//NameOfSubstance") |> xml2::xml_text() + + # Extract keywords from the record + keys <- xml2::xml_find_all(g, ".//Keyword") |> xml2::xml_text() + + # Combine the extracted data into a single data frame + # Create separate data frames for MeSH terms, chemical substances, and keywords, and then bind them together + df0 <- rbind( + data.frame(pmid = pm, type = 'MeSH', form = if(length(meshes) > 0){meshes} else{NA}), + data.frame(pmid = pm, type = 'Chemistry', form = if(length(chems) > 0){chems} else{NA}), + data.frame(pmid = pm, type = 'Keyword', form = if(length(keys) > 0){keys} else{NA}) + ) + + # Return the combined annotations data frame + return(df0) +} + + + +#' Reformat Abstract Text +#' +#' Internal function to reformat an abstract by inserting newlines before each section title. +#' It handles abstracts with or without section titles and trims whitespace from each section. +#' Returns NA if the input is NA. +#' +#' @param abstract A character string representing the abstract text. +#' +#' @return A character string of the reformatted abstract with newlines before each section title, or NA if the input is NA. +#' +#' @noRd +.reformat_abstract <- function(abstract) { + if (is.na(abstract)) { + return(NA) + } + + if (!is.character(abstract)) { + stop("Abstract must be a character string.", call. = FALSE) + } + + # Regular expression to match section titles (e.g., "Methodology and Results:") + # This pattern matches 1-3 words, each word starting with an uppercase letter or all words being uppercase + pattern_title <- "(^|\\.\\s+)(([A-Z][a-z]*|[A-Z]+)(\\s([A-Z][a-z]*|[A-Z]+)){0,2}):" + + # Use the pattern to insert a newline before each title and split the abstract into sections + split_abstract <- strsplit(gsub(pattern_title, "\n\\2:", abstract), "\n")[[1]] + + # Combine the sections back into a single string + formatted_abstract <- paste(split_abstract, collapse = "\n") + + return(formatted_abstract) +} \ No newline at end of file diff --git a/R/source_pubtations.R b/R/source_pubtations.R new file mode 100644 index 0000000..0806576 --- /dev/null +++ b/R/source_pubtations.R @@ -0,0 +1,115 @@ +#' Extract Named Entities from PubMed's PubTator3 Tool +#' +#' This function retrieves named entity annotations from PubMed's PubTator3 tool. It fetches data using PubMed IDs and processes the JSON response into a structured format. +#' @param x A vector of PubMed IDs for which annotations are to be retrieved from PubTator. +#' @return A data.table, or NA if no data is available, with columns for PubMed ID, title or abstract location, annotation text, start and end positions of annotations, and annotation types. +#' @importFrom jsonlite stream_in +#' @importFrom data.table rbindlist +#' @noRd +#' +.get_pubtations <- function(x, sleep){ + + # x <- batches[[3]] + # https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocxml?pmids=29355051&full=true + + # Connect to PubTator API and retrieve data + # con <- url(paste0("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids=", paste(x, collapse = ','))) + + con <- url(paste0("https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson?pmids=", paste(x, collapse = ','))) + + # Read JSON data stream, handling errors with NA + # mydata <- tryCatch(jsonlite::stream_in(gzcon(con)), error = function(e) NA) + mydata <- tryCatch( + jsonlite::stream_in(con), + error = function(e) NA) + + + # Process the data if valid, else return NA + # i = 1 + if(length(mydata) == 1){jj0 <- NA} else{ + jj <- list() + + # Iterate over each record to extract and format annotations + for(i in 1:nrow(mydata)){ + + # Extract annotations for titles and abstracts + pb1 <- mydata$passages[[i]]$annotations + names(pb1) <- c('title', 'abstract') + + # Process title annotations + if(any(nrow(pb1$title) == 0, is.null(nrow(pb1$title)))) { + pb1$title <- data.frame(tiab = 'title', + id = NA, + text = NA, + locations = NA, + identifier = NA, + type = NA) + } else{ + + if (!("identifier" %in% names(pb1[["title"]]$infons))) { + pb1[["title"]]$infons$identifier <- NA + } + + pb1$title <- cbind(tiab = 'title', + pb1$title[, c('id', 'text', 'locations')], + identifier = pb1$title$infons$identifier, + type = pb1$title$infons$type) + } + + # Process abstract annotations + if(any(nrow(pb1$abstract) == 0, is.null(nrow(pb1$abstract)))) { + pb1$abstract <- data.frame(tiab = 'abstract', + id = NA, + text = NA, + locations = NA, + identifier = NA, + type = NA) + } else{ + + if (!("identifier" %in% names(pb1[["abstract"]]$infons))) { + pb1[["abstract"]]$infons$identifier <- NA + } + + pb1$abstract <- cbind(tiab = 'abstract', + pb1$abstract[, c('id', 'text', 'locations')], + identifier = pb1$abstract$infons$identifier, + type = pb1$abstract$infons$type) + } + + # Combine title and abstract annotations + jj[[i]] <- rbind(pb1$title, pb1$abstract) + } + + + if (!all(sapply(jj, is.data.frame))) { + return(NA) + } else { + + names(jj) <- mydata$id + jj0 <- jj |> data.table::rbindlist(idcol = 'pmid') + # ... rest of the processing ... + } + + + # Clean and format location data + jj0$locations <- jj0$locations |> as.character() + jj0$locations <- gsub("[^[:digit:],]", "", jj0$locations) + + # Extract start and end positions of annotations + start <- NULL + end <- NULL + locations <- NULL + jj0[, c('start', 'length') := data.table::tstrsplit(locations, ",", fixed=TRUE)] + jj0[, start := as.integer(start)] + jj0[, end := start + as.integer(length)] + + # Clean up temporary columns + jj0[, length := NULL] + jj0[, locations := NULL] + } + + Sys.sleep(sleep) + + # Return the processed annotations data + return(jj0) +} diff --git a/R/utils-data-table.R b/R/utils-data-table.R new file mode 100644 index 0000000..d2f2964 --- /dev/null +++ b/R/utils-data-table.R @@ -0,0 +1,12 @@ +# data.table is generally careful to minimize the scope for namespace +# conflicts (i.e., functions with the same name as in other packages); +# a more conservative approach using @importFrom should be careful to +# import any needed data.table special symbols as well, e.g., if you +# run DT[ , .N, by='grp'] in your package, you'll need to add +# @importFrom data.table .N to prevent the NOTE from R CMD check. +# See ?data.table::`special-symbols` for the list of such symbols +# data.table defines; see the 'Importing data.table' vignette for more +# advice (vignette('datatable-importing', 'data.table')). +# +#' @import data.table +NULL diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..7e6e7ae --- /dev/null +++ b/R/utils.R @@ -0,0 +1,54 @@ +#' Fetch Batch of PubMed Records as XML +#' +#' This function attempts to fetch batches of PubMed records in XML format. It retries multiple times in case of failures. +#' @param x A vector of PubMed record identifiers to be fetched. +#' @return A character string with XML content of PubMed records, or an error object in case of failure. +#' @importFrom rentrez entrez_fetch +#' @noRd +#' +#' +.fetch_records <- function(x, sleep) { + # Loop to retry fetching records, with a maximum of 15 attempts + for (i in 1:15) { + # Display the current attempt number + #message(i) + + # Try fetching records using rentrez::entrez_fetch + x1 <- try({ + rentrez::entrez_fetch( + db = "pubmed", + id = x, + rettype = "xml", + parsed = FALSE + ) + }) + + # Wait for 5 seconds before the next attempt + Sys.sleep(sleep) + + # Check if the fetch was successful using inherits(), and if so, break the loop + if (!inherits(x1, "try-error")) { + break + } + } + + # Return the fetched XML content or an error object + doc <- xml2::read_xml(x1) + xml2::xml_find_all(doc, "//PubmedArticle") +} + + + +#' Clean Missing or Invalid Values in Data +#' +#' This function standardizes the representation of missing or invalid values in data by replacing specific character representations of missing data (' ', 'NA', 'n/a', 'n/a.') with R's standard `NA`. +#' @param x A vector that may contain missing or invalid values represented in various formats. +#' @return A vector with standardized missing values represented as `NA`. +#' @noRd +#' +#' +.clean_nas <- function(x) { + + # Replace specific character representations of missing data with NA + ifelse(x %in% c(' ', 'NA', 'n/a', 'n/a.') | is.na(x), NA, x) +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..5019560 --- /dev/null +++ b/README.md @@ -0,0 +1,93 @@ +[![R build +status](https://github.com/jaytimm/puremoe/workflows/R-CMD-check/badge.svg)](https://github.com/jaytimm/puremoe/actions) + +# puremoe + +> **P**ubMed **U**nified **RE**trieval for **M**ulti-**O**utput +> **E**xploration + +An R package that provides a single interface for accessing a range of +NLM/PubMed databases, including +[PubMed](https://pubmed.ncbi.nlm.nih.gov/) abstract records, +[iCite](https://icite.od.nih.gov/) bibliometric data, +[PubTator3](https://www.ncbi.nlm.nih.gov/research/pubtator3/) named +entity annotations, and full-text entries from [PubMed +Central](https://www.ncbi.nlm.nih.gov/pmc/) (PMC). This unified +interface simplifies the data retrieval process, allowing users to +interact with multiple PubMed services/APIs/output formats through a +single R function. + +The package also includes MeSH thesaurus resources as simple data +frames, including Descriptor Terms, Descriptor Tree Structures, +Supplementary Concept Terms, and Pharmacological Actions; it also +includes descriptor-level word embeddings [(Noh & Kavuluru +2021)](https://www.sciencedirect.com/science/article/pii/S1532046421001969). +Via the [mesh-resources](https://github.com/jaytimm/mesh-resources) +library. + +## Installation + +You can download the development version from GitHub with: + +``` r +devtools::install_github("jaytimm/puremoe") +``` + +## Usage + +## PubMed search + +The package has two basic functions: `search_pubmed` and `get_records`. +The former fetches PMIDs from the PubMed API based on user search; the +latter scrapes PMID records from a user-specified PubMed endpoint – +`pubmed_abstracts`, `pubmed_affiliations`, `pubtations`, `icites`, or +`pmc_fulltext`. + +Search syntax is the same as that implemented in standard [PubMed +search](https://pubmed.ncbi.nlm.nih.gov/advanced/). + +``` r +pmids <- puremoe::search_pubmed('("political ideology"[TiAb])', + use_pub_years = F) + +# pmids <- puremoe::search_pubmed('immunity', +# use_pub_years = T, +# start_year = 2022, +# end_year = 2024) +``` + +## Get record-level data + +``` r +pubmed <- pmids |> + puremoe::get_records(endpoint = 'pubmed_abstracts', + cores = 3, + sleep = 1) + +affiliations <- pmids |> + puremoe::get_records(endpoint = 'pubmed_affiliations', + cores = 1, + sleep = 0.5) + +icites <- pmids |> + puremoe::get_records(endpoint = 'icites', + cores = 3, + sleep = 0.25) + +pubtations <- pmids |> + puremoe::get_records(endpoint = 'pubtations', + cores = 2) +``` + +> When the endpoint is PMC, the \`get_records() function takes a vector +> of filepaths (from the PMC Open Access list) instead of PMIDs. + +``` r +pmclist <- puremoe::data_pmc_list(force_install = F) +pmc_pmids <- pmclist[PMID %in% pmids] + +pmc_fulltext <- pmc_pmids$fpath[1:5] |> + puremoe::get_records(endpoint = 'pmc_fulltext', cores = 2) +``` + +## Summary diff --git a/man/data_mesh_embeddings.Rd b/man/data_mesh_embeddings.Rd new file mode 100644 index 0000000..c742a42 --- /dev/null +++ b/man/data_mesh_embeddings.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_mesh_embeddings.R +\name{data_mesh_embeddings} +\alias{data_mesh_embeddings} +\title{Download and Process Mesh and SCR Embeddings} +\usage{ +data_mesh_embeddings() +} +\value{ +A data frame containing the processed Mesh and SCR embeddings data. +} +\description{ +This function downloads Mesh and SCR embeddings data from the specified URLs and processes it for use. +The data is saved locally in RDS format. If the files do not exist, they will be downloaded and processed. +} +\examples{ +\donttest{ +if (interactive()) { + # Code that downloads data or performs other interactive-only operations + data <- data_mesh_embeddings() +} +} + +} diff --git a/man/data_mesh_thesuarus.Rd b/man/data_mesh_thesuarus.Rd new file mode 100644 index 0000000..dabdfd0 --- /dev/null +++ b/man/data_mesh_thesuarus.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_mesh_thesaurus.R +\name{data_mesh_thesuarus} +\alias{data_mesh_thesuarus} +\title{Download and Combine MeSH and Supplemental Thesauruses} +\usage{ +data_mesh_thesuarus(force_download = FALSE) +} +\arguments{ +\item{force_download}{A logical value indicating whether to force re-downloading +of the data even if it already exists locally.} +} +\value{ +A data.table containing the combined MeSH and supplemental thesaurus data. +} +\description{ +This function downloads and combines the MeSH (Medical Subject Headings) Thesaurus +and a supplemental concept thesaurus for use in biomedical research and analysis. +The data is sourced from specified URLs and stored locally for subsequent use. +} +\examples{ +\donttest{ +if (interactive()) { + # Code that downloads data or performs other interactive-only operations + data <- data_mesh_thesaurus() +} +} +} diff --git a/man/data_mesh_trees.Rd b/man/data_mesh_trees.Rd new file mode 100644 index 0000000..74431b1 --- /dev/null +++ b/man/data_mesh_trees.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_mesh_trees.R +\name{data_mesh_trees} +\alias{data_mesh_trees} +\title{Download and Load MeSH Trees Data} +\usage{ +data_mesh_trees(force_download = FALSE) +} +\arguments{ +\item{force_download}{A logical value indicating whether to force re-downloading +of the data even if it already exists locally.} +} +\value{ +A data frame containing the MeSH Trees data. +} +\description{ +This function downloads and loads the MeSH (Medical Subject Headings) Trees data +from a specified URL. The data is stored locally for future use. If the data already +exists locally, the download can be skipped unless `force_download` is set to `TRUE`. +} +\examples{ +\donttest{ +if (interactive()) { + # Code that downloads data or performs other interactive-only operations + data <- data_mesh_trees() +} +} +} diff --git a/man/data_pharm_action.Rd b/man/data_pharm_action.Rd new file mode 100644 index 0000000..621ead0 --- /dev/null +++ b/man/data_pharm_action.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_pharm_action.R +\name{data_pharm_action} +\alias{data_pharm_action} +\title{Download and Load Pharmacological Actions Data} +\usage{ +data_pharm_action(force_download = FALSE) +} +\arguments{ +\item{force_download}{A logical value indicating whether to force re-downloading +of the data even if it already exists locally. Default is FALSE.} +} +\value{ +A data frame containing pharmacological actions data. +} +\description{ +This function downloads and loads pharmacological actions data from a specified URL. +The data is stored locally in the user's data directory. If the data file does not +exist locally or if `force_download` is TRUE, it will be downloaded. The function +returns the data as a data frame. +} +\examples{ +\donttest{ +if (interactive()) { + # Code that downloads data or performs other interactive-only operations + data <- data_mesh_embeddings() +} +} +} diff --git a/man/data_pmc_list.Rd b/man/data_pmc_list.Rd new file mode 100644 index 0000000..fe4401b --- /dev/null +++ b/man/data_pmc_list.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_pmc_list.R +\name{data_pmc_list} +\alias{data_pmc_list} +\title{Download and Process PMC Open Access File List} +\usage{ +data_pmc_list(force_install = FALSE) +} +\arguments{ +\item{force_install}{Logical, if TRUE, forces the re-download and processing of +the file even if it already exists locally. Default is FALSE.} +} +\value{ +A data frame containing the processed PMC open access file list. +} +\description{ +This function downloads the PubMed Central (PMC) open access file list from the +National Center for Biotechnology Information (NCBI) and processes it for use. +The list is saved locally. If the file does not exist or if `force_install` is TRUE, +it will be downloaded and processed. +} +\examples{ +\donttest{ +if (interactive()) { + # Code that downloads data or performs other interactive-only operations + data <- data_pmc_list() +} +} +} diff --git a/man/dot-extract_basic.Rd b/man/dot-extract_basic.Rd new file mode 100644 index 0000000..c369faa --- /dev/null +++ b/man/dot-extract_basic.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/source_pubmed.R +\name{.extract_basic} +\alias{.extract_basic} +\title{Extract Basic Information from PubMed Records} +\usage{ +.extract_basic(g) +} +\arguments{ +\item{g}{An XML node set representing a single PubMed record.} +} +\value{ +A named vector with basic bibliographic information from a PubMed record. +} +\description{ +An internal function that parses XML records from PubMed. It extracts essential bibliographic information such as PubMed ID, journal title, article title, publication year, and abstract. +} diff --git a/man/get_records.Rd b/man/get_records.Rd new file mode 100644 index 0000000..22943f4 --- /dev/null +++ b/man/get_records.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_records.R +\name{get_records} +\alias{get_records} +\title{Retrieve Data from NLM/PubMed databases Based on PMIDs} +\usage{ +get_records( + pmids, + endpoint = c("pubtations", "icites", "pubmed_affiliations", "pubmed_abstracts", "pmc"), + cores = 3, + sleep = 1, + ncbi_key = NULL +) +} +\arguments{ +\item{pmids}{A vector of PMIDs for which data is to be retrieved.} + +\item{endpoint}{A character vector specifying the type of data to retrieve ('pubtations', 'icites', 'affiliations', 'pubmed', 'pmc').} + +\item{cores}{Number of cores to use for parallel processing (default is 3).} + +\item{sleep}{Duration (in seconds) to pause after each batch} + +\item{ncbi_key}{(Optional) NCBI API key for authenticated access.} +} +\value{ +A data.table containing combined results from the specified endpoint. +} +\description{ +This function retrieves different types of data (like PubMed records, affiliations, iCites data, etc.) from PubMed based on provided PMIDs. It supports parallel processing for efficiency. +} +\examples{ +\donttest{ +if (interactive()) { +pmids <- c("38136652", "31345328", "32496629") +results <- get_records(pmids, endpoint = "pubmed_abstracts", cores = 1) +} +} + +} diff --git a/man/search_pubmed.Rd b/man/search_pubmed.Rd new file mode 100644 index 0000000..d8e8882 --- /dev/null +++ b/man/search_pubmed.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/search_pubmed.R +\name{search_pubmed} +\alias{search_pubmed} +\title{Search PubMed Records} +\usage{ +search_pubmed( + x, + start_year = NULL, + end_year = NULL, + retmax = 9999, + use_pub_years = TRUE +) +} +\arguments{ +\item{x}{Character string, the search query.} + +\item{start_year}{Integer, the start year of publication date range (used if `use_pub_years` is TRUE).} + +\item{end_year}{Integer, the end year of publication date range (used if `use_pub_years` is TRUE).} + +\item{retmax}{Integer, maximum number of records to retrieve, defaults to 9999.} + +\item{use_pub_years}{Logical, whether to filter search by publication years, defaults to TRUE.} +} +\value{ +Numeric vector of unique PubMed IDs. +} +\description{ +Performs a PubMed search based on a query, optionally filtered by publication years. +Returns a unique set of PubMed IDs matching the query. +} +\examples{ +\donttest{ +if (interactive()) { +ethnob1 <- search_pubmed("ethnobotany", 2010, 2012) +ethnob2 <- search_pubmed("ethnobotany", use_pub_years = FALSE) +} +} + +} diff --git a/pubmedtk.Rproj b/pubmedtk.Rproj new file mode 100644 index 0000000..21a4da0 --- /dev/null +++ b/pubmedtk.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source