diff --git a/.Rbuildignore b/.Rbuildignore
new file mode 100644
index 0000000..ed69db7
--- /dev/null
+++ b/.Rbuildignore
@@ -0,0 +1,4 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^\.github$
+LICENSE.md
diff --git a/.github/.gitignore b/.github/.gitignore
new file mode 100644
index 0000000..2d19fc7
--- /dev/null
+++ b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/check-standard.yaml b/.github/workflows/check-standard.yaml
new file mode 100644
index 0000000..a3ac618
--- /dev/null
+++ b/.github/workflows/check-standard.yaml
@@ -0,0 +1,49 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+name: R-CMD-check
+
+jobs:
+  R-CMD-check:
+    runs-on: ${{ matrix.config.os }}
+
+    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: macos-latest,   r: 'release'}
+          - {os: windows-latest, r: 'release'}
+          - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
+          - {os: ubuntu-latest,   r: 'release'}
+          - {os: ubuntu-latest,   r: 'oldrel-1'}
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      R_KEEP_PKG_SOURCE: yes
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+          http-user-agent: ${{ matrix.config.http-user-agent }}
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
+      - uses: r-lib/actions/check-r-package@v2
+        with:
+          upload-snapshots: true
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
new file mode 100644
index 0000000..ed7650c
--- /dev/null
+++ b/.github/workflows/pkgdown.yaml
@@ -0,0 +1,48 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+  release:
+    types: [published]
+  workflow_dispatch:
+
+name: pkgdown
+
+jobs:
+  pkgdown:
+    runs-on: ubuntu-latest
+    # Only restrict concurrency for non-PR jobs
+    concurrency:
+      group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::pkgdown, local::.
+          needs: website
+
+      - name: Build site
+        run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
+        shell: Rscript {0}
+
+      - name: Deploy to GitHub pages 🚀
+        if: github.event_name != 'pull_request'
+        uses: JamesIves/github-pages-deploy-action@v4.4.1
+        with:
+          clean: false
+          branch: gh-pages
+          folder: docs
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
new file mode 100644
index 0000000..27d4528
--- /dev/null
+++ b/.github/workflows/test-coverage.yaml
@@ -0,0 +1,50 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+name: test-coverage
+
+jobs:
+  test-coverage:
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::covr
+          needs: coverage
+
+      - name: Test coverage
+        run: |
+          covr::codecov(
+            quiet = FALSE,
+            clean = FALSE,
+            install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
+          )
+        shell: Rscript {0}
+
+      - name: Show testthat output
+        if: always()
+        run: |
+          ## --------------------------------------------------------------------
+          find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
+        shell: bash
+
+      - name: Upload test results
+        if: failure()
+        uses: actions/upload-artifact@v3
+        with:
+          name: coverage-test-failures
+          path: ${{ runner.temp }}/package
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..55e750c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+README.Rmd
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..bf88dd2
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,25 @@
+Package: puremoe
+Type: Package
+Title: Pubmed Unified REtrieval for Multi-Output Exploration
+Version: 1.0.0
+Author: Jason Timm [aut, cre]
+Maintainer: Jason Timm <JaTimm@salud.unm.edu>
+Description: An R package for accessing a variety of PubMed data, including abstracts, bibliometrics, pubtations, and full-text records, through a single, user-friendly interface.
+License: MIT + file LICENSE
+Encoding: UTF-8
+LazyData: false
+Depends: 
+    R (>= 3.5)
+Imports: 
+    rentrez,
+    textshape,
+    xml2,
+    data.table,
+    httr,
+    pbapply,
+    jsonlite,
+    rappdirs
+Suggests: 
+    knitr,
+    rmarkdown
+RoxygenNote: 7.3.1
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..1a747f5
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2022
+COPYRIGHT HOLDER: Jason Timm
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..cc0c2a0
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+
+Copyright (c) 2022 Jason Timm
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..9e6b519
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,32 @@
+# Generated by roxygen2: do not edit by hand
+
+export(data_mesh_embeddings)
+export(data_mesh_thesuarus)
+export(data_mesh_trees)
+export(data_pharm_action)
+export(data_pmc_list)
+export(get_records)
+export(search_pubmed)
+import(data.table)
+importFrom(data.table,fread)
+importFrom(data.table,rbindlist)
+importFrom(data.table,setDT)
+importFrom(httr,GET)
+importFrom(httr,content)
+importFrom(jsonlite,stream_in)
+importFrom(parallel,clusterExport)
+importFrom(parallel,detectCores)
+importFrom(parallel,makeCluster)
+importFrom(parallel,stopCluster)
+importFrom(pbapply,pblapply)
+importFrom(rappdirs,user_data_dir)
+importFrom(rentrez,entrez_fetch)
+importFrom(rentrez,entrez_search)
+importFrom(utils,download.file)
+importFrom(utils,read.csv)
+importFrom(utils,untar)
+importFrom(xml2,read_xml)
+importFrom(xml2,xml_children)
+importFrom(xml2,xml_find_all)
+importFrom(xml2,xml_find_first)
+importFrom(xml2,xml_text)
diff --git a/R/data_mesh_embeddings.R b/R/data_mesh_embeddings.R
new file mode 100644
index 0000000..17786d1
--- /dev/null
+++ b/R/data_mesh_embeddings.R
@@ -0,0 +1,71 @@
+#' Download and Process Mesh and SCR Embeddings
+#'
+#' This function downloads Mesh and SCR embeddings data from the specified URLs and processes it for use.
+#' The data is saved locally in RDS format. If the files do not exist, they will be downloaded and processed.
+#'
+#' @return A data frame containing the processed Mesh and SCR embeddings data.
+#'
+#' @importFrom rappdirs user_data_dir
+#' @importFrom utils download.file
+#' @export
+#' @examples
+#' \donttest{
+#' if (interactive()) {
+#'   # Code that downloads data or performs other interactive-only operations
+#'   data <- data_mesh_embeddings()
+#' }
+#' }
+
+#' 
+data_mesh_embeddings <- function() {
+  
+  # Define the URLs for Mesh and SCR embeddings data
+  sf <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_mesh_embeddings.rds?raw=true'
+  sf2 <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_scr_embeddings.rds?raw=true'
+  
+  # Define local file paths for storing the processed data
+  df <- file.path(rappdirs::user_data_dir('puremoe'), 'data_mesh_embeddings.rds')
+  df2 <- file.path(rappdirs::user_data_dir('puremoe'), 'data_scr_embeddings.rds')
+  
+  # Check if the directory for data storage exists, and create it if not
+  if (!dir.exists(rappdirs::user_data_dir('puremoe'))) {
+    dir.create(rappdirs::user_data_dir('puremoe'), recursive = TRUE)
+  }
+  
+  # Download and process Mesh embeddings data if it doesn't exist
+  if (!file.exists(df)) {
+    message('Downloading the Mesh embeddings ...')
+    out <- tryCatch({
+      utils::download.file(sf, df)
+    }, error = function(e) paste("Error"))
+    
+    if (out == 'Error') {
+      message('Download not completed ... Try options(timeout = 600)')
+      file.remove(df)
+    }
+  }
+  
+  # Download and process SCR embeddings data if it doesn't exist
+  if (!file.exists(df2)) {
+    message('Downloading the SCR embeddings ...')
+    out <- tryCatch({
+      utils::download.file(sf2, df2)
+    }, error = function(e) paste("Error"))
+    
+    if (out == 'Error') {
+      message('Download not completed ... Try options(timeout = 600)')
+      file.remove(df2)
+    }
+  }
+  
+  # If both files exist, read and combine them
+  if (all(file.exists(df), file.exists(df2))) {
+    a1 <- readRDS(df)
+    a2 <- readRDS(df2)
+    
+    result <- rbind(a1, a2)
+    return(result)
+  }
+  
+  return(NULL)
+}
diff --git a/R/data_mesh_thesaurus.R b/R/data_mesh_thesaurus.R
new file mode 100644
index 0000000..4c5ba2a
--- /dev/null
+++ b/R/data_mesh_thesaurus.R
@@ -0,0 +1,59 @@
+#' Download and Combine MeSH and Supplemental Thesauruses
+#'
+#' This function downloads and combines the MeSH (Medical Subject Headings) Thesaurus 
+#' and a supplemental concept thesaurus for use in biomedical research and analysis.
+#' The data is sourced from specified URLs and stored locally for subsequent use.
+#' @param force_download A logical value indicating whether to force re-downloading 
+#' of the data even if it already exists locally.
+#' @return A data.table containing the combined MeSH and supplemental thesaurus data.
+#' @importFrom rappdirs user_data_dir
+#' @importFrom utils download.file
+#' @importFrom data.table rbindlist
+#' @export
+#' @examples
+#' \donttest{
+#' if (interactive()) {
+#'   # Code that downloads data or performs other interactive-only operations
+#'   data <- data_mesh_thesaurus()
+#' }
+#' }
+
+
+data_mesh_thesuarus <- function(force_download = FALSE) {
+  
+  # URLs for the MeSH thesaurus and supplemental thesaurus data
+  sf <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_mesh_thesaurus.rds?raw=true'
+  sf2 <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_scr_thesaurus.rds?raw=true'
+  
+  # Local file paths for storing the downloaded data
+  df <- file.path(rappdirs::user_data_dir('puremoe'), 'data_mesh_thesuarus.rds')
+  df2 <- file.path(rappdirs::user_data_dir('puremoe'), 'data_scr_thesuarus.rds')
+  
+  # Check for the existence of the files or force download
+  if (!file.exists(df) | force_download) {
+    # Create the directory if it doesn't exist
+    if (!dir.exists(rappdirs::user_data_dir('puremoe'))) {
+      dir.create(rappdirs::user_data_dir('puremoe'), recursive = TRUE)
+    }
+    
+    # Download the MeSH thesaurus data
+    message('Downloading the mesh thesaurus ...')
+    utils::download.file(sf, df, mode = "wb")
+  }
+  
+  # Repeat the process for the supplemental concept thesaurus
+  if (!file.exists(df2) | force_download) {
+    message('Downloading the supplemental concept thesaurus ...')
+    utils::download.file(sf2, df2, mode = "wb")
+  }
+  
+  # Read the downloaded RDS files
+  a1 <- readRDS(df)
+  a2 <- readRDS(df2)
+  
+  # Ensure the column names are consistent between the two data sets
+  colnames(a2) <- colnames(a1)
+  
+  # Combine the data using data.table's rbindlist
+  data.table::rbindlist(list(a1, a2))
+}
diff --git a/R/data_mesh_trees.R b/R/data_mesh_trees.R
new file mode 100644
index 0000000..5d41fee
--- /dev/null
+++ b/R/data_mesh_trees.R
@@ -0,0 +1,44 @@
+#' Download and Load MeSH Trees Data
+#'
+#' This function downloads and loads the MeSH (Medical Subject Headings) Trees data
+#' from a specified URL. The data is stored locally for future use. If the data already 
+#' exists locally, the download can be skipped unless `force_download` is set to `TRUE`.
+#'
+#' @param force_download A logical value indicating whether to force re-downloading 
+#' of the data even if it already exists locally.
+#' @return A data frame containing the MeSH Trees data.
+#' @importFrom rappdirs user_data_dir
+#' @importFrom utils download.file
+#' @export
+#' @examples
+#' \donttest{
+#' if (interactive()) {
+#'   # Code that downloads data or performs other interactive-only operations
+#'   data <- data_mesh_trees()
+#' }
+#' }
+
+data_mesh_trees <- function(force_download = FALSE) {
+  
+  # Define the URL for the MeSH trees data
+  sf <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_mesh_trees.rds?raw=true'
+  
+  # Determine the local file path for storing the data
+  df <- file.path(rappdirs::user_data_dir('puremoe'), 'data_mesh_trees.rds')
+  
+  # Check if the file exists or if forced download is requested
+  if (!file.exists(df) | force_download) {
+    # Create the directory if it doesn't exist
+    if (!dir.exists(rappdirs::user_data_dir('puremoe'))) {
+      dir.create(rappdirs::user_data_dir('puremoe'), recursive = TRUE)
+    }
+    
+    # Download the MeSH trees data
+    message('Downloading mesh trees ...')
+    utils::download.file(sf, df, mode = "wb")
+  }
+  
+  # Read and return the downloaded RDS file
+  a1 <- readRDS(df)
+  return(a1)
+}
diff --git a/R/data_pharm_action.R b/R/data_pharm_action.R
new file mode 100644
index 0000000..f921065
--- /dev/null
+++ b/R/data_pharm_action.R
@@ -0,0 +1,45 @@
+#' Download and Load Pharmacological Actions Data
+#'
+#' This function downloads and loads pharmacological actions data from a specified URL.
+#' The data is stored locally in the user's data directory. If the data file does not 
+#' exist locally or if `force_download` is TRUE, it will be downloaded. The function 
+#' returns the data as a data frame.
+#'
+#' @param force_download A logical value indicating whether to force re-downloading 
+#' of the data even if it already exists locally. Default is FALSE.
+#' @return A data frame containing pharmacological actions data.
+#' @importFrom rappdirs user_data_dir
+#' @importFrom utils download.file
+#' @examples
+#' \donttest{
+#' if (interactive()) {
+#'   # Code that downloads data or performs other interactive-only operations
+#'   data <- data_mesh_embeddings()
+#' }
+#' }
+
+#' @export
+data_pharm_action <- function(force_download = FALSE) {
+  
+  # URL for the pharmacological actions data
+  sf <- 'https://github.com/jaytimm/mesh-builds/blob/main/data/data_pharm_action.rds?raw=true'
+  
+  # Local file path for storing the data
+  df <- file.path(rappdirs::user_data_dir('puremoe'), 'data_pharm_action.rds')
+  
+  # Check if the data file exists, and download it if it doesn't or if forced
+  if (!file.exists(df) | force_download) {
+    # Create the directory if it doesn't exist
+    if (!dir.exists(rappdirs::user_data_dir('puremoe'))) {
+      dir.create(rappdirs::user_data_dir('puremoe'), recursive = TRUE)
+    }
+    
+    # Download the pharmacological actions data
+    message('Downloading pharmacological actions ...')
+    utils::download.file(sf, df, mode = "wb")
+  }
+  
+  # Read and return the downloaded RDS file
+  a1 <- readRDS(df)
+  return(a1)
+}
diff --git a/R/data_pmc_list.R b/R/data_pmc_list.R
new file mode 100644
index 0000000..0992628
--- /dev/null
+++ b/R/data_pmc_list.R
@@ -0,0 +1,65 @@
+#' Download and Process PMC Open Access File List
+#'
+#' This function downloads the PubMed Central (PMC) open access file list from the
+#' National Center for Biotechnology Information (NCBI) and processes it for use.
+#' The list is saved locally. If the file does not exist or if `force_install` is TRUE, 
+#' it will be downloaded and processed.
+#'
+#' @param force_install Logical, if TRUE, forces the re-download and processing of 
+#' the file even if it already exists locally. Default is FALSE.
+#' @return A data frame containing the processed PMC open access file list.
+#' @importFrom rappdirs user_data_dir
+#' @importFrom data.table fread
+#' @examples
+#' \donttest{
+#' if (interactive()) {
+#'   # Code that downloads data or performs other interactive-only operations
+#'   data <- data_pmc_list()
+#' }
+#' }
+
+#' @export
+data_pmc_list <- function(force_install = FALSE) {
+  
+  # URL for the PMC open access file list
+  sf <- 'https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt'
+  
+  # Local file path for storing the processed data
+  df <- file.path(rappdirs::user_data_dir('pubmedr'), 'oa_file_list.rds')
+  
+  # Define PMID and PMCID variables
+  PMID <- NULL
+  PMCID <- NULL
+  
+  # Check if the file exists, and download and process it if it doesn't or if forced
+  if (!file.exists(df) | force_install) {
+    # Create the directory if it doesn't exist
+    if (!dir.exists(rappdirs::user_data_dir('puremoe'))) {
+      dir.create(rappdirs::user_data_dir('puremoe'), recursive = TRUE)
+    }
+    
+    message('Downloading "pub/pmc/oa_file_list.txt" ...')
+    suppressWarnings({
+      # Read the file using data.table's fread
+      pmc <- data.table::fread(sf, sep = '\t')
+      
+      # Set column names
+      colnames(pmc) <- c('fpath', 'journal', 'PMCID', 'PMID', 'license_type')
+      
+      # Process PMCID and PMID columns
+      pmc[, PMID := gsub('^PMID:', '', PMID)]
+      pmc[, PMCID := gsub('^PMC', '', PMCID)]
+      
+      # Replace empty strings with NA
+      pmc[pmc==''] <- NA
+      
+      # Save the processed data as an RDS file
+      setwd(rappdirs::user_data_dir('puremoe'))
+      saveRDS(pmc, 'oa_file_list.rds')
+    })
+  }
+  
+  # Read and return the processed RDS file
+  pmc <- readRDS(df)
+  return(pmc)
+}
diff --git a/R/get_records.R b/R/get_records.R
new file mode 100644
index 0000000..9199141
--- /dev/null
+++ b/R/get_records.R
@@ -0,0 +1,87 @@
+#' Retrieve Data from NLM/PubMed databases Based on PMIDs
+#'
+#' This function retrieves different types of data (like PubMed records, affiliations, iCites data, etc.) from PubMed based on provided PMIDs. It supports parallel processing for efficiency.
+#' @param pmids A vector of PMIDs for which data is to be retrieved.
+#' @param endpoint A character vector specifying the type of data to retrieve ('pubtations', 'icites', 'affiliations', 'pubmed', 'pmc').
+#' @param cores Number of cores to use for parallel processing (default is 3).
+#' @param ncbi_key (Optional) NCBI API key for authenticated access.
+#' @param sleep Duration (in seconds) to pause after each batch 
+#' @return A data.table containing combined results from the specified endpoint.
+#' @importFrom parallel makeCluster stopCluster detectCores clusterExport
+#' @importFrom pbapply pblapply
+#' @importFrom data.table rbindlist
+#' @export
+#' @examples
+#' \donttest{
+#' if (interactive()) {
+#' pmids <- c("38136652", "31345328", "32496629")
+#' results <- get_records(pmids, endpoint = "pubmed_abstracts", cores = 1)
+#' }
+#' }
+#' 
+get_records <- function(pmids, 
+                        endpoint = c('pubtations', 
+                                     'icites', 
+                                     'pubmed_affiliations', 
+                                     'pubmed_abstracts', 
+                                     'pmc'), 
+                        cores = 3, 
+                        sleep = 1,
+                        ncbi_key = NULL) {
+  
+  # Input validation
+  if (!(is.character(pmids) || is.numeric(pmids)) || length(pmids) == 0) {
+    stop("pmids must be a non-empty vector of characters or numbers")
+  }
+  
+  if (!is.character(endpoint) || length(endpoint) != 1 || 
+      !endpoint %in% c('pubtations', 'icites', 'pubmed_affiliations', 'pubmed_abstracts', 'pmc')) {
+    stop("Invalid endpoint. Must be one of 'pubtations', 'icites', 'pubmed_affiliations', 'pubmed_abstracts', 'pmc'")
+  }
+  
+  if (!is.numeric(cores)) {
+    stop("cores must be numeric")
+  }
+  
+  # Set the NCBI API key for authenticated access if provided
+  if (!is.null(ncbi_key)) rentrez::set_entrez_key(ncbi_key)
+  
+  
+  # Define batch size and the specific task function based on the chosen endpoint
+  batch_size <- if (endpoint == "pmc") {5} else if (endpoint == "pubtations") {99} else {199}
+  task_function <- switch(endpoint,
+                          "icites" = .get_icites,
+                          "pubtations" = .get_pubtations,
+                          "pubmed_affiliations" = .get_affiliations,
+                          "pubmed_abstracts" = .get_records,
+                          "pmc_fulltext" = .get_pmc,
+                          ##
+                          stop("Invalid endpoint"))
+  
+  # Split the PMIDs into batches for parallel processing
+  batches <- split(pmids, ceiling(seq_along(pmids) / batch_size))
+  
+  if (cores > 1) {
+    
+    # Parallel processing: Create a cluster and export necessary variables
+    clust <- parallel::makeCluster(cores)
+    parallel::clusterExport(cl = clust, 
+                            varlist = c("task_function", "sleep"), 
+                            envir = environment())
+    
+    # Apply the task function to each batch with the sleep parameter, using parallel processing
+    results <- pbapply::pblapply(X = batches, 
+                                 FUN = function(batch) task_function(batch, sleep), 
+                                 cl = clust)
+    parallel::stopCluster(clust)  # Stop the cluster after processing
+  } else {
+    
+    # Sequential processing: Apply the task function to each batch with the sleep parameter
+    results <- lapply(batches, function(batch) task_function(batch, sleep))
+  }
+  
+  df_only_list <- results[sapply(results, is.data.frame)]
+  # Combine results from all batches into a single data.table
+  combined_results <- data.table::rbindlist(df_only_list)
+  return(combined_results)
+}
diff --git a/R/search_pubmed.R b/R/search_pubmed.R
new file mode 100644
index 0000000..0adcb09
--- /dev/null
+++ b/R/search_pubmed.R
@@ -0,0 +1,78 @@
+#' Search PubMed Records
+#'
+#' Performs a PubMed search based on a query, optionally filtered by publication years. 
+#' Returns a unique set of PubMed IDs matching the query.
+#'
+#' @param x Character string, the search query.
+#' @param start_year Integer, the start year of publication date range (used if `use_pub_years` is TRUE).
+#' @param end_year Integer, the end year of publication date range (used if `use_pub_years` is TRUE).
+#' @param retmax Integer, maximum number of records to retrieve, defaults to 9999.
+#' @param use_pub_years Logical, whether to filter search by publication years, defaults to TRUE.
+#' @return Numeric vector of unique PubMed IDs.
+#' @importFrom rentrez entrez_search
+#' @export
+#' @examples
+#' \donttest{
+#' if (interactive()) {
+#' ethnob1 <- search_pubmed("ethnobotany", 2010, 2012)
+#' ethnob2 <- search_pubmed("ethnobotany", use_pub_years = FALSE)
+#' }
+#' }
+#' 
+search_pubmed <- function(x, 
+                          start_year = NULL, 
+                          end_year = NULL, 
+                          retmax = 9999, 
+                          use_pub_years = TRUE) {
+  
+  if(!is.character(x) || length(x) != 1) {
+    stop("x must be a single character string.")
+  }
+  
+  if(use_pub_years) {
+    if(is.null(start_year) || is.null(end_year)) {
+      stop("start_year and end_year must be provided when use_pub_years is TRUE.")
+    }
+    if(!is.numeric(start_year) || !is.numeric(end_year) || length(start_year) != 1 || length(end_year) != 1) {
+      stop("start_year and end_year must be single integers.")
+    }
+    if(start_year > end_year) {
+      stop("start_year must be less than or equal to end_year.")
+    }
+    
+    all_ids <- vector("list", length = end_year - start_year + 1)
+    names(all_ids) <- as.character(start_year:end_year)
+    
+    for (year in start_year:end_year) {
+      query <- paste0(x, " AND ", year, "[Pub Date]")
+      all_ids[[as.character(year)]] <- .perform_search(query, retmax)
+    }
+  } else {
+    all_ids <- list(all_years = .perform_search(x, retmax))
+  }
+  
+  return(unique(unlist(all_ids, use.names = FALSE)))
+}
+
+#' Internal Function for PubMed Search
+#'
+#' Handles querying of the PubMed database and returns search results.
+#' This function is used internally by 'search_pubmed'.
+#'
+#' @param query Character string containing the PubMed search query.
+#' @param retmax Integer specifying the maximum number of records to retrieve.
+#' @noRd
+.perform_search <- function(query, retmax) {
+  tryCatch({
+    result <- rentrez::entrez_search(db = "pubmed", term = query, retmax = retmax, use_history = TRUE)
+    if (result$count > 0) {
+      return(result$ids)
+    } else {
+      return(NULL)
+    }
+  }, error = function(e) {
+    warning(sprintf("Failed to retrieve data for query '%s': %s", query, e$message))
+    return(NULL)
+  })
+  Sys.sleep(0.5)
+}
diff --git a/R/source_affiliations.R b/R/source_affiliations.R
new file mode 100644
index 0000000..83dc699
--- /dev/null
+++ b/R/source_affiliations.R
@@ -0,0 +1,69 @@
+#' Internal: Extract Author Affiliations from PubMed Records
+#'
+#' Function queries PubMed to extract author affiliations from the fetched records. It processes XML records to obtain detailed information about authors, including their names and affiliations.
+#' @param x A character vector with search terms or IDs for fetching records from PubMed.
+#' @return A data.table consisting of PubMed IDs, author names, and their affiliations.
+#' @importFrom xml2 xml_find_all xml_text
+#' @importFrom data.table rbindlist
+#' @noRd
+#' 
+#' 
+.get_affiliations <- function (x, sleep) {
+  
+  # Fetch records from PubMed based on the input x
+  records <- .fetch_records(x, sleep)
+  
+  # Process each PubMed record to extract author affiliations
+  z <- lapply(records, function(g){
+    
+    # Extract the PubMed ID from the record
+    pm <- xml2::xml_text(xml2::xml_find_all(g, ".//MedlineCitation/PMID"))
+    
+    # Find all author elements in the record
+    auts <- xml2::xml_find_all(g, ".//Author")
+    
+    # Process each author element
+    cache <- lapply(auts, function(k){
+      # Extract and concatenate the last name and first name of the author
+      Author <- paste(
+        xml2::xml_text(xml2::xml_find_all(k, ".//LastName")),
+        xml2::xml_text(xml2::xml_find_all(k, ".//ForeName")),
+        sep = ', ')
+      
+      # Handle cases where the author name is missing
+      if(length(Author) == 0){Author <- NA}
+      
+      # Extract the affiliation information of the author
+      Affiliation <- xml2::xml_text(xml2::xml_find_all(k,  ".//Affiliation"))
+      
+      # Handle cases where the affiliation is missing
+      if(length(Affiliation) == 0){Affiliation <- NA}
+      
+      # Create a data frame with PubMed ID, Author name, and Affiliation
+      data.frame(pmid = pm, Author, Affiliation)
+    })
+    
+    # Combine all author information into a single data.table
+    data.table::rbindlist(cache)
+  })
+  
+  # Combine all records into one data.table
+  x0 <- data.table::rbindlist(z)
+  
+  # Return the final data.table with author affiliations
+  return(x0)
+}
+
+
+
+# #### clean -- 
+# .clean_affiliations <- function(x){
+#   
+#   x[, Affiliation := sub('^.*?([A-Z])','\\1', Affiliation)]
+#   x[, Affiliation := trimws(Affiliation)]
+#   x[, Affiliation := gsub('(^.*[[:punct:] ])(.*@.*$)', '\\1', Affiliation)]
+#   x[, Affiliation := gsub('(^.*[[:punct:] ])(.*@.*$)', '\\1', Affiliation)]
+#   x[, Affiliation := gsub('electronic address.*$|email.*$', '', Affiliation, ignore.case = T)]
+#   x[, Affiliation := ifelse(nchar(Affiliation) < 10, NA, Affiliation)]
+#   return(x)
+# }
diff --git a/R/source_icites.R b/R/source_icites.R
new file mode 100644
index 0000000..7a09d20
--- /dev/null
+++ b/R/source_icites.R
@@ -0,0 +1,103 @@
+#' Internal: Fetch Data from iCite Database
+#'
+#' This internal function is designed to scrape data from the iCite database, a bibliometric tool provided by the NIH. It constructs a URL to query iCite with specified PubMed IDs and retrieves citation metrics and other related data.
+#' @param x A vector of PubMed IDs for which data is to be fetched from the iCite database.
+#' @return A data.frame consisting of the data retrieved from iCite, formatted as CSV.
+#' @importFrom httr GET content
+#' @importFrom utils read.csv
+#' @noRd
+#' 
+
+.fetch_icites <- function(x, sleep){
+  
+  # Construct the URL for the iCite API call, including the PubMed IDs (x)
+  url0 <- httr::GET(paste0("https://icite.od.nih.gov/api/pubs?pmids=",
+                           paste(x, collapse = ","),
+                           "&format=csv"))
+  
+  # Note: There is no error handling here, which could be a point of improvement.
+  
+  # Read the content of the response as a CSV.
+  csv_ <- utils::read.csv(textConnection(
+    httr::content(url0,
+                  "text",
+                  encoding = "UTF-8")),
+    encoding = "UTF-8")
+  
+  Sys.sleep(sleep)
+  # Return the CSV content as a data.frame
+  return(csv_)
+}
+
+
+
+
+#' Process and Structure Data from iCite
+#'
+#' Function processes and structures the data obtained via `.fetch_icites`.
+#' @param x A vector of PubMed IDs for which data has been fetched from the iCite database.
+#' @return A data.table enhanced with citation network information and cleaned reference and citation data.
+#' @importFrom data.table setDT
+#' @noRd
+#' 
+#' 
+.get_icites <- function(x, sleep){
+  
+  # Fetch data from iCite using the PubMed IDs provided
+  pmiddf <- .fetch_icites(x, sleep)
+  
+  # Extract the PubMed IDs for reference
+  gots <- pmiddf$pmid
+  
+  # Convert pmiddf to a data.table for efficient data manipulation
+  data.table::setDT(pmiddf)
+  
+  # Clean and format the 'ref_count' column
+  ref_count <- NULL
+  pmiddf[, ref_count := ifelse(is.null(references)|is.na(references), NULL, references)]
+  
+  # Process 'references' and 'cited_by' columns, handling empty or NA values
+  pmiddf[, references := ifelse(nchar(references) == 0|is.na(references), '99', references)]
+  pmiddf[, cited_by := ifelse(nchar(cited_by) == 0|is.na(cited_by), '99', cited_by)]
+  
+  # Split the 'cited_by' and 'references' columns into lists
+  cited_by <- strsplit(pmiddf$cited_by, split = " ")
+  references <- strsplit(pmiddf$references, split = " ")
+  rs <- strsplit(pmiddf$ref_count, split = " ")
+  
+  # Build a data frame for references
+  doc_id <- NULL
+  from <- NULL
+  refs <- data.table::data.table(doc_id = rep(gots, sapply(references, length)),
+                                 from = rep(gots, sapply(references, length)),
+                                 to = unlist(references))
+  # Replace placeholder '99' with NA
+  refs[refs == 99] <- NA
+  
+  # Aggregate reference data and convert to a data.table
+  refs0 <- refs[, list(references = .N), by = list(from)]
+  
+  # Build a data frame for cited_by data
+  cited <- data.frame(doc_id = rep(gots, sapply(cited_by, length)),
+                      from = unlist(cited_by),
+                      to = rep(gots, sapply(cited_by, length)))
+  # Replace placeholder '99' with NA
+  cited[cited == 99] <- NA
+  
+  # Combine references and cited_by data
+  f1 <- rbind(refs, cited)
+  # Aggregate the combined data and format as a list within a data.table
+  f2 <- data.table::setDT(f1)[, list(references = list(.SD)), by = doc_id]
+  
+  # Add citation network data to pmiddf
+  citation_net <- NULL
+  pmiddf[, citation_net := f2$references]
+  # Calculate and add reference count
+  pmiddf[, ref_count := sapply(rs, length)]
+  # Remove the original 'cited_by' and 'references' columns
+  pmiddf[, c('cited_by', 'references') := NULL]
+  
+  # Return the processed data table
+  pmiddf[, c(1, 6:25)]
+}
+
diff --git a/R/source_pmc.R b/R/source_pmc.R
new file mode 100644
index 0000000..bb814b0
--- /dev/null
+++ b/R/source_pmc.R
@@ -0,0 +1,80 @@
+#' Scrape Full Text Entries from PubMed Central (PMC)
+#'
+#' This function retrieves full-text articles from PMC using provided PMC identifiers. It downloads and parses XML files to extract article sections and their corresponding text.
+#' @param x A vector of PMC identifiers for which full-text articles are to be retrieved.
+#' @return A data.table with columns for document ID, PMC identifier, section titles, and text content of each section.
+#' @importFrom xml2 read_xml xml_children xml_find_first xml_text
+#' @importFrom utils untar
+#' @noRd
+#' 
+#' 
+.get_pmc <- function(x, sleep) {
+  
+  # Initialize an empty list to store the scraped data
+  flist <- list()
+  
+  # Loop over each PMC identifier
+  for(q in 1:length(x)){
+    
+    # Construct the file URL for the given PMC identifier
+    fn <- paste0('https://ftp.ncbi.nlm.nih.gov/pub/pmc/', x[q])
+    
+    # Create a temporary file to store the downloaded content
+    tmp <- tempfile()
+    
+    # Try to download the file, handling errors gracefully
+    dd <- tryCatch(download.file(fn, destfile = tmp), 
+                   error = function(e) 'error')  
+    
+    # If download is successful, proceed with extraction
+    if(dd != 'error'){
+      
+      # Find XML files in the downloaded content
+      xmls <- grep('xml$', utils::untar(tmp, list = TRUE), value = TRUE)
+      
+      # Extract the XML files to a temporary directory
+      untar(tmp, files = xmls, exdir = tempdir())
+      
+      # Read the first XML file
+      x0 <- xml2::read_xml(paste0(tempdir(), '/', xmls)[1])
+      pmid <- pmid_value <- xml2::xml_find_first(x0, ".//article-meta//article-id[@pub-id-type='pmid']") |>
+        xml2::xml_text()
+
+      
+      # Check if there are multiple children nodes in the XML
+      if(length(xml2::xml_children(x0)) > 1){
+        
+        # Extract the second child node (assuming it contains the relevant content)
+        x1 <- xml2::xml_child(x0, 2)            
+        
+        # Extract titles of different sections in the article
+        header_titles <- lapply(xml2::xml_children(x1),
+                                function(x) {
+                                  xml2::xml_text(xml2::xml_find_first(x, ".//title"))}
+        )
+        
+        # Extract the text of each section
+        text <- lapply(xml2::xml_children(x1), xml2::xml_text)
+        
+        # Unlist the section titles
+        section <- unlist(header_titles)
+        
+        # Combine the data into a data frame
+        df <- data.frame(pmid, 
+                         section,
+                         text = unlist(text),
+                         row.names = NULL)
+        
+        # Format the text for readability
+        df$text <- gsub('([a-z]+)([A-Z])', '\\1\n\\2', df$text)
+        
+        # Add the data frame to the list
+        flist[[q]] <- df
+      }
+    }
+    Sys.sleep(sleep)
+  }
+  
+  # Combine all data frames into one data.table and return
+  return(flist |> data.table::rbindlist())
+}
diff --git a/R/source_pubmed.R b/R/source_pubmed.R
new file mode 100644
index 0000000..5214f82
--- /dev/null
+++ b/R/source_pubmed.R
@@ -0,0 +1,176 @@
+#' Get PubMed Records
+#'
+#' Processes XML records obtained from PubMed. It extracts basic bibliographic information and annotations for each record.
+#' @param x A character vector with search terms or IDs for fetching records from PubMed.
+#' @return A data.table with columns for PubMed IDs, publication year, journal name, article title, abstract, and annotations.
+#' @noRd
+.get_records <- function (x, sleep) { 
+  
+  # Fetch records using .fetch_records function and parse XML content
+  records <- .fetch_records(x, sleep)
+  
+  # Process each record to extract basic information and annotations
+  parsed_records <- lapply(records, function(x){
+    # Extract basic bibliographic information from the record
+    basic_info <- .extract_basic(x)
+    # Extract annotations (like MeSH terms) from the record
+    annotations <- .extract_annotations(x)
+    
+    # Combine basic information and annotations into a list
+    out1 <- list('basic_info' = basic_info, 'annotations' = annotations)
+    return(out1)
+  })
+  
+  # Convert the list of basic information into a tidy format
+  sum0 <- textshape::tidy_list(x = lapply(parsed_records, '[[', 1),
+                               id.name = 'id',
+                               content.name = 'varx')
+  
+  # Reshape the data into a wide format using data.table
+  id <- NULL
+  sum1 <- data.table::dcast(data = sum0,
+                            formula = id ~ attribute,
+                            value.var = 'varx')
+  
+  sum1 <- sum1[order(as.numeric(id))]
+  
+  # Select and reorder columns for the final output
+  sum1 <- sum1[, c('pmid', 'year', 'journal', 'articletitle', 'abstract')]
+  
+  # Add annotations to the data table
+  annotations <- NULL
+  sum1[, annotations := list(lapply(parsed_records, '[[', 2))] 
+  
+  # Ensure proper encoding for compatibility
+  Encoding(rownames(sum1)) <- 'UTF-8'
+  
+  # Clean up NA values and return the final data table
+  cols <- colnames(sum1)
+  sum1[, c(cols) := lapply(.SD, .clean_nas), .SDcols = cols]
+  
+  return(sum1)
+}
+
+
+
+
+
+#' Extract Basic Information from PubMed Records
+#'
+#' An internal function that parses XML records from PubMed. It extracts essential bibliographic information such as PubMed ID, journal title, article title, publication year, and abstract.
+#' @param g An XML node set representing a single PubMed record.
+#' @return A named vector with basic bibliographic information from a PubMed record.
+#' @noRd
+
+
+.extract_basic <- function(g){
+
+  # Extract the PubMed ID (PMID) from the XML
+  pm <- xml2::xml_find_all(g, ".//MedlineCitation/PMID") |> xml2::xml_text()
+
+  # Extract the journal title
+  a1 <- xml2::xml_find_all(g, ".//Title") |> xml2::xml_text()
+  a1a <- a1[1]  # In case there are multiple titles, use the first one
+
+  # Extract the article title
+  a2 <- xml2::xml_find_all(g, ".//ArticleTitle") |> xml2::xml_text()
+  
+  # Extract publication type 
+  #pub_type <- xml2::xml_find_all(g, ".//PublicationType") |> xml2::xml_text()
+
+  # Extract the publication year. If 'Year' is not available, use 'MedlineDate' as a fallback
+  year <- xml2::xml_find_all(g, ".//PubDate/Year") |> xml2::xml_text()
+  if(length(year) == 0){
+    year <- xml2::xml_find_all(g, ".//PubDate/MedlineDate") |> xml2::xml_text()
+  }
+  # Clean up the year to remove any extra characters or ranges
+  year <- gsub(" .+", "", year)
+  year <- gsub("-.+", "", year)
+
+  # Extract the abstract text, combining multiple parts if necessary
+  abstract <- xml2::xml_find_all(g, ".//Abstract/AbstractText") |> xml2::xml_text()
+  
+  if(length(abstract) > 1){
+    abstract <- paste(abstract, collapse = ' ')}
+  if(length(abstract) == 0){abstract <- NA}
+
+  abstract <- .reformat_abstract(abstract)
+  # Construct the output with the extracted information
+  out <- c('pmid' = pm,
+           'journal' = a1a,
+           #'pubtype' = pub_type,
+           'articletitle' = a2,
+           'year' = year,
+           'abstract' = abstract)
+
+  return(out)
+}
+
+
+
+#' Extract Annotations from PubMed Records
+#'
+#' Parses XML records from PubMed to extract annotations such as MeSH terms, chemical names, and keywords. 
+#' @param g An XML node set representing a single PubMed record.
+#' @return A data frame with annotations extracted from a PubMed record.
+#' @noRd
+.extract_annotations <- function(g){
+  
+  # Extract the PubMed ID (PMID) from the XML record
+  pm <- xml2::xml_find_all(g, ".//MedlineCitation/PMID") |> xml2::xml_text()
+  
+  # Extract MeSH terms (Medical Subject Headings)
+  meshes <- xml2::xml_find_all(g, ".//DescriptorName") |> xml2::xml_text()
+  
+  # Extract chemical substances names
+  chems <- xml2::xml_find_all(g, ".//NameOfSubstance") |> xml2::xml_text()
+  
+  # Extract keywords from the record
+  keys <- xml2::xml_find_all(g, ".//Keyword") |> xml2::xml_text()
+  
+  # Combine the extracted data into a single data frame
+  # Create separate data frames for MeSH terms, chemical substances, and keywords, and then bind them together
+  df0 <- rbind(
+    data.frame(pmid = pm, type = 'MeSH', form = if(length(meshes) > 0){meshes} else{NA}),
+    data.frame(pmid = pm, type = 'Chemistry', form = if(length(chems) > 0){chems} else{NA}),
+    data.frame(pmid = pm, type = 'Keyword', form = if(length(keys) > 0){keys} else{NA})
+  )
+  
+  # Return the combined annotations data frame
+  return(df0)
+}
+
+
+
+#' Reformat Abstract Text
+#'
+#' Internal function to reformat an abstract by inserting newlines before each section title.
+#' It handles abstracts with or without section titles and trims whitespace from each section.
+#' Returns NA if the input is NA.
+#'
+#' @param abstract A character string representing the abstract text.
+#'
+#' @return A character string of the reformatted abstract with newlines before each section title, or NA if the input is NA.
+#'
+#' @noRd
+.reformat_abstract <- function(abstract) {
+  if (is.na(abstract)) {
+    return(NA)
+  }
+  
+  if (!is.character(abstract)) {
+    stop("Abstract must be a character string.", call. = FALSE)
+  }
+  
+  # Regular expression to match section titles (e.g., "Methodology and Results:")
+  # This pattern matches 1-3 words, each word starting with an uppercase letter or all words being uppercase
+  pattern_title <- "(^|\\.\\s+)(([A-Z][a-z]*|[A-Z]+)(\\s([A-Z][a-z]*|[A-Z]+)){0,2}):"
+  
+  # Use the pattern to insert a newline before each title and split the abstract into sections
+  split_abstract <- strsplit(gsub(pattern_title, "\n\\2:", abstract), "\n")[[1]]
+  
+  # Combine the sections back into a single string
+  formatted_abstract <- paste(split_abstract, collapse = "\n")
+  
+  return(formatted_abstract)
+}
\ No newline at end of file
diff --git a/R/source_pubtations.R b/R/source_pubtations.R
new file mode 100644
index 0000000..0806576
--- /dev/null
+++ b/R/source_pubtations.R
@@ -0,0 +1,115 @@
+#' Extract Named Entities from PubMed's PubTator3 Tool
+#'
+#' This function retrieves named entity annotations from PubMed's PubTator3 tool. It fetches data using PubMed IDs and processes the JSON response into a structured format.
+#' @param x A vector of PubMed IDs for which annotations are to be retrieved from PubTator.
+#' @return A data.table, or NA if no data is available, with columns for PubMed ID, title or abstract location, annotation text, start and end positions of annotations, and annotation types.
+#' @importFrom jsonlite stream_in
+#' @importFrom data.table rbindlist
+#' @noRd
+#' 
+.get_pubtations <- function(x, sleep){
+  
+  # x <- batches[[3]]
+  # https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocxml?pmids=29355051&full=true
+  
+  # Connect to PubTator API and retrieve data
+  # con <- url(paste0("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids=", paste(x, collapse = ',')))
+  
+  con <- url(paste0("https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson?pmids=", paste(x, collapse = ',')))
+  
+  # Read JSON data stream, handling errors with NA
+  # mydata <- tryCatch(jsonlite::stream_in(gzcon(con)), error = function(e) NA)  
+  mydata <- tryCatch(
+    jsonlite::stream_in(con), 
+    error = function(e) NA)  
+
+  
+  # Process the data if valid, else return NA
+  # i = 1
+  if(length(mydata) == 1){jj0 <- NA} else{
+    jj <- list()
+    
+    # Iterate over each record to extract and format annotations
+    for(i in 1:nrow(mydata)){
+      
+      # Extract annotations for titles and abstracts
+      pb1 <- mydata$passages[[i]]$annotations
+      names(pb1) <- c('title', 'abstract')
+      
+      # Process title annotations
+      if(any(nrow(pb1$title) == 0, is.null(nrow(pb1$title)))) {
+        pb1$title <- data.frame(tiab = 'title', 
+                                id = NA,
+                                text = NA, 
+                                locations = NA, 
+                                identifier = NA, 
+                                type = NA)
+      } else{
+        
+        if (!("identifier" %in% names(pb1[["title"]]$infons))) {
+          pb1[["title"]]$infons$identifier <- NA
+        }
+        
+        pb1$title <- cbind(tiab = 'title', 
+                           pb1$title[, c('id', 'text', 'locations')], 
+                           identifier = pb1$title$infons$identifier, 
+                           type = pb1$title$infons$type)
+      }
+      
+      # Process abstract annotations
+      if(any(nrow(pb1$abstract) == 0, is.null(nrow(pb1$abstract)))) {
+        pb1$abstract <- data.frame(tiab = 'abstract', 
+                                   id = NA, 
+                                   text = NA, 
+                                   locations = NA, 
+                                   identifier = NA, 
+                                   type = NA)
+      } else{
+        
+        if (!("identifier" %in% names(pb1[["abstract"]]$infons))) {
+          pb1[["abstract"]]$infons$identifier <- NA
+        }
+        
+        pb1$abstract <- cbind(tiab = 'abstract', 
+                              pb1$abstract[, c('id', 'text', 'locations')], 
+                              identifier = pb1$abstract$infons$identifier, 
+                              type = pb1$abstract$infons$type)
+      }
+      
+      # Combine title and abstract annotations
+      jj[[i]] <- rbind(pb1$title, pb1$abstract)
+    }
+    
+    
+    if (!all(sapply(jj, is.data.frame))) {
+      return(NA)  
+    } else {
+
+      names(jj) <- mydata$id
+      jj0 <- jj |> data.table::rbindlist(idcol = 'pmid')
+      # ... rest of the processing ...
+    }
+    
+    
+    # Clean and format location data
+    jj0$locations <- jj0$locations |> as.character()
+    jj0$locations <- gsub("[^[:digit:],]", "", jj0$locations)
+    
+    # Extract start and end positions of annotations
+    start <- NULL
+    end <- NULL
+    locations <- NULL
+    jj0[, c('start', 'length') := data.table::tstrsplit(locations, ",", fixed=TRUE)]
+    jj0[, start := as.integer(start)]
+    jj0[, end := start + as.integer(length)]
+    
+    # Clean up temporary columns
+    jj0[, length := NULL]
+    jj0[, locations := NULL]
+  }
+  
+  Sys.sleep(sleep)
+  
+  # Return the processed annotations data
+  return(jj0)
+}
diff --git a/R/utils-data-table.R b/R/utils-data-table.R
new file mode 100644
index 0000000..d2f2964
--- /dev/null
+++ b/R/utils-data-table.R
@@ -0,0 +1,12 @@
+# data.table is generally careful to minimize the scope for namespace
+# conflicts (i.e., functions with the same name as in other packages);
+# a more conservative approach using @importFrom should be careful to
+# import any needed data.table special symbols as well, e.g., if you
+# run DT[ , .N, by='grp'] in your package, you'll need to add
+# @importFrom data.table .N to prevent the NOTE from R CMD check.
+# See ?data.table::`special-symbols` for the list of such symbols
+# data.table defines; see the 'Importing data.table' vignette for more
+# advice (vignette('datatable-importing', 'data.table')).
+#
+#' @import data.table
+NULL
diff --git a/R/utils.R b/R/utils.R
new file mode 100644
index 0000000..7e6e7ae
--- /dev/null
+++ b/R/utils.R
@@ -0,0 +1,54 @@
+#' Fetch Batch of PubMed Records as XML
+#'
+#' This function attempts to fetch batches of PubMed records in XML format. It retries multiple times in case of failures.
+#' @param x A vector of PubMed record identifiers to be fetched.
+#' @return A character string with XML content of PubMed records, or an error object in case of failure.
+#' @importFrom rentrez entrez_fetch
+#' @noRd
+#' 
+#' 
+.fetch_records <- function(x, sleep) {
+  # Loop to retry fetching records, with a maximum of 15 attempts
+  for (i in 1:15) {
+    # Display the current attempt number
+    #message(i)
+    
+    # Try fetching records using rentrez::entrez_fetch
+    x1 <- try({
+      rentrez::entrez_fetch(
+        db = "pubmed",
+        id = x,
+        rettype = "xml",
+        parsed = FALSE
+      )
+    })
+    
+    # Wait for 5 seconds before the next attempt
+    Sys.sleep(sleep)
+    
+    # Check if the fetch was successful using inherits(), and if so, break the loop
+    if (!inherits(x1, "try-error")) {
+      break
+    }
+  }
+  
+  # Return the fetched XML content or an error object
+  doc <- xml2::read_xml(x1)
+  xml2::xml_find_all(doc, "//PubmedArticle")
+}
+
+
+
+#' Clean Missing or Invalid Values in Data
+#'
+#' This function standardizes the representation of missing or invalid values in data by replacing specific character representations of missing data (' ', 'NA', 'n/a', 'n/a.') with R's standard `NA`.
+#' @param x A vector that may contain missing or invalid values represented in various formats.
+#' @return A vector with standardized missing values represented as `NA`.
+#' @noRd
+#' 
+#' 
+.clean_nas <- function(x) {
+  
+  # Replace specific character representations of missing data with NA
+  ifelse(x %in% c(' ', 'NA', 'n/a', 'n/a.') | is.na(x), NA, x) 
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5019560
--- /dev/null
+++ b/README.md
@@ -0,0 +1,93 @@
+[![R build
+status](https://github.com/jaytimm/puremoe/workflows/R-CMD-check/badge.svg)](https://github.com/jaytimm/puremoe/actions)
+
+# puremoe
+
+> **P**ubMed **U**nified **RE**trieval for **M**ulti-**O**utput
+> **E**xploration
+
+An R package that provides a single interface for accessing a range of
+NLM/PubMed databases, including
+[PubMed](https://pubmed.ncbi.nlm.nih.gov/) abstract records,
+[iCite](https://icite.od.nih.gov/) bibliometric data,
+[PubTator3](https://www.ncbi.nlm.nih.gov/research/pubtator3/) named
+entity annotations, and full-text entries from [PubMed
+Central](https://www.ncbi.nlm.nih.gov/pmc/) (PMC). This unified
+interface simplifies the data retrieval process, allowing users to
+interact with multiple PubMed services/APIs/output formats through a
+single R function.
+
+The package also includes MeSH thesaurus resources as simple data
+frames, including Descriptor Terms, Descriptor Tree Structures,
+Supplementary Concept Terms, and Pharmacological Actions; it also
+includes descriptor-level word embeddings [(Noh & Kavuluru
+2021)](https://www.sciencedirect.com/science/article/pii/S1532046421001969).
+Via the [mesh-resources](https://github.com/jaytimm/mesh-resources)
+library.
+
+## Installation
+
+You can download the development version from GitHub with:
+
+``` r
+devtools::install_github("jaytimm/puremoe")
+```
+
+## Usage
+
+## PubMed search
+
+The package has two basic functions: `search_pubmed` and `get_records`.
+The former fetches PMIDs from the PubMed API based on user search; the
+latter scrapes PMID records from a user-specified PubMed endpoint –
+`pubmed_abstracts`, `pubmed_affiliations`, `pubtations`, `icites`, or
+`pmc_fulltext`.
+
+Search syntax is the same as that implemented in standard [PubMed
+search](https://pubmed.ncbi.nlm.nih.gov/advanced/).
+
+``` r
+pmids <- puremoe::search_pubmed('("political ideology"[TiAb])',
+                                 use_pub_years = F)
+
+# pmids <- puremoe::search_pubmed('immunity', 
+#                                  use_pub_years = T,
+#                                  start_year = 2022,
+#                                  end_year = 2024) 
+```
+
+## Get record-level data
+
+``` r
+pubmed <- pmids |> 
+  puremoe::get_records(endpoint = 'pubmed_abstracts', 
+                       cores = 3, 
+                       sleep = 1) 
+
+affiliations <- pmids |> 
+  puremoe::get_records(endpoint = 'pubmed_affiliations', 
+                       cores = 1, 
+                       sleep = 0.5)
+
+icites <- pmids |>
+  puremoe::get_records(endpoint = 'icites',
+                       cores = 3,
+                       sleep = 0.25)
+
+pubtations <- pmids |> 
+  puremoe::get_records(endpoint = 'pubtations',
+                       cores = 2)
+```
+
+> When the endpoint is PMC, the \`get_records() function takes a vector
+> of filepaths (from the PMC Open Access list) instead of PMIDs.
+
+``` r
+pmclist <- puremoe::data_pmc_list(force_install = F)
+pmc_pmids <- pmclist[PMID %in% pmids]
+
+pmc_fulltext <- pmc_pmids$fpath[1:5] |> 
+  puremoe::get_records(endpoint = 'pmc_fulltext', cores = 2)
+```
+
+## Summary
diff --git a/man/data_mesh_embeddings.Rd b/man/data_mesh_embeddings.Rd
new file mode 100644
index 0000000..c742a42
--- /dev/null
+++ b/man/data_mesh_embeddings.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_mesh_embeddings.R
+\name{data_mesh_embeddings}
+\alias{data_mesh_embeddings}
+\title{Download and Process Mesh and SCR Embeddings}
+\usage{
+data_mesh_embeddings()
+}
+\value{
+A data frame containing the processed Mesh and SCR embeddings data.
+}
+\description{
+This function downloads Mesh and SCR embeddings data from the specified URLs and processes it for use.
+The data is saved locally in RDS format. If the files do not exist, they will be downloaded and processed.
+}
+\examples{
+\donttest{
+if (interactive()) {
+  # Code that downloads data or performs other interactive-only operations
+  data <- data_mesh_embeddings()
+}
+}
+
+}
diff --git a/man/data_mesh_thesuarus.Rd b/man/data_mesh_thesuarus.Rd
new file mode 100644
index 0000000..dabdfd0
--- /dev/null
+++ b/man/data_mesh_thesuarus.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_mesh_thesaurus.R
+\name{data_mesh_thesuarus}
+\alias{data_mesh_thesuarus}
+\title{Download and Combine MeSH and Supplemental Thesauruses}
+\usage{
+data_mesh_thesuarus(force_download = FALSE)
+}
+\arguments{
+\item{force_download}{A logical value indicating whether to force re-downloading 
+of the data even if it already exists locally.}
+}
+\value{
+A data.table containing the combined MeSH and supplemental thesaurus data.
+}
+\description{
+This function downloads and combines the MeSH (Medical Subject Headings) Thesaurus 
+and a supplemental concept thesaurus for use in biomedical research and analysis.
+The data is sourced from specified URLs and stored locally for subsequent use.
+}
+\examples{
+\donttest{
+if (interactive()) {
+  # Code that downloads data or performs other interactive-only operations
+  data <- data_mesh_thesaurus()
+}
+}
+}
diff --git a/man/data_mesh_trees.Rd b/man/data_mesh_trees.Rd
new file mode 100644
index 0000000..74431b1
--- /dev/null
+++ b/man/data_mesh_trees.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_mesh_trees.R
+\name{data_mesh_trees}
+\alias{data_mesh_trees}
+\title{Download and Load MeSH Trees Data}
+\usage{
+data_mesh_trees(force_download = FALSE)
+}
+\arguments{
+\item{force_download}{A logical value indicating whether to force re-downloading 
+of the data even if it already exists locally.}
+}
+\value{
+A data frame containing the MeSH Trees data.
+}
+\description{
+This function downloads and loads the MeSH (Medical Subject Headings) Trees data
+from a specified URL. The data is stored locally for future use. If the data already 
+exists locally, the download can be skipped unless `force_download` is set to `TRUE`.
+}
+\examples{
+\donttest{
+if (interactive()) {
+  # Code that downloads data or performs other interactive-only operations
+  data <- data_mesh_trees()
+}
+}
+}
diff --git a/man/data_pharm_action.Rd b/man/data_pharm_action.Rd
new file mode 100644
index 0000000..621ead0
--- /dev/null
+++ b/man/data_pharm_action.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_pharm_action.R
+\name{data_pharm_action}
+\alias{data_pharm_action}
+\title{Download and Load Pharmacological Actions Data}
+\usage{
+data_pharm_action(force_download = FALSE)
+}
+\arguments{
+\item{force_download}{A logical value indicating whether to force re-downloading 
+of the data even if it already exists locally. Default is FALSE.}
+}
+\value{
+A data frame containing pharmacological actions data.
+}
+\description{
+This function downloads and loads pharmacological actions data from a specified URL.
+The data is stored locally in the user's data directory. If the data file does not 
+exist locally or if `force_download` is TRUE, it will be downloaded. The function 
+returns the data as a data frame.
+}
+\examples{
+\donttest{
+if (interactive()) {
+  # Code that downloads data or performs other interactive-only operations
+  data <- data_mesh_embeddings()
+}
+}
+}
diff --git a/man/data_pmc_list.Rd b/man/data_pmc_list.Rd
new file mode 100644
index 0000000..fe4401b
--- /dev/null
+++ b/man/data_pmc_list.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_pmc_list.R
+\name{data_pmc_list}
+\alias{data_pmc_list}
+\title{Download and Process PMC Open Access File List}
+\usage{
+data_pmc_list(force_install = FALSE)
+}
+\arguments{
+\item{force_install}{Logical, if TRUE, forces the re-download and processing of 
+the file even if it already exists locally. Default is FALSE.}
+}
+\value{
+A data frame containing the processed PMC open access file list.
+}
+\description{
+This function downloads the PubMed Central (PMC) open access file list from the
+National Center for Biotechnology Information (NCBI) and processes it for use.
+The list is saved locally. If the file does not exist or if `force_install` is TRUE, 
+it will be downloaded and processed.
+}
+\examples{
+\donttest{
+if (interactive()) {
+  # Code that downloads data or performs other interactive-only operations
+  data <- data_pmc_list()
+}
+}
+}
diff --git a/man/dot-extract_basic.Rd b/man/dot-extract_basic.Rd
new file mode 100644
index 0000000..c369faa
--- /dev/null
+++ b/man/dot-extract_basic.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/source_pubmed.R
+\name{.extract_basic}
+\alias{.extract_basic}
+\title{Extract Basic Information from PubMed Records}
+\usage{
+.extract_basic(g)
+}
+\arguments{
+\item{g}{An XML node set representing a single PubMed record.}
+}
+\value{
+A named vector with basic bibliographic information from a PubMed record.
+}
+\description{
+An internal function that parses XML records from PubMed. It extracts essential bibliographic information such as PubMed ID, journal title, article title, publication year, and abstract.
+}
diff --git a/man/get_records.Rd b/man/get_records.Rd
new file mode 100644
index 0000000..22943f4
--- /dev/null
+++ b/man/get_records.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_records.R
+\name{get_records}
+\alias{get_records}
+\title{Retrieve Data from NLM/PubMed databases Based on PMIDs}
+\usage{
+get_records(
+  pmids,
+  endpoint = c("pubtations", "icites", "pubmed_affiliations", "pubmed_abstracts", "pmc"),
+  cores = 3,
+  sleep = 1,
+  ncbi_key = NULL
+)
+}
+\arguments{
+\item{pmids}{A vector of PMIDs for which data is to be retrieved.}
+
+\item{endpoint}{A character vector specifying the type of data to retrieve ('pubtations', 'icites', 'affiliations', 'pubmed', 'pmc').}
+
+\item{cores}{Number of cores to use for parallel processing (default is 3).}
+
+\item{sleep}{Duration (in seconds) to pause after each batch}
+
+\item{ncbi_key}{(Optional) NCBI API key for authenticated access.}
+}
+\value{
+A data.table containing combined results from the specified endpoint.
+}
+\description{
+This function retrieves different types of data (like PubMed records, affiliations, iCites data, etc.) from PubMed based on provided PMIDs. It supports parallel processing for efficiency.
+}
+\examples{
+\donttest{
+if (interactive()) {
+pmids <- c("38136652", "31345328", "32496629")
+results <- get_records(pmids, endpoint = "pubmed_abstracts", cores = 1)
+}
+}
+
+}
diff --git a/man/search_pubmed.Rd b/man/search_pubmed.Rd
new file mode 100644
index 0000000..d8e8882
--- /dev/null
+++ b/man/search_pubmed.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/search_pubmed.R
+\name{search_pubmed}
+\alias{search_pubmed}
+\title{Search PubMed Records}
+\usage{
+search_pubmed(
+  x,
+  start_year = NULL,
+  end_year = NULL,
+  retmax = 9999,
+  use_pub_years = TRUE
+)
+}
+\arguments{
+\item{x}{Character string, the search query.}
+
+\item{start_year}{Integer, the start year of publication date range (used if `use_pub_years` is TRUE).}
+
+\item{end_year}{Integer, the end year of publication date range (used if `use_pub_years` is TRUE).}
+
+\item{retmax}{Integer, maximum number of records to retrieve, defaults to 9999.}
+
+\item{use_pub_years}{Logical, whether to filter search by publication years, defaults to TRUE.}
+}
+\value{
+Numeric vector of unique PubMed IDs.
+}
+\description{
+Performs a PubMed search based on a query, optionally filtered by publication years. 
+Returns a unique set of PubMed IDs matching the query.
+}
+\examples{
+\donttest{
+if (interactive()) {
+ethnob1 <- search_pubmed("ethnobotany", 2010, 2012)
+ethnob2 <- search_pubmed("ethnobotany", use_pub_years = FALSE)
+}
+}
+
+}
diff --git a/pubmedtk.Rproj b/pubmedtk.Rproj
new file mode 100644
index 0000000..21a4da0
--- /dev/null
+++ b/pubmedtk.Rproj
@@ -0,0 +1,17 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source