readme edit

jaytimm · Apr 10, 2024 · 9788d08 · 9788d08
1 parent 98fe585
commit 9788d08
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 15 deletions.
diff --git a/R/source_pubtations.R b/R/source_pubtations.R
@@ -1,6 +1,6 @@
-#' Extract Records from PubMed's PubTator Tool
+#' Extract Named Entities from PubMed's PubTator3 Tool
 #'
-#' This function retrieves annotated bibliographic data from PubMed's PubTator tool. It fetches data using PubMed IDs and processes the JSON response into a structured format.
+#' This function retrieves named entity annotations from PubMed's PubTator3 tool. It fetches data using PubMed IDs and processes the JSON response into a structured format.
 #' @param x A vector of PubMed IDs for which annotations are to be retrieved from PubTator.
 #' @return A data.table, or NA if no data is available, with columns for PubMed ID, title or abstract location, annotation text, start and end positions of annotations, and annotation types.
 #' @importFrom jsonlite stream_in
@@ -9,13 +9,23 @@
 #' 
 .get_pubtations <- function(x, sleep){
 
+  # x <- batches[[3]]
+  # https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocxml?pmids=29355051&full=true
+
   # Connect to PubTator API and retrieve data
-  con <- url(paste0("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids=", paste(x, collapse = ',')))
+  # con <- url(paste0("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocjson?pmids=", paste(x, collapse = ',')))
+
+  con <- url(paste0("https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson?pmids=", paste(x, collapse = ',')))
 
   # Read JSON data stream, handling errors with NA
-  mydata <- tryCatch(jsonlite::stream_in(gzcon(con)), error = function(e) NA)  
+  # mydata <- tryCatch(jsonlite::stream_in(gzcon(con)), error = function(e) NA)  
+  mydata <- tryCatch(
+    jsonlite::stream_in(con), 
+    error = function(e) NA)  
+
 
   # Process the data if valid, else return NA
+  # i = 1
   if(length(mydata) == 1){jj0 <- NA} else{
     jj <- list()
 
@@ -35,6 +45,11 @@
                                 identifier = NA, 
                                 type = NA)
       } else{
+
+        if (!("identifier" %in% names(pb1[["title"]]$infons))) {
+          pb1[["title"]]$infons$identifier <- NA
+        }
+
         pb1$title <- cbind(tiab = 'title', 
                            pb1$title[, c('id', 'text', 'locations')], 
                            identifier = pb1$title$infons$identifier, 
@@ -50,6 +65,11 @@
                                    identifier = NA, 
                                    type = NA)
       } else{
+
+        if (!("identifier" %in% names(pb1[["abstract"]]$infons))) {
+          pb1[["abstract"]]$infons$identifier <- NA
+        }
+
         pb1$abstract <- cbind(tiab = 'abstract', 
                               pb1$abstract[, c('id', 'text', 'locations')], 
                               identifier = pb1$abstract$infons$identifier, 

diff --git a/R/utils.R b/R/utils.R
@@ -11,7 +11,7 @@
   # Loop to retry fetching records, with a maximum of 15 attempts
   for (i in 1:15) {
     # Display the current attempt number
-    message(i)
+    #message(i)
 
     # Try fetching records using rentrez::entrez_fetch
     x1 <- try({

diff --git a/README.md b/README.md
@@ -10,17 +10,17 @@ An R package that provides a single interface for accessing a range of
 NLM/PubMed databases, including
 [PubMed](https://pubmed.ncbi.nlm.nih.gov/) abstract records,
 [iCite](https://icite.od.nih.gov/) bibliometric data,
-[PubTator](https://www.ncbi.nlm.nih.gov/research/pubtator/) named entity
-annotations, and full-text entries from [PubMed
+[PubTator3](https://www.ncbi.nlm.nih.gov/research/pubtator3/) named
+entity annotations, and full-text entries from [PubMed
 Central](https://www.ncbi.nlm.nih.gov/pmc/) (PMC). This unified
 interface simplifies the data retrieval process, allowing users to
 interact with multiple PubMed services/APIs/output formats through a
 single R function.
 
-The package also includes MeSH ontology resources as simple data frames,
-including Descriptor Terms, Descriptor Tree Structures, Supplementary
-Concept Terms, and Pharmacological Actions; it also includes
-descriptor-level word embeddings [(Noh & Kavuluru
+The package also includes MeSH thesaurus resources as simple data
+frames, including Descriptor Terms, Descriptor Tree Structures,
+Supplementary Concept Terms, and Pharmacological Actions; it also
+includes descriptor-level word embeddings [(Noh & Kavuluru
 2021)](https://www.sciencedirect.com/science/article/pii/S1532046421001969).
 Via the [mesh-resources](https://github.com/jaytimm/mesh-resources)
 library.
@@ -61,21 +61,22 @@ pmids <- puremoe::search_pubmed('("political ideology"[TiAb])',
 ``` r
 pubmed <- pmids |> 
   puremoe::get_records(endpoint = 'pubmed_abstracts', 
-                       cores = 1, 
+                       cores = 3, 
                        sleep = 1) 
 
 affiliations <- pmids |> 
   puremoe::get_records(endpoint = 'pubmed_affiliations', 
-                       cores = 3, 
+                       cores = 1, 
                        sleep = 0.5)
 
 icites <- pmids |>
   puremoe::get_records(endpoint = 'icites',
-                       cores = 4,
+                       cores = 3,
                        sleep = 0.25)
 
 pubtations <- pmids |> 
-  puremoe::get_records(endpoint = 'pubtations')
+  puremoe::get_records(endpoint = 'pubtations',
+                       cores = 2)
 ```
 
 > When the endpoint is PMC, the \`get_records() function takes a vector
@@ -88,3 +89,5 @@ pmc_pmids <- pmclist[PMID %in% pmids]
 pmc_fulltext <- pmc_pmids$fpath[1:5] |> 
   puremoe::get_records(endpoint = 'pmc_fulltext', cores = 2)
 ```
+
+## Summary