Update read_omim() to work for key-req downloads

Only phenotypicSeries.txt has a new class (omim_PS_complete) so far.
DiseaseOntology · Feb 12, 2024 · 259d5c5 · 259d5c5
1 parent cf20a4a
commit 259d5c5
Show file tree

Hide file tree

Showing 7 changed files with 107 additions and 21 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: DO.utils
 Type: Package
 Title: Public Resource Utilities Designed by/for the Disease Ontology
-Version: 0.3.1
+Version: 0.3.1.9000
 Author: J. Allen Baron
 Maintainer: J. Allen Baron <allenbaron@som.umaryland.edu>
 Description: Generally useful tools to assess use of public resources in

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,18 @@
+# DO.utils (development version)
+
+## DO Management & Analysis
+
+### Updated
+* `read_omim()` now additionally parses official API-key requiring
+phenotypicSeries.txt downloads and may be able to handle additional API-key
+requiring downloads.
+
+### New
+* `download_omim()` downloads official API-key requiring files directly from
+OMIM (e.g. mim2gene.txt, phenotypicSeries.txt, etc.).
+
+
+
 # DO.utils 0.3.1
 
 ## Dependency Update

diff --git a/R/read.R b/R/read.R
@@ -116,19 +116,22 @@ read_ga <- function(ga_file, read_all = FALSE, tidy = TRUE, keep_total = FALSE,
 }
 
 
-#' Read Data from omim.org
+#' Read OMIM Data
 #'
-#' Reads and formats OMIM data copied or downloaded from https://omim.org/
-#' and appends columns to speed up subsequent curation activities.
+#' Reads and formats OMIM data copied or manually downloaded from
+#' https://omim.org/, or downloaded with [download_omim()] (permission
+#' required), and appends columns to speed up subsequent curation activities.
 #'
-#' @section Input Requirements:
+#' @section Manual Input Requirements:
 #' The `file` with OMIM data copied or downloaded must include headers at the
-#' top. These data can be left _as pasted_ even if they are not formatted
-#' correctly, as `read_omim()` will process and correct headers, which includes
-#' fixing multi-line or misarranged column headers, and will trim whitespace.
+#' top. These data can be left _as copied & pasted from omim.org_ even if they
+#' are not formatted correctly, as `read_omim()` will process and correct
+#' headers, which includes fixing multi-line or misarranged column headers,
+#' and will trim whitespace.
 #'
-#' @param file The path to a .tsv or .csv file (possibly compressed) with data
-#'     from https://omim.org/. See "Input Requirements" for details.
+#' @param file The path to a file (possibly compressed) with copy/pasted or
+#' manually downloaded from https://omim.org/ (see "Manual Input Requirements"
+#' for details), or downloaded with [download_omim()].
 #' @param keep_mim \[**OMIM search data only**\] The MIM symbols representing
 #' the data types to keep, as a character vector, or `NULL` to retain all
 #' (default: `"#"` and `"%"`).

diff --git a/R/read_helpers.R b/R/read_helpers.R
@@ -15,9 +15,48 @@ preprocess_omim_dl <- function(file, ...) {
             stringr::regex("copyright.*omim", ignore_case = TRUE)
         )
     )
+    # "generated" files are those downloaded programmatically
+    was_generated <- any(
+        stringr::str_detect(
+            utils::head(.lines, 10),
+            stringr::regex("generated", ignore_case = TRUE)
+        )
+    )
+    if (is_official && was_generated) {
+        # get header (last commented out line)
+        header_n <- which(stringr::str_detect(.lines, "^[^#]"))[1] - 1
+        header <- .lines[header_n] %>%
+            stringr::str_remove("^# *") %>%
+            stringr::str_split_1("\t")
 
-    if (is_official) {
-        # determine official download type: search, PS, or PS_titles
+        # determine official download type: PS_complete, etc.
+        ps_complete_col_nm <- c("Phenotypic Series Number", "MIM Number", "Phenotype")
+        if (isTRUE(all(header %in% ps_complete_col_nm))) {
+            dl_type <- "PS_complete"
+
+            # fix lines with PS labels - add tab to push label to Phenotype col
+            pos_replace <- dplyr::if_else(
+                stringr::str_detect(.lines, "^PS[0-9]+\t[0-9]{6}"),
+                "\\1\t",
+                "\\1\t\t"
+            )
+            .lines <- stringr::str_replace(
+                .lines,
+                "^(PS[0-9]+)\t",
+                pos_replace
+            )
+        } else {
+            dl_type <- NA
+        }
+
+        df <- readr::read_tsv(
+            file = I(.lines),
+            comment = "#",
+            col_names = header,
+            show_col_types = FALSE
+        )
+    } else if (is_official) {
+        # determine official, manual download type: search, PS, or PS_titles
         dl_type <- stringr::str_extract(
             .lines[1],
             stringr::regex(

diff --git a/man/read_omim.Rd b/man/read_omim.Rd
diff --git a/tests/testthat/data/omim/omim-ps_complete.txt b/tests/testthat/data/omim/omim-ps_complete.txt
@@ -0,0 +1,7 @@
+# Copyright (c) 1966-2024 Johns Hopkins University. Use of this file adheres to the terms specified at https://omim.org/help/agreement
+# Generated: 2024-02-09
+# Phenotypic Series Number	MIM Number	Phenotype
+PS100070	Aortic aneurysm, familial abdominal
+PS100070	100070	Aortic aneurysm, familial abdominal 1
+PS100070	609782	Aortic aneurysm, familial abdominal 2
+PS100070	611891	{Aneurysm, familial abdominal 3}
diff --git a/tests/testthat/test-read.R b/tests/testthat/test-read.R
@@ -146,3 +146,22 @@ test_that("read_omim() works for COPIED data (PS or with entry info)", {
     expect_equal(read_omim("data/omim/omim-ps_cp-entry_page.csv"), ps_df_cp)
     # expect_snapshot(read_omim("data/omim/omim-ps_cp-entry_page_w_ps.csv")) # not supported
 })
+
+test_that("read_omim() works for key-required phenotypicSeries.txt downloads", {
+    expected <- structure(
+        list(
+            phenotypic_series_number = c("PS100070", "PS100070", "PS100070",
+                                         "PS100070"),
+            mim_number = c(NA, 100070, 609782, 611891),
+            phenotype = c("Aortic aneurysm, familial abdominal",
+                          "Aortic aneurysm, familial abdominal 1",
+                          "Aortic aneurysm, familial abdominal 2",
+                          "{Aneurysm, familial abdominal 3}")
+        ),
+        row.names = c(NA, -4L),
+        class = c("omim_PS_complete", "omim_tbl", "spec_tbl_df", "tbl_df",
+                  "tbl", "data.frame")
+    )
+
+    expect_equal(read_omim("data/omim/omim-ps_complete.txt"), expected)
+})