Create extract_ordo_mappings()

DiseaseOntology · Feb 13, 2024 · b041d85 · b041d85
1 parent 2ac4cb3
commit b041d85
Show file tree

Hide file tree

Showing 6 changed files with 216 additions and 1 deletion.
diff --git a/NAMESPACE b/NAMESPACE
@@ -59,6 +59,7 @@ export(elucidate)
 export(extract_as_tidygraph)
 export(extract_class_axiom)
 export(extract_eq_axiom)
+export(extract_ordo_mappings)
 export(extract_pm_date)
 export(extract_pmid)
 export(extract_subclass_axiom)

diff --git a/NEWS.md b/NEWS.md
@@ -10,7 +10,7 @@ requiring downloads.
 ### New
 * `download_omim()` downloads official API-key requiring files directly from
 OMIM (e.g. mim2gene.txt, phenotypicSeries.txt, etc.).
-
+* `extract_ordo_mappings()` extracts mappings from Orphanet Rare Disease Ontology, in native format as `oboInOwl:hasDbXref` with Orphanet's text-based predicate modifiers, or as SKOS (supplemented with filler `doid:` predicates where `SKOS` predicates don't exist.
 
 
 # DO.utils 0.3.1

diff --git a/R/extract.R b/R/extract.R
@@ -441,3 +441,74 @@ extract_as_tidygraph <- function(x, query = NULL, collapse_method = "first",
 
     tg
 }
+
+
+#' Extract mappings from ORDO
+#'
+#' Extract mappings from the Orphanet Rare Disease Ontology (ORDO). ORDO uses
+#' `oboInOwl:hasDbXref` for mapping with annotations to indicate
+#' exact/broad/narrow-ness. Utilizes [robot()].
+#'
+#' @param ordo_path The path to the ORDO file, as a string.
+#' @param as_skos Whether to convert ORDO's annotated `oboInOwl:hasDbXref`
+#' mappings to their
+#' [Simple Knowledge Organization System (SKOS)](https://www.w3.org/TR/2009/REC-skos-reference-20090818/)
+#' equivalents, as a boolean (default: `TRUE`).
+#'
+#' The ORDO-skos equivalent predicates are as follows:
+#'
+#' * `"BTNT"` - `skos:narrowMatch`
+#' * `"NTBT"` - `skos:broadMatch`
+#' * `"E"` - `skos:exactMatch`
+#' * `"ND"` - `doid:undefinedMatch` (supplements SKOS)
+#' * `"W"` - `doid:notMatch` (supplements SKOS)
+#'
+#' @param output The path where output will be written, as a string. If `NULL`
+#' (default), the data will be read into R and not saved to a file.
+#' @inheritDotParams tidy_sparql -query_res
+#' @returns
+#' If `output` is specified, the path to the output file with the data,
+#' otherwise, a `tibble` with the data.
+#'
+#' ORDO mappings data will be formatted according to the
+#' [SSSOM](https://github.com/mapping-commons/sssom) specification,
+#' with an additional `status` column indicating the status (active, deprecated,
+#' etc.) of each ORPHA term.
+#'
+#' If `as_skos = FALSE`, ORDO's text-based `oboInOwl:hasDbXref` annotations
+#' denoting the type of relationship the Xref represents (simple text code only)
+#' will be included in the `predicate_modifier` column.
+#'
+#' @export
+extract_ordo_mappings <- function(ordo_path, as_skos = TRUE, output = NULL,
+                                  ...) {
+    if (isTRUE(as_skos)) {
+        q_nm <- "mapping-ordo-skos.rq"
+    } else {
+        q_nm <- "mapping-ordo.rq"
+    }
+
+    q_file <- system.file("sparql", q_nm, package = "DO.utils", mustWork = TRUE)
+
+    if (is.null(output)) {
+        to_stdout <- TRUE
+        output <- tmp_out <- tempfile(fileext = ".tsv")
+        on.exit(unlink(tmp_out))
+    } else {
+        to_stdout <- FALSE
+    }
+
+    robot_query(input = ordo_path, query = q_file, output)
+
+    if (to_stdout) {
+        out <- readr::read_tsv(
+            output,
+            col_types = readr::cols(.default = readr::col_character())
+        )
+        out <- tidy_sparql(out, ...)
+    } else {
+        out <- output
+    }
+
+    out
+}
diff --git a/inst/sparql/mapping-ordo-skos.rq b/inst/sparql/mapping-ordo-skos.rq
@@ -0,0 +1,48 @@
+# SPARQL query: Extract all ORDO mappings to SSSOM (with SKOS-conversion)
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX owl: <http://www.w3.org/2002/07/owl#>
+PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+PREFIX obo: <http://purl.obolibrary.org/obo/>
+PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
+PREFIX ORDO: <http://www.orpha.net/ORDO/Orphanet_>
+PREFIX doid: <http://purl.obolibrary.org/obo/doid#>
+
+SELECT ?subject_id ?subject_label ?predicate_id ?object_id ?status
+WHERE {
+    ?class skos:notation ?subject_id ;
+        rdfs:label ?subject_label ;
+        oboInOwl:hasDbXref ?object_id .
+    FILTER(STRSTARTS(?subject_id, "ORPHA:"))
+
+    OPTIONAL {
+        ?xref_annot owl:annotatedSource ?class ;
+            owl:annotatedProperty oboInOwl:hasDbXref ;
+            owl:annotatedTarget ?object_id ;
+            obo:ECO_0000218 ?xref_eco .
+
+        BIND(
+            REPLACE(
+                str(?xref_eco), # convert to string to avoid possible language tag mismatches
+                ".*(E|NTBT|BTNT|W|ND)(.|\\n)*",
+                "$1"
+            ) AS ?xref_type
+        )
+
+        VALUES (?xref_type ?predicate_id) {
+            ("E" skos:exactMatch)
+            ("BTNT" skos:narrowMatch)
+            ("NTBT" skos:broadMatch)
+            ("ND" doid:undefinedMatch)
+            ("W" doid:notMatch)
+        }
+    }
+
+    OPTIONAL {
+        ?class rdfs:subClassOf* ?inactive .
+        ?inactive rdfs:subClassOf ORDO:C041 ;
+            rdfs:label ?inactive_label .
+    }
+
+    # clean up output: F where missing and no language tags
+    BIND( if( BOUND( ?inactive_label ), ?inactive_label, "active") AS ?status)
+}
diff --git a/inst/sparql/mapping-ordo.rq b/inst/sparql/mapping-ordo.rq
@@ -0,0 +1,40 @@
+# SPARQL query: Extract all ORDO mappings to SSSOM
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX owl: <http://www.w3.org/2002/07/owl#>
+PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+PREFIX obo: <http://purl.obolibrary.org/obo/>
+PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
+PREFIX ORPHA: <http://www.orpha.net/ORDO/Orphanet_>
+
+SELECT ?subject_id ?subject_label ?predicate_id ?predicate_modifier ?object_id ?status
+WHERE {
+    VALUES ?predicate_id { oboInOwl:hasDbXref }
+    ?class skos:notation ?subject_id ;
+        rdfs:label ?subject_label ;
+        ?predicate_id ?object_id .
+    FILTER(STRSTARTS(?subject_id, "ORPHA:"))
+
+    OPTIONAL {
+        ?xref_annot owl:annotatedSource ?class ;
+            owl:annotatedProperty ?predicate_id ;
+            owl:annotatedTarget ?object_id ;
+            obo:ECO_0000218 ?xref_eco .
+
+        BIND(
+            REPLACE(
+                str(?xref_eco), # convert to string to avoid possible language tag mismatches
+                ".*(E|NTBT|BTNT|W|ND)(.|\\n)*",
+                "$1"
+            ) AS ?predicate_modifier
+        )
+    }
+
+    OPTIONAL {
+        ?class rdfs:subClassOf* ?inactive .
+        ?inactive rdfs:subClassOf ORPHA:C041 ;
+            rdfs:label ?inactive_label .
+    }
+
+    # clean up output: F where missing and no language tags
+    BIND( if( BOUND( ?inactive_label ), ?inactive_label, "active") AS ?status)
+}
diff --git a/man/extract_ordo_mappings.Rd b/man/extract_ordo_mappings.Rd