Add example lexica

biopragmatics · Jan 30, 2024 · f537255 · f537255
1 parent 024de00
commit f537255
Show file tree

Hide file tree

Showing 13 changed files with 160 additions and 103 deletions.
diff --git a/lexica/cell/README.md b/lexica/cell/README.md
@@ -0,0 +1,2 @@
+# Cell and Cell Line Lexicon
+
diff --git a/lexica/cell/generate.py b/lexica/cell/generate.py
@@ -0,0 +1,74 @@
+import semra
+import biolexica
+from pathlib import Path
+
+HERE = Path(__file__).parent.resolve()
+TERMS_PATH = HERE.joinpath("terms.tsv.gz")
+
+PRIORITY = ["mesh", "efo", "cellosaurus", "ccle", "depmap", "bto", "cl", "clo"]
+BIOLEXICA_CONFIG = [
+    biolexica.Input(source="mesh", processor="pyobo", ancestors=["mesh:D002477"]),  # cells
+    biolexica.Input(source="efo", processor="pyobo", ancestors=["efo:0000324"]),
+    biolexica.Input(source="cellosaurus", processor="pyobo"),
+    # Input(source="ccle", processor="pyobo"),
+    biolexica.Input(source="bto", processor="pyobo"),
+    biolexica.Input(source="cl", processor="pyobo"),
+    biolexica.Input(source="clo", processor="pyobo"),
+]
+
+SEMRA_CONFIG = semra.Configuration(
+    name="Cell and Cell Line Mappings",
+    description="Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario "
+    "posed in the Biomappings paper, this configuration imports several different cell and "
+    "cell line resources and identifies mappings between them.",
+    inputs=[
+        semra.Input(source="biomappings"),
+        semra.Input(source="gilda"),
+        semra.Input(prefix="cellosaurus", source="pyobo", confidence=0.99),
+        semra.Input(prefix="bto", source="bioontologies", confidence=0.99),
+        semra.Input(prefix="cl", source="bioontologies", confidence=0.99),
+        semra.Input(prefix="clo", source="custom", confidence=0.99),
+        semra.Input(prefix="efo", source="pyobo", confidence=0.99),
+        semra.Input(
+            prefix="depmap",
+            source="pyobo",
+            confidence=0.99,
+            extras={"version": "22Q4", "standardize": True, "license": "CC-BY-4.0"},
+        ),
+        semra.Input(
+            prefix="ccle",
+            source="pyobo",
+            confidence=0.99,
+            extras={"version": "2019"},
+        ),
+    ],
+    add_labels=False,
+    priority=PRIORITY,
+    keep_prefixes=PRIORITY,
+    remove_imprecise=False,
+    mutations=[
+        semra.Mutation(source="efo", confidence=0.7),
+        semra.Mutation(source="bto", confidence=0.7),
+        semra.Mutation(source="cl", confidence=0.7),
+        semra.Mutation(source="clo", confidence=0.7),
+        semra.Mutation(source="depmap", confidence=0.7),
+        semra.Mutation(source="ccle", confidence=0.7),
+        semra.Mutation(source="cellosaurus", confidence=0.7),
+    ],
+    raw_pickle_path=HERE.joinpath("mappings_raw.pkl.gz"),
+    processed_pickle_path=HERE.joinpath("mappings_processed.pkl.gz"),
+    priority_pickle_path=HERE.joinpath("mappings_prioritized.pkl"),
+)
+
+
+def _main() -> None:
+    mappings = SEMRA_CONFIG.get_mappings()
+    biolexica.assemble_terms(
+        inputs=BIOLEXICA_CONFIG,
+        mappings=mappings,
+        output_path=TERMS_PATH,
+    )
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/lexica/cell/mappings_prioritized.pkl b/lexica/cell/mappings_prioritized.pkl
diff --git a/lexica/cell/mappings_processed.pkl.gz b/lexica/cell/mappings_processed.pkl.gz
diff --git a/lexica/cell/mappings_raw.pkl.gz b/lexica/cell/mappings_raw.pkl.gz
diff --git a/lexica/cell/terms.tsv.gz b/lexica/cell/terms.tsv.gz
diff --git a/lexica/phenotype/generate.py b/lexica/phenotype/generate.py
@@ -0,0 +1,70 @@
+import semra
+import biolexica
+from pathlib import Path
+
+HERE = Path(__file__).parent.resolve()
+TERMS_PATH = HERE.joinpath("terms.tsv.gz")
+
+PRIORITY = [
+    "doid",
+    "mondo",
+    "hp",
+    "symp",
+    "mesh",
+    "efo",
+]
+BIOLEXICA_CONFIG = [
+    biolexica.Input(source="doid", processor="pyobo"),
+    biolexica.Input(source="mondo", processor="pyobo"),
+    biolexica.Input(source="hp", processor="pyobo"),
+    biolexica.Input(source="symp", processor="pyobo"),
+    # TODO get subsets of MeSH (C for diseases, F for Psychiatry/Psychology,
+    #  and maybe others. See https://meshb.nlm.nih.gov/treeView)
+    biolexica.Input(source="mesh", processor="pyobo"),
+    biolexica.Input(source="efo", processor="pyobo"),  # TODO find subset of EFO
+    # biolexica.Input(source="umls", processor="pyobo"), # TODO find subset of UMLS
+    # biolexica.Input(source="ncit", processor="pyobo"), # TODO find subset of NCIT
+]
+
+SEMRA_CONFIG = semra.Configuration(
+    name="Cell and Cell Line Mappings",
+    description="Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario "
+    "posed in the Biomappings paper, this configuration imports several different cell and "
+    "cell line resources and identifies mappings between them.",
+    inputs=[
+        semra.Input(source="biomappings"),
+        semra.Input(source="gilda"),
+        semra.Input(prefix="doid", source="pyobo", confidence=0.99),
+        semra.Input(prefix="mondo", source="pyobo", confidence=0.99),
+        semra.Input(prefix="hp", source="pyobo", confidence=0.99),
+        semra.Input(prefix="symp", source="pyobo", confidence=0.99),
+        semra.Input(prefix="mesh", source="pyobo", confidence=0.99),
+        semra.Input(prefix="efo", source="pyobo", confidence=0.99),
+    ],
+    add_labels=False,
+    priority=PRIORITY,
+    keep_prefixes=PRIORITY,
+    remove_imprecise=False,
+    mutations=[
+        semra.Mutation(source="doid", confidence=0.7),
+        semra.Mutation(source="mondo", confidence=0.7),
+        semra.Mutation(source="hp", confidence=0.7),
+        semra.Mutation(source="symp", confidence=0.7),
+    ],
+    raw_pickle_path=HERE.joinpath("mappings_raw.pkl.gz"),
+    processed_pickle_path=HERE.joinpath("mappings_processed.pkl.gz"),
+    priority_pickle_path=HERE.joinpath("mappings_prioritized.pkl"),
+)
+
+
+def _main() -> None:
+    mappings = SEMRA_CONFIG.get_mappings()
+    biolexica.assemble_terms(
+        inputs=BIOLEXICA_CONFIG,
+        mappings=mappings,
+        output_path=TERMS_PATH,
+    )
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/lexica/phenotype/mappings_prioritized.pkl b/lexica/phenotype/mappings_prioritized.pkl
diff --git a/lexica/phenotype/mappings_processed.pkl.gz b/lexica/phenotype/mappings_processed.pkl.gz
diff --git a/lexica/phenotype/mappings_raw.pkl.gz b/lexica/phenotype/mappings_raw.pkl.gz
diff --git a/lexica/phenotype/terms.tsv.gz b/lexica/phenotype/terms.tsv.gz
diff --git a/setup.cfg b/setup.cfg
@@ -51,14 +51,11 @@ keywords =
 
 [options]
 install_requires =
-    # Missing itertools from the standard library you didn't know you needed
-    more_itertools
-    # Use progress bars excessively
-    tqdm
-    # Command line tools
-    click
-    more_click
-    # TODO your requirements go here
+    semra
+    gilda
+    bioregistry
+    pyobo
+    bioontologies
 
 # Random options
 zip_safe = false

diff --git a/src/biolexica/api.py b/src/biolexica/api.py
@@ -50,20 +50,17 @@ def assemble_terms(
     mappings: Optional[List["semra.Mapping"]] = None,
     *,
     include_biosynonyms: bool = True,
+    output_path: Optional[Path] = None,
 ) -> list[gilda.Term]:
     """Assemble terms from multiple resources."""
     terms = []
     for inp in inputs:
         if inp.processor in {"pyobo", "bioontologies"}:
             terms.extend(
-                iter_terms_by_prefix(
-                    inp.source, ancestors=inp.ancestors, processor=inp.processor
-                )
+                iter_terms_by_prefix(inp.source, ancestors=inp.ancestors, processor=inp.processor)
             )
         elif inp.processor == "biosynonyms":
-            terms.extend(
-                s.as_gilda_term() for s in biosynonyms.parse_synonyms(inp.source)
-            )
+            terms.extend(s.as_gilda_term() for s in biosynonyms.parse_synonyms(inp.source))
         else:
             raise ValueError(f"Unknown processor {inp.processor}")
 
@@ -75,6 +72,9 @@ def assemble_terms(
 
         terms = update_terms(terms, mappings)
 
+    if output_path is not None:
+        gilda.term.dump_terms(terms, output_path)
+
     return terms
 
 
@@ -101,9 +101,7 @@ def iter_terms_by_prefix(
         raise ValueError(f"Unknown processor: {processor}")
 
 
-def _get_pyobo_subset_terms(
-    source: str, ancestors: str | list[str]
-) -> Iterable[gilda.Term]:
+def _get_pyobo_subset_terms(source: str, ancestors: str | list[str]) -> Iterable[gilda.Term]:
     from pyobo.gilda_utils import get_gilda_terms
 
     subset = {
@@ -141,17 +139,11 @@ def _get_bioontologies_subset_terms(
             graph.add_edge(edge.subject.curie, edge.object.curie)
 
     descendant_curies = {
-        descendant
-        for c in _ensure_list(parent_curie)
-        for descendant in nx.ancestors(graph, c)
+        descendant for c in _ensure_list(parent_curie) for descendant in nx.ancestors(graph, c)
     }
 
     for node in tqdm(obograph.nodes, leave=False):
-        if (
-            not node.name
-            or node.reference is None
-            or node.reference.curie not in descendant_curies
-        ):
+        if not node.name or node.reference is None or node.reference.curie not in descendant_curies:
             continue
         yield gilda.Term(
             norm_text=normalize(node.name),
@@ -172,81 +164,3 @@ def _get_bioontologies_subset_terms(
                 status="synonym",
                 source=source,
             )
-
-
-def _main() -> None:
-    import pystow
-    import semra
-
-    PRIORITY = ["mesh", "efo", "cellosaurus", "ccle", "depmap", "bto", "cl", "clo"]
-    MODULE = pystow.module("semra", "case-studies", "cancer-cell-lines")
-    PRIORITY_SSSOM_PATH = MODULE.join(name="priority.sssom.tsv")
-
-    biolexica_input = [
-        Input(source="mesh", processor="pyobo", ancestors=["mesh:D002477"]),  # cells
-        Input(source="efo", processor="pyobo", ancestors=["efo:0000324"]),
-        Input(source="cellosaurus", processor="pyobo"),
-        # Input(source="ccle", processor="pyobo"),
-        Input(source="bto", processor="pyobo"),
-        Input(source="cl", processor="pyobo"),
-        Input(source="clo", processor="pyobo"),
-    ]
-
-    semra_config = semra.Configuration(
-        name="Cell and Cell Line Mappings",
-        description="Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario "
-        "posed in the Biomappings paper, this configuration imports several different cell and "
-        "cell line resources and identifies mappings between them.",
-        inputs=[
-            semra.Input(source="biomappings"),
-            semra.Input(source="gilda"),
-            semra.Input(prefix="cellosaurus", source="pyobo", confidence=0.99),
-            semra.Input(prefix="bto", source="bioontologies", confidence=0.99),
-            semra.Input(prefix="cl", source="bioontologies", confidence=0.99),
-            semra.Input(prefix="clo", source="custom", confidence=0.99),
-            semra.Input(prefix="efo", source="pyobo", confidence=0.99),
-            semra.Input(
-                prefix="depmap",
-                source="pyobo",
-                confidence=0.99,
-                extras={"version": "22Q4", "standardize": True, "license": "CC-BY-4.0"},
-            ),
-            semra.Input(
-                prefix="ccle",
-                source="pyobo",
-                confidence=0.99,
-                extras={"version": "2019"},
-            ),
-        ],
-        add_labels=False,
-        priority=PRIORITY,
-        keep_prefixes=PRIORITY,
-        remove_imprecise=False,
-        mutations=[
-            semra.Mutation(source="efo", confidence=0.7),
-            semra.Mutation(source="bto", confidence=0.7),
-            semra.Mutation(source="cl", confidence=0.7),
-            semra.Mutation(source="clo", confidence=0.7),
-            semra.Mutation(source="depmap", confidence=0.7),
-            semra.Mutation(source="ccle", confidence=0.7),
-            semra.Mutation(source="cellosaurus", confidence=0.7),
-        ],
-        raw_pickle_path=MODULE.join(name="raw.pkl"),
-        raw_sssom_path=MODULE.join(name="raw.sssom.tsv"),
-        raw_neo4j_path=MODULE.join("neo4j_raw"),
-        processed_pickle_path=MODULE.join(name="processed.pkl"),
-        processed_sssom_path=MODULE.join(name="processed.sssom.tsv"),
-        processed_neo4j_path=MODULE.join("neo4j"),
-        processed_neo4j_name="semra-cell",
-        priority_pickle_path=MODULE.join(name="priority.pkl"),
-        priority_sssom_path=PRIORITY_SSSOM_PATH,
-    )
-
-    mappings = semra_config.get_mappings()
-
-    terms = assemble_terms(inputs=biolexica_input, mappings=mappings)
-    gilda.term.dump_terms(terms, HERE.joinpath("terms.tsv.gz"))
-
-
-if __name__ == "__main__":
-    _main()