Skip to content

Commit

Permalink
Add example lexica
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Jan 30, 2024
1 parent 024de00 commit f537255
Show file tree
Hide file tree
Showing 13 changed files with 160 additions and 103 deletions.
2 changes: 2 additions & 0 deletions lexica/cell/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Cell and Cell Line Lexicon

74 changes: 74 additions & 0 deletions lexica/cell/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import semra
import biolexica
from pathlib import Path

HERE = Path(__file__).parent.resolve()
TERMS_PATH = HERE.joinpath("terms.tsv.gz")

PRIORITY = ["mesh", "efo", "cellosaurus", "ccle", "depmap", "bto", "cl", "clo"]
BIOLEXICA_CONFIG = [
biolexica.Input(source="mesh", processor="pyobo", ancestors=["mesh:D002477"]), # cells
biolexica.Input(source="efo", processor="pyobo", ancestors=["efo:0000324"]),
biolexica.Input(source="cellosaurus", processor="pyobo"),
# Input(source="ccle", processor="pyobo"),
biolexica.Input(source="bto", processor="pyobo"),
biolexica.Input(source="cl", processor="pyobo"),
biolexica.Input(source="clo", processor="pyobo"),
]

SEMRA_CONFIG = semra.Configuration(
name="Cell and Cell Line Mappings",
description="Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario "
"posed in the Biomappings paper, this configuration imports several different cell and "
"cell line resources and identifies mappings between them.",
inputs=[
semra.Input(source="biomappings"),
semra.Input(source="gilda"),
semra.Input(prefix="cellosaurus", source="pyobo", confidence=0.99),
semra.Input(prefix="bto", source="bioontologies", confidence=0.99),
semra.Input(prefix="cl", source="bioontologies", confidence=0.99),
semra.Input(prefix="clo", source="custom", confidence=0.99),
semra.Input(prefix="efo", source="pyobo", confidence=0.99),
semra.Input(
prefix="depmap",
source="pyobo",
confidence=0.99,
extras={"version": "22Q4", "standardize": True, "license": "CC-BY-4.0"},
),
semra.Input(
prefix="ccle",
source="pyobo",
confidence=0.99,
extras={"version": "2019"},
),
],
add_labels=False,
priority=PRIORITY,
keep_prefixes=PRIORITY,
remove_imprecise=False,
mutations=[
semra.Mutation(source="efo", confidence=0.7),
semra.Mutation(source="bto", confidence=0.7),
semra.Mutation(source="cl", confidence=0.7),
semra.Mutation(source="clo", confidence=0.7),
semra.Mutation(source="depmap", confidence=0.7),
semra.Mutation(source="ccle", confidence=0.7),
semra.Mutation(source="cellosaurus", confidence=0.7),
],
raw_pickle_path=HERE.joinpath("mappings_raw.pkl.gz"),
processed_pickle_path=HERE.joinpath("mappings_processed.pkl.gz"),
priority_pickle_path=HERE.joinpath("mappings_prioritized.pkl"),
)


def _main() -> None:
mappings = SEMRA_CONFIG.get_mappings()
biolexica.assemble_terms(
inputs=BIOLEXICA_CONFIG,
mappings=mappings,
output_path=TERMS_PATH,
)


if __name__ == "__main__":
_main()
Binary file added lexica/cell/mappings_prioritized.pkl
Binary file not shown.
Binary file added lexica/cell/mappings_processed.pkl.gz
Binary file not shown.
Binary file added lexica/cell/mappings_raw.pkl.gz
Binary file not shown.
Binary file added lexica/cell/terms.tsv.gz
Binary file not shown.
70 changes: 70 additions & 0 deletions lexica/phenotype/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import semra
import biolexica
from pathlib import Path

HERE = Path(__file__).parent.resolve()
TERMS_PATH = HERE.joinpath("terms.tsv.gz")

PRIORITY = [
"doid",
"mondo",
"hp",
"symp",
"mesh",
"efo",
]
BIOLEXICA_CONFIG = [
biolexica.Input(source="doid", processor="pyobo"),
biolexica.Input(source="mondo", processor="pyobo"),
biolexica.Input(source="hp", processor="pyobo"),
biolexica.Input(source="symp", processor="pyobo"),
# TODO get subsets of MeSH (C for diseases, F for Psychiatry/Psychology,
# and maybe others. See https://meshb.nlm.nih.gov/treeView)
biolexica.Input(source="mesh", processor="pyobo"),
biolexica.Input(source="efo", processor="pyobo"), # TODO find subset of EFO
# biolexica.Input(source="umls", processor="pyobo"), # TODO find subset of UMLS
# biolexica.Input(source="ncit", processor="pyobo"), # TODO find subset of NCIT
]

SEMRA_CONFIG = semra.Configuration(
name="Cell and Cell Line Mappings",
description="Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario "
"posed in the Biomappings paper, this configuration imports several different cell and "
"cell line resources and identifies mappings between them.",
inputs=[
semra.Input(source="biomappings"),
semra.Input(source="gilda"),
semra.Input(prefix="doid", source="pyobo", confidence=0.99),
semra.Input(prefix="mondo", source="pyobo", confidence=0.99),
semra.Input(prefix="hp", source="pyobo", confidence=0.99),
semra.Input(prefix="symp", source="pyobo", confidence=0.99),
semra.Input(prefix="mesh", source="pyobo", confidence=0.99),
semra.Input(prefix="efo", source="pyobo", confidence=0.99),
],
add_labels=False,
priority=PRIORITY,
keep_prefixes=PRIORITY,
remove_imprecise=False,
mutations=[
semra.Mutation(source="doid", confidence=0.7),
semra.Mutation(source="mondo", confidence=0.7),
semra.Mutation(source="hp", confidence=0.7),
semra.Mutation(source="symp", confidence=0.7),
],
raw_pickle_path=HERE.joinpath("mappings_raw.pkl.gz"),
processed_pickle_path=HERE.joinpath("mappings_processed.pkl.gz"),
priority_pickle_path=HERE.joinpath("mappings_prioritized.pkl"),
)


def _main() -> None:
mappings = SEMRA_CONFIG.get_mappings()
biolexica.assemble_terms(
inputs=BIOLEXICA_CONFIG,
mappings=mappings,
output_path=TERMS_PATH,
)


if __name__ == "__main__":
_main()
Binary file added lexica/phenotype/mappings_prioritized.pkl
Binary file not shown.
Binary file added lexica/phenotype/mappings_processed.pkl.gz
Binary file not shown.
Binary file added lexica/phenotype/mappings_raw.pkl.gz
Binary file not shown.
Binary file added lexica/phenotype/terms.tsv.gz
Binary file not shown.
13 changes: 5 additions & 8 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,11 @@ keywords =

[options]
install_requires =
# Missing itertools from the standard library you didn't know you needed
more_itertools
# Use progress bars excessively
tqdm
# Command line tools
click
more_click
# TODO your requirements go here
semra
gilda
bioregistry
pyobo
bioontologies

# Random options
zip_safe = false
Expand Down
104 changes: 9 additions & 95 deletions src/biolexica/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,17 @@ def assemble_terms(
mappings: Optional[List["semra.Mapping"]] = None,
*,
include_biosynonyms: bool = True,
output_path: Optional[Path] = None,
) -> list[gilda.Term]:
"""Assemble terms from multiple resources."""
terms = []
for inp in inputs:
if inp.processor in {"pyobo", "bioontologies"}:
terms.extend(
iter_terms_by_prefix(
inp.source, ancestors=inp.ancestors, processor=inp.processor
)
iter_terms_by_prefix(inp.source, ancestors=inp.ancestors, processor=inp.processor)
)
elif inp.processor == "biosynonyms":
terms.extend(
s.as_gilda_term() for s in biosynonyms.parse_synonyms(inp.source)
)
terms.extend(s.as_gilda_term() for s in biosynonyms.parse_synonyms(inp.source))
else:
raise ValueError(f"Unknown processor {inp.processor}")

Expand All @@ -75,6 +72,9 @@ def assemble_terms(

terms = update_terms(terms, mappings)

if output_path is not None:
gilda.term.dump_terms(terms, output_path)

return terms


Expand All @@ -101,9 +101,7 @@ def iter_terms_by_prefix(
raise ValueError(f"Unknown processor: {processor}")


def _get_pyobo_subset_terms(
source: str, ancestors: str | list[str]
) -> Iterable[gilda.Term]:
def _get_pyobo_subset_terms(source: str, ancestors: str | list[str]) -> Iterable[gilda.Term]:
from pyobo.gilda_utils import get_gilda_terms

subset = {
Expand Down Expand Up @@ -141,17 +139,11 @@ def _get_bioontologies_subset_terms(
graph.add_edge(edge.subject.curie, edge.object.curie)

descendant_curies = {
descendant
for c in _ensure_list(parent_curie)
for descendant in nx.ancestors(graph, c)
descendant for c in _ensure_list(parent_curie) for descendant in nx.ancestors(graph, c)
}

for node in tqdm(obograph.nodes, leave=False):
if (
not node.name
or node.reference is None
or node.reference.curie not in descendant_curies
):
if not node.name or node.reference is None or node.reference.curie not in descendant_curies:
continue
yield gilda.Term(
norm_text=normalize(node.name),
Expand All @@ -172,81 +164,3 @@ def _get_bioontologies_subset_terms(
status="synonym",
source=source,
)


def _main() -> None:
import pystow
import semra

PRIORITY = ["mesh", "efo", "cellosaurus", "ccle", "depmap", "bto", "cl", "clo"]
MODULE = pystow.module("semra", "case-studies", "cancer-cell-lines")
PRIORITY_SSSOM_PATH = MODULE.join(name="priority.sssom.tsv")

biolexica_input = [
Input(source="mesh", processor="pyobo", ancestors=["mesh:D002477"]), # cells
Input(source="efo", processor="pyobo", ancestors=["efo:0000324"]),
Input(source="cellosaurus", processor="pyobo"),
# Input(source="ccle", processor="pyobo"),
Input(source="bto", processor="pyobo"),
Input(source="cl", processor="pyobo"),
Input(source="clo", processor="pyobo"),
]

semra_config = semra.Configuration(
name="Cell and Cell Line Mappings",
description="Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario "
"posed in the Biomappings paper, this configuration imports several different cell and "
"cell line resources and identifies mappings between them.",
inputs=[
semra.Input(source="biomappings"),
semra.Input(source="gilda"),
semra.Input(prefix="cellosaurus", source="pyobo", confidence=0.99),
semra.Input(prefix="bto", source="bioontologies", confidence=0.99),
semra.Input(prefix="cl", source="bioontologies", confidence=0.99),
semra.Input(prefix="clo", source="custom", confidence=0.99),
semra.Input(prefix="efo", source="pyobo", confidence=0.99),
semra.Input(
prefix="depmap",
source="pyobo",
confidence=0.99,
extras={"version": "22Q4", "standardize": True, "license": "CC-BY-4.0"},
),
semra.Input(
prefix="ccle",
source="pyobo",
confidence=0.99,
extras={"version": "2019"},
),
],
add_labels=False,
priority=PRIORITY,
keep_prefixes=PRIORITY,
remove_imprecise=False,
mutations=[
semra.Mutation(source="efo", confidence=0.7),
semra.Mutation(source="bto", confidence=0.7),
semra.Mutation(source="cl", confidence=0.7),
semra.Mutation(source="clo", confidence=0.7),
semra.Mutation(source="depmap", confidence=0.7),
semra.Mutation(source="ccle", confidence=0.7),
semra.Mutation(source="cellosaurus", confidence=0.7),
],
raw_pickle_path=MODULE.join(name="raw.pkl"),
raw_sssom_path=MODULE.join(name="raw.sssom.tsv"),
raw_neo4j_path=MODULE.join("neo4j_raw"),
processed_pickle_path=MODULE.join(name="processed.pkl"),
processed_sssom_path=MODULE.join(name="processed.sssom.tsv"),
processed_neo4j_path=MODULE.join("neo4j"),
processed_neo4j_name="semra-cell",
priority_pickle_path=MODULE.join(name="priority.pkl"),
priority_sssom_path=PRIORITY_SSSOM_PATH,
)

mappings = semra_config.get_mappings()

terms = assemble_terms(inputs=biolexica_input, mappings=mappings)
gilda.term.dump_terms(terms, HERE.joinpath("terms.tsv.gz"))


if __name__ == "__main__":
_main()

0 comments on commit f537255

Please sign in to comment.