Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EFO: add xref details to node data #21

Merged
merged 4 commits into from
Nov 9, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 63 additions & 8 deletions nxontology_data/efo/efo.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,68 @@ def update_term(old_term: str) -> str:
)
return {k: sorted(v) for k, v in current_to_old.items()}

def get_xref_details(self) -> dict[str, dict[str, str | list[str] | None]]:
xrefs = self.get_xrefs_df()[["efo_id", "xref_bioregistry"]].rename(
columns={"xref_bioregistry": "xref_id"}
)

xref_sources = (
self.get_xref_sources_df()
.assign(
xref_id=lambda df: df["xref"]
.str.split(":", expand=True)
.apply(
lambda row: normalize_parsed_curie(
xref_prefix=row[0],
xref_accession=row[1],
collapse_orphanet=True,
),
axis="columns",
)
)
.groupby(["efo_id", "xref_id"])["axiom_source"]
.apply(list)
.reset_index()
.rename(columns={"axiom_source": "sources"})
)

def get_relation(x: list[str]) -> str | None:
if "skos:exactMatch" in x or "mondo:exactMatch" in x:
return "skos:exactMatch"
if "skos:closeMatch" in x or "mondo:closeMatch" in x:
return "skos:closeMatch"
return None

mapping_properties = (
self.get_mapping_properties_df()
.groupby(["efo_id", "xref_id"])["mapping_property_id"]
.apply(list)
.reset_index()
.rename(columns={"mapping_property_id": "mapping_properties"})
.assign(
relation=lambda x: x["mapping_properties"].apply(get_relation),
)
)

xref_details = (
xrefs.merge(
mapping_properties,
how="outer",
on=["efo_id", "xref_id"],
)
.merge(
xref_sources,
how="outer",
on=["efo_id", "xref_id"],
)
.query("efo_id != xref_id")
)

return {
k: v[["xref_id", "relation", "sources"]].to_dict(orient="records")
for k, v in xref_details.groupby("efo_id")
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you paste a couple examples of xref_details values here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

EFO:0000479
"xref_details": [
        {
          "xref_id": "cohd:438383",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": null
        },
        {
          "xref_id": "DOID:2224",
          "sources": [
            "EFO:0000479",
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "gard:0006594",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": null
        },
        {
          "xref_id": "icd10:D47.3",
          "sources": [
            "DOID:2224",
            "ORDO:3318/e",
            "Orphanet:3318"
          ],
          "relation": null
        },
        {
          "xref_id": "icd9:238.71",
          "sources": [
            "DOID:2224",
            "EFO:0000479",
            "MONDO:equivalentTo",
            "MONDO:i2s",
            "i2s"
          ],
          "relation": null
        },
        {
          "xref_id": "icdo:9962/3",
          "sources": [
            "NCIT:C3407"
          ],
          "relation": null
        },
        {
          "xref_id": "mesh:D013920",
          "sources": [
            "DOID:2224",
            "EFO:0000479",
            "MONDO:equivalentTo",
            "ORDO:3318/e",
            "Orphanet:3318",
            "Orphanet:3318/e"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "MONDO:0005029",
          "sources": null,
          "relation": null
        },
        {
          "xref_id": "meddra:10015493",
          "sources": [
            "ORDO:3318/e",
            "Orphanet:3318",
            "Orphanet:3318/e"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "NCIT:C3407",
          "sources": [
            "DOID:2224",
            "EFO:0000479",
            "MONDO:equivalentTo",
            "MONDO:exact-label-match",
            "exact-label-match"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "omim:187950",
          "sources": null,
          "relation": null
        },
        {
          "xref_id": "omim:601977",
          "sources": null,
          "relation": null
        },
        {
          "xref_id": "omim:614521",
          "sources": null,
          "relation": null
        },
        {
          "xref_id": "oncotree:ET",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": null
        },
        {
          "xref_id": "orphanet:3318",
          "sources": [
            "DOID:2224",
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "orphanet:71493",
          "sources": [
            "DOID:2224",
            "MONDO:relatedTo"
          ],
          "relation": null
        },
        {
          "xref_id": "snomedct:109994006",
          "sources": [
            "DOID:2224",
            "EFO:0000479",
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "umls:C0040028",
          "sources": [
            "DOID:2224",
            "MONDO:equivalentTo",
            "NCIT:C3407",
            "ORDO:3318/e",
            "Orphanet:3318",
            "Orphanet:3318/e"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "snomedct:128844009",
          "sources": null,
          "relation": "skos:closeMatch"
        },
        {
          "xref_id": "snomedct:189508006",
          "sources": null,
          "relation": "skos:closeMatch"
        },
        {
          "xref_id": "snomedct:189513005",
          "sources": null,
          "relation": "skos:closeMatch"
        },
        {
          "xref_id": "snomedct:189514004",
          "sources": null,
          "relation": "skos:closeMatch"
        },
        {
          "xref_id": "snomedct:191333009",
          "sources": null,
          "relation": "skos:closeMatch"
        },
        {
          "xref_id": "snomedct:234499005",
          "sources": null,
          "relation": "skos:closeMatch"
        },
        {
          "xref_id": "snomedct:307652003",
          "sources": null,
          "relation": "skos:closeMatch"
        },
        {
          "xref_id": "snomedct:65471002",
          "sources": null,
          "relation": "skos:closeMatch"
        }
      ],
EFO:0000489
"xref_details": [
        {
          "xref_id": "DOID:0050936",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "MONDO:0000550",
          "sources": null,
          "relation": null
        },
        {
          "xref_id": "NCIT:C48576",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "NCIT:C48576",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "umls:C1257877",
          "sources": [
            "MONDO:equivalentTo",
            "NCIT:C48576"
          ],
          "relation": "skos:exactMatch"
        }
      ],
EFO:0000640
"xref_details": [
        {
          "xref_id": "DOID:4465",
          "sources": [
            "EFO:0000640",
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "gard:0009572",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": null
        },
        {
          "xref_id": "gard:0009575",
          "sources": [
            "MONDO:equivalentTo",
            "shared-umls-xref"
          ],
          "relation": null
        },
        {
          "xref_id": "icd10:C64",
          "sources": [
            "ORDO:319298/ntbt",
            "ORDO:47044/attributed",
            "ORDO:47044/ntbt",
            "Orphanet:319298",
            "Orphanet:47044"
          ],
          "relation": null
        },
        {
          "xref_id": "MONDO:0017884",
          "sources": null,
          "relation": null
        },
        {
          "xref_id": "mesh:C538614",
          "sources": null,
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "NCIT:C6975",
          "sources": [
            "DOID:4465",
            "EFO:0000640",
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "omim:605074",
          "sources": [
            "DOID:4465",
            "EFO:0000640",
            "MONDO:equivalentTo",
            "ORDO:47044/e",
            "Orphanet:47044"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "oncotree:PRCC",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": null
        },
        {
          "xref_id": "orphanet:319298",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "orphanet:47044",
          "sources": [
            "MONDO:equivalentTo",
            "OMIM:605074"
          ],
          "relation": null
        },
        {
          "xref_id": "snomedct:733608000",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "umls:C1306837",
          "sources": [
            "DOID:4465",
            "MONDO:equivalentTo",
            "NCIT:C6975",
            "ORDO:319298/e",
            "Orphanet:319298",
            "Orphanet:319298/e"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "umls:C1336078",
          "sources": [
            "DOID:4465",
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "umls:C2931899",
          "sources": null,
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "umls:CN205129",
          "sources": [
            "MONDO:equivalentTo"
          ],
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "NCIT:C27890",
          "sources": null,
          "relation": "skos:closeMatch"
        },
        {
          "xref_id": "Orphanet:47044",
          "sources": null,
          "relation": "skos:exactMatch"
        },
        {
          "xref_id": "snomedct:4797003",
          "sources": null,
          "relation": "skos:closeMatch"
        },
        {
          "xref_id": "umls:C1336839",
          "sources": null,
          "relation": "skos:closeMatch"
        }
      ],

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks great.

One note for future investigation is that it'd be nice to understand what the MONDO:equivalentTo source refers to (i.e. where it originates and how) and whether it implies (or should imply) a skos:exactMatch relation.


def get_nodes(self) -> list[dict[str, Any]]:
logger.info("Generating nodes")
node_df = self.get_terms_df()
Expand All @@ -271,6 +333,7 @@ def get_nodes(self) -> list[dict[str, Any]]:
.apply(lambda df: sorted(set(df.xref_bioregistry.dropna())))
)
node_df["subsets"] = node_df.efo_id.map(self.get_subsets())
node_df["xref_details"] = node_df.efo_id.map(self.get_xref_details())
# Use .to_json and not .to_dict to convert NaN to None
return json.loads(node_df.to_json(orient="records")) # type: ignore [no-any-return]

Expand Down Expand Up @@ -309,14 +372,6 @@ def write_outputs(self) -> None:
write_dataframe(
self.get_obsolete_df(), output_dir.joinpath(f"{self.name}_obsolete.json.gz")
)
write_dataframe(
self.get_mapping_properties_df(),
output_dir.joinpath(f"{self.name}_mapping_properties.json.gz"),
)
write_dataframe(
self.get_xref_sources_df(),
output_dir.joinpath(f"{self.name}_xref_sources.json.gz"),
)
if nxo.name == "efo_otar_profile":
nxo_slim = self.create_slim_nxo(nxo)
write_ontology(nxo_slim, output_dir, compression_threshold_mb=30.0)
Expand Down