From e0c1e0a22f8e62e4081e9dcbf2bd4b5ed9218bf9 Mon Sep 17 00:00:00 2001 From: Bartek Foltyn Date: Thu, 9 Nov 2023 14:29:05 +0100 Subject: [PATCH 1/4] Add xref details to node data --- nxontology_data/efo/efo.py | 63 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/nxontology_data/efo/efo.py b/nxontology_data/efo/efo.py index fb048c0..e1e4c56 100644 --- a/nxontology_data/efo/efo.py +++ b/nxontology_data/efo/efo.py @@ -259,6 +259,68 @@ def update_term(old_term: str) -> str: ) return {k: sorted(v) for k, v in current_to_old.items()} + def get_xref_details(self) -> dict[str, dict[str, list[str]]]: + xrefs = self.get_xrefs_df()[["efo_id", "xref_bioregistry"]].rename( + columns={"xref_bioregistry": "xref_id"} + ) + + xref_sources = ( + self.get_xref_sources_df() + .assign( + xref_id=lambda df: df["xref"] + .str.split(":", expand=True) + .apply( + lambda row: normalize_parsed_curie( + xref_prefix=row[0], + xref_accession=row[1], + collapse_orphanet=True, + ), + axis="columns", + ) + ) + .groupby(["efo_id", "xref_id"])["axiom_source"] + .apply(list) + .reset_index() + .rename(columns={"axiom_source": "sources"}) + ) + + def get_relation(x: list[str]) -> str | None: + if "skos:exactMatch" in x or "mondo:exactMatch" in x: + return "skos:exactMatch" + if "skos:closeMatch" in x or "mondo:closeMatch" in x: + return "skos:closeMatch" + return None + + mapping_properties = ( + self.get_mapping_properties_df() + .groupby(["efo_id", "xref_id"])["mapping_property_id"] + .apply(list) + .reset_index() + .rename(columns={"mapping_property_id": "mapping_properties"}) + .assign( + relation=lambda x: x["mapping_properties"].apply(get_relation), + ) + ) + + xref_details = ( + xrefs.merge( + mapping_properties, + how="outer", + on=["efo_id", "xref_id"], + ) + .merge( + xref_sources, + how="outer", + on=["efo_id", "xref_id"], + ) + .query("efo_id != xref_id") + ) + + return { + k: v[["xref_id", "sources", "relation"]].to_dict(orient="records") + for k, v in xref_details.groupby("efo_id") + } + def get_nodes(self) -> list[dict[str, Any]]: logger.info("Generating nodes") node_df = self.get_terms_df() @@ -271,6 +333,7 @@ def get_nodes(self) -> list[dict[str, Any]]: .apply(lambda df: sorted(set(df.xref_bioregistry.dropna()))) ) node_df["subsets"] = node_df.efo_id.map(self.get_subsets()) + node_df["xref_details"] = node_df.efo_id.map(self.get_xref_details()) # Use .to_json and not .to_dict to convert NaN to None return json.loads(node_df.to_json(orient="records")) # type: ignore [no-any-return] From 8e45e1d5014d1b9bf4e588bc7f9193d568473ead Mon Sep 17 00:00:00 2001 From: Bartek Foltyn Date: Thu, 9 Nov 2023 14:30:26 +0100 Subject: [PATCH 2/4] change get_xref_details return type --- nxontology_data/efo/efo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nxontology_data/efo/efo.py b/nxontology_data/efo/efo.py index e1e4c56..34ec46d 100644 --- a/nxontology_data/efo/efo.py +++ b/nxontology_data/efo/efo.py @@ -259,7 +259,7 @@ def update_term(old_term: str) -> str: ) return {k: sorted(v) for k, v in current_to_old.items()} - def get_xref_details(self) -> dict[str, dict[str, list[str]]]: + def get_xref_details(self) -> dict[str, dict[str, str | list[str] | None]]: xrefs = self.get_xrefs_df()[["efo_id", "xref_bioregistry"]].rename( columns={"xref_bioregistry": "xref_id"} ) From 160de7688a6c684b8d79924f01315111fe4f63d5 Mon Sep 17 00:00:00 2001 From: Bartek Foltyn Date: Thu, 9 Nov 2023 16:21:08 +0100 Subject: [PATCH 3/4] move relation before sources --- nxontology_data/efo/efo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nxontology_data/efo/efo.py b/nxontology_data/efo/efo.py index 34ec46d..07bcdb4 100644 --- a/nxontology_data/efo/efo.py +++ b/nxontology_data/efo/efo.py @@ -317,7 +317,7 @@ def get_relation(x: list[str]) -> str | None: ) return { - k: v[["xref_id", "sources", "relation"]].to_dict(orient="records") + k: v[["xref_id", "relation", "sources"]].to_dict(orient="records") for k, v in xref_details.groupby("efo_id") } From dc6b6a7243d0754e7d870d73ced323a800953467 Mon Sep 17 00:00:00 2001 From: Bartek Foltyn Date: Thu, 9 Nov 2023 16:26:08 +0100 Subject: [PATCH 4/4] Remove writing of xref sources and mapping properties to output --- nxontology_data/efo/efo.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/nxontology_data/efo/efo.py b/nxontology_data/efo/efo.py index 07bcdb4..18c7a52 100644 --- a/nxontology_data/efo/efo.py +++ b/nxontology_data/efo/efo.py @@ -372,14 +372,6 @@ def write_outputs(self) -> None: write_dataframe( self.get_obsolete_df(), output_dir.joinpath(f"{self.name}_obsolete.json.gz") ) - write_dataframe( - self.get_mapping_properties_df(), - output_dir.joinpath(f"{self.name}_mapping_properties.json.gz"), - ) - write_dataframe( - self.get_xref_sources_df(), - output_dir.joinpath(f"{self.name}_xref_sources.json.gz"), - ) if nxo.name == "efo_otar_profile": nxo_slim = self.create_slim_nxo(nxo) write_ontology(nxo_slim, output_dir, compression_threshold_mb=30.0)