From 983a282157cd3f6b77a38417d9ae2d842e73cab1 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Thu, 14 Dec 2023 17:29:34 +0100 Subject: [PATCH] Medcat refresh (#623) * extracted and modified methods, no big cleanup * updated move to obs, some checks failing * trim down medcat usage * ruff-ed * enhanced docstring * updated doc and test * clean up add to obs * clean up add to obs * tiny test data for medcat * use booleans instead of yes/no --- docs/usage/usage.md | 7 +- ehrapy/plot/_scanpy_pl_api.py | 306 +-------- ehrapy/tools/__init__.py | 7 +- ehrapy/tools/nlp/_medcat.py | 599 +++++++----------- tests/tools/nlp/test_data_nlp/dataset1.csv | 5 + .../nlp/test_data_nlp/medcat_annotations1.csv | 39 ++ tests/tools/nlp/test_medcat.py | 25 +- 7 files changed, 317 insertions(+), 671 deletions(-) create mode 100644 tests/tools/nlp/test_data_nlp/dataset1.csv create mode 100644 tests/tools/nlp/test_data_nlp/medcat_annotations1.csv diff --git a/docs/usage/usage.md b/docs/usage/usage.md index 898e3cb0..534f4c5c 100644 --- a/docs/usage/usage.md +++ b/docs/usage/usage.md @@ -224,10 +224,9 @@ In contrast to a preprocessing function, a tool usually adds an easily interpret :nosignatures: tools.Translator - tools.MedCAT - tools.mc.run_unsupervised_training - tools.mc.annotate_text - tools.mc.get_annotation_overview + tools.annotate_text + tools.get_medcat_annotation_overview + tools.add_medcat_annotation_to_obs ``` ### Survival Analysis diff --git a/ehrapy/plot/_scanpy_pl_api.py b/ehrapy/plot/_scanpy_pl_api.py index d54d7c06..4f61330e 100644 --- a/ehrapy/plot/_scanpy_pl_api.py +++ b/ehrapy/plot/_scanpy_pl_api.py @@ -10,7 +10,6 @@ from scanpy.plotting import DotPlot, MatrixPlot, StackedViolin from scanpy.plotting._tools.scatterplots import _wraps_plot_scatter -from ehrapy.tools.nlp._medcat import EhrapyMedcat, MedCAT from ehrapy.util._doc_util import ( _doc_params, doc_adata_color_etc, @@ -137,34 +136,8 @@ def scatter( save=save, ax=ax, ) - if isinstance(adata, MedCAT): - if color: - if isinstance(color, str): - color = [color] # type: ignore - additional_columns: list[str] = [] - for colored_column in color: - if ( - colored_column not in set(adata.anndata.var_names) - and colored_column not in set(adata.anndata.obs.columns) - and not colored_column.startswith("#") - ): # hex codes are not treated as extracted entities - EhrapyMedcat.add_binary_column_to_obs( - adata, - adata.anndata, - colored_column, - color, # type: ignore - additional_columns, - ) - - scatter = scatter_partial(adata=adata.anndata, color=color[0]) - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return scatter - - else: - return scatter_partial(adata=adata.anndata) - - else: - return scatter_partial(adata=adata, color=color) + + return scatter_partial(adata=adata, color=color) @_doc_params( @@ -279,18 +252,8 @@ def heatmap( norm=norm, **kwds, ) - if isinstance(adata, MedCAT): - if isinstance(groupby, str): - groupby = [groupby] - additional_columns: list[str] = [] - for grp_column in groupby: - if grp_column not in set(adata.anndata.var_names) and grp_column not in set(adata.anndata.obs.columns): - EhrapyMedcat.add_binary_column_to_obs(adata, adata.anndata, grp_column, groupby, additional_columns) # type: ignore - heatmap = heatmap_partial(adata=adata.anndata, groupby=groupby) - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return heatmap - else: - return heatmap_partial(adata=adata, groupby=groupby) + + return heatmap_partial(adata=adata, groupby=groupby) @_doc_params( @@ -438,19 +401,8 @@ def dotplot( norm=norm, **kwds, ) - if isinstance(adata, MedCAT): - # keep loop and lists in case dotplot will accept sequences - if isinstance(groupby, str): - groupby = [groupby] # type: ignore - additional_columns: list[str] = [] - for grp_column in groupby: - if grp_column not in set(adata.anndata.var_names) and grp_column not in set(adata.anndata.obs.columns): - EhrapyMedcat.add_binary_column_to_obs(adata, adata.anndata, grp_column, groupby, additional_columns) # type: ignore - dotplot = dotplot_partial(adata=adata.anndata, groupby=groupby[0]) - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return dotplot - else: - return dotplot_partial(adata=adata, groupby=groupby) + + return dotplot_partial(adata=adata, groupby=groupby) @_doc_params(show_save_ax=doc_show_save_ax, common_plot_args=doc_common_plot_args) @@ -529,18 +481,8 @@ def tracksplot( figsize=figsize, **kwds, ) - if isinstance(adata, MedCAT): - # keep the list and loop in case of groupby could be a sequence in future - groupby = [groupby] # type: ignore - additional_columns: list[str] = [] - for grp_col in groupby: - if grp_col not in set(adata.anndata.var_names) and grp_col not in set(adata.anndata.obs.columns): - EhrapyMedcat.add_binary_column_to_obs(adata, adata.anndata, grp_col, groupby, additional_columns) # type: ignore - tracksplot = tracksplot_partial(adata=adata.anndata, groupby=groupby[0]) - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return tracksplot - else: - return tracksplot_partial(adata=adata, groupby=groupby) + + return tracksplot_partial(adata=adata, groupby=groupby) def violin( @@ -571,7 +513,7 @@ def violin( Args: adata: :class:`~anndata.AnnData` object object containing all observations. keys: Keys for accessing variables of `.var_names` or fields of `.obs`. - groupby: The key of the observation grouping to consider. Could also be an entity extracted by ehrapy's medcat tool. + groupby: The key of the observation grouping to consider. log: Plot on logarithmic axis. use_raw: Whether to use `raw` attribute of `adata`. Defaults to `True` if `.raw` is present. stripplot: Add a stripplot on top of the violin plot. See :func:`~seaborn.stripplot`. @@ -633,23 +575,8 @@ def violin( ax=ax, **kwds, ) - if isinstance(adata, MedCAT): - if groupby: - if isinstance(groupby, str): - groupby = [groupby] # type: ignore - additional_columns: list[str] = [] - for grp_column in groupby: - if grp_column not in set(adata.anndata.var_names) and grp_column not in set(adata.anndata.obs.columns): - EhrapyMedcat.add_binary_column_to_obs(adata, adata.anndata, grp_column, groupby, additional_columns) # type: ignore - violin = violin_partial(adata=adata.anndata, groupby=groupby[0]) - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return violin - else: - return violin_partial(adata=adata.anndata, groupby=None) - - else: - return violin_partial(adata=adata, groupby=groupby) + return violin_partial(adata=adata, groupby=groupby) @_doc_params( @@ -794,23 +721,8 @@ def stacked_violin( norm=norm, **kwds, ) - if isinstance(adata, MedCAT): - if isinstance(groupby, str): - groupby = [groupby] # type: ignore - additional_columns: list[str] = [] - for colored_column in groupby: - if colored_column not in set(adata.anndata.var_names) and colored_column not in set( - adata.anndata.obs.columns - ): - EhrapyMedcat.add_binary_column_to_obs(adata, adata.anndata, colored_column, groupby, additional_columns) # type: ignore - stacked_violin = stacked_vio_partial( - adata=adata.anndata, groupby=groupby if isinstance(groupby, Sequence) else groupby[0] - ) - if additional_columns: - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return stacked_violin - else: - return stacked_vio_partial(adata=adata, groupby=groupby) + + return stacked_vio_partial(adata=adata, groupby=groupby) @_doc_params( @@ -928,21 +840,8 @@ def matrixplot( norm=norm, **kwds, ) - if isinstance(adata, MedCAT): - if isinstance(groupby, str): - groupby = [groupby] # type: ignore - additional_columns: list[str] = [] - for colored_column in groupby: - if colored_column not in set(adata.anndata.var_names) and colored_column not in set( - adata.anndata.obs.columns - ): - EhrapyMedcat.add_binary_column_to_obs(adata, adata.anndata, colored_column, groupby, additional_columns) # type: ignore - matrixplot = matrix_partial(adata=adata.anndata, groupby=groupby) - if additional_columns: - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return matrixplot - else: - return matrix_partial(adata=adata, groupby=groupby) + + return matrix_partial(adata=adata, groupby=groupby) @_doc_params(show_save_ax=doc_show_save_ax) @@ -985,23 +884,8 @@ def clustermap( .. image:: /_static/docstring_previews/clustermap.png """ clustermap_partial = partial(sc.pl.clustermap, use_raw=use_raw, show=show, save=save, **kwds) - if isinstance(adata, MedCAT): - if obs_keys: - grp_flag = False - obs_keys = [obs_keys] # type: ignore - if obs_keys[0] not in set(adata.anndata.var_names) and obs_keys[0] not in set(adata.anndata.obs.columns): - EhrapyMedcat.add_binary_column_to_obs(adata, adata.anndata, obs_keys[0], obs_keys, None) # type: ignore - grp_flag = True - clustermap = clustermap_partial(adata=adata.anndata, obs_keys=obs_keys[0]) - if grp_flag: - adata.anndata.obs.drop(obs_keys[0], inplace=True, axis=1) - return clustermap - else: - return clustermap_partial(adata=adata.anndata, obs_keys=None) - - else: - return clustermap_partial(adata=adata, obs_keys=obs_keys) + return clustermap_partial(adata=adata, obs_keys=obs_keys) def ranking( @@ -1116,19 +1000,8 @@ def dendrogram( save=save, ax=ax, ) - if isinstance(adata, MedCAT): - grp_flag = False - groupby = [groupby] # type: ignore - if groupby[0] not in set(adata.anndata.var_names) and groupby[0] not in set(adata.anndata.obs.columns): - EhrapyMedcat.add_binary_column_to_obs(adata, adata.anndata, groupby[0], groupby, None) # type: ignore - grp_flag = True - dendrogram = dendrogram_partial(adata=adata.anndata, groupby=groupby[0]) - if grp_flag: - adata.anndata.obs.drop(groupby[0], inplace=True, axis=1) - return dendrogram - else: - return dendrogram_partial(adata=adata, groupby=groupby) + return dendrogram_partial(adata=adata, groupby=groupby) # @_wraps_plot_scatter @@ -1174,28 +1047,8 @@ def pca( pca_partial = partial( sc.pl.pca, annotate_var_explained=annotate_var_explained, show=show, return_fig=return_fig, save=save ) - if isinstance(adata, MedCAT): - if kwargs.get("color"): - if isinstance(kwargs["color"], str): - kwargs["color"] = [kwargs["color"]] - additional_columns: list[str] = [] - for colored_column in kwargs["color"]: - if colored_column not in set(adata.anndata.var_names) and colored_column not in set( - adata.anndata.obs.columns - ): - EhrapyMedcat.add_binary_column_to_obs( - adata, adata.anndata, colored_column, kwargs["color"], additional_columns - ) - pca = pca_partial(adata=adata.anndata, **kwargs) - if additional_columns: - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return pca - - else: - return pca_partial(adata=adata.anndata, **kwargs) - - else: - return pca_partial(adata=adata, **kwargs) + + return pca_partial(adata=adata, **kwargs) def pca_loadings( @@ -1343,27 +1196,8 @@ def tsne(adata, **kwargs) -> Axes | list[Axes] | None: # pragma: no cover .. image:: /_static/docstring_previews/tsne_3.png """ - if isinstance(adata, MedCAT): - if kwargs.get("color"): - if isinstance(kwargs["color"], str): - kwargs["color"] = [kwargs["color"]] - additional_columns: list[str] = [] - for colored_column in kwargs["color"]: - if colored_column not in set(adata.anndata.var_names) and colored_column not in set( - adata.anndata.obs.columns - ): - EhrapyMedcat.add_binary_column_to_obs( - adata, adata.anndata, colored_column, kwargs["color"], additional_columns - ) - tsne = sc.pl.tsne(adata=adata.anndata, **kwargs) - if additional_columns: - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return tsne - - else: - return sc.pl.tsne(adata=adata.anndata, **kwargs) - else: - return sc.pl.tsne(adata=adata, **kwargs) + + return sc.pl.tsne(adata=adata, **kwargs) # @_wraps_plot_scatter @@ -1373,7 +1207,7 @@ def tsne(adata, **kwargs) -> Axes | list[Axes] | None: # pragma: no cover scatter_bulk=doc_scatter_embedding, show_save_ax=doc_show_save_ax, ) -def umap(adata: AnnData | MedCAT, **kwargs) -> Axes | list[Axes] | None: # pragma: no cover +def umap(adata: AnnData, **kwargs) -> Axes | list[Axes] | None: # pragma: no cover """Scatter plot in UMAP basis. Args: @@ -1405,27 +1239,8 @@ def umap(adata: AnnData | MedCAT, **kwargs) -> Axes | list[Axes] | None: # prag .. image:: /_static/docstring_previews/umap_3.png """ - if isinstance(adata, MedCAT): - if kwargs.get("color"): - if isinstance(kwargs["color"], str): - kwargs["color"] = [kwargs["color"]] - additional_columns: list[str] = [] - for colored_column in kwargs["color"]: - if colored_column not in set(adata.anndata.var_names) and colored_column not in set( - adata.anndata.obs.columns - ): - EhrapyMedcat.add_binary_column_to_obs( - adata, adata.anndata, colored_column, kwargs["color"], additional_columns - ) - umap = sc.pl.umap(adata=adata.anndata, **kwargs) - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return umap - - else: - return sc.pl.umap(adata=adata.anndata, **kwargs) - - else: - return sc.pl.umap(adata=adata, **kwargs) + + return sc.pl.umap(adata=adata, **kwargs) # @_wraps_plot_scatter @@ -1457,28 +1272,8 @@ def diffmap(adata, **kwargs) -> Axes | list[Axes] | None: # pragma: no cover Preview: .. image:: /_static/docstring_previews/diffmap.png """ - if isinstance(adata, MedCAT): - if kwargs.get("color"): - if isinstance(kwargs["color"], str): - kwargs["color"] = [kwargs["color"]] - additional_columns: list[str] = [] - for colored_column in kwargs["color"]: - if colored_column not in set(adata.anndata.var_names) and colored_column not in set( - adata.anndata.obs.columns - ): - EhrapyMedcat.add_binary_column_to_obs( - adata, adata.anndata, colored_column, kwargs["color"], additional_columns - ) - diffmap = sc.pl.diffmap(adata=adata.anndata, **kwargs) - if additional_columns: - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return diffmap - - else: - return sc.pl.diffmap(adata=adata.anndata, **kwargs) - - else: - return sc.pl.diffmap(adata=adata, **kwargs) + + return sc.pl.diffmap(adata=adata, **kwargs) # @_wraps_plot_scatter @@ -1526,28 +1321,8 @@ def draw_graph( .. image:: /_static/docstring_previews/draw_graph_2.png """ draw_graph_part = partial(sc.pl.draw_graph, layout=layout) - if isinstance(adata, MedCAT): - if kwargs.get("color"): - if isinstance(kwargs["color"], str): - kwargs["color"] = [kwargs["color"]] - additional_columns: list[str] = [] - for colored_column in kwargs["color"]: - if colored_column not in set(adata.anndata.var_names) and colored_column not in set( - adata.anndata.obs.columns - ): - EhrapyMedcat.add_binary_column_to_obs( - adata, adata.anndata, colored_column, kwargs["color"], additional_columns - ) - graph = draw_graph_part(adata=adata.anndata, **kwargs) - if additional_columns: - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return graph - - else: - return draw_graph_part(adata=adata.anndata, **kwargs) - - else: - return draw_graph_part(adata=adata, **kwargs) + + return draw_graph_part(adata=adata, **kwargs) class Empty(Enum): @@ -1680,32 +1455,7 @@ def embedding( **kwargs, ) - if isinstance(adata, MedCAT): - if color: - if isinstance(color, str): - color = [color] # type: ignore - additional_columns: list[str] = [] - for colored_column in color: - if colored_column not in set(adata.anndata.var_names) and colored_column not in set( - adata.anndata.obs.columns - ): - EhrapyMedcat.add_binary_column_to_obs( - adata, - adata.anndata, - colored_column, - color, # type: ignore - additional_columns, - ) - _embedding = embedding_partial(adata=adata.anndata, color=color) - if additional_columns: - adata.anndata.obs.drop(additional_columns, inplace=True, axis=1) - return _embedding - - else: - return embedding_partial(adata=adata.anndata, color=None) - - else: - return embedding_partial(adata=adata, color=color) + return embedding_partial(adata=adata, color=color) @_doc_params(vminmax=doc_vbound_percentile, panels=doc_panels, show_save_ax=doc_show_save_ax) diff --git a/ehrapy/tools/__init__.py b/ehrapy/tools/__init__.py index 9fc5b9dc..20f0d699 100644 --- a/ehrapy/tools/__init__.py +++ b/ehrapy/tools/__init__.py @@ -4,8 +4,11 @@ from ehrapy.tools.feature_ranking._rank_features_groups import rank_features_groups try: # pragma: no cover - from ehrapy.tools.nlp._medcat import EhrapyMedcat as mc - from ehrapy.tools.nlp._medcat import MedCAT + from ehrapy.tools.nlp._medcat import ( + add_medcat_annotation_to_obs, + annotate_text, + get_medcat_annotation_overview, + ) except ImportError: pass from ehrapy.tools.nlp._translators import Translator diff --git a/ehrapy/tools/nlp/_medcat.py b/ehrapy/tools/nlp/_medcat.py index e7f23b40..41249ff5 100644 --- a/ehrapy/tools/nlp/_medcat.py +++ b/ehrapy/tools/nlp/_medcat.py @@ -1,394 +1,251 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import pandas as pd from thefuzz import process -from ehrapy.core._tool_available import _check_module_importable +if TYPE_CHECKING: + from collections.abc import Iterable -try: - from medcat.cat import CAT - from medcat.cdb import CDB - from medcat.cdb_maker import CDBMaker - from medcat.config import Config - from medcat.vocab import Vocab -except ModuleNotFoundError: - pass -from typing import TYPE_CHECKING + from anndata import AnnData -from rich import box, print -from rich.console import Console -from rich.table import Table + try: + from medcat.cat import CAT -if TYPE_CHECKING: - from anndata import AnnData + except ModuleNotFoundError: + pass -class MedCAT: - """Wrapper class for Medcat. This class will hold references to the current AnnData object, which holds the data, the current model (with vocab and concept database) and should be - passed to all functions exposed to the ehrapy nlp API when required. +def _format_df_column(df: pd.DataFrame, column_name: str) -> list[tuple[int, str]]: + """Format the df to match: formatted_data = [(row_id, row_text), (row_id, row_text), ...] + as this is required by MedCAT's multiprocessing annotation step + + """ + formatted_data = [] + for id, row in df.iterrows(): + text = row[column_name] + formatted_data.append((id, text)) + return formatted_data + + +def _flatten_annotated_results(annotation_results: dict) -> dict: + """Flattens the nested set (usually 5 level nested) of annotation results. + + annotation_results is just a simple flattened dict with infos on all entities found """ + flattened_annotated_dict = {} + entry_nr = 0 + + # row numbers where the text column is located in the original data + for row_id in annotation_results.keys(): + # all entities extracted from a given row + entities = annotation_results[row_id]["entities"] + for entity_id in entities.keys(): + # tokens are currently ignored, as they will not appear with the current basic model used by ehrapy from MedCAT + if entity_id != "tokens": + single_entity = {"row_nr": row_id} + entity = entities[entity_id] + # iterate over all info attributes of a single entity found in a specific row + for entity_key in entity.keys(): + if entity_key in ["pretty_name", "cui", "type_ids", "types"]: + single_entity[entity_key] = entities[entity_id][entity_key] + elif entity_key == "meta_anns": + single_entity[entity_key] = entities[entity_id][entity_key]["Status"]["value"] + flattened_annotated_dict[entry_nr] = single_entity + entry_nr += 1 + return flattened_annotated_dict + + +def annotate_text( + adata: AnnData, + cat: CAT, + text_column: str, + key_added: str = "medcat_annotations", + n_proc: int = 2, + batch_size_chars: int = 500000, + copy: bool = False, +) -> AnnData | None: + """Annotate the original free text data. Note this will only annotate non null rows. + + The result is a DataFrame. + This DataFrame serves as the base for all further analyses, for example coloring UMAPs by specific diseases. + + Args: + adata: AnnData object that holds the data to annotate. + cat: MedCAT object. + text_column: Name of the column that should be annotated. + key_added: Key to add to adata.uns for the annotated results. + n_proc: Number of processors to use. + batch_size_chars: batch size to use for CAT's multiprocessing method. + copy: Whether to copy adata or not. + + Returns: + Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields; + + `adata.uns[key_added]` : :class:`pandas.DataFrame` + DataFrame with the annotated results. - def __init__(self, anndata: AnnData, vocabulary: Vocab = None, concept_db: CDB = None, model_pack_path=None): - if not _check_module_importable("medcat"): - raise RuntimeError("Package medcat is not importable. Please install via pip install medcat") - self.anndata = anndata - self.vocabulary = vocabulary - self.concept_db = concept_db - if self.vocabulary is not None and self.concept_db is not None: - self.cat = CAT(cdb=concept_db, config=concept_db.config, vocab=vocabulary) - elif model_pack_path is not None: - self.cat = CAT.load_model_pack(model_pack_path) - # will be initialized as None, but will get updated when running annotate_text - self.annotated_results = None - - def update_cat(self, vocabulary: Vocab = None, concept_db: CDB = None): - """Updates the current MedCAT instance with new Vocabularies and Concept Databases. - - Args: - vocabulary: Vocabulary to update to. - concept_db: Concept Database to update to. - """ - self.cat = CAT(cdb=concept_db, config=concept_db.config, vocab=vocabulary) - - def update_cat_config(self, concept_db_config: Config) -> None: - """Updates the MedCAT configuration. - - Args: - concept_db_config: Concept to update to. - """ - self.concept_db.config = concept_db_config - - def set_filter_by_tui(self, tuis: list[str]) -> None: - """Restrict results of annotation step to certain tui's (type unique identifiers). - - Note that this will change the MedCat object by updating the concept database config. In every annotation - process that will be run afterwards, entities are shown, only if they fall into the tui's type. - A full list of tui's can be found at: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt - - As an example: - Setting tuis=["T047", "T048"] will only annotate concepts (identified by a CUI (concept unique identifier)) in UMLS that are either diseases or - syndroms (T047) or mental/behavioural dysfunctions (T048). - - Args: - tuis: list of TUI's (default is - - """ - # the filtered cui's that fall into the type of the filter tui's - cui_filters = set() - for type_id in tuis: - cui_filters.update(self.cat.cdb.addl_info["type_id2cuis"][type_id]) - self.cat.cdb.config.linking["filters"]["cuis"] = cui_filters - - @staticmethod - def create_vocabulary(vocabulary_data: str, replace: bool = True) -> Vocab: - """Creates a MedCAT Vocab and sets it for the MedCAT object. - - Args: - vocabulary_data: Path to the vocabulary data. - It is a tsv file and must look like: - - \t\t - house 34444 0.3232 0.123213 1.231231 - replace: Whether to replace existing words in the vocabulary. - - Returns: - Instance of a MedCAT Vocab - """ - vocabulary = Vocab() - vocabulary.add_words(vocabulary_data, replace=replace) - - return vocabulary - - @staticmethod - def create_concept_db(csv_path: list[str], config: Config = None) -> CDB: - """Creates a MedCAT concept database and sets it for the MedCAT object. - - Args: - csv_path: List of paths to one or more csv files containing all concepts. - The concept csvs must look like: - - cui,name - 1,kidney failure - 7,coronavirus - config: Optional MedCAT concept database configuration. - If not provided a default configuration with config.general['spacy_model'] = 'en_core_sci_md' is created. - Returns: - Instance of a MedCAT CDB concept database - """ - if config is None: - config = Config() - config.general["spacy_model"] = "en_core_sci_md" - maker = CDBMaker(config) - concept_db = maker.prepare_csvs(csv_path, full_build=True) - - return concept_db - - @staticmethod - def save_vocabulary(vocab: Vocab, output_path: str) -> None: - """Saves a vocabulary. - - Args: - vocab: The vocabulary object - output_path: Path to write the vocabulary to. - """ - vocab.save(output_path) - - @staticmethod - def load_vocabulary(vocabulary_path) -> Vocab: - """Loads a vocabulary. - - Args: - vocabulary_path: Path to load the vocabulary from. - """ - - return Vocab.load(vocabulary_path) - - @staticmethod - def save_concept_db(cdb, output_path: str) -> None: - """Saves a concept database. - - Args: - cdb: the concept database object - output_path: Path to save the concept database to. - """ - cdb.save(output_path) - - @staticmethod - def load_concept_db(concept_db_path) -> CDB: - """Loads the concept database. - - Args: - concept_db_path: Path to load the concept database from. - """ - return CDB.load(concept_db_path) - - def save_model_pack(self, model_pack_dir: str = ".", name: str = "ehrapy_medcat_model_pack") -> None: - """Saves a MedCAT model pack. - - Args: - model_pack_dir: Path to save the model to (defaults to current working directory). - name: Name of the new model pack - """ - _ = self.cat.create_model_pack(name) - - -class EhrapyMedcat: - """Wrapper class to perform feature extraction from free text data using MedCAT with ehrapy. This can be simply called by `ep.tl.mc`. - This class is not supposed to be instantiated at any time, it just serves as a wrapper for import. """ + if copy: + adata = adata.copy() + non_null_text = pd.DataFrame(adata.obs[text_column][~adata.obs[text_column].isnull()]) + formatted_text_column = _format_df_column(non_null_text, text_column) + results = cat.multiprocessing(formatted_text_column, batch_size_chars=batch_size_chars, nproc=n_proc) + flattened_res = _flatten_annotated_results(results) + + # flatten annotated results into a Pandas DataFrame and remove duplicate entries; for example when a single entity like a disease is mentioned multiple times without any meaningful context changes + # Example: The patient suffers from Diabetes. Cause of the Diabetes, he receives drug X. + flattened_res_df = pd.DataFrame.from_dict(flattened_res, orient="index").drop_duplicates( + subset=["cui", "row_nr", "meta_anns"] + ) + + # sort for row number in ascending order and reset index to keep index updated + adata.uns[key_added] = flattened_res_df.sort_values(by=["row_nr"]).reset_index(drop=True) + + return adata if copy else None + + +def _filter_df_by_status(df: pd.DataFrame, status: str) -> pd.DataFrame: + """Util function to filter passed dataframe by status.""" + df_res = df + if status != "Both": + if status not in {"Affirmed", "Other"}: + raise StatusNotSupportedError(f"{status} is not available. Please use either Affirmed, Other or Both!") + mask = df["meta_anns"].values == status + df_res = df[mask] + return df_res + + +def get_medcat_annotation_overview( + adata: AnnData, status: str = "Affirmed", use_key: str = "medcat_annotations" +) -> pd.DataFrame: + """Provide an overview for the annotation results. An overview will look like the following: + + cui (the CUI), nsubjects (from how many rows this one got extracted), type_ids (TUIs), name(name of the entitiy), perc_subjects (how many rows relative + to absolute number of rows) + + Args: + medcat_obj: The current MedCAT object which holds all infos on NLP analysis with MedCAT and ehrapy. + n: Basically the parameter for head() of pandas Dataframe. How many of the most common entities should be shown? + status: One of "Affirmed" (default), "Other" or "Both". Displays stats for either only affirmed entities, negated ones or both. + use_key: Key to use for the annotated results. + + Returns: + A Pandas DataFrame with the overview stats. + """ + df = _filter_df_by_status(adata.uns[use_key], status) + # group by CUI as this is a unique identifier per entity + grouped = df.groupby("cui") + # get absolute number of rows with this entity + # note for overview, only one TUI and type is shown (there shouldn't be much situations were multiple are even possible or useful) + res = grouped.agg( + { + "pretty_name": (lambda x: next(iter(set(x)))), + "type_ids": (lambda x: next(iter(x))[0]), + "types": (lambda x: next(iter(x))[0]), + "row_nr": "nunique", + } + ) + res = res.rename(columns={"row_nr": "n_patient_visit"}) + # relative amount of patient visits with the specific entity to all patient visits (or rows in the original data) + res["n_patient_visit_percent"] = (res["n_patient_visit"] / df["row_nr"].nunique()) * 100 + res.round({"n_patient_visit_percent": 1}) + + return res + + +def _check_valid_name(df: pd.DataFrame, name: Iterable[str]) -> None: + """Check whether the name is in the extracted entities to inform about possible typos. + Currently, only the pretty_name column is supported. + """ + invalid_names = [] + suggested_names = [] + + for nm in name: + pretty_names = df["pretty_name"].unique() - @staticmethod - def run_unsupervised_training( - medcat_obj: MedCAT, text: pd.Series, progress_print: int = 100, print_statistics: bool = False - ) -> None: - """Performs MedCAT unsupervised training on a provided text column. - - Args: - medcat_obj: ehrapy's custom MedCAT object, that keeps track of the vocab, concept database and the (annotated) results - text: Pandas Series of (free) text to annotate. - progress_print: print progress after that many training documents - print_statistics: Whether to print training statistics after training. - """ - print(f"[bold blue]Running unsupervised training using {len(text)} documents.") - medcat_obj.cat.train(text.values, progress_print=progress_print) - - if print_statistics: - medcat_obj.cat.cdb.print_stats() - - @staticmethod - def annotate_text(medcat_obj: MedCAT, text_column: str, n_proc: int = 2, batch_size_chars: int = 500000) -> None: - """Annotate the original free text data. Note this will only annotate non null rows. - The result will be a DataFrame. It will be set as the annotated_results attribute for the passed MedCat object. - This dataframe will be the base for all further analyses, for example coloring umaps by specific diseases. - - Args: - medcat_obj: Ehrapy's custom MedCAT object. The annotated_results attribute will be set here. - text_column: Name of the column that should be annotated - n_proc: Number of processors to use - batch_size_chars: batch size to control for the variability between document sizes - """ - non_null_text = EhrapyMedcat._filter_null_values(medcat_obj.anndata.obs, text_column) - formatted_text_column = EhrapyMedcat._format_df_column(non_null_text, text_column) - results = medcat_obj.cat.multiprocessing(formatted_text_column, batch_size_chars=batch_size_chars, nproc=n_proc) - flattened_res = EhrapyMedcat._flatten_annotated_results(results) - # sort for row number in ascending order and reset index to keep index updated - medcat_obj.annotated_results = ( - EhrapyMedcat._annotated_results_to_df(flattened_res).sort_values(by=["row_nr"]).reset_index(drop=True) + if nm not in pretty_names: + invalid_names.append(nm) + try: + new_name, _ = process.extractOne(query=nm, choices=pretty_names, score_cutoff=50) + suggested_names.append(new_name) + except EntitiyNotFoundError: + pass + + if invalid_names: + suggested_str = f" Do you mean {suggested_names}?" if suggested_names else "" + msg = f"Did not find {invalid_names} in MedCAT's extracted entities and added them not to .obs.{suggested_str}" + raise EntitiyNotFoundError(msg) + + +def add_medcat_annotation_to_obs( + adata: AnnData, + name: Iterable[str] | str, + use_key: str = "medcat_annotations", + added_colname: Iterable[str] | str | None = None, + copy: bool = False, +) -> AnnData | None: + """Add info extracted from free text as a binary column to obs. + + Indicates whether the specific entity to color by has been found in that row or not. + + + Args: + adata: AnnData object that holds the data to annotate. + name: Name of the entity to add as a column to obs. + use_key: Key to use for the annotated results. + added_colname: Name of the column to add to obs. If None, name will be used. + copy: Whether to copy adata or not. + + Returns: + Returns `None` if `copy=False`, else returns an `AnnData` object. Sets the following fields; + + `adata.obs[name | added_coname]` : :class:`pandas.DataFrame` + Added column(s) `to adata.obs`, indicating whether the specific entity to color by has been found in that row or not. + + """ + if use_key not in adata.uns.keys(): + raise ValueError(f"Key {use_key} not found in adata.uns. Please run ep.tl.annotate_text first.") + + if copy: + adata = adata.copy() + + if isinstance(name, str): + annotation_names = [name] + else: + annotation_names = list(name) + + if added_colname is None: + added_colname = annotation_names + elif isinstance(added_colname, str): + added_colname = [added_colname] + + added_colnames = list(added_colname) + if len(added_colnames) != len(annotation_names): + raise ValueError( + f"Length of added_colname ({len(added_colnames)}) does not match length of name ({len(annotation_names)})." ) - @staticmethod - def get_annotation_overview( - medcat_obj: MedCAT, n: int = 10, status: str = "Affirmed", save_to_csv: bool = False, save_path: str = "." - ) -> None: - """Provide an overview for the annotation results. An overview will look like the following: - - cui (the CUI), nsubjects (from how many rows this one got extracted), type_ids (TUIs), name(name of the entitiy), perc_subjects (how many rows relative - to absolute number of rows) - - Args: - medcat_obj: The current MedCAT object which holds all infos on NLP analysis with MedCAT and ehrapy. - n: Basically the parameter for head() of pandas Dataframe. How many of the most common entities should be shown? - status: One of "Affirmed" (default), "Other" or "Both". Displays stats for either only affirmed entities, negated ones or both. - save_to_csv: Whether to save the overview dataframe to a local .csv file in the current working directory or not. - save_path: Path to save the overview as .csv file. Defaults to current working directory. - - Returns: - A Pandas DataFrame with the overview stats. - """ - df = EhrapyMedcat._filter_df_by_status(medcat_obj.annotated_results, status) - # group by CUI as this is a unique identifier per entity - grouped = df.groupby("cui") - # get absolute number of rows with this entity - # note for overview, only one TUI and type is shown (there shouldn't be much situations were multiple are even possible or useful) - res = grouped.agg( + _check_valid_name(adata.uns[use_key], annotation_names) + + # only extract affirmed entities + df = _filter_df_by_status(adata.uns[use_key], "Affirmed") + + # check whether the name is in the extracted entities to inform about possible typos + # currently, only the pretty_name column is supported + for i, annotation_name in enumerate(annotation_names): + adata.obs[added_colnames[i]] = df.groupby("row_nr").agg( { - "pretty_name": (lambda x: next(iter(set(x)))), - "type_ids": (lambda x: next(iter(x))[0]), - "types": (lambda x: next(iter(x))[0]), - "row_nr": "nunique", + "pretty_name": ( + lambda row_pretty_names, annotation_name=annotation_name: any( + row_pretty_names.isin([annotation_name]) + ) + ) } ) - res = res.rename(columns={"row_nr": "n_patient_visit"}) - # relative amount of patient visits with the specific entity to all patient visits (or rows in the original data) - res["n_patient_visit_percent"] = (res["n_patient_visit"] / df["row_nr"].nunique()) * 100 - res.round({"n_patient_visit_percent": 1}) - # save to csv if desired - if save_to_csv: - res.to_csv(save_path) - - overview_table = EhrapyMedcat._df_to_rich_table(res.nlargest(n, "n_patient_visit")) - console = Console() - console.print(overview_table) - - @staticmethod - def add_binary_column_to_obs( - medcat_obj: MedCAT, adata: AnnData, name: str, all_names: list[str], add_cols: list[str] | None - ) -> None: - """Adds a binary column to obs (temporarily) for plotting infos extracted from freetext. - - Indicates whether the specific entity to color by has been found in that row or not. - """ - # only extract affirmed entities - df = EhrapyMedcat._filter_df_by_status(medcat_obj.annotated_results, "Affirmed") - # check whether the name is in the extracted entities to handle possible typos to a certain extend - # currently, only the pretty_name column is supported - # _list_replace(color, colored_column, colored_column_tmp) - if name not in df["pretty_name"].values: - new_name, _ = process.extractOne(query=name, choices=df["pretty_name"].unique(), score_cutoff=50) - if new_name: - print( - f"[bold yellow]Did not find [blue]{name} [yellow]in MedCAT's extracted entities. " - f"Will use best match {new_name}!" - ) - def _list_replace(lst, old: str, new: str): - """replace list elements (inplace)""" - i = -1 - try: - while True: - i = lst.index(old, i + 1) - lst[i] = new - except ValueError: - pass - - _list_replace(all_names, name, new_name) - name = new_name - else: - raise EntitiyNotFoundError( - f"Did not find {name} in MedCAT's extracted entities and could not determine a best matching equivalent." - ) - # add column to additional to remove it later on - if add_cols is not None: - add_cols.append(name) - adata.obs[name] = ( - df.groupby("row_nr").agg({"pretty_name": (lambda x: int(any(x.isin([name]))))}).astype("category") - ) - adata.obs = adata.obs.replace({name: {1.0: "yes", 0.0: "no"}}) - adata.obs[name] = adata.obs[name].fillna("no").astype("category") - - @staticmethod - def _annotated_results_to_df(flattened_results: dict) -> pd.DataFrame: - """Turn the flattened annotated results into a pandas DataFrame and remove duplicates.""" - df = pd.DataFrame.from_dict(flattened_results, orient="index") - # remove duplicate entries; for example when a single entity like a disease is mentioned multiple times without any meaningful context changes - # Example: The patient suffers from Diabetes. Cause of the Diabetes, he receives drug X. - df.drop_duplicates(subset=["cui", "row_nr", "meta_anns"]) - return df - - @staticmethod - def _flatten_annotated_results(annotation_results: dict) -> dict: - """Flattens the nested set (usually 5 level nested) of annotation results. - - annotation_results is just a simple flattened dict with infos on all entities found - """ - flattened_annotated_dict = {} - entry_nr = 0 - - # row numbers where the text column is located in the original data - for row_id in annotation_results.keys(): - # all entities extracted from a given row - entities = annotation_results[row_id]["entities"] - for entity_id in entities.keys(): - # tokens are currently ignored, as they will not appear with the current basic model used by ehrapy from MedCAT - if entity_id != "tokens": - single_entity = {"row_nr": row_id} - entity = entities[entity_id] - # iterate over all info attributes of a single entity found in a specific row - for entity_key in entity.keys(): - if entity_key in ["pretty_name", "cui", "type_ids", "types"]: - single_entity[entity_key] = entities[entity_id][entity_key] - elif entity_key == "meta_anns": - single_entity[entity_key] = entities[entity_id][entity_key]["Status"]["value"] - flattened_annotated_dict[entry_nr] = single_entity - entry_nr += 1 - return flattened_annotated_dict - - @staticmethod - def _format_df_column(df: pd.DataFrame, column_name: str) -> list[tuple[int, str]]: - """Format the df to match: formatted_data = [(row_id, row_text), (row_id, row_text), ...] - as this is required by MedCAT's multiprocessing annotation step - - """ - formatted_data = [] - for id, row in df.iterrows(): - text = row[column_name] - formatted_data.append((id, text)) - return formatted_data - - @staticmethod - def _filter_null_values(df: pd.DataFrame, column: str) -> pd.DataFrame: - """Filter null values of a given column and return that column without the null values""" - return pd.DataFrame(df[column][~df[column].isnull()]) - - @staticmethod - def _filter_df_by_status(df: pd.DataFrame, status: str) -> pd.DataFrame: - """Util function to filter passed dataframe by status.""" - df_res = df - if status != "Both": - if status not in {"Affirmed", "Other"}: - raise StatusNotSupportedError(f"{status} is not available. Please use either Affirmed, Other or Both!") - mask = df["meta_anns"].values == status - df_res = df[mask] - return df_res - - @staticmethod - def _df_to_rich_table(df: pd.DataFrame) -> Table: - """Convert a pandas dataframe to a rich Table""" - table = Table(show_header=True, header_style="bold magenta") - - for column in df.columns: - table.add_column(str(column)) - - for _, value_list in enumerate(df.values.tolist()): - row = [] - row += [str(x) for x in value_list] - table.add_row(*row) - - # Update the style of the table - table.row_styles = ["none", "dim"] - table.box = box.SIMPLE_HEAD - - return table + return adata if copy else None class StatusNotSupportedError(Exception): diff --git a/tests/tools/nlp/test_data_nlp/dataset1.csv b/tests/tools/nlp/test_data_nlp/dataset1.csv new file mode 100644 index 00000000..63ce1384 --- /dev/null +++ b/tests/tools/nlp/test_data_nlp/dataset1.csv @@ -0,0 +1,5 @@ +idx,sys_bp_entry,dia_bp_entry,glucose,weight,disease,station,text +1,138,78,80,77,A,ICU,"HISTORY OF PRESENT ILLNESS:, The patient is a 71-year-old Caucasian female with a history of diabetes, osteoarthritis, atrial fibrillation, hypertension, asthma, obstructive sleep apnea on CPAP, diabetic foot ulcer, anemia and left lower extremity cellulitis. She was brought in by the EMS service to Erlanger emergency department with pulseless electrical activity. Her husband states that he was at home with his wife, when she presented to him complaining of fever and chills. She became acutely unresponsive. She was noted to have worsening of her breathing. She took several of her MDIs and then was placed on her CPAP. He went to notify EMS and when he returned, she was found to not be breathing. He stated that she was noted to have no breathing in excess of 10 minutes. He states that the EMS system arrived at the home and she was found not breathing. The patient was intubated at the scene and upon arrival to Erlanger Medical Center, she was found to have pupils fixed and dilated. She was seen by me in the emergency department and was on Neo-Synephrine, dopamine with a blood pressure of 97/22 with a rapid heart rate and again, in an unresponsive state.,REVIEW OF SYSTEMS:, Review of systems was not obtainable.,PAST MEDICAL HISTORY:, Diabetes, osteoarthritis, hypertension, asthma, atrial fibrillation, diabetic foot ulcer and anemia.,PAST SURGICAL HISTORY:, Noncontributory to above.,FAMILY HISTORY:, Mother with history of coronary artery disease.,SOCIAL HISTORY:, The patient is married. She uses no ethanol, no tobacco and no illicits. She has a very support family unit.,MEDICATIONS:, Augmentin; Detrol LA; lisinopril.,IMMUNIZATIONS:, Immunizations were up to date for influenza, negative for Pneumovax.,ALLERGIES:, PENICILLIN.,LABORATORY AT PRESENTATION:, White blood cell count 11, hemoglobin 10.5, hematocrit 32.2, platelets 175,000. Sodium 148, potassium 5.2, BUN 30, creatinine 2.2 and glucose 216. PT was 22.4.,RADIOLOGIC DATA:, Chest x-ray revealed a diffuse pulmonary edema.,PHYSICAL EXAMINATION:,VITAL SIGNS: Blood pressure 97/52, pulse of 79, respirations 16, O2 sat 100%.,HEENT: The patient's pupils were again, fixed and dilated and intubated on the monitor.,CHEST: Poor air movement bilateral with bilateral rales.,CARDIOVASCULAR: Regular rate and rhythm.,ABDOMEN: The abdomen was obese, nondistended and nontender.,EXTREMITIES: Left diabetic foot had oozing pus drainage from the foot.,GU: Foley catheter was in place.,IMPRESSION AND PLAN:,1. Acute cardiac arrest with pulseless electrical activity with hypotensive shock and respiratory failure: Will continue ventilator support. Will rule out pulmonary embolus, rule out myocardial infarction. Continue pressors. The patient is currently on dopamine, Neo-Synephrine and Levophed.,2. Acute respiratory distress syndrome: Will continue ventilatory support.,3. Questionable sepsis: Will obtain blood cultures, intravenous vancomycin and Rocephin given.,4. Hypotensive shock: Will continue pressors. Will check random cortisol. Hydrocortisone was added.,Further inpatient management for this patient will be provided by Dr. R. The patient's status was discussed with her daughter and her husband. The husband states that his wife has been very ill in the past with multiple admissions, but he had never seen her as severely ill as with this event. He states that she completely was not breathing at all and he is aware of the severity of her illness and the gravity of her current prognosis. Will obtain the assistance with cardiology with this admission and will continue pressors and supportive therapy. The family will make an assessment and final decision concerning her long-term management after a 24 hour period." +2,139,79,90,76,A,ICU,"HISTORY OF PRESENT ILLNESS: , A 71-year-old female who I am seeing for the first time. She has a history of rheumatoid arthritis for the last 6 years. She was followed by another rheumatologist. She says she has been off and on, on prednisone and Arava. The rheumatologist, as per the patient, would not want her to be on a long-term medicine, so he would give her prednisone and then switch to Arava and then switch her back to prednisone. She says she had been on prednisone for the last 6 to 9 months. She is on 5 mg a day. She recently had a left BKA and there was a question of infection, so it had to be debrided. I was consulted to see if her prednisone is to be continued. The patient denies any joint pains at the present time. She says when this started she had significant joint pains and was unable to walk. She had pain in the hands and feet. Currently, she has no pain in any of her joints.,REVIEW OF SYSTEMS: , Denies photosensitivity, oral or nasal ulcer, seizure, psychosis, and skin rashes.,PAST MEDICAL HISTORY: , Significant for hypertension, peripheral vascular disease, and left BKA.,FAMILY HISTORY: ,Noncontributory.,SOCIAL HISTORY: , Denies tobacco, alcohol or illicit drugs.,PHYSICAL EXAMINATION:,VITAL SIGNS: BP 130/70, heart rate 80, and respiratory rate 14.,HEENT: EOMI. PERRLA.,NECK: Supple. No JVD. No lymphadenopathy.,CHEST: Clear to auscultation.,HEART: S1 and S2. No S3, no murmurs.,ABDOMEN: Soft and nontender. No organomegaly.,EXTREMITIES: No edema.,NEUROLOGIC: Deferred.,ARTICULAR: She has swelling of bilateral wrists, but no significant tenderness.,LABORATORY DATA:, Labs in chart was reviewed.,ASSESSMENT AND PLAN:, A 71-year-old female with a history of rheumatoid arthritis, on longstanding prednisone. She is not on DMARD, but as she recently had a surgery followed by a probable infection, I will hold off on that. As she has no pain, I have decreased the prednisone to 2.5 mg a day starting tomorrow if she is to go back to her nursing home tomorrow. If in a couple of weeks her symptoms stay the same, then I would discontinue the prednisone. I would defer that to Dr. X. If she flares up at that point, prednisone may have to be restarted with a DMARD, so that eventually she could stay off the prednisone. I discussed this at length with the patient and she is in full agreement with the plan. I explained to her that if she is to be discharged, if she wishes, she could follow up with me in clinic or if she goes back to Victoria, then see her rheumatologist over there." +3,140,80,120,60,A,MICU,"HISTORY OF PRESENT ILLNESS:, The patient is a 71-year-old Caucasian female with a history of diabetes, osteoarthritis, atrial fibrillation, hypertension, asthma, obstructive sleep apnea on CPAP, diabetic foot ulcer, anemia and left lower extremity cellulitis. She was brought in by the EMS service to Erlanger emergency department with pulseless electrical activity. Her husband states that he was at home with his wife, when she presented to him complaining of fever and chills. She became acutely unresponsive. She was noted to have worsening of her breathing. She took several of her MDIs and then was placed on her CPAP. He went to notify EMS and when he returned, she was found to not be breathing. He stated that she was noted to have no breathing in excess of 10 minutes. He states that the EMS system arrived at the home and she was found not breathing. The patient was intubated at the scene and upon arrival to Erlanger Medical Center, she was found to have pupils fixed and dilated. She was seen by me in the emergency department and was on Neo-Synephrine, dopamine with a blood pressure of 97/22 with a rapid heart rate and again, in an unresponsive state.,REVIEW OF SYSTEMS:, Review of systems was not obtainable.,PAST MEDICAL HISTORY:, Diabetes, osteoarthritis, hypertension, asthma, atrial fibrillation, diabetic foot ulcer and anemia.,PAST SURGICAL HISTORY:, Noncontributory to above.,FAMILY HISTORY:, Mother with history of coronary artery disease.,SOCIAL HISTORY:, The patient is married. She uses no ethanol, no tobacco and no illicits. She has a very support family unit.,MEDICATIONS:, Augmentin; Detrol LA; lisinopril.,IMMUNIZATIONS:, Immunizations were up to date for influenza, negative for Pneumovax.,ALLERGIES:, PENICILLIN.,LABORATORY AT PRESENTATION:, White blood cell count 11, hemoglobin 10.5, hematocrit 32.2, platelets 175,000. Sodium 148, potassium 5.2, BUN 30, creatinine 2.2 and glucose 216. PT was 22.4.,RADIOLOGIC DATA:, Chest x-ray revealed a diffuse pulmonary edema.,PHYSICAL EXAMINATION:,VITAL SIGNS: Blood pressure 97/52, pulse of 79, respirations 16, O2 sat 100%.,HEENT: The patient's pupils were again, fixed and dilated and intubated on the monitor.,CHEST: Poor air movement bilateral with bilateral rales.,CARDIOVASCULAR: Regular rate and rhythm.,ABDOMEN: The abdomen was obese, nondistended and nontender.,EXTREMITIES: Left diabetic foot had oozing pus drainage from the foot.,GU: Foley catheter was in place.,IMPRESSION AND PLAN:,1. Acute cardiac arrest with pulseless electrical activity with hypotensive shock and respiratory failure: Will continue ventilator support. Will rule out pulmonary embolus, rule out myocardial infarction. Continue pressors. The patient is currently on dopamine, Neo-Synephrine and Levophed.,2. Acute respiratory distress syndrome: Will continue ventilatory support.,3. Questionable sepsis: Will obtain blood cultures, intravenous vancomycin and Rocephin given.,4. Hypotensive shock: Will continue pressors. Will check random cortisol. Hydrocortisone was added.,Further inpatient management for this patient will be provided by Dr. R. The patient's status was discussed with her daughter and her husband. The husband states that his wife has been very ill in the past with multiple admissions, but he had never seen her as severely ill as with this event. He states that she completely was not breathing at all and he is aware of the severity of her illness and the gravity of her current prognosis. Will obtain the assistance with cardiology with this admission and will continue pressors and supportive therapy. The family will make an assessment and final decision concerning her long-term management after a 24 hour period." +4,141,81,130,90,A,MICU,"CHIEF COMPLAINT:,1. Infection.,2. Pelvic pain.,3. Mood swings.,4. Painful sex.,HISTORY OF PRESENT ILLNESS:, The patient is a 29-year-old female who is here today with the above-noted complaints. She states that she has been having a lot of swelling and infection in her inner thigh area with the folliculitis she has had in the past. She is requesting antibiotics. She has been squeezing them and some of them are very bruised and irritated. She also states that she is having significant pelvic pain and would like to go back and see Dr. XYZ again. She also states that she took herself off of lithium, but she has been having significant mood swings, anger outbursts and not dealing with the situation well at all. She also has had some psychiatric evaluation, but she states that she did not feel like herself on the medication, so she took herself off. She states she does not wish to be on any medication at the current time. She otherwise states that sex is so painful that she is unable to have sex with her husband, even though she ""wants to."",PAST MEDICAL HISTORY:, Significant for cleft palate.,ALLERGIES:, She is allergic to Lortab.,CURRENT MEDICATIONS:, None.,REVIEW OF SYSTEMS:, Please see history of present illness.,Psychiatric: She has had some suicidal thoughts, but no plans. She denies being suicidal at the current time.,Cardiopulmonary: She has not had any chest pain or shortness of breath.,GI: Denies any nausea or vomiting.,Neurological: No numbness, weakness or tingling.,PHYSICAL EXAMINATION:,General: The patient is a well-developed, well-nourished, 29-year-old female who is in no acute distress.,Vital signs: Weight: 160 pounds. Blood pressure: 100/60. Pulse: 62.,Psychiatric: I did spend over 25 minutes face-to-face with the patient talking about the situation she was in and the medication and her discontinuing use of that.,Extremities: Her inner thighs are covered with multiple areas of folliculitis and mild abscesses. They are bruised from her squeezing them. We talked about that in detail.,ASSESSMENT:,1. Folliculitis.,2. Pelvic pain.,3. Mood swings.,4. Dyspareunia.,PLAN:,1. I would like her to go to the lab and get a CBC, chem-12, TSH and UA.,2. We will put her on cephalexin 500 mg three times a day.,3. We will send her back to see Dr. XYZ regarding the pelvic pain per her request.,4. We will get her an appointment with a psychiatrist for evaluation and treatment.,5. She is to call if she has any further problems or concerns. Otherwise I will see her back for her routine care or sooner if there are any further issues." diff --git a/tests/tools/nlp/test_data_nlp/medcat_annotations1.csv b/tests/tools/nlp/test_data_nlp/medcat_annotations1.csv new file mode 100644 index 00000000..3e06d8e9 --- /dev/null +++ b/tests/tools/nlp/test_data_nlp/medcat_annotations1.csv @@ -0,0 +1,39 @@ +,row_nr,pretty_name,cui,type_ids,types,meta_anns +0,0,Diabetes,C0011847,['T047'],['Disease or Syndrome'],Affirmed +1,0,Sepsis,C0243026,['T047'],['Disease or Syndrome'],Other +2,0,"Respiratory Distress Syndrome, Adult",C0035222,['T047'],['Disease or Syndrome'],Affirmed +3,0,Myocardial Infarction,C0027051,['T047'],['Disease or Syndrome'],Other +4,0,Pulmonary Embolism,C0034065,['T047'],['Disease or Syndrome'],Other +5,0,Respiratory Failure,C1145670,['T047'],['Disease or Syndrome'],Affirmed +6,0,Cardiac Arrest,C0018790,['T047'],['Disease or Syndrome'],Affirmed +7,0,Coronary Arteriosclerosis,C0010054,['T047'],['Disease or Syndrome'],Affirmed +8,0,Diabetic foot ulcer,C1456868,['T047'],['Disease or Syndrome'],Affirmed +9,0,"Sleep Apnea, Obstructive",C0520679,['T047'],['Disease or Syndrome'],Affirmed +10,0,Asthma,C0004096,['T047'],['Disease or Syndrome'],Affirmed +11,0,Hypertensive disease,C0020538,['T047'],['Disease or Syndrome'],Affirmed +12,0,Atrial Fibrillation,C0004238,['T047'],['Disease or Syndrome'],Affirmed +13,0,Degenerative polyarthritis,C0029408,['T047'],['Disease or Syndrome'],Affirmed +14,0,Anemia,C0002871,['T047'],['Disease or Syndrome'],Affirmed +15,1,Peripheral Vascular Diseases,C0085096,['T047'],['Disease or Syndrome'],Affirmed +16,1,Psychotic Disorders,C0033975,['T048'],['Mental or Behavioral Dysfunction'],Affirmed +17,1,Hypertensive disease,C0020538,['T047'],['Disease or Syndrome'],Affirmed +18,1,Rheumatoid Arthritis,C0003873,['T047'],['Disease or Syndrome'],Affirmed +19,1,Ulcer,C0041582,['T047'],['Disease or Syndrome'],Affirmed +20,2,Anemia,C0002871,['T047'],['Disease or Syndrome'],Affirmed +21,2,Myocardial Infarction,C0027051,['T047'],['Disease or Syndrome'],Other +22,2,Pulmonary Embolism,C0034065,['T047'],['Disease or Syndrome'],Other +23,2,Respiratory Failure,C1145670,['T047'],['Disease or Syndrome'],Affirmed +24,2,Cardiac Arrest,C0018790,['T047'],['Disease or Syndrome'],Affirmed +25,2,Coronary Arteriosclerosis,C0010054,['T047'],['Disease or Syndrome'],Affirmed +26,2,Diabetic foot ulcer,C1456868,['T047'],['Disease or Syndrome'],Affirmed +27,2,"Sleep Apnea, Obstructive",C0520679,['T047'],['Disease or Syndrome'],Affirmed +28,2,Asthma,C0004096,['T047'],['Disease or Syndrome'],Affirmed +29,2,Hypertensive disease,C0020538,['T047'],['Disease or Syndrome'],Affirmed +30,2,Atrial Fibrillation,C0004238,['T047'],['Disease or Syndrome'],Affirmed +31,2,Degenerative polyarthritis,C0029408,['T047'],['Disease or Syndrome'],Affirmed +32,2,Diabetes,C0011847,['T047'],['Disease or Syndrome'],Affirmed +33,2,"Respiratory Distress Syndrome, Adult",C0035222,['T047'],['Disease or Syndrome'],Affirmed +34,2,Sepsis,C0243026,['T047'],['Disease or Syndrome'],Other +35,3,Mood swings,C0085633,['T048'],['Mental or Behavioral Dysfunction'],Other +36,3,Mood swings,C0085633,['T048'],['Mental or Behavioral Dysfunction'],Affirmed +37,3,Dyspareunia (female),C0013394,['T047'],['Disease or Syndrome'],Affirmed diff --git a/tests/tools/nlp/test_medcat.py b/tests/tools/nlp/test_medcat.py index 11eace26..4c244103 100644 --- a/tests/tools/nlp/test_medcat.py +++ b/tests/tools/nlp/test_medcat.py @@ -4,24 +4,17 @@ import pandas as pd from anndata import AnnData +import ehrapy as ep + CURRENT_DIR = Path(__file__).parent _TEST_PATH = f"{CURRENT_DIR}/test_data_nlp" class TestMedCAT: - def setup_method(self): - obs_data = { - "Krankheit": ["Krebs", "Tumor"], - "Land": ["Deutschland", "Schweiz"], - "Geschlecht": ["männlich", "weiblich"], - } - var_data = { - "Krankheit": ["Krebs", "Tumor", "Krebs"], - "Land": ["Deutschland", "Schweiz", "Österreich"], - "Geschlecht": ["männlich", "weiblich", "männlich"], - } - self.test_adata = AnnData( - X=np.array([["Deutschland", "Zöliakie", "Tumor"], ["Frankreich", "Allergie", "Krebs"]], np.dtype(object)), - obs=pd.DataFrame(data=obs_data), - var=pd.DataFrame(data=var_data, index=["Land", "Prädisposition", "Krankheit"]), - ) + def test_add_medcat_annotation_to_obs(self): + # created manually a small dataset with annotations to use here + adata = ep.io.read_csv(f"{_TEST_PATH}/dataset1.csv") + adata.uns["medcat_annotations"] = pd.read_csv(f"{_TEST_PATH}/medcat_annotations1.csv") + + ep.tl.add_medcat_annotation_to_obs(adata, name="Diabetes") + assert "Diabetes" in adata.obs.columns