From 02f56c0bda9b0ecc58e05c71a08d2f42197c00b8 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 26 Jun 2024 17:10:01 +0100 Subject: [PATCH 01/19] rename expandCoding --- fhirflat/fhir2flat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fhirflat/fhir2flat.py b/fhirflat/fhir2flat.py index 8b83d89..83aa1d4 100644 --- a/fhirflat/fhir2flat.py +++ b/fhirflat/fhir2flat.py @@ -115,14 +115,14 @@ def single_or_list(x): return df.groupby(df.index).agg(single_or_list) -def expandCoding(df: pd.DataFrame, column_name: str) -> pd.DataFrame: +def condenseCoding(df: pd.DataFrame, column_name: str) -> pd.DataFrame: """ Turns a column containing a list of dictionaries with coding information into 2 columns containing a list of strings with the coding information, and the text. [ {"system": "http://loinc.org", "code": "1234", "display": "Test"} ] becomes - [ "http://loinc.org/1234" ], ["Test"] + [ "http://loinc.org|1234" ], ["Test"] If a "text" field has already been provided, this overrides the display. """ @@ -291,7 +291,7 @@ def fhir2flat(resource: FHIRFlatBase, lists: list | None = None) -> pd.DataFrame # expand all instances of the "coding" list for coding in df.columns[df.columns.str.endswith("coding")]: - df = expandCoding(df, coding) + df = condenseCoding(df, coding) # condense all references for reference in df.columns[df.columns.str.endswith("reference")]: From 746209eb9dca8a581555d332feb4a0301285ce19 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Wed, 26 Jun 2024 17:13:47 +0100 Subject: [PATCH 02/19] Split out methods in comments --- fhirflat/ingest.py | 10 +++ fhirflat/resources/base.py | 141 ++++++++++++++++++++++++++++++++++++- fhirflat/util.py | 61 ++++++++++++++++ 3 files changed, 211 insertions(+), 1 deletion(-) diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 66bf683..470697d 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -518,6 +518,16 @@ def convert_data_to_flat( os.path.join(folder_name, resource.__name__.lower()), ) + # flat_nonvalidated = resource.ingest_to_flat( + # df, + # # os.path.join(folder_name, resource.__name__.lower()), + # ) + + # valid_flat, errors = resource.validate_flat(flat_nonvalidated) + # valid_flat.to_parquet( + # f"{os.path.join(folder_name, resource.__name__.lower())}.parquet" + # ) + end_time = timeit.default_timer() total_time = end_time - start_time print( diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index b1d3f83..ab08048 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -1,4 +1,3 @@ -# from pydantic import BaseModel from __future__ import annotations import datetime @@ -86,6 +85,56 @@ def create_fhir_resource( except ValidationError as e: return e + # @classmethod + # def validate_fhirflat( + # cls, flat_df: pd.DataFrame + # ) -> tuple[FHIRFlatBase | list[FHIRFlatBase], pd.Series | None]: + # """ + # Takes a FHIRflat dataframe and converts the data into a list of populated + # FHIR resources. Returns a pandas Series of valid resources and a dataframe of + # the FHIRflat data that produced validation errors, with a `validation_error` + # column describing the error. + # If a single resource is found, it is returned as a single FHIR resource or + # raises a ValidationError. + + # Parameters + # ---------- + # flat_df: pd.DataFrame + # Pandas dataframe containing the FHIRflat data + + # Returns + # ------- + # resources: FHIRFlatBase or list[FHIRFlatBase] + # A list of populated FHIR resources + # errors: pd.Series or None + # A dataframe containing the flat_dict and validation errors. + # """ + # flat_df["fhir"] = flat_df.apply( + # lambda row: row.to_json(date_format="iso", date_unit="s"), axis=1 + # ).apply(lambda x: cls.create_fhir_resource(x)) + + # if len(flat_df) == 1: + # resource = flat_df["fhir"].iloc[0] + # if isinstance(resource, ValidationError): + # raise resource + # else: + # return resource, None + # else: + # resources = list(flat_df["fhir"]) + # errors = None + # if any(isinstance(r, ValidationError) for r in resources): + # validation_error_mask = flat_df["fhir"].apply( + # lambda x: isinstance(x, ValidationError) + # ) + + # errors = flat_df[validation_error_mask].copy() + # errors.rename(columns={"fhir": "validation_error"}, inplace=True) + + # valid_fhir = flat_df[~validation_error_mask] + # resources = valid_fhir["fhir"] + + # return resources, errors + @classmethod def from_flat(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: """ @@ -254,6 +303,96 @@ def ingest_to_flat(cls, data: pd.DataFrame, filename: str) -> pd.DataFrame | Non data_errors.rename(columns={"fhir": "validation_error"}, inplace=True) return data_errors if not data_errors.empty else None + # @classmethod + # def ingest_to_flat(cls, data: pd.DataFrame) -> pd.DataFrame | None: + # """ + # Takes a pandas dataframe containg the populated mapping file and a dictionary + # representing the FHIRflat resource and creates the FHIRflat parquet file. + # Performs data formatting on the date and coding columns to account for + # simplifications parquet makes when saving. + + # Parameters + # ---------- + # data: pd.DataFrame + # Pandas dataframe containing the data + # filename: str + # Name of the parquet file to be generated. + + # Returns + # ------- + # pd.DataFrame or None + # A dataframe containing the FHIRflat data. + # """ + + # data.loc[:, "flat_dict"] = cls.ingest_backbone_elements(data["flat_dict"]) + + # flat_df = pd.json_normalize(data["flat_dict"]) + + # if not flat_df.empty: + # # apply the coding column formatting in here + # system_columns = flat_df.columns[flat_df.columns.str.endswith(".system")] + # for coding_col in system_columns: + # col = coding_col.removesuffix(".system") + # flat_df = flat_df.apply(lambda x: condense_codes(x, col), axis=1) + # flat_df.drop(columns=system_columns, inplace=True) + + # # find and create dense columns - not working + # list_columns = flat_df.map(lambda x: isinstance(x, list)) + # list_lengths = [len(flat_df[x][0]) for x in list_cols] + # long_list_cols = [ + # x for x, y in zip(list_cols, list_lengths, strict=True) if y > 1 + # ] + + # if long_list_cols: + # flat_df.rename( + # columns={x: x + "_dense" for x in long_list_cols}, inplace=True + # ) + + # # format dates and columns + # flat_df = format_flat(flat_df) + + # # flat_df.to_parquet(f"{filename}.parquet") + # return flat_df + # return None + + # @classmethod + # def validate_flat(cls, flat_df: pd.DataFrame) + # -> tuple[pd.DataFrame, pd.DataFrame]: + # """ + # Takes a FHIRflat dataframe and validates the data against the FHIR + # schema. Returns a dataframe of valid resources and a dataframe of the + # FHIRflat data that produced validation errors, with a `validation_error` + # column describing the error. + + # Parameters + # ---------- + # flat_df: pd.DataFrame + # Pandas dataframe containing the FHIRflat data + + # Returns + # ------- + # valid_resources: pd.DataFrame + # A dataframe containing the valid FHIR resources + # errors: pd.DataFrame + # A dataframe containing the flat_dict and validation errors. + # """ + + # flat_df["fhir"] = flat_df.apply(lambda row: row.to_json(), axis=1).apply( + # lambda x: cls.create_fhir_resource(x) + # ) + + # validation_error_mask = flat_df["fhir"].apply( + # lambda x: isinstance(x, ValidationError) + # ) + + # errors = flat_df[validation_error_mask].copy() + # errors.rename(columns={"fhir": "validation_error"}, inplace=True) + + # valid_fhir = flat_df[~validation_error_mask] + # valid_fhir = valid_fhir.drop(columns=["fhir"]) + + # return valid_fhir, errors + @classmethod def fhir_bulk_import(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: """ diff --git a/fhirflat/util.py b/fhirflat/util.py index 32a5423..cdc6763 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -1,10 +1,12 @@ # Utility functions for FHIRflat +import datetime import importlib import re from collections.abc import KeysView from itertools import groupby import fhir.resources +import numpy as np import fhirflat from fhirflat.resources import extensions @@ -70,3 +72,62 @@ def get_local_extension_type(t: str): def get_local_resource(t: str): return getattr(fhirflat, t) + + +def format_flat(flat_df): + """ + Performs formatting ondates/lists in FHIRflat resources. + """ + + for date_cols in [ + x + for x in flat_df.columns + if ("date" in x.lower() or "period" in x.lower() or "time" in x.lower()) + ]: + # replace nan with None + flat_df[date_cols] = flat_df[date_cols].replace(np.nan, None) + + # convert datetime objects to ISO strings + # (stops unwanted parquet conversions) + # but skips over extensions that have floats/strings rather than dates + flat_df[date_cols] = flat_df[date_cols].apply( + lambda x: ( + x.isoformat() + if isinstance(x, datetime.datetime) or isinstance(x, datetime.date) + else x + ) + ) + + for coding_column in [ + x + for x in flat_df.columns + if (x.lower().endswith(".code") or x.lower().endswith(".text")) + and "Quantity" not in x + ]: + flat_df[coding_column] = flat_df[coding_column].apply( + lambda x: [x] if isinstance(x, str) else x + ) + + return flat_df + + +def condense_codes(row, code_col): + raw_codes = row[(code_col + ".code")] + if isinstance(raw_codes, (str, float)) and raw_codes == raw_codes: + formatted_code = ( + raw_codes if isinstance(raw_codes, str) else str(int(raw_codes)) + ) + codes = row[code_col + ".system"] + "|" + formatted_code + elif np.isnan(raw_codes) or raw_codes is None: + codes = None + else: + formatted_codes = [ + c if (isinstance(c, str) or c is None) else str(int(c)) for c in raw_codes + ] + codes = [ + s + "|" + c + for s, c in zip(row[code_col + ".system"], formatted_codes, strict=True) + ] + + row[code_col + ".code"] = codes + return row From 49ae33612f8e5ac5d050adae5974fae75d2d4751 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 30 Jul 2024 11:27:34 +0100 Subject: [PATCH 03/19] finish renaming condenseCoding --- fhirflat/fhir2flat.py | 4 ++-- fhirflat/flat2fhir.py | 1 + tests/test_fhir2flat_units.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fhirflat/fhir2flat.py b/fhirflat/fhir2flat.py index 83aa1d4..b8eb5f1 100644 --- a/fhirflat/fhir2flat.py +++ b/fhirflat/fhir2flat.py @@ -127,7 +127,7 @@ def condenseCoding(df: pd.DataFrame, column_name: str) -> pd.DataFrame: If a "text" field has already been provided, this overrides the display. """ - def expand( + def condense( row: pd.Series, column_name: str, text_present: bool = False ) -> pd.Series: codes = row[column_name] @@ -148,7 +148,7 @@ def expand( if column_name.removesuffix(".coding") + ".text" in df.columns: text_present = True - df = df.apply(lambda x: expand(x, column_name, text_present), axis=1) + df = df.apply(lambda x: condense(x, column_name, text_present), axis=1) if not text_present: df.insert( diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index 1eb85e0..6930dd5 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -216,6 +216,7 @@ def expand_concepts(data: dict[str, str], data_class: type[_DomainResource]) -> Combines columns containing flattened FHIR concepts back into JSON-like structures. """ + groups = group_keys(data.keys()) group_classes = {} diff --git a/tests/test_fhir2flat_units.py b/tests/test_fhir2flat_units.py index 2ca5e38..61ff6b4 100644 --- a/tests/test_fhir2flat_units.py +++ b/tests/test_fhir2flat_units.py @@ -118,12 +118,12 @@ def test_explode_and_flatten_no_multiples(data_lists, expected): ), ], ) -def test_expandCoding(data, expected): +def test_condenseCoding(data, expected): # Create a mock DataFrame df = pd.DataFrame(data) # Call the function - result = f2f.expandCoding(df, "code.coding") + result = f2f.condenseCoding(df, "code.coding") # Check the result expected = pd.DataFrame(expected) From 1c9ff596d2410547edf686caddd591d99d36c960 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 30 Jul 2024 14:33:58 +0100 Subject: [PATCH 04/19] Seperate ingestion into ingest & validate. Correct some of the list/non list codes in tests --- fhirflat/flat2fhir.py | 44 +---- fhirflat/ingest.py | 24 +-- fhirflat/resources/base.py | 275 ++++++++++--------------------- fhirflat/util.py | 71 +++++++- tests/test_condition_resource.py | 37 ++--- tests/test_ingest.py | 167 ++++++++++--------- 6 files changed, 270 insertions(+), 348 deletions(-) diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index 6930dd5..b201bf1 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -6,14 +6,9 @@ from fhir.resources.fhirprimitiveextension import FHIRPrimitiveExtension from fhir.resources.period import Period from fhir.resources.quantity import Quantity -from pydantic.v1 import BaseModel from pydantic.v1.error_wrappers import ValidationError -from .util import ( - get_fhirtype, - get_local_extension_type, - group_keys, -) +from .util import find_data_class, get_fhirtype, get_local_extension_type, group_keys def create_codeable_concept( @@ -174,43 +169,6 @@ def set_datatypes(k, v_dict, klass) -> dict: return {s.split(".", 1)[1]: v_dict[s] for s in v_dict} -def find_data_class(data_class: list[BaseModel] | BaseModel, k: str) -> BaseModel: - """ - Finds the type class for item k within the data class. - - Parameters - ---------- - data_class: list[BaseModel] or BaseModel - The data class to search within. If a list, the function will search for the - a class with a matching title to k. - k: str - The property to search for within the data class - """ - - if isinstance(data_class, list): - title_matches = [k.lower() == c.schema()["title"].lower() for c in data_class] - result = [x for x, y in zip(data_class, title_matches, strict=True) if y] - if len(result) == 1: - return get_fhirtype(k) - else: - raise ValueError(f"Couldn't find a matching class for {k} in {data_class}") - - else: - k_schema = data_class.schema()["properties"].get(k) - - base_class = ( - k_schema.get("items").get("type") - if k_schema.get("items") is not None - else k_schema.get("type") - ) - - if base_class is None: - assert k_schema.get("type") == "array" - - base_class = [opt.get("type") for opt in k_schema["items"]["anyOf"]] - return get_fhirtype(base_class) - - def expand_concepts(data: dict[str, str], data_class: type[_DomainResource]) -> dict: """ Combines columns containing flattened FHIR concepts back into diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 470697d..41d41ec 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -431,6 +431,7 @@ def convert_data_to_flat( mapping_files_types: tuple[dict, dict] | None = None, sheet_id: str | None = None, subject_id="subjid", + validate: bool = True, ): """ Takes raw clinical data (currently assumed to be a one-row-per-patient format like @@ -513,20 +514,19 @@ def convert_data_to_flat( else: raise ValueError(f"Unknown mapping type {t}") - errors = resource.ingest_to_flat( - df, - os.path.join(folder_name, resource.__name__.lower()), - ) + flat_nonvalidated = resource.ingest_to_flat(df) - # flat_nonvalidated = resource.ingest_to_flat( - # df, - # # os.path.join(folder_name, resource.__name__.lower()), - # ) + if validate: + valid_flat, errors = resource.validate_fhirflat(flat_nonvalidated) - # valid_flat, errors = resource.validate_flat(flat_nonvalidated) - # valid_flat.to_parquet( - # f"{os.path.join(folder_name, resource.__name__.lower())}.parquet" - # ) + valid_flat.to_parquet( + f"{os.path.join(folder_name, resource.__name__.lower())}.parquet" + ) + else: + errors = None + flat_nonvalidated.to_parquet( + f"{os.path.join(folder_name, resource.__name__.lower())}.parquet" + ) end_time = timeit.default_timer() total_time = end_time - start_time diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index ab08048..7aabfcc 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -1,15 +1,14 @@ from __future__ import annotations -import datetime import warnings from typing import ClassVar, TypeAlias -import numpy as np import orjson import pandas as pd from fhir.resources.domainresource import DomainResource as _DomainResource from pydantic.v1 import ValidationError +from fhirflat import util from fhirflat.fhir2flat import fhir2flat from fhirflat.flat2fhir import expand_concepts @@ -71,6 +70,8 @@ def create_fhir_resource( if not isinstance(data, dict): data: dict = orjson.loads(data) + data = {k: v for k, v in data.items() if v is not None} + data = cls.cleanup(data) data = expand_concepts(data, cls) @@ -85,55 +86,58 @@ def create_fhir_resource( except ValidationError as e: return e - # @classmethod - # def validate_fhirflat( - # cls, flat_df: pd.DataFrame - # ) -> tuple[FHIRFlatBase | list[FHIRFlatBase], pd.Series | None]: - # """ - # Takes a FHIRflat dataframe and converts the data into a list of populated - # FHIR resources. Returns a pandas Series of valid resources and a dataframe of - # the FHIRflat data that produced validation errors, with a `validation_error` - # column describing the error. - # If a single resource is found, it is returned as a single FHIR resource or - # raises a ValidationError. - - # Parameters - # ---------- - # flat_df: pd.DataFrame - # Pandas dataframe containing the FHIRflat data - - # Returns - # ------- - # resources: FHIRFlatBase or list[FHIRFlatBase] - # A list of populated FHIR resources - # errors: pd.Series or None - # A dataframe containing the flat_dict and validation errors. - # """ - # flat_df["fhir"] = flat_df.apply( - # lambda row: row.to_json(date_format="iso", date_unit="s"), axis=1 - # ).apply(lambda x: cls.create_fhir_resource(x)) - - # if len(flat_df) == 1: - # resource = flat_df["fhir"].iloc[0] - # if isinstance(resource, ValidationError): - # raise resource - # else: - # return resource, None - # else: - # resources = list(flat_df["fhir"]) - # errors = None - # if any(isinstance(r, ValidationError) for r in resources): - # validation_error_mask = flat_df["fhir"].apply( - # lambda x: isinstance(x, ValidationError) - # ) - - # errors = flat_df[validation_error_mask].copy() - # errors.rename(columns={"fhir": "validation_error"}, inplace=True) - - # valid_fhir = flat_df[~validation_error_mask] - # resources = valid_fhir["fhir"] - - # return resources, errors + @classmethod + def validate_fhirflat( + cls, df: pd.DataFrame + ) -> tuple[FHIRFlatBase | list[FHIRFlatBase], pd.Series | None]: + """ + Takes a FHIRflat dataframe and validates the data against the FHIR + schema. Returns a dataframe of valid resources and a dataframe of the + FHIRflat data that produced validation errors, with a `validation_error` + column describing the error. + + Parameters + ---------- + df: pd.DataFrame + Pandas dataframe containing the FHIRflat data + + Returns + ------- + valid_resources: pd.DataFrame + A dataframe containing the valid FHIR resources + errors: pd.DataFrame + A dataframe containing the flat_dict and validation errors. + """ + + flat_df = df.copy() + + flat_df["fhir"] = flat_df.apply( + lambda row: row.to_json(date_format="iso", date_unit="s"), axis=1 + ).apply(lambda x: cls.create_fhir_resource(x)) + + if len(flat_df) == 1: + resource = flat_df["fhir"].iloc[0] + if isinstance(resource, ValidationError): + raise resource + else: + return resource, None + else: + resources = list(flat_df["fhir"]) + errors = None + if any(isinstance(r, ValidationError) for r in resources): + validation_error_mask = flat_df["fhir"].apply( + lambda x: isinstance(x, ValidationError) + ) + + errors = flat_df[validation_error_mask].copy() + errors.rename(columns={"fhir": "validation_error"}, inplace=True) + + valid_fhir = flat_df[~validation_error_mask] + valid_fhirflat = valid_fhir.drop(columns=["fhir"]) + else: + valid_fhirflat = flat_df.drop(columns=["fhir"]) + + return valid_fhirflat, errors @classmethod def from_flat(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: @@ -235,10 +239,12 @@ def fhir_format(row: pd.Series) -> pd.Series: return condensed_mapped_data @classmethod - def ingest_to_flat(cls, data: pd.DataFrame, filename: str) -> pd.DataFrame | None: + def ingest_to_flat(cls, data: pd.DataFrame) -> pd.DataFrame | None: """ - Takes a pandas dataframe and populates the resource with the data. - Creates a FHIRflat parquet file for the resources. + Takes a pandas dataframe containing the populated mapping file and a dictionary + representing the FHIRflat resource and creates the FHIRflat parquet file. + Performs data formatting on the date and coding columns to account for + simplifications parquet makes when saving. Parameters ---------- @@ -250,148 +256,43 @@ def ingest_to_flat(cls, data: pd.DataFrame, filename: str) -> pd.DataFrame | Non Returns ------- pd.DataFrame or None - A dataframe containing the flat_dict and validation errors. + A dataframe containing the FHIRflat data. """ data.loc[:, "flat_dict"] = cls.ingest_backbone_elements(data["flat_dict"]) - # Creates a columns of FHIR resource instances - data["fhir"] = data["flat_dict"].apply(lambda x: cls.create_fhir_resource(x)) - - validation_error_mask = data["fhir"].apply( - lambda x: isinstance(x, ValidationError) - ) - - valid_fhir = data[~validation_error_mask].copy() - - # flattens resources back out - flat_df = valid_fhir["fhir"].apply(lambda x: x.to_flat()) + flat_df = pd.json_normalize(data["flat_dict"]) if not flat_df.empty: - # create FHIR expected date format - for date_cols in [ - x - for x in flat_df.columns - if ("date" in x.lower() or "period" in x.lower() or "time" in x.lower()) - ]: - # replace nan with None - flat_df[date_cols] = flat_df[date_cols].replace(np.nan, None) - - # convert datetime objects to ISO strings - # (stops unwanted parquet conversions) - # but skips over extensions that have floats/strings rather than dates - flat_df[date_cols] = flat_df[date_cols].apply( - lambda x: ( - x.isoformat() - if isinstance(x, datetime.datetime) - or isinstance(x, datetime.date) - else x - ) + # apply the coding column formatting in here + system_columns = flat_df.columns[flat_df.columns.str.endswith(".system")] + for coding_col in system_columns: + col = coding_col.removesuffix(".system") + flat_df = flat_df.apply( + lambda x, c=col: util.condense_codes(x, c), axis=1 ) - - for coding_column in [ - x - for x in flat_df.columns - if x.lower().endswith(".code") or x.lower().endswith(".text") - ]: - flat_df[coding_column] = flat_df[coding_column].apply( - lambda x: [x] if isinstance(x, str) else x + flat_df.drop(columns=system_columns, inplace=True) + + list_cols = [ + col + for col in flat_df.columns + if flat_df[col].apply(lambda x: isinstance(x, list)).any() + ] + list_lengths = [len(flat_df[x].dropna().iloc[0]) for x in list_cols] + long_list_cols = [ + x for x, y in zip(list_cols, list_lengths, strict=True) if y > 1 + ] + + if long_list_cols: + flat_df.rename( + columns={x: x + "_dense" for x in long_list_cols}, inplace=True ) - flat_df.to_parquet(f"{filename}.parquet") - data_errors = data[validation_error_mask].copy() - data_errors.rename(columns={"fhir": "validation_error"}, inplace=True) - return data_errors if not data_errors.empty else None - - # @classmethod - # def ingest_to_flat(cls, data: pd.DataFrame) -> pd.DataFrame | None: - # """ - # Takes a pandas dataframe containg the populated mapping file and a dictionary - # representing the FHIRflat resource and creates the FHIRflat parquet file. - # Performs data formatting on the date and coding columns to account for - # simplifications parquet makes when saving. - - # Parameters - # ---------- - # data: pd.DataFrame - # Pandas dataframe containing the data - # filename: str - # Name of the parquet file to be generated. - - # Returns - # ------- - # pd.DataFrame or None - # A dataframe containing the FHIRflat data. - # """ - - # data.loc[:, "flat_dict"] = cls.ingest_backbone_elements(data["flat_dict"]) - - # flat_df = pd.json_normalize(data["flat_dict"]) - - # if not flat_df.empty: - # # apply the coding column formatting in here - # system_columns = flat_df.columns[flat_df.columns.str.endswith(".system")] - # for coding_col in system_columns: - # col = coding_col.removesuffix(".system") - # flat_df = flat_df.apply(lambda x: condense_codes(x, col), axis=1) - # flat_df.drop(columns=system_columns, inplace=True) - - # # find and create dense columns - not working - # list_columns = flat_df.map(lambda x: isinstance(x, list)) - # list_lengths = [len(flat_df[x][0]) for x in list_cols] - # long_list_cols = [ - # x for x, y in zip(list_cols, list_lengths, strict=True) if y > 1 - # ] - - # if long_list_cols: - # flat_df.rename( - # columns={x: x + "_dense" for x in long_list_cols}, inplace=True - # ) - - # # format dates and columns - # flat_df = format_flat(flat_df) - - # # flat_df.to_parquet(f"{filename}.parquet") - # return flat_df - # return None - - # @classmethod - # def validate_flat(cls, flat_df: pd.DataFrame) - # -> tuple[pd.DataFrame, pd.DataFrame]: - # """ - # Takes a FHIRflat dataframe and validates the data against the FHIR - # schema. Returns a dataframe of valid resources and a dataframe of the - # FHIRflat data that produced validation errors, with a `validation_error` - # column describing the error. - - # Parameters - # ---------- - # flat_df: pd.DataFrame - # Pandas dataframe containing the FHIRflat data - - # Returns - # ------- - # valid_resources: pd.DataFrame - # A dataframe containing the valid FHIR resources - # errors: pd.DataFrame - # A dataframe containing the flat_dict and validation errors. - # """ - - # flat_df["fhir"] = flat_df.apply(lambda row: row.to_json(), axis=1).apply( - # lambda x: cls.create_fhir_resource(x) - # ) - - # validation_error_mask = flat_df["fhir"].apply( - # lambda x: isinstance(x, ValidationError) - # ) - - # errors = flat_df[validation_error_mask].copy() - # errors.rename(columns={"fhir": "validation_error"}, inplace=True) - - # valid_fhir = flat_df[~validation_error_mask] - # valid_fhir = valid_fhir.drop(columns=["fhir"]) - - # return valid_fhir, errors + # format dates and columns + flat_df = util.format_flat(flat_df, cls) + + return flat_df + return None @classmethod def fhir_bulk_import(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: diff --git a/fhirflat/util.py b/fhirflat/util.py index cdc6763..f778bde 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -74,9 +74,68 @@ def get_local_resource(t: str): return getattr(fhirflat, t) -def format_flat(flat_df): +def find_data_class(data_class, k): """ - Performs formatting ondates/lists in FHIRflat resources. + Finds the type class for item k within the data class. + + Parameters + ---------- + data_class: list[BaseModel] or BaseModel + The data class to search within. If a list, the function will search for the + a class with a matching title to k. + k: str + The property to search for within the data class + """ + + if isinstance(data_class, list): + title_matches = [k.lower() == c.schema()["title"].lower() for c in data_class] + result = [x for x, y in zip(data_class, title_matches, strict=True) if y] + if len(result) == 1: + return get_fhirtype(k) + else: + raise ValueError(f"Couldn't find a matching class for {k} in {data_class}") + + else: + k_schema = data_class.schema()["properties"].get(k) + + base_class = ( + k_schema.get("items").get("type") + if k_schema.get("items") is not None + else k_schema.get("type") + ) + + if base_class is None: + assert k_schema.get("type") == "array" + + base_class = [opt.get("type") for opt in k_schema["items"]["anyOf"]] + return get_fhirtype(base_class) + + +def code_or_codeable_concept(col_name, resource): + search_terms = col_name.split(".") + fhir_type = find_data_class(resource, search_terms[0]) + + if isinstance(fhir_type, list): + return code_or_codeable_concept(".".join(search_terms[1:]), fhir_type) + + if len(search_terms) == 2: # e.g. "code.code", "age.code" + schema = fhir_type.schema()["properties"] + codeable_concepts = [ + key + for key in schema.keys() + if "codeableconcept" in key.lower() or "coding" in key.lower() + ] + if codeable_concepts: + return True + else: + return False + else: + return code_or_codeable_concept(".".join(search_terms[1:]), fhir_type) + + +def format_flat(flat_df, resource): + """ + Performs formatting on dates/lists in FHIRflat resources. """ for date_cols in [ @@ -101,8 +160,10 @@ def format_flat(flat_df): for coding_column in [ x for x in flat_df.columns - if (x.lower().endswith(".code") or x.lower().endswith(".text")) - and "Quantity" not in x + if ( + (x.lower().endswith(".code") or x.lower().endswith(".text")) + and code_or_codeable_concept(x, resource) + ) ]: flat_df[coding_column] = flat_df[coding_column].apply( lambda x: [x] if isinstance(x, str) else x @@ -113,7 +174,7 @@ def format_flat(flat_df): def condense_codes(row, code_col): raw_codes = row[(code_col + ".code")] - if isinstance(raw_codes, (str, float)) and raw_codes == raw_codes: + if isinstance(raw_codes, (str, int, float)) and raw_codes == raw_codes: formatted_code = ( raw_codes if isinstance(raw_codes, str) else str(int(raw_codes)) ) diff --git a/tests/test_condition_resource.py b/tests/test_condition_resource.py index cb68b88..e8b94c1 100644 --- a/tests/test_condition_resource.py +++ b/tests/test_condition_resource.py @@ -106,28 +106,26 @@ } CONDITION_FLAT = { - "resourceType": ["Condition"], + "resourceType": "Condition", "extension.presenceAbsence.code": ["http://snomed.info/sct|410605003"], "extension.presenceAbsence.text": ["Present"], - "extension.prespecifiedQuery": [True], + "extension.prespecifiedQuery": True, "category.code": [ - [ - "http://snomed.info/sct|55607006", - "http://terminology.hl7.org/CodeSystem/condition-category|problem-list-item", # noqa: E501 - ] + "http://snomed.info/sct|55607006", + "http://terminology.hl7.org/CodeSystem/condition-category|problem-list-item", # noqa: E501 ], - "category.text": [["Problem", None]], + "category.text": ["Problem", None], "bodySite.code": ["http://snomed.info/sct|38266002"], - "bodySite.text": ["whole body"], - "onsetDateTime": [datetime.date(2013, 4, 2)], - "abatementString": ["around April 9, 2013"], - "recordedDate": [datetime.date(2013, 4, 4)], + "bodySite.text": "whole body", + "onsetDateTime": datetime.date(2013, 4, 2), + "abatementString": "around April 9, 2013", + "recordedDate": datetime.date(2013, 4, 4), "severity.code": ["http://snomed.info/sct|255604002"], "severity.text": ["Mild"], "code.code": ["http://snomed.info/sct|386661006"], - "code.text": ["Fever"], - "subject": ["Patient/f201"], - "encounter": ["Encounter/f201"], + "code.text": "Fever", + "subject": "Patient/f201", + "encounter": "Encounter/f201", } CONDITION_DICT_OUT = { @@ -209,11 +207,12 @@ def test_condition_to_flat(): fever.to_flat("test_condition.parquet") - assert_frame_equal( - pd.read_parquet("test_condition.parquet"), - pd.DataFrame(CONDITION_FLAT), - check_like=True, - ) + fever_flat = pd.read_parquet("test_condition.parquet") + expected = pd.DataFrame([CONDITION_FLAT], index=[0]) + expected = expected.reindex(sorted(expected.columns), axis=1) + # v, e = Condition.validate_fhirflat(expected) + + assert_frame_equal(fever_flat, expected) os.remove("test_condition.parquet") diff --git a/tests/test_ingest.py b/tests/test_ingest.py index 1a05993..283a48c 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -17,9 +17,9 @@ import sys import shutil from pathlib import Path -from decimal import Decimal import numpy as np import pytest +from pydantic.v1 import ValidationError if sys.version_info < (3, 11): # tomllib was introduced in 3.11 import tomli # pragma: no cover @@ -448,10 +448,9 @@ def test_create_dict_one_to_one_dense_freetext(file, expected): ENCOUNTER_SINGLE_ROW_FLAT = { - "resourceType": "Encounter", - "id": "11", - "class.code": "https://snomed.info/sct|32485007", - "class.text": "Hospital admission (procedure)", + "id": 11, + "class.code": ["https://snomed.info/sct|32485007"], + "class.text": ["Hospital admission (procedure)"], "diagnosis_dense": [ { "condition": [ @@ -509,8 +508,8 @@ def test_create_dict_one_to_one_dense_freetext(file, expected): "subject": "Patient/2", "actualPeriod.start": "2021-04-01T18:00:00-03:00", "actualPeriod.end": "2021-04-10", - "admission.dischargeDisposition.code": "https://snomed.info/sct|371827001", - "admission.dischargeDisposition.text": "Patient discharged alive (finding)", + "admission.dischargeDisposition.code": ["https://snomed.info/sct|371827001"], + "admission.dischargeDisposition.text": ["Patient discharged alive (finding)"], "extension.timingPhase.code": ["https://snomed.info/sct|278307001"], "extension.timingPhase.text": ["On admission (qualifier value)"], } @@ -527,14 +526,19 @@ def test_load_data_one_to_one_single_row(): ) assert df is not None - Encounter.ingest_to_flat(df, "encounter_ingestion_single") + + flat_encounter = Encounter.ingest_to_flat(df) + expected_encounter = pd.DataFrame([ENCOUNTER_SINGLE_ROW_FLAT], index=[0]) + + _, error = Encounter.validate_fhirflat(flat_encounter) + + assert error is None assert_frame_equal( - pd.read_parquet("encounter_ingestion_single.parquet"), - pd.DataFrame([ENCOUNTER_SINGLE_ROW_FLAT], index=[0]), - check_dtype=False, + flat_encounter, + expected_encounter, + check_like=True, ) - os.remove("encounter_ingestion_single.parquet") def test_load_data_one_to_one_dense_single_row(): @@ -548,13 +552,15 @@ def test_load_data_one_to_one_dense_single_row(): ) assert df is not None - Encounter.ingest_to_flat(df, "encounter_ingestion_dense") - df_parquet = pd.read_parquet("encounter_ingestion_dense.parquet") + flat_diagnosis = Encounter.ingest_to_flat(df) + + _, e = Encounter.validate_fhirflat(flat_diagnosis) + assert e is None expected_diagnosis = [ { - "condition": [{"concept": {"coding": None, "text": "sepsis"}}], + "condition": [{"concept": {"text": "sepsis"}}], "use": [ { "coding": [ @@ -578,7 +584,6 @@ def test_load_data_one_to_one_dense_single_row(): "system": "https://snomed.info/sct", } ], - "text": None, } } ], @@ -596,23 +601,21 @@ def test_load_data_one_to_one_dense_single_row(): }, ] - assert all(df_parquet["diagnosis_dense"][0] == expected_diagnosis) - os.remove("encounter_ingestion_dense.parquet") + assert flat_diagnosis["diagnosis_dense"].iloc[0] == expected_diagnosis ENCOUNTER_SINGLE_ROW_MULTI = { - "resourceType": ["Encounter", "Encounter", "Encounter", "Encounter"], "class.code": [ - "https://snomed.info/sct|371883000", - "https://snomed.info/sct|32485007", - "https://snomed.info/sct|32485007", - "https://snomed.info/sct|32485007", + ["https://snomed.info/sct|371883000"], + ["https://snomed.info/sct|32485007"], + ["https://snomed.info/sct|32485007"], + ["https://snomed.info/sct|32485007"], ], "class.text": [ - "Outpatient procedure (procedure)", - "Hospital admission (procedure)", - "Hospital admission (procedure)", - "Hospital admission (procedure)", + ["Outpatient procedure (procedure)"], + ["Hospital admission (procedure)"], + ["Hospital admission (procedure)"], + ["Hospital admission (procedure)"], ], "diagnosis_dense": [ None, @@ -745,7 +748,7 @@ def test_load_data_one_to_one_dense_single_row(): ["Final diagnosis (discharge) (contextual qualifier) (qualifier value)"], ], "subject": ["Patient/1", "Patient/2", "Patient/3", "Patient/4"], - "id": ["10", "11", "12", "13"], + "id": [10, 11, 12, 13], "actualPeriod.start": [ "2020-05-01", "2021-04-01T18:00:00-03:00", @@ -759,16 +762,16 @@ def test_load_data_one_to_one_dense_single_row(): "2022-06-20", ], "admission.dischargeDisposition.code": [ - "https://snomed.info/sct|371827001", - "https://snomed.info/sct|371827001", - "https://snomed.info/sct|419099009", - "https://snomed.info/sct|32485007", + ["https://snomed.info/sct|371827001"], + ["https://snomed.info/sct|371827001"], + ["https://snomed.info/sct|419099009"], + ["https://snomed.info/sct|32485007"], ], "admission.dischargeDisposition.text": [ - "Patient discharged alive (finding)", - "Patient discharged alive (finding)", - "Dead (finding)", - "Hospital admission (procedure)", + ["Patient discharged alive (finding)"], + ["Patient discharged alive (finding)"], + ["Dead (finding)"], + ["Hospital admission (procedure)"], ], "extension.timingPhase.code": [ ["https://snomed.info/sct|281379000"], @@ -796,38 +799,34 @@ def test_load_data_one_to_one_multi_row(): ) assert df is not None - Encounter.ingest_to_flat(df, "encounter_ingestion_multi") + + flat_encounter = Encounter.ingest_to_flat(df) + + _, e = Encounter.validate_fhirflat(flat_encounter) + assert e is None assert_frame_equal( - pd.read_parquet("encounter_ingestion_multi.parquet"), + flat_encounter, pd.DataFrame(ENCOUNTER_SINGLE_ROW_MULTI), check_dtype=False, check_like=True, ) - os.remove("encounter_ingestion_multi.parquet") OBS_FLAT = { - "resourceType": [ - "Observation", - "Observation", - "Observation", - "Observation", - "Observation", - ], "category.code": [ - "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", - "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", - "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", - "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", - "http://terminology.hl7.org/CodeSystem/observation-category|vital-signs", + ["http://terminology.hl7.org/CodeSystem/observation-category|vital-signs"], + ["http://terminology.hl7.org/CodeSystem/observation-category|vital-signs"], + ["http://terminology.hl7.org/CodeSystem/observation-category|vital-signs"], + ["http://terminology.hl7.org/CodeSystem/observation-category|vital-signs"], + ["http://terminology.hl7.org/CodeSystem/observation-category|vital-signs"], ], "category.text": [ - "Vital Signs", - "Vital Signs", - "Vital Signs", - "Vital Signs", - "Vital Signs", + ["Vital Signs"], + ["Vital Signs"], + ["Vital Signs"], + ["Vital Signs"], + ["Vital Signs"], ], "effectiveDateTime": [ "2020-01-01", @@ -837,18 +836,18 @@ def test_load_data_one_to_one_multi_row(): "2021-02-02", ], "code.code": [ - "https://loinc.org|8310-5", - "https://loinc.org|8310-5", - "https://loinc.org|8310-5", - "https://loinc.org|8867-4", - "https://loinc.org|8867-4", + ["https://loinc.org|8310-5"], + ["https://loinc.org|8310-5"], + ["https://loinc.org|8310-5"], + ["https://loinc.org|8867-4"], + ["https://loinc.org|8867-4"], ], "code.text": [ - "Body temperature", - "Body temperature", - "Body temperature", - "Heart rate", - "Heart rate", + ["Body temperature"], + ["Body temperature"], + ["Body temperature"], + ["Heart rate"], + ["Heart rate"], ], "subject": ["Patient/1", "Patient/2", "Patient/3", "Patient/1", "Patient/2"], "encounter": [ @@ -858,7 +857,7 @@ def test_load_data_one_to_one_multi_row(): "Encounter/10", "Encounter/11", ], - "valueQuantity.value": [Decimal("36.2"), 37.0, 35.5, 120.0, 100.0], + "valueQuantity.value": [36.2, 37.0, 35.5, 120.0, 100.0], "valueQuantity.unit": [ "DegreesCelsius", "DegreesCelsius", @@ -891,13 +890,15 @@ def test_load_data_one_to_many_multi_row(): assert df is not None clean_df = df.dropna().copy() - Observation.ingest_to_flat(clean_df, "observation_ingestion") - full_df = pd.read_parquet("observation_ingestion.parquet") + flat_obs = Observation.ingest_to_flat(clean_df) + + _, e = Observation.validate_fhirflat(flat_obs) + assert e is None - assert len(full_df) == 33 + assert len(flat_obs) == 33 - df_head = full_df.head(5) + df_head = flat_obs.head(5) assert_frame_equal( df_head, @@ -905,7 +906,6 @@ def test_load_data_one_to_many_multi_row(): check_dtype=False, check_like=True, ) - os.remove("observation_ingestion.parquet") def test_convert_data_to_flat_missing_mapping_error(): @@ -998,7 +998,7 @@ def test_convert_data_to_flat_local_mapping(): shutil.rmtree(output_folder) -def test_ingest_to_flat_validation_errors(): +def test_validate_fhirflat_single_resource_errors(): df = pd.DataFrame( { "subjid": [2], @@ -1041,12 +1041,17 @@ def test_ingest_to_flat_validation_errors(): index=[0], ) - error_df = Encounter.ingest_to_flat(df, "encounter_date_error") - assert len(error_df) == 1 - assert ( - repr(error_df["validation_error"][0].errors()) - == "[{'loc': ('actualPeriod', 'start'), 'msg': 'invalid datetime format', 'type': 'value_error.datetime'}]" # noqa: E501 - ) + flat_df = Encounter.ingest_to_flat(df) + with pytest.raises(ValidationError, match="invalid datetime format"): + _, _ = Encounter.validate_fhirflat(flat_df) + # assert len(error_df) == 1 + # assert ( + # repr(error_df["validation_error"][0].errors()) + # == "[{'loc': ('actualPeriod', 'start'), 'msg': 'invalid datetime format', 'type': 'value_error.datetime'}]" # noqa: E501 + # ) + + +# TODO: add test for validate_fhirflat with multiple resources def test_convert_data_to_flat_local_mapping_errors(): @@ -1069,9 +1074,7 @@ def test_convert_data_to_flat_local_mapping_errors(): encounter_df = pd.read_parquet("tests/ingestion_output_errors/encounter.parquet") obs_df = pd.read_parquet("tests/ingestion_output_errors/observation.parquet") - expected_encounter_minus_errors = ( - pd.DataFrame(ENCOUNTER_SINGLE_ROW_MULTI).iloc[:-1].dropna(axis=1, how="all") - ) + expected_encounter_minus_errors = pd.DataFrame(ENCOUNTER_SINGLE_ROW_MULTI).iloc[:-1] assert_frame_equal( encounter_df, From 415b3740207a19bdcd8c4136213e62b08a46111c Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 30 Jul 2024 16:32:14 +0100 Subject: [PATCH 05/19] Fix dense column identification, add test --- fhirflat/resources/base.py | 14 +++--- tests/test_ingest.py | 92 +++++++++++++++++++++++++++++++++++--- 2 files changed, 94 insertions(+), 12 deletions(-) diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 7aabfcc..59690e4 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -273,14 +273,16 @@ def ingest_to_flat(cls, data: pd.DataFrame) -> pd.DataFrame | None: ) flat_df.drop(columns=system_columns, inplace=True) - list_cols = [ - col - for col in flat_df.columns - if flat_df[col].apply(lambda x: isinstance(x, list)).any() + potential_dense_cols = [ + x for x in cls.backbone_elements.keys() if x in flat_df.columns + ] + list_lengths = [ + len(flat_df[x].dropna().iloc[0]) for x in potential_dense_cols ] - list_lengths = [len(flat_df[x].dropna().iloc[0]) for x in list_cols] long_list_cols = [ - x for x, y in zip(list_cols, list_lengths, strict=True) if y > 1 + x + for x, y in zip(potential_dense_cols, list_lengths, strict=True) + if y > 1 ] if long_list_cols: diff --git a/tests/test_ingest.py b/tests/test_ingest.py index 283a48c..9270f44 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -1044,14 +1044,94 @@ def test_validate_fhirflat_single_resource_errors(): flat_df = Encounter.ingest_to_flat(df) with pytest.raises(ValidationError, match="invalid datetime format"): _, _ = Encounter.validate_fhirflat(flat_df) - # assert len(error_df) == 1 - # assert ( - # repr(error_df["validation_error"][0].errors()) - # == "[{'loc': ('actualPeriod', 'start'), 'msg': 'invalid datetime format', 'type': 'value_error.datetime'}]" # noqa: E501 - # ) -# TODO: add test for validate_fhirflat with multiple resources +def test_validate_fhirflat_multi_resource_errors(): + df = pd.DataFrame( + { + "subjid": [1, 2], + "flat_dict": [ + { + "subject": "Patient/1", + "id": 11, + "actualPeriod.start": "2021-04-01", + "actualPeriod.end": "2021-04-10", + "extension.timingPhase.system": "https://snomed.info/sct", + "extension.timingPhase.code": 278307001.0, + "extension.timingPhase.text": "On admission (qualifier value)", + "class.system": "https://snomed.info/sct", + "class.code": 32485007.0, + "class.text": "Hospital admission (procedure)", + "diagnosis.condition.concept.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.condition.concept.code": [38362002.0, 722863008.0], + "diagnosis.condition.concept.text": [ + "Dengue (disorder)", + "Dengue with warning signs (disorder)", + ], + "diagnosis.use.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.use.code": [89100005.0, 89100005.0], + "diagnosis.use.text": [ + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + ], + "admission.dischargeDisposition.system": "https://snomed.info/sct", + "admission.dischargeDisposition.code": 371827001.0, + "admission.dischargeDisposition.text": "Patient discharged alive (finding)", # noqa: E501 + }, + { + "subject": "Patient/2", + "id": 12, + "actualPeriod.start": ["2021-04-01", None], + "actualPeriod.end": [None, "2021-04-10"], + "extension.timingPhase.system": "https://snomed.info/sct", + "extension.timingPhase.code": 278307001.0, + "extension.timingPhase.text": "On admission (qualifier value)", + "class.system": "https://snomed.info/sct", + "class.code": 32485007.0, + "class.text": "Hospital admission (procedure)", + "diagnosis.condition.concept.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.condition.concept.code": [38362002.0, 722863008.0], + "diagnosis.condition.concept.text": [ + "Dengue (disorder)", + "Dengue with warning signs (disorder)", + ], + "diagnosis.use.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.use.code": [89100005.0, 89100005.0], + "diagnosis.use.text": [ + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + ], + "admission.dischargeDisposition.system": "https://snomed.info/sct", + "admission.dischargeDisposition.code": 371827001.0, + "admission.dischargeDisposition.text": "Patient discharged alive (finding)", # noqa: E501 + }, + ], + }, + ) + flat_df = Encounter.ingest_to_flat(df) + + assert "diagnosis_dense" in flat_df.columns + + valid, errors = Encounter.validate_fhirflat(flat_df) + + assert len(valid) == 1 + assert len(errors) == 1 + assert ( + repr(errors["validation_error"][1].errors()) + == "[{'loc': ('actualPeriod', 'end'), 'msg': 'invalid type; expected datetime, string, bytes, int or float', 'type': 'type_error'}, {'loc': ('actualPeriod', 'start'), 'msg': 'invalid type; expected datetime, string, bytes, int or float', 'type': 'type_error'}]" # noqa: E501 + ) def test_convert_data_to_flat_local_mapping_errors(): From 5448d32a647b51c6e270e598a809a4a03aed9840 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 30 Jul 2024 17:08:06 +0100 Subject: [PATCH 06/19] Add flag to turn off validation in CLI --- fhirflat/ingest.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 41d41ec..299484e 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -18,6 +18,7 @@ import dateutil.parser import numpy as np import pandas as pd +from pyarrow.lib import ArrowTypeError import fhirflat from fhirflat.util import get_local_resource, group_keys @@ -459,11 +460,20 @@ def convert_data_to_flat( be named by resource, and contain the mapping for that resource. subject_id: str The name of the column containing the subject ID in the data file. + validate: bool + Whether to validate the FHIRflat files after creation. """ if not mapping_files_types and not sheet_id: raise TypeError("Either mapping_files_types or sheet_id must be provided") + if not validate: + warnings.warn( + "Validation of the FHIRflat files has been disabled. ", + UserWarning, + stacklevel=2, + ) + if not os.path.exists(folder_name): os.makedirs(folder_name) @@ -524,9 +534,19 @@ def convert_data_to_flat( ) else: errors = None - flat_nonvalidated.to_parquet( - f"{os.path.join(folder_name, resource.__name__.lower())}.parquet" - ) + try: + flat_nonvalidated.to_parquet( + f"{os.path.join(folder_name, resource.__name__.lower())}.parquet" + ) + except ArrowTypeError as e: + warnings.warn( + f"Error writing {resource.__name__.lower()}.parquet: {e}\n" + "This is likely due to a validation error, re-run without " + "--no-validate.", + UserWarning, + stacklevel=2, + ) + continue end_time = timeit.default_timer() total_time = end_time - start_time @@ -577,6 +597,13 @@ def main(): default="subjid", ) + parser.add_argument( + "--no-validate", + help="Do the data conversion without validation", + dest="validate", + action="store_false", + ) + args = parser.parse_args() convert_data_to_flat( @@ -586,6 +613,7 @@ def main(): folder_name=args.output, sheet_id=args.sheet_id, subject_id=args.subject_id, + validate=args.validate, ) From ca283d3ad99d35eb776617c58f66d6cd7eb2f7f7 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Thu, 1 Aug 2024 14:42:19 +0100 Subject: [PATCH 07/19] Add validate CLI --- fhirflat/__main__.py | 6 +- fhirflat/ingest.py | 65 ++++++++++++++++++ fhirflat/resources/base.py | 7 +- fhirflat/util.py | 9 ++- .../invalid_flat_bundle/condition.parquet | Bin 0 -> 14360 bytes .../invalid_flat_bundle/encounter.parquet | Bin 0 -> 18309 bytes .../data/valid_flat_bundle/condition.parquet | Bin 0 -> 14975 bytes .../data/valid_flat_bundle/encounter.parquet | Bin 0 -> 26792 bytes tests/data/valid_flat_bundle/patient.parquet | Bin 0 -> 3911 bytes tests/test_ingest.py | 29 ++++++++ 10 files changed, 111 insertions(+), 5 deletions(-) create mode 100644 tests/data/invalid_flat_bundle/condition.parquet create mode 100644 tests/data/invalid_flat_bundle/encounter.parquet create mode 100644 tests/data/valid_flat_bundle/condition.parquet create mode 100644 tests/data/valid_flat_bundle/encounter.parquet create mode 100644 tests/data/valid_flat_bundle/patient.parquet diff --git a/fhirflat/__main__.py b/fhirflat/__main__.py index 7832ca3..7dcec77 100644 --- a/fhirflat/__main__.py +++ b/fhirflat/__main__.py @@ -1,6 +1,7 @@ import sys from .ingest import main as ingest_to_flat +from .ingest import validate_cli as validate def main(): @@ -10,16 +11,19 @@ def main(): Available subcommands: transform - Convert raw data into FHIRflat files + validate - Validate FHIRflat files against FHIR schemas """ ) sys.exit(1) subcommand = sys.argv[1] - if subcommand not in ["transform"]: + if subcommand not in ["transform", "validate"]: print("fhirflat: unrecognised subcommand", subcommand) sys.exit(1) sys.argv = sys.argv[1:] if subcommand == "transform": ingest_to_flat() + elif subcommand == "validate": + validate() else: pass diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 5e93bcc..875066a 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -580,6 +580,49 @@ def convert_data_to_flat( shutil.rmtree(folder_name) +def validate(folder_name: str, compress_format: str | None = None): + """ + Takes a folder containing (optionally compressed) FHIRflat files and validates them + against the FHIR. File names **must** correspond to the FHIR resource types they + represent. E.g. a file containing Patient resources must be named "patient.parquet". + """ + + if compress_format: + shutil.unpack_archive(folder_name, compress_format, folder_name) + directory = Path(folder_name).parents + else: + directory = folder_name + + for file in Path(directory).glob("*.parquet"): + df = pd.read_parquet(file) + resource = file.stem + resource_type = get_local_resource(resource, case_insensitive=True) + + valid_flat, errors = resource_type.validate_fhirflat(df, return_files=True) + + if errors is not None: + + valid_flat.to_parquet(os.path.join(directory, f"{resource}_valid.parquet")) + errors.to_csv( + os.path.join(directory, f"{resource}_errors.csv"), index=False + ) + error_length = len(errors) + print( + f"{error_length} rows in {file.name} have validation errors. " + f"Errors saved to {resource}_errors.csv. " + f"Valid rows saved to {resource}_valid.parquet" + ) + else: + print(f"{file.name} is valid") + print("Validation complete") + + if compress_format: + new_directory = directory + "_validated" + shutil.make_archive(new_directory, compress_format, new_directory) + shutil.rmtree(directory) + print(f"Validated files saved as {new_directory}.{compress_format}") + + def main(): parser = argparse.ArgumentParser( description="Convert data to FHIRflat parquet files", @@ -637,5 +680,27 @@ def main(): ) +def validate_cli(): + parser = argparse.ArgumentParser( + description="Validate FHIRflat parquet files against the FHIR schema", + prog="fhirflat validate", + ) + parser.add_argument("folder", help="File path to folder containing FHIRflat files") + + parser.add_argument( + "-c", + "--compress_format", + help="Format the folder is compressed in", + choices=["zip", "tar", "gztar", "bztar", "xztar"], + ) + + args = parser.parse_args() + + validate( + args.folder, + compress_format=args.compress_format, + ) + + if __name__ == "__main__": main() diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 59690e4..8d1f47d 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -88,7 +88,7 @@ def create_fhir_resource( @classmethod def validate_fhirflat( - cls, df: pd.DataFrame + cls, df: pd.DataFrame, return_files: bool = False ) -> tuple[FHIRFlatBase | list[FHIRFlatBase], pd.Series | None]: """ Takes a FHIRflat dataframe and validates the data against the FHIR @@ -100,6 +100,9 @@ def validate_fhirflat( ---------- df: pd.DataFrame Pandas dataframe containing the FHIRflat data + return_files: bool + If True, returns the valid FHIR resources & errors as a parquet file, + even if only one row is present in the dataframe. Returns ------- @@ -115,7 +118,7 @@ def validate_fhirflat( lambda row: row.to_json(date_format="iso", date_unit="s"), axis=1 ).apply(lambda x: cls.create_fhir_resource(x)) - if len(flat_df) == 1: + if len(flat_df) == 1 and return_files is False: resource = flat_df["fhir"].iloc[0] if isinstance(resource, ValidationError): raise resource diff --git a/fhirflat/util.py b/fhirflat/util.py index f778bde..f62c23e 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -70,8 +70,13 @@ def get_local_extension_type(t: str): raise AttributeError(f"Could not find {t} in fhirflat extensions") from ae -def get_local_resource(t: str): - return getattr(fhirflat, t) +def get_local_resource(t: str, case_insensitive: bool = False): + if case_insensitive is False: + return getattr(fhirflat, t) + else: + for a in dir(fhirflat): + if a.lower() == t.lower(): + return getattr(fhirflat, a) def find_data_class(data_class, k): diff --git a/tests/data/invalid_flat_bundle/condition.parquet b/tests/data/invalid_flat_bundle/condition.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f92c847aca2ed9fa029452739b5897f118b8f962 GIT binary patch literal 14360 zcmdU0U2G%O6&`y{b{DeCvh29d5|-5L4oj1b$99~J5j>2Y*s)2ni8t{dMUiJbPMnO# z#vaFU0I8~~3RS2=RaK!rRI3VARaGG#D6J~A>H|-R$7-K=KovrLM186dLOth>XFMK{ ze**EcHLNpp@44rG=l7m-v%AhEytIR6>A4l!PE!{t%Jt(*6h-B#<+_}o;wsgmG!>kg z^08h&P0<%==M2zYpvUO3$$9GfB`Z3Y{Q1;XWPM;C|&DDr&@_3)E1 zT@MTA_;S8^yQqk>LOCxwq+(4uElNZ&L?!Rqogdhl_Z-Z}r%)+__8SbyQdN`ml%}fl za}NXP#%@^>rx2Tk2Rz)Kbs3&vnbF#xoYkdtwWUJrdy{47;}re$mmx*0mWpy&D(^IBccq)NBxd zj{f90WjkeO?mVh%df}V~-O`^N&UYNn-#eJE#+h#(Mt{1m*ez}8q%Ui(_sI3rAk(_4 za!~XXeHG)nm3i#fPq(IA+5=Eucpmj|%6tDv>3QVmHs>#G%o~@PdlM)}Cl^aZSLy2-qKk{?VW%ZofRhix*HeHq&jQ-{P70Ul z-30dTlLoM&#+xL)4{gqmY|PzB=Bpl7e#xAVRKLtXTRD-dv^T!VBvTd9gQP=v_w_zgPNsq>f++jFt+t$T$d3 zvCtDF8Zw0$hdfwHedpRxQd?ypyYqK8=O1j$dsmsSo<^y)joLd}$UxEO>5DU1ypf{J zWwXKmPU<@O2@p@aNU>qFG54OCq^XH(W7LG}iB5Y|M4?zHiusMYSZxyLTz76VKYiwT z6SoOq0RJIF$i-*qaV>Tn<9~Sp#!LmG%YmcnHy|>+0Rv-5|KdlJw7pdfo-4~WQCWmT zce7X$37EHR&bMvMZ=YrUbA7T8PYdOkK>pPdB#ycq_>CajZzhocoN0p`uZlvsniunk z(}3~wY39Rez}RMDVrhx-GJQo;^%Y#Y{!53@$+sq1I;Nmu*P1oclI-3eU)wN|cI z1#z=kQK70{c6XNf*(`cln~Ra5CD38hjSowdIqNdISGv*H^bBR40;KIuH^1`hGB2E~ z;ki;&ntij&T`zNw?U!Bj4Tpeasjmz4CC#vBCLI8KwJ7Dmrsfpvz;PG(wv_GTc`Mnh zbI6-E=dW$d7Xjvj0Lo@$Js2bGPCjP8bDhy<+E3uw7NbCCjO@F?N!rnh?8mmZi*jB( z+?K#P(l)!jO(N%SHs?QV%)1NBH#aBS+}lq*_OTLX;%j$+a*fr5U9ps--4s23&EC~r zUZYkf0cF5$=tQtKf)qNo@;uHsT%fY8S89f3@kfJ?qOLX7JZ>w%?yVH z7PjlUVRs_}v@0j0SZDPTr7hBcvwPTG0csyk#}S&Qz$pDrFgT%rqVpiRHohW(ZOKeW z_SPoQ(jB?7t%1&Tatwjn3y&qA1E6#V3+1#=Oiq0L*nPb^dC=}Bm{Hv%{2w>Yqi%AU z2da%^POkrBM%kS)4}Sl{-0x=PWLpqsRBZz@qEeq3NnOr7aFs7JvU-)dpY^2kqe-Wo zIpe(hB>YBXV#YOA$;tU#&2{hbSKVqME0m>rNv^q{n|jIJRouE?o|$s16l&co62e#A za;_x861gr(KmmQXl;7?gQ#aaHpko6-Gf$9coxn8O?^@%dSCkhIVmqaC(&nY<| zf7OjeTpoUpf@`-nBR3Y@ufN^~(}}`u`rQjM>wsiM@SX6X+^^37%$CgVj^BE8@830T z+8wXsUuvo7P`e%Cz@)w7lMLbxYhab;7&$y^XbiGLYVc??gPZ^NteK zq2C#6w&*m|VEWOc?6^rCq{uE}6F$CNmXJqh($_`}DZeYBbljnK!|0keZ44*Lu%}Je z*2bIB?w!rEC%SR_p3`dM&RNsO-!Y6k?i;%|@0vF4ej?$bZ(iWKXeyH z-5IoQ_59R@km9!nyWUk;Hflo)oQtc%-{|ff8~Uu-oPw7 z%etZN&<+p7q;J-TbCheGOC)Zko>T7+U2i|@!H;ieo#VI-_=BGpIc|;%?Qju}+u_1H zWGocwm~Y_VQhUAuYw#UDgbSIEaa@Cgb?_6JEbfP44&FI#Efa^Qo3WBq&qkATY2Vrb zznoNfd9`?}7~*46u#igy_Ot2T0v}EKvZ=$uY9v^QNo*|^j_>ilLoc5SXf%@0jLpX) z3h)c;@~JK4QO~CiYb#-{m@P>XA59eaQbft#2D)g#%SRi?r;&j@GU>HKepzaexp3Ue zmn7hykYdHg4)F8x=};i;PXvT0^5u%WFaAn4wZ^LRUMU}q0FT6OY_Gf{HscasT06)_ zK`z7;jwh2#8Rb?u9xNm~upc3P%3mw$xWe(>Oew-ESSeV{m6Ch;Fqv1fAP?|hGrGPGgDYXQ zm!@B?m?;JK#pGHgAKj`Oc$adiWGx#B)-xc7$~%6LU6wX;>4daGbhAZdvckznrK5aA z(QS|(5#BubTuavec;$p7E^qY2I}*M6&3peIUsaKjG!1A}mYBoN|yU zRiunx^&>u&WHYG**ltPBQ-#|N%pHc9!5p$xAD~Y&&57qTkfp{iL*82M$WtW8f!u61 z9qW~!x@fH8UDw?S#yOlB5I=#VTq;;!kweXFdJVBi8>#pajPGS{_qYAA*57x(fV>q= z)&-w*z!w7zp_Ei*+KLnfUlQ`gAfGwbdMkgb<cVnd68s@4t>;UmkE)qn%61Ce%RR z>sf8Y+xKiS;{D#v>OM7mI9^EmwBF$mbz4gxg$n5=sb8)9v%e{6fv=$RpNV*^2>PJP2J<6k8?Dm*Fvi??G&^VjIwFg*LF4THsbWF3v5& zBMa>o^jGk{wh@M>eQdje*p{szX=o0YZjd&u0q@}&^4r8O(zJzkrlA#tmk_s02FX0> z*T}27f7%;_z69#xNU)YoXC*$e&`5h1kb?Aku<^3m?~z3z(hk13dDG)Tnx$1}Dq(9f z+U^%aq+eXasV*PI!T27m_hdADA)*N`gHr~4gdg@9JYm1c5&HEbj|bUrG`5f)8c+BT z0C`qi?su6G|xPIXA9BnSN@JGld&>X`@#g9*_{-761D&#eE1oT0{ zv##p1g_kTc-~~g0yl{r|{$w-bOKxF5M%FLk`sr0|eOO)8YyBSd0Q=cgVhM9FS-*BaRv#5Fr8lh!RqFUmA%eX)GYs1t%KzeK4@A3Heg2%E~bP?#McTN%y>4`zN z6~r^Wc|1I;M@YZw0oOy7wERbaXQRIpQ({dE{)4?j6X&OKzoR{HeG+dieu-N685ID_kby8Of0tAX@~99|Bp`JsUa4oQZZku`rc;6Vxw?-;pEDvtH80)ESG z`4dgJ=abRDKyHJ+UU-3WO;vK$SL&j2L#R}4kPFYL)`jMjdesSkQ4Ic;<8l9|7N3QG JbOZk;@PEJvi~axr literal 0 HcmV?d00001 diff --git a/tests/data/invalid_flat_bundle/encounter.parquet b/tests/data/invalid_flat_bundle/encounter.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c743369b5d0faec6707d1b7f0aed5790d57a031d GIT binary patch literal 18309 zcmeHPeP|=+nIFlH*SpzlHrpLV&N*Afw>$B6ZEHrdYEd@A;Zk zuQY2#Y5kxo(QePM=U-f&AG(+bGwjT6WCCDOw3~v|g0+1j~?`3?(v5yV0 z9QhO=*C3EzroA0_-CWS?=2##iubSZ%#0nO4I&jI67qfxEj^!w{-r7Ve4vbtIE*NcsB1!uW%ILNZBkJ`RQc>~ko z5X;U2(kt{>2h!&b<>2)>lIUy!$CF{ACPl6Tu7L~8`)5pD!{>^MP_IuFE21>0$o0k< zNs-Eu+8_paWx(@$H}lzJ%*TUhfMWq9gn8p5wjJpQo)Q|Lc5@B^%Oy zqt}vuc6&Z`Gj|_nK01q*d~h_jW=ztN1mq>g;PWJLS2YXzxv3|p=^&=E@npIuHQeK8 z%|WmEk%Ku}q6|FY9=J*|zr1Lw>^)O&7Ph6L7A(JUdER$1zk8DT-wUHNPRwj!|Jp^r z1>VsYHVD^|XMLebj$P+wZw4dJ zQtN?9c4m?btp_+ORga?7_z1AKC-$CeOExvvOT^DFw>7^c6dO%JS(R#XMVzWPgqmiA z&)l95-OM{*VgB?K`uT{0Y^|cu-!yD8^UYxZ!}&vgM5o(_*o1rbLKJKe(w`Bf0<4Y7 zsFyzK9X2?1Yxu>}Z4EDWXeg;#N&Lj+`5zZ^_i5&J59(*n)mEL~yXbF&leOqn?;chm z1G6m4sn@8h)7%_8H^;JbK6TkzEq(OWqg6|j!I7%v#TTsA(mimOV%`}wtrGrgv))XJ za=lm*YFkoNu2(B{xdBDUp^knq;Q6zg`S2O$rz3sq=)FL-HS*6~^p8v<=ieOz2kSZQ zGvSIpL6LW%mTtD43E)^5{}9WKLrfk-fAf=C-nfAQSWEp1r7(s1vB=x_t&0pTk#p_Mv{vCajGGg!P~1Pp)NUuDHsph{ZFqkKfc;GOz#PytvO$F zVR6$|OABv*6~nY9O`5so8-EFktGp>oHQ%nFG$qpWZ>c_Ea}1Y&-7BT99x~f|1?RR` z3f4^#Yys>y696xJPdWQ`LF#mF50ImZUU60|fpZwx+%02KZ#k|Lw z7J2tcQ5LpTD0}rgA_Q3H;_+;;0;L5DV#iA83_LDM)y5F4HX#%{#s+Z(3X9J3ogqjKv ze06zCtv@?XJwG!}4Og0t!VQeCZZkcvGs3Ar)H&P2-uqOKL}zRME_8C%JOdh672$vw z^xXl^f4iC2rkT$IXwU~y#@6mL^i5;C_*bQ;L1BkY2^f~4zt$0dPdku$od$sWR~Ea2 z6StV3-aI*L4%H9p4XI3w`5m|Cdv4~vIp&XHG^P_yX9a9c=%q&u6P};IY!1(H&_dXd zjm?IQF|YZl$0~2WbB;o*y!m&gRo?EKu>;Mn6z-p+KXQ3~=3;*Ex6DU>gH|wsSRmQz zdDlh%BdkxfvxUWv<555!lU3NqndO9}leVzB?i+`SfkhU&fzK@3@YwvUw+9GT1nct+*NvBCcJ7MV_uej;dvV{=w!9qu`3 zYNI&HY0FXJfhi7ScQ4ukyFPpCdjRFYzM zaHlU251TopMHt!MS-oy7R;uVJ6)%5%>N%YX>SfOU#e4{M{pv;oX#T;<}Z7b@3|ZB3^T)?Z?l&vYGlGY zR25WFsCz$r>g6%`qF&m+Q>-Y>vRWUz>HGROCVXSs?Hw{Ebv-4wFOR80SpxDg>nnF- zFcrR7Rm3|^qnhA54R}8fCpT3YznjB(YO`EDFb68+-5-2jmKuU6Gz1`5n~L)H6TY@= zS?ralJv&}G?f2Mv<#mpA$;G$czOF0zjr-M7lYl!hC+Wu6Nl4cZU3x`Z2D;=vpjSIy z4kE8V^$NGAIYAuolF}h4bc$ofV7yr1{7R4wtfNipLV7H*YQ6|zkJkv)&LVNdNhg8V z`4yI4v-YQbcNCR*=z|vW zHd5moo_^S@G$)V72$asu>_lwP92Xk0(x^#7qf&$J{_>b27ivQ702c<;1EE%{?4f;j zr5e6thEKWB6g$O^!5Uyp$6y%@Oa*X^@}B2Ysau(w+7}w$Hz)n%hhO7)@)v&bFY^2} zAKBt#Jio=yZ{Z_866qM(){vvA+XOC zG9m3*ur3$Mu}1#((hQ6p|LF}u8-Pp3k~_=cRA=jD7^Q3~38E#KX&#_WAq9CTF@|-M`)7#=a8E@pXsS@yDbBHG%;tKKo zxx_r`8!8nt8=xaE3z;=1T-ChBXD^ot?c}n_O>t4#({yQH2O$gJd^Mlh-^m8rcyodMQZcyF zwAjH8UtP#1*-S9q6f((O@U3_PIN5x*r0eI5Q-rwE{qeAIw#1Cp2fY}J!L&7A?C_n0 zpG&!5ipyp8*YerrY9X-JECy4hqUsojed!-op5!2w#Z0We5l%uC#dN6q9pnS)1^B#I02_#LGS<#}HBy3}P%6T|P-xfSeXv)mVAkq1 zOX@4syhd?RvDVPtT)C>`f?8Y@GHEuKNtML-0`?xL?fdmj=s(iIvL&vdmc-Ld9V7UU zgPtU=0nS$bcK^DGwJHyNiuhawpJkxuaS@1#q>_LhXT*SAd=|jhVyt1-g6uK&pT4+M z=(mfGbpggwNC;XB2xDrcDPqp{&y5a$cGLHXtz*e?ecgPJ&&1e5a0%-ftU33qIdvKO z{0>=nLM?>P%e&$MS*r$MEkJ4$S;uZKaEWccxiY^D>s4NE%%^Q=NI*Q49or~F8!DIMHpF%z~@;$$Meg49PW3JHi7r6Yp}PN=OZvb%5UI~1NIU; zi+h7L{J4TlA{Dr3`-PR(wBH|wyM+~)2vbognWODDaAU#qxA6k`Q_2TI*k@Y_HI4O| z#6Cz`xUuT@M^;t8AK9;Mkevzu!y9tN3)!DQ4(2l~c{aPc?e`zFLJuIHtgC_9&W%mK zKYx$%*O7j@OMbLV|0sb<9(L%Ddnj0g==~FLgtqh92&>01nWP~9xphOo7&=08s%~FG zh4zU6pLB!VCJ>K5jQZd}6deir;f2^wn_9$s5)W}p4d7rO1^qC;7Tv@7E2}l#A1nk0 z*(}19@FQMLAuo+Tfd;irDQNE6y0Po`=gj$g5J)@*8vGCf0)8H16 zbblsEJYk?(@Y9w@_+gCm5%fE(6Tl2FMnRsR;KhMxipaCri{Pub zIT3zc`bIZUery#k{o)s+U@;kjwGgqBAMv(Yv%0{WOl#`(MEe3tmr-oD7s z5|1x#173f{z#B~wc_jWU@ksa)SI03gNN?pil()Yg?#OTZcp%VGUtI410NxMvWNo!& z#4nMD2O+w?Br6Mes|BGZAMY*oC5iKcrhEtgka%iIu(u|4Db@nKH;O19xH8!BD|!AF;aB(T^`l>W zS@L}Y>C>x0-QHS!Vdhxl3l6WqBG@`2eb=c*+!B7F^|}7}(G_otEug;h4 zi{@~W~jm@Sn`k9Yo64bBk$jI zTd{~oH2L?7pLYMuB7LX@sY&8Ti|=`opIZDNJd&i5p7RFi6QoA+%TY}2wJv*D>L)VM z`hU-PftqAt{|(JNO6(bd9Ozp=(RPbTPTwQ)vxxrya=;q@^F&@d{>dT;&rP97>gQx+ zAB3z>9%s&X$nUQBT3ClkHx}SR{IACsR(5NDCH<15UOZWc{bWnuPl8Y~pEc$mEx+cA zNFVI7kJitKzfin(lp9yT_E8?`!3a>-ws6S+^@fD}TDwUs*!!E%MY_*v33~_yVA+;O td3{x(_EJ-7OctxvNpkS*(+{?N+L1TBRts<7-}fKtkAC=9(C{Dd{2ve3Nw@$2 literal 0 HcmV?d00001 diff --git a/tests/data/valid_flat_bundle/condition.parquet b/tests/data/valid_flat_bundle/condition.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f213484721ce977ee26bc82d686915dabc7ac32c GIT binary patch literal 14975 zcmdU0TWs6r6{Zx`Nm?h(vQU##H}#x^^%}>z*m9&SeW2t>mTD`uZ0pt)hIp|}o4QmN z%W_*_7=~dOilG>aVc5eE48<@E!_bGWD~2N2!ybmBk3*mOPz*(}hdmU-Fcd{m?3|w> zMN-tojg@9fLXrH>fBx@W|8ssjD{RtBJ7^C*w?x}%>O4iczITD5sC=bVFA5WExgyCE z!RZN~$2&_?^m*Dj4Rq(|5qe~No|?K~Mdz}73rtT>*x_oo+3XbascoDd9rIHo=f+%* z=ec}MEQrP0TCE}#w`sf6?)=E+{HKk{FsF{X&e${vMA3E%F6ZEjqTo3~$5jC@|M)3* zw^OT?Z@S&pVyPesGg5J@NCaFj{sJEJU}!+k?RT6 z!}mutJp|5hC84<{)x;UTB!~`Ks@6`5GEods$sg>_x9!ZE4(7cRsFZ%|^#^3Bs&RTk zRn^(KM*(zWrzDFLh|R(S9&L}hboVe#Z|(O_Yf`$}Qr5(S8WHRboAdWJ=EGCWU1Dt= z9|KQIxc{)xAKI4Cr|8R{hH7G^AQek;X}dYIBj1=QRkqz> zQ2Sc53cl#(OGQDdNu}a-9(;4VRB66ku9P^i#&xvIb+A%FbrMHAAmS}2IR@@s&8Nzt z`O^tI`jg{??S!4V`?#j*z!?>~r9U~GZ#bO4b}*lhGG9K5{`5exTiVh|UsPT1v8j_F z(~6>UQ1k?S1>?GvdF)dsTSG4O2B^>7LOqZ6=I7_V;C`yt=+fv!zd5eH6tHk-a{2SO+R~rK zoMPz+cIR7m=eu_16Po$*0y;w9#Rh<5sVmq%R9&5(cmyCv#C@@1kyRcc11{wY==8bd zY z;~>16g`ObMkRi-C_*S_BBHa@((1-L-o*Ad@tzz&@saO?j z3$W>~O9hdDdClhhg^l^;cbI=qjrZVbp&S#)KUsprw=M@>5oG(x1oH3GZIGiCkuOyQ zQ9ztJj2~ZP-n|AG+e}O>EiqoCFR6+=<2en>hSlQ3tkHRD>a;fKYTufm{?4sAp$ni^ zOZ5sbt~bjHRK?5g%`iWlK`(1_(KECJI%vA_VTm$lT}F3HH`<)8p{!AWw4LeZXS15j zfiqRySEO3AXLh;gWj^rq$}ZZ113fb4uHKZ$pYBaoPr(r-bJpKvb{HN zC7X2)dDZ58-Nt<4XWsUsYVB9Zbu7$K z+6(5PE8v--mEFv6N@M0hmafkKJn6JEr=0)21V1ktn|6(q^F<+Fb^YzaOOr~?%$MYP zp;(=~Ir02tM;$%+!t}(XLZOt{A|ZTfvY0Q3Fr`?RWuSml30c@QzEc+3tU*5ugp>86 zgthP_P#5cka#I^9k&Xl37sOg#$k*~f{?a7Yivs+V3FmICN3I7ZUwNes#)x9l^ph!M zW`JZxa9{XP?pLM(W=rNt!_R~|*Y6lMt&Tsec)q2g7Y5W(cMMn)_Q0SHn=l;BLLIAX zZH8)QqXjyKO{*V^2DC*~5xZbehlheeJ;3j@0BVMub>Ps@)>efx$&PEQL#DOw)ACwt z*D0}K=$KdSc~)%!r7yh+-?4b4@dOvtp`EC!wlJEhKmG7g8qS&fDYApugpVtgWaME? z`s!O<%I`=hh7;ya7#+i=g<&TdbhqhPT7Pcbxw3imSQl=e0$VL?95pR`pJ8k`jqO~# zW7xF%v4o4Zh;?*y3=ai^at3Olf%brD-TTu#x?e}d(pidi?io_CHa~|rjlre8Hb)n! zpDJR$R;rL1OP07wzS6`JtK7_2Dy0T8gEM*j5g@+x`e!^do=K>5)a^<)=$rB3808ve zlgT^jo66y+>tD~f@zdB|Wmy*g;paJ)onu4WY=mXE+3+@b7YZ518~F0JHXg${_zWMy zg^b5ow!y+Y_(mq7Eb@Y9pxg%f(2ZC@uIHkuxr{Hd&n>2ETya^tBZat_9NfyM{d>90 z&K4I<`Eu!lt>s8?D<*rYv2c8s^Bs7(v|pu>`DScB7O4S0{|=YlKpu4=eNbHrvr?`g z%Um?M#T6p8+#1kD{a!BGKt7EutdY$mwuD8wLB_&yFISL(e^QP~jcwrP)$i`^#Bha1Ufexj{GH zhL{R^#7sy=9I--B$`?|*LYR!#av%@z@MJZ89R!!cXfIX2tduPT_rz49EJQczI^KnR zI#ta@g7qxOq419H=N9Gld?qO`5#4MMnXGVfQTb4a)HEAphJ;rDpKHn58*ecwi;JsW z@eW0=KJ)=$pD*t9(Q9G{tpU$_< zG{w*kS4i&gUVY3Ql=;~tr};q2L;Qp*rfU46tj{U?*+N;)&MJPyrBj}4ItjMBt>vk$ zwFc%6UCdw(*{Ju?riIPKXiE4h`(!u25*L%O_a(}lZ?4WS#$|psBxijx=IFhR zOiG5F*wx<#oonTXu$S#~%HC(OH^%olUqgxQfnqlZ_DOk?uz%)K$uia>UH0&OkX06L zV=hQ-};kEqc6P<6h0rru22G zg?}cz`n~nR*u6(@5sq(Vd}>c~fHtpY4ntd+CaJ5f;-<_MSAB`Dxks)G3t7uvLx~%& zfN_zXg?P>+_jw6=4&1Awvcwl6O%dyN=ySw$q{$b8vr2x+K^`SJmh>~r>l^ju@LVHS zB>hPnBlxSb&mfOjb7?mQbnzgzZ!DXIf1wubGTcIo2TfiGdcP_*fQt?61$pfC_E{K@ zvI}tAfIsko=QtaNn{8|h=df*jZWTyFWa=G!PLO_a4yo7iqBNCr*PvB}hY-6%URLo+ z`bqN0Ch;rd4MN`pb$cXO&1G^j7YQ^n-T)4feiRm7RQgeT#Uf;Qb9lq;Mw;6Z(!@gZ ziw}5FK0JJd4u!PR`?1k=pue`K;#p9*>>;g!hfKvu>;f*ZsEdCkp5SFTi}QUe`_O;G z7SgN%fV5-JshYoO5p-#a^Lpq%R#O9;JVWcwGlP?!BYoSy)E;g%xl; zoxejc!l$ioVoK08*xw81ATp1kxPtNr=G+h}@DytKD~T^%{Kt_K@$oh6O?F7Y#Bn{} zI)oaD?^e7ZrS7~+iFf^c(jOvG3g`Q^`Cak@;t#=DNUDZ&AfLc`*8(cuzWE{A(HjGk za0BNDO#14VAACrEunXqV%S-fYnIE#c{$0xtEX24wxC|G)zDdyN^li=$9;6o_#`X5? zh;M`v)tw2cKF)^raDG{vufz{AiY|T>e$hTodJd q<;A*KyUv%(*U2I7MC*`tLOH&L-{ON`OZ^Iesn4E)-!q2)dip;D@h>v~ literal 0 HcmV?d00001 diff --git a/tests/data/valid_flat_bundle/encounter.parquet b/tests/data/valid_flat_bundle/encounter.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a1cdf7a9f8f827680d85e0cfc67d93aefe31d848 GIT binary patch literal 26792 zcmeHQU2Gdyb|z_6v0d3w5|g1_HOmUlGz;6}k3^fyx?2xPiK3z(ind6R+0-RDBXXpf zA;TX@WD;NzEEdKh2=Y)EMiFcmi^U=j!6FYq5EOYRieM2GMIRPLk%yot7C|41pa_C( zABsHmoIC%+ONz2Z$%~eV$DFz6p7Wh^&pqefbB9iv%>>LYpAAH#flwe20yUp=PaF2W?s&m>P#^Ds6d?G?2pJm(xt_ui`tpe>_k=P+ zkxNBR5}Hk+CePrv*6mumQRZjLHI5$>g=XssUgRZSRt)l?%kzngd1st??+>Pq2p`pW zx7-QMc)r)CK!>;}$U?a&`qt!fO{`Tqz64)~VIs(S4L3E2-aD|BYOr5V9B8l>zu)RL z*t;&zFI>!jKj)APQQ`QS@V}t$nGY0HQ19q(;R0P?C z_@_@ZzkTY^h#?RygGDhsVnDj&e$JSw@aSB?G{>9{<~_F_UCcAA!L#~oqtCD(cuLHd z7nomP_%1E{^$d4|CzH0tbCmFL)@b#bz60Feknj_)ZoPhh=^p-p)AO;DdHY%BHy&ct zJ~F9paMOKD^WNuLmjKsozOgHm`3(PR8@$agbsBRa<(iMo6SCn z>;F#D71%jZl&wUvKk+K8?*^_o+?|I`1_5psAH&4O!)EHTDjGcC#axtHM zS6B7klblej$hD@>yvy-&)4I$!Q`W#(lIfWSzr{CTD0t%=#Pl)Ge>**2Ihl`MV7~ZM zG|j>As3!5cr!C)bqhLUC;Lh>UpYdQ+yoq0R%gX z)?=6>h?;LZJ@=i=-(F?jyozdS7^uqp!2K7RGBd=N^_=6=E$@7GF(EX|)ncQ<8&4_P zK0_h%K11P1x5vl9qqN5poY1U`#SSsaeV6B57jr+ryz3{EVrZaLqdjCq)(v{CQ4@KJ z{f3NeRzAZJWz9*R>da z@h{zAjrqFJgbB2^6)!e;Kfhm;>ST=U%VIZ`&`W;|e9g;;|GBA~*L#jP3j`avJWAyL z*Y7cJeGhF!XW35@Rm(~D1x?G-xHE#!g$MFK?DbShKd2nmWeOBGO zUxpnG#AnW1U{iWJP4TAR=@r}c+OiXytaEA#?RLcjN@3qS=%CUQnB9WY38reh||WThba~Nu=8p* zIv1IX9vS=K%JiP^tPfaFn@p&O+d|-cr^6C{I(HK7sz;>G=mI^Q%qfs~oCr7Rp?8n z`#+qTN{z2yl>yV8D(};Fa!sC>n!Z-e*KBXWZo;?Ks7bz(0I7{{Kxwfq^KGdn3q@a1 z=6tZ(fMi%<$`!s21KD7G}e z#mtZjv?njDH7Z3}==QHQpSMzYZTB|Z_qGk(Pd1e*V7gJ;6*#E7Aao@`+!L5js|el3 zt%o~RiJrAXS}E~hYQtKGWeE*JCd}<#%bgAWk0k^C6U}yMn=dO)u2x4y=AU*DwuOg{ zBNg(nIV~8Ojo@1GFsDd*pS}r1rvP<(bK2K+UFpb~7Wmk}eD-dQ`A|k1^jjM0!iIoD zwN=ldQA)4j!I56aw^eKSWI={{N3qkSq_?R-^xlE3RQr6;w%doE_>?oEubRwTO|;KI zK?lH~+QqW61f=9+cXxds4xJ%IMWZ?R5^sA z=Uj^l!hXs`RRs;FsUm*TqllTup%u~2Q4DmZ!vHE8x?$B`hmjA%VXwQ=1Ol0+yn77C z(5O2fMD8NwdOccm3!^S=XpKeQq1dU#--Bq!6%Qn3VY6Fkby4#(wNht(WUWtWEoZ?I zhAQ!ciO`o<3SAb|7QSwPKO-56)on$ zC<^w>=*#KMXiDD9V^vgU9*Yt#bDSks&-*WWT+C_DJ2x*lz+bUUB6G#2_f6kx(@Jd@ zet*%Hb=viT=h*W6iO+WCxov}%SDNYE^=EGfnrPGMeeTknIh;e9wY$a zbQ^x}0WWV&1GU_i>K(nIMow4oxx}}MT(MOI^4F%}fR?;KiG4TjEL@9Bzww3v#)@Lv z@_QfnRT6TuV4_mk5Q6Ao;i1}ZTm_V>%2xK|2S*1$&}ohmg!bM{&o6%{U-Vy)P^K8G#VL||$#Hl^K_F?gHeg3G z{S0%OCDNzkVA@fj6DW%@<(W;|BhCQj<pgHMCunZQh78aAn#cbghRGa+nqcqiCQW-a%k^7@tc z-;#d(Tw#kW%dW#8{Jg}nb8M`_F0gEcjaSICSj^JChY#oV_Cy8y!aaNl7t)?$**zBe z!QX_*IugJ=d{RIayOEN_b|IOa+YGJlmKL+ElDs0U39(X2jBXWk;hn-}b*q%jh6=g; zt(Ar7R!R&sQ}LzkQfNO|%7rx=vD`^rPc5{7U$|PzZ6J>}m)mbH$628uiDD_4*(ym3 zt-@`fONN7`I@H`^3+ zT=Au9el`=#=l0hNn@jalXuVwke2Sbx9c-|(8PfD)g*;&3atqCkWVTbxMR%5^Y`YLo z_0tP6K9o)+Iom8P0`|qhSj)4SYFSHY^6V;%$y__1Lq3^1rP*vJAKGZ=L(v@-C(WOlATyAg&9;j<^ouO_ zFn-l<3_OFjME`C%B<_}kFxW0w=N49Xxy_~R!tHPX=AGi- za;#dCU|fS{+wX%dG9@k-t`ps}0b)16*eWlI=DD<+hhJ;vXBD3)<+1^=U6o7D>+^H# z_8!ieez;)%uLd9vKp(_OeBmHnXnkno@<{W1nDHphF4YSmvE6_En*A{fbC|2EI-UsL zvXqTk{K@d^gKXX@Y*_rsh@S)ervP&r;_k@igI%9P%m=$ZCpaILs^w62=T2y@Uu+T} ze~E?psN!cdd&^qvQR3Ut%+u8-$n72wH@8a=)61RkQ2sEQc%tHzGa|p(H#ccGPjH+w z^?Z(NmmtSLtlfdU-&J!8W$yK*rNs>;|Af?hxp0)RI=ctC4Rcz$u(Mh%OMUaFSWb#v zTvOc%6%zT)C1o9QY`zM4Fqv(uxvm52D9k6JLN+FrC2_Y94;#-m^feU4?hnO>HYQRj z8Le`Oyt;<$U1Juc{oVZLdT8|!>l@73=r?42Bf~lr)7I5kA0Y_b^Vup8$XR1aqkw@8EoD#(Q!(72+Ml1Ki()U{4){xd7{s@OEJ{ z7FZsPAM(KhwQ#PitRybHCR6cR<{S#IMliyteIfl z8SSWb!e%kI)Pc2dnAEf6SQx%{O0#7$XJD<>QP!(iLrm?j-k#fA-ev`?mzXeyV9t^C z9^tW_YOe@$dnq{p_0>H5iK#YU!LL|!jbJS|!tP*2b_wn`;!uSRVQuy$tkJT8I9xhx znPr#u7O_%`-NMLgsRE_j*bdfi7qGy)vYvqZJd}6Y8`yPa9lFNz$Z8GmS1UjY_dr^} zvT%ZwedqE1Rp7zS6Fj6|oJp7be!N}BnlQ^Q<3%YVlW+6*6l%Y;dN2}zU{J7vSWyS% zvw?rbQ_)?&AHKvAF~XV@p_TeG5sgM-(FW|I$*1vQ`c4Rn;&WR6SOy;wqpJ05a!;5* z{uWDwJzI`KDSb}YXOEyzKoJ5WI!U_17_c!?&s1|i-mhT)`Sg}*j}6jEp;rW8SA~fP zYkeGNmgSJXyFKgoN8oOLfq;VIJ>K9&`M{X6u`ZWWYS-*i7wDrEl!pzfekS{nUTM5S z{hi&l;GYnWpx(@`VE-%Y2lR(=f&NmqtJF%f8(6=^XKWh#`&nE6CN?SKgWte) z;Dhr9@?U#dE>z)+fUNJK#x9l@*OeHAM2YP=;1|*OspE}CR{0TApb^*vnsU*!_EOdY zNZGuS_WM`T+lR8hi1a~}Pwc+}hbF31na)F$`L~KqR>uKIBmG_? zY|#&9KYSE=?5Z3R74e~UA8Om1X|`VM0>3DG1zZ=p45@E?2kDCh5A<4B<`;1?C0-Cy z!QmBftl?$go{H420ecwvUEkL9>ob2)$ygn7UJKc?Z1+0&+q#CI-6s#{@M;<_3Gc1; z7;0Y*@w24jNwJIRhU$+iWRTX7sVP2UAZM}Gf0!Bv_F-A!iV`2=Y;qg)F~tuEs_-%S z1I8Mtm)Pn!NEAeTVZHw#|0nST&c>>RP&NUBDY%$NoU92tN{vlj@lu$Z4azS?o{jy-MVfB&6CKu_W=LaH7gz z!~R$HH2HnDnIELi zi!#~d9i*?JJh;>BV;WClA0z*Qec+r4<`P{rO2JR24&)aJ)=_wFOL9{L``^&}tMLGhVxA8qf5Sep05!xc#BnmzWk1BH zIbtuMR^t;r|C9J+sXK$2RexS|_a83)t#m=&m371ma{9$@;@Wu$>QFgYmKxukaP3U;=f zPm%ox2wzQO=r|<)g)U5;NX4(jKg~bMAe-hd4icM0e}prBs4;5irS|h7)$ikFWe`><&`1$Cq3YwK^onE Oj{izI9BKG}6#Xy5HKd~e literal 0 HcmV?d00001 diff --git a/tests/data/valid_flat_bundle/patient.parquet b/tests/data/valid_flat_bundle/patient.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c27c24b17f8a8e3918050c6daacb690e2b2e141c GIT binary patch literal 3911 zcmcgv&2QsW5O+*AZ6QFlT1N@Wp^DUK58Wk6n`N^CBqwcBhqN?qYbRz=<*z1gY$t0w zNz>IzZ~}2c91!A&I3oTCjvP4h2k^epZZokhP^~i&CX#$W-CY^0;*$kfuWisk=}?HK z(m#ZfOf-?FBI}8G)Y08;-%<6GQBP;+7#;g26#G7u{xH3|+*M)=z)%cL0a=F^MFGFg z+?jw|O@={sp2=jFAPb{BPUsw_Uthb%L>n4G@>wYMWhniePX9t9mf+@>Lt)9Lz6=G* z<6)$8D1N_njUv+0ZB2IqNd69|e+v^N0S8NwtTNF_KElzTHz7xJRf>mE>eCn;?V6iI z^w&y~p~tmaQw+yz@5-KEsINn@??UOXR?>e&lK5!9+yE*6So}Mr$VSqd+B+VxyYVkpo~8`AOqXYB*X&!in|ieI z$8bHbd6k9xStDsgvciPl56Moe}5y*zK&B zb3-?^vssxB>&%0Zhw=Nip>{PNLEq|)ri!j|p{pJuvvg0^WKRb3rzzbAbHi@o-qVxv zy@#o10Wh;DQo;F(o@VeYXkQ5#=Iz-Q034q-WoU~Lec8>%@h9bCtZVtr#+%N;#gH?q zxu{UUmo4%_D(8mZV&EkPbKw=3o4C)O>9L74&|xGv>261ty{+p}y zM=ZO;7F%qYWm{~iMP$X|jDCnh*fZAGhPyrwlz1U}j%9}|?1MKtDV+j~JV;cqFTTfF zW?!lZJB{4oK-m{O#Xc}THj0Wqsyecm?{F907H3!6ilv#HF>G;GzO9I-Xw%okOSe{H z4RxDut9Bh^>~PEv)DriA+ud`yQuRu!2t)L9(8y}2Z{`ZFCT9DMoF7|iOk?(&V%9`k z#THzpgz==FBwltJxg~KcWx!s(YHag)wSt&fLlg5o@ZXV+IgAY$vqN1dWb{Un_~opE zAzQ+QRw8;&YVd9Fk!cfQ0{xQI&@eLw0==e3g0DwWGZt%Ut{oi8aT zVDF7ygl9(ozvZXeQ*FM>Lr;u*$dX0DmCA)c4i{{OO}Hz~#$m8F!1I$klT{kUe6UZI zT=j`09%lTrXHBhAM*H@(_xCd{_M>8zy>YEOU~fVUQ`)Nzl-#A+6n9A7CCfCG3V;4G zw##bUv~7U9@ZmeTP1!p zOBks+bD$X5!`$WJ@y@Vz!5Y}BU^}@7IjK>-lEp%w__*Nu2gc5jvokmsS$2PHhw}{Q z-64{a3WOzOmkdpusqEj zP4~B!?k}OV66lFKTfun?IN?2hYA4|7AuxMo0*V|0=lA4k!lG!y{OL#yKv=!P&?*vC z6nSEIij>T6!m=fN!M$bKgI%zESjS|55aOyVej%XoG)VoF;IE=Qcz$H}tEQSOWL4|X z6mY%}OWX=BQSUz=Q1OkP>^$k~-aWO~yGOnRH^yIb8~)c{;xYV`2!5!~qwrH1enS5P D@VFzY literal 0 HcmV?d00001 diff --git a/tests/test_ingest.py b/tests/test_ingest.py index b3eea81..aaa2cbe 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -9,6 +9,7 @@ write_metadata, checksum, main, + validate, ) from fhirflat.resources.encounter import Encounter from fhirflat.resources.observation import Observation @@ -1225,3 +1226,31 @@ def test_convert_data_to_flat_local_mapping_errors(): ) shutil.rmtree(output_folder) + + +def test_validate_valid(capsys): + folder = "tests/data/valid_flat_bundle" + + validate(folder) + + captured = capsys.readouterr() + assert "encounter.parquet is valid" in captured.out + assert "condition.parquet is valid" in captured.out + assert "patient.parquet is valid" in captured.out + assert "Validation complete" in captured.out + + +def test_validate_invalid(capsys): + folder = "tests/data/invalid_flat_bundle" + + validate(folder) + + captured = capsys.readouterr() + assert "encounter.parquet have validation errors" in captured.out + assert "condition.parquet have validation errors" in captured.out + assert "Validation complete" in captured.out + + Path.unlink(os.path.join(folder, "encounter_errors.csv")) + Path.unlink(os.path.join(folder, "encounter_valid.parquet")) + Path.unlink(os.path.join(folder, "condition_errors.csv")) + Path.unlink(os.path.join(folder, "condition_valid.parquet")) From 6258eb473e936797e4a27160cb750ecac8192b6e Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Thu, 1 Aug 2024 15:49:25 +0100 Subject: [PATCH 08/19] Add some tests of the validate_cli --- .../ingestion_output_errors/encounter.parquet | Bin 0 -> 18137 bytes tests/ingestion_output_errors/fhirflat.toml | 5 ++ tests/ingestion_output_errors/sha256sums.txt | 1 + tests/test_ingest.py | 55 ++++++++++++++++++ 4 files changed, 61 insertions(+) create mode 100644 tests/ingestion_output_errors/encounter.parquet create mode 100644 tests/ingestion_output_errors/fhirflat.toml create mode 100644 tests/ingestion_output_errors/sha256sums.txt diff --git a/tests/ingestion_output_errors/encounter.parquet b/tests/ingestion_output_errors/encounter.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0b186b5a43a4564864016453085dbeb91909370f GIT binary patch literal 18137 zcmeHPZD=F+nIFlH*ZY#qc9T)$)JYU|cj`UAXhyPZX`9lFY)h6;*4kQIZ>8OiG-GS* z(MZuqeoF%(giw0r2qi52aFnGSC4^AQ5srm}qa5WvgkH&~qaQ*(lyDzHIgaB#gb5$dv} z)_vU6;}qqG%XFai^)yYy zqx7YbAT@Ml#C=X{6t<*dop!p0UH|QH{ilPeGQ+)9jZ6Siigr?PxdMMt6gkUC!k!o^9oYLw- zP50O@ovy!kGQT~?{G7PoMyaiRRr>p;b)o5Y>mI-CCbn_x$mMx3CTKMKC)t@vznvv$ zg3;Q~oDVKG&AHNMj-=`y{M_OClY{xg1?J=P7=XR>+FJCsgWgDBXhQZZz4ZccS*q8o z+D)HNQ!8ajoRZazicc%nf9MZ!f!Q$2vbF2f=xiv!&ax~A4tkFEwx!YwhbnOQ7BTe# zjweDS^(b;3a12~#K7PVtBJ_;3SC>>xuBcOWxh$)jDAo(D5lYSN@2mSqjGdd6OcXp}c( zspi=cl!kQUIS{#}dV~!xR)$V4pFgCx`#J2}Te~fde|qXr8tcGrH;w=6bp6iB{QODg z-c_XWp~2ZQ_)`a6G8v5jtY1I|X1NfAA?MZNN}# z%j9d%G?~0mR0K`y5rD6g0DRZY{NvMo1MoP2Y}xyXgMI@Fu%6FR(Sy;|s8{s~(NnEe zijvrTnV#Y4>_AsCi zgaR{umi6N*c-HR^%|O)*Sr5<>0HVg|i3$3Wxg;JOJ8wz4hfM|0|Ea_PTnDZ=aFI6f z>wmdOIS1aLn7dOJ8~2|OWnoi=DAVqUFh{pf$mfd{@Fd1x+XQnCUKgcmeF%aQ{M^m|>wP*VYdN0T>I?b$Kvn=@n7aHUZ%jN=sDNz*qCMc5UD+IzRCcYm#G#NKTFQ{eR2 zJO>t66=9zcdVj$6ME$=h*%>?ZvW%TQ%r@`PJ^DGd{(0|hwe^2)i zb@yrj?N2Ut2B%|~VtyYwJvN85eXTB)37Kz_O~u_=<~K7)W-pw*4X~xqO^=!sUYfvc z4uv^b^sqr2&W6o4w{_RW%Ds0lP{_*XFIud;(>G%W8XYOzKS$qlxZZa#|NkxK-@k<{ zSU_4J+1mM*gT4#XfgUW2{}Y=Bt7>>{-5V=x z8=B@%La+NowDfe({kfBLPg~OeY|!_PKXWL$athh#7vhz#81*6 z-DTt6BJ)A)1j~kte#T6ui(}^?%$5a3sL2APL;I3wXzZNP_N$$H4%a&l=F$cUH|H2-d$n-?G|!y62NR(JH=Ys z89ZmHdf0~T2icxZvJcds4!Ztfkojbt`PCXa=`0y&=j@R$W`dM^2=;eUS)fNL`qFi$ zbv%8YN{mA6(u4304t(`Sn?q4k4*PZ;G8&CRr@POo8zkva;zbxMIOry;cRcGRP>hqf zL$#ncj-jRHz~%V8jycv*cGAZHahSSeqU&6}*&zYvSx2IS6g)NeVTXx3QQX|0fmh+l zSOFf^b4ZT>vVUuhFmpB1`p%dUn^A(x&cg=y(!d-#-`OGU5&kwa#e?@gPN=wsM@C1! zc=Ck5$6`+POC{#CzRh7y@q3js|4jvRhQ9+~&e*9sbC%AgzcS~$%o#l%J<|u3%wNX2 z-&gs-?Q9*3F@HS;(tj+&e5HGxwb2iT+n7{1Zx9e@rHOvgVKAn1sqDCg%LsdZ) z1?9QIvbi%y2TueN|`N8vSo7>7#QptLqT6dq*j)d_%Q z*yxqT?!okT9plJqeP+7bYU_22(OrYI{qjopRF{SJG94g}M}B`C@C&eR)7wxO5+{qH z_iwMd(e#(?z=glu`mRCH_qrWr8=hSZ*^h`D<)flAWlsZL^JhZcXxiIOZ*Z2Dnql{tMhW}vi3S36=;=A_V&6-G@G>XjOq_!YTO6KeZ7?^pMQ zTCK8+{Om|I{4p!O<$_b}6zhfQ*lS>DY4lIQKi*d<_a#1+x|O-9e?{TGKk38IE}!Cg zeii=3&)@R=G#}aIqddRK&ux--kw{znF1}o_+86jG{TZ%A3u%w@{4Nju;BTTNHwSI- z3?B{TBeU_c(#Xfs(^-FVr?8l=7u03>mK-Ta`%5(;6Woq(RW{@5QmIfDmAJgS87~J* zh0Hq2G{nrFmYCz^Vjxv2s;lsh-!-0higWRq`24CCpIbT*W9ePgvz+BbY|r__nwa4l zS-(DZK5LEH$YnSMWeaLrE6m||`D#A1x1IIdj2nvr_UJ)2kO~%Kh>4fQOt1?2xAV8- zI5uG9cBOQfm9i0{H(n0QLOH!9&XM+dKAS3m4mNii@Gfd{J`>z3_*2zFxk&V-)2Ucu zZByRpz+T)3`Dh*ZGjSmg=~mrkA(Phf(XeGxTupMha#hI%R%OD|N#lg7r!L#a+xG6W}@19 zEWIye!rO^*x?ynE?cZ{AOI%Es3ekvCEGs+txu8-gf4RQSXG<)^@Bv_LBvgpw4CELW zseo^lL^-*Whdk-Qw<0a3l!BTB-%K~em{N!QSJz{uQqjNOXvrzNSkZ)RlFbCt4IzVL zq(!4vaD{9n*qWaO|I*8O;Pe=A(-#-imT4oGZTtakv2>&8S9S_=kQFi<PSNDy@04)0)$TOq$JQQYA6AVAaTt z+q;`kPn&h1oYcS{2Z#7$s{lBQ`=lP}wf9lsR8!!Co;Ae8`3U)pZbO_O7=GS{Slj|1 z@8|vL1-RcW1d>&VXN&=??Z!OTIj_LHlu6Y=UeBF!s#^3XA#aw{{5sqRB0HcTVhHl8 zC6DatP$8xqh|xM%C@W;V?1RijJ!<)pG4}Tv%y}Hxr004b@aFuG=P<{o;Jz3O9|&2f z9i$Fe`SoSz{kD*as)aIQNp3(*fjsV6yL!s@@sE+WsgtZ#EjX63X2SZU88d})QYkL3 zLT$A;wW(RDYAz!-FlP1K4?;ZazCe9Be|lG(dz7`S4IkGrHymb9r~D&~spW=4Z1G0eIjQM5kFW`l2O6Ir6SA0Hr zh|FV0p0D6V-;cmn#@M7R;8rEp9cLtOpeeYr<@4yocEOQ6*+?3jNCFKU4hw+5Yj1u8SNYvp8 zY0GbCJjRKAD+dN%miSMPKLVYE4$+4-iH~4jY~`~NmR}*`X}JOQ8^!Ox%lSyO_uaP6KTlHE!N9^6P^!MNw32LcFHIyGmd&vCKFaF`2 z4vMUQuT%`cdIHyV@EpMYdutHN$C;mqe8>l>11ikUA(URi-uN1q$@pYYwnKQS_bQ+4F`$ z+jnno#{B~5^sPXS9bW&M^HBoj{cCV(@n35_us}t+&Zyw*WgDCM*gW(f&%uTG)9?pr z)BQ=LZsWD(xY3qQDjus_s)_~1W*hX1L}p8?kR6Y>B6 literal 0 HcmV?d00001 diff --git a/tests/ingestion_output_errors/fhirflat.toml b/tests/ingestion_output_errors/fhirflat.toml new file mode 100644 index 0000000..2103d2d --- /dev/null +++ b/tests/ingestion_output_errors/fhirflat.toml @@ -0,0 +1,5 @@ +[metadata] +N = NA +generator = "fhirflat/0.1.0" +checksum = "437a5f50450c7671084451fc83d2c049239bcc8686cdd663823589ec689d872a" +checksum_file = "sha256sums.txt" diff --git a/tests/ingestion_output_errors/sha256sums.txt b/tests/ingestion_output_errors/sha256sums.txt new file mode 100644 index 0000000..2ff9e43 --- /dev/null +++ b/tests/ingestion_output_errors/sha256sums.txt @@ -0,0 +1 @@ +8980474796eea6a656ae2d6cf5c6d31f290d97e2bbe39eaa0469d2ebed8e3df1 encounter.parquet diff --git a/tests/test_ingest.py b/tests/test_ingest.py index aaa2cbe..3b378ef 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -10,6 +10,7 @@ checksum, main, validate, + validate_cli, ) from fhirflat.resources.encounter import Encounter from fhirflat.resources.observation import Observation @@ -938,6 +939,45 @@ def test_convert_data_to_flat_wrong_mapping_type_error(): ) +def test_convert_data_to_flat_no_validation_warning(): + mappings = { + Encounter: "tests/dummy_data/encounter_dummy_mapping.csv", + } + resource_types = {"Encounter": "one-to-one"} + + with pytest.warns( + UserWarning, match="Validation of the FHIRflat files has been disabled" + ): + convert_data_to_flat( + "tests/dummy_data/combined_dummy_data.csv", + folder_name="tests/ingestion_output", + date_format="%Y-%m-%d", + timezone="Brazil/East", + mapping_files_types=(mappings, resource_types), + validate=False, + ) + shutil.rmtree("tests/ingestion_output") + + +# TODO: write a working version of this (needs data like the private ones) +# def test_convert_data_to_flat_no_validation_invalid_file_warning(): +# mappings = { +# Encounter: "tests/dummy_data/encounter_dummy_mapping.csv", +# } +# resource_types = {"Encounter": "one-to-one"} + +# with pytest.warns(UserWarning, match="This is likely due to a validation error"): +# convert_data_to_flat( +# "tests/dummy_data/combined_dummy_data_error.csv", +# folder_name="tests/ingestion_output_errors", +# date_format="%Y-%m-%d", +# timezone="Brazil/East", +# mapping_files_types=(mappings, resource_types), +# validate=False, +# ) +# shutil.rmtree("tests/ingestion_output_errors") + + def test_generate_metadata(): meta = generate_metadata("tests/bundle") assert meta[0]["checksum"] == METADATA_CHECKSUM @@ -1021,6 +1061,8 @@ def test_convert_data_to_flat_local_mapping_zipped(): os.remove("tests/ingestion_output.zip") +# This don't run intermittantly - because of the "#NAME" error i get with the googele sheets +# Turns out this is an issue with custom functions in Google Sheets, not a Python thing. def test_main(capsys, monkeypatch): # Simulate command line arguments monkeypatch.setattr( @@ -1042,6 +1084,19 @@ def test_main(capsys, monkeypatch): shutil.rmtree("fhirflat_output") +def test_validate_cli(capsys, monkeypatch): + # Simulate command line arguments + monkeypatch.setattr( + "sys.argv", + ["ingest.py", "tests/data/valid_flat_bundle"], + ) + validate_cli() + captured = capsys.readouterr() + assert "encounter.parquet is valid" in captured.out + assert "condition.parquet is valid" in captured.out + assert "validation errors" not in captured.out + + def test_validate_fhirflat_single_resource_errors(): df = pd.DataFrame( { From 6079bf7c5595630dd72716f0a92f896b79ceecee Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 2 Aug 2024 15:24:34 +0100 Subject: [PATCH 09/19] test compression --- fhirflat/ingest.py | 12 ++- tests/data/valid_flat_compressed.zip | Bin 0 -> 10492 bytes tests/test_base.py | 140 +++++++++++++++++++++++++ tests/test_ingest.py | 150 +++------------------------ 4 files changed, 161 insertions(+), 141 deletions(-) create mode 100644 tests/data/valid_flat_compressed.zip create mode 100644 tests/test_base.py diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 875066a..df7ec14 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -588,8 +588,8 @@ def validate(folder_name: str, compress_format: str | None = None): """ if compress_format: - shutil.unpack_archive(folder_name, compress_format, folder_name) - directory = Path(folder_name).parents + directory = Path(folder_name).with_suffix("") + shutil.unpack_archive(folder_name, extract_dir=directory) else: directory = folder_name @@ -617,8 +617,12 @@ def validate(folder_name: str, compress_format: str | None = None): print("Validation complete") if compress_format: - new_directory = directory + "_validated" - shutil.make_archive(new_directory, compress_format, new_directory) + new_directory = str(directory) + "_validated" + shutil.make_archive( + new_directory, + format=compress_format, + root_dir=directory, + ) shutil.rmtree(directory) print(f"Validated files saved as {new_directory}.{compress_format}") diff --git a/tests/data/valid_flat_compressed.zip b/tests/data/valid_flat_compressed.zip new file mode 100644 index 0000000000000000000000000000000000000000..c172605d22dd0978e38f8babdc1c20036f48deaf GIT binary patch literal 10492 zcmZ{qbx<8Zkf(7cKyY^n?(UKVhY;M|dAK~>-91=vf4IB5ySux?!!CPUceho$H#1dT zJu|=RzoxqSn^uyAhQWe>fIxsawt&>UwPE?;@y{)>fPldGw`*!=3~;e?Hg#mSH*|Dx zF?GJs(zPK}ccrmZb2{V&_aT+{Q-(syOvP;&KY?;0 z4#k`W@!wPceoTfTEEKloNRf@7C_`bXq?oX(!%Ag#3?_)Pqeh#2^3*V zub?^OP+L>A(5|1N%1Qh3#ZAs~eDfZ}1qgYT4(QV9~2(VTWz$ zK|lmes;`{s1UG2e;HYXV!f$gQQqbW7FUD1rpPwBe2hj~{~mX`rW;D>B|KXl z@PnzZeiJ@GxpgR+w($9L2;?xoF9Mni4euXwJ0t_&9SsY5OfEGpd6qqK)~1-|wYwDG zCt7|CNO8_5EX;7t2QN5HH)=AmvKu%7qv9d^a^`>+ty_c7y1$bfw7E<*6%t;M%zDb+ z*KJ;><-G5)9x{;oOmgtf-Le-9SgfkcT{`>EFh0-mPnQ%GqwunMc96-xU!+z}S)=Ek zw96i`n_)iqt4=TjLrb0N!5Kt`BDDvlRV=@{+MnNR&ee{p(ME}6!vdVNs&t<+j(0>F zFokr`($05C(`|G9l)7c-`jG!lb+`f;1Fp0uFbC%!eFdKLao zE;owY#z}=&Sqr_#Uq9?2wXnOB%#P=E)xJp~)wTg>m-x2Id3&BHf2X9^s~goFSnAYT zz=2OMkMx2K{dlOg$INWn*MS#Vv6N<~<&SM&Iu3{;*bK9yM&RfKvpSxm4M^SRdHNQ3 z%(Mh_|D1*3`Mi$0*yEUYn^Nie=t~@w&x|`$Q*;y0Uh~CSeU?GJh3s~*kBcP4#tSsK zGDkFV{Y5#Wwbe0g&g2;N9=h_VZ+5D#UP7v$Zu2`2l(9%^muEJeEW?;L(4b}Lk@-f| zq9?S^?9TiLCxRrEu$gF);b%zMO~y18 zBm0|b*;keg!&Z2nNECppi$-{vK)u`D{0*YA;-f=aGLmd9~UNH2&-B*6$E-)9EZHY&4NUlU?d0#(~yWL~eY23KM zq2lY73_L3%fmTi@)%>g|A9pz)vqEw!2^*i13TXdL$MNvBxZ!KPcZ#T)ZwgtwO|0U^ zT>TIxBg+CzQLwxt+6Ab@llfHW$V9X_JzlkR=~UI9av3mAw7_7#KE606mFCZf`LHr! zIoaJF^kWe7I+CMNaZ>5YYhF=JQA=eP3s}L^s^We!RXN?Lyhn{*qpM?nnwcnI)=!X4 zx%a(ShN?a>19dI*v!l?KbkYe3r+fI(M5+Y%G`Niq(_+?QacvN63sP(-ifnpS=rL~- zT4mv_|Eh(0y57}cBDAD@gj6B5O+Cu3qPIAql8UqU(Efw=mtlLw{HptiQ&HhJR_WZXeJ_o`x&c6#}87 zH2l44dJ@Dyy;jelL>1l4_cm+vwkx{qkbAebnQzX2_xZAF(6mcURn9kWH?#$}<>RI; zD}|#MKHkN;cKN4Nac_R3IX*pXH{xM{Il?>uSLw^eC3P0`wa#&2HDPx2P3k=BWd7`+ ztFn0q3+B8&%^(KY0-NXabhXU4H$pqr++M0&R?NAa<&m?AUq6sa<>+8>S_wSy<>7I% zo($un!KSeF5eRg!N=f&_@a;5)`7dW=yAJ^!uXJ`gmC5Tg}r9kzfp z#hZq7=0^!ve}gSKhU?cg-Fk{%o^X!-ETLe*w+t6*f<2n4kvvA0yV$A_7CSmZQJr(` zKKKruKk!t@KlC$BAQmszKLwSycELpOMaJq+)4Owa>nHugBtsXnD+Tb}^z2U8SLh1( zE5{Bo`@Ue+a><9_LVxzWOX-M8Cu7rgA!*}yYP_+FONA7t^s2cBYDgQm@r=5tw+SBC zmgWA}GU^oEx*Hnjx}vlU7n5=o$qi$gxHj44?flV!LqTkl+fTZ=tMhsOD0;@@*9##< zxMvXz7b(foc#&-+p#^1s@KK-gS5ixPrz(%7X^>?Cm;3>KvtJ2T|Ne)qyzHgo=(e%M zQr%P8{7|xG5B}9GU71=xRk|f##6oL+uR1UMSUuHj;S4MAby_daTi6Qkg5x~qm@piu z4g8H~oKph6l5Y#eNqm%bE178^(E04SNpJz}ffct}`qVWppJTd-9eRnEvIJWD=6K{! z>kg^*;*whK4f@P$r)24x*cZ8uy?bVTN8WmSi~NR_X?N_J<}T`%Lv?%tOb>Yg+Jx8^;==nT^1hA$MeX&p@`s=1L!#pK+pTEK=b=8=n0H`MgI$k^sv93|)4Yc>yU zX0?`dR;rZF@jNURw+bEoHLv&;E6#7W0MjQFqthnZy?7NVk^PphP$-8`sf^JNvR4wX3JXn8Ulcr|hP z@OPzi_Aaf9^DeD3kNqxH+U#X@NN#hNH<*V>VTU+$YnBI6;Xvl2rjtnIo$z8SL12F? zIdT!R^G5K3aAdBF&1?K>U*Tl%dRGw|Q*4ATy&QkvqsTvzb0Y*YLjBh9ihQ|&&a^cQl`M-8jSMgXAPn~CSp9yhwQ(YU6oeQH3yHQMH_b()->0DwB zML8oh4Otg|gyfRXBpgO!T02@J)tNHZMMc`Cbk3I!-%HH6$hm>)1LQvcc6irKtCyFf z5O}97auZFwlA|O1p+cgl4@ZV#eV!d` z>-ptFf_2={2Ob9oe1k%rb$NcQxJgCU(J@E7W-ok2nk~v|3WJ(F&RV<3wt=yb=>3@{ zq+53_`amRNaE=cYIeC`lm3%^*R1Z6gU$r5^%<&@k*|+k>&x(N+);B+8B069y(yQXe zSi(^M(L&tN>b<19q&o(%k~rfRS`sRmGSUC5QhG5iJ9?)6=!+&WV;*c~@ke2H^vLv& zULiC!zvLV>up?*<%0K&q2HckW2rnyr(82)w^kZfV;v+Jc3PTJ6Q_7#-o`W7KT8Foz z^(vh7fYOpre`UAnMO2vnIZ|%WEp3ej_im@2HkFUrF8FPrWIG}6tR}GiXgO=@xs_8q z7rO%kb%bn6ebW3fE($J#FNJ%_QkWJ=TsV{da3C&o4YIX*DhrJ#KGO{6x$gCbf)L|R zGeGEYG7@T75mC8Q6UyWA&8NHRO8y1OkSg`?E_>O!MQ*+Dz73X}2nxT{@&2#Yt`-|P z%V4ad;dFWMLE#*sriZ7{Axn{GbrXj7Aj1tk=s|<;OunTqeb2;BL~YD7A)07gWpHGn z8!q%U8|1^81B+rqLv;~6$3(LyvW_(uy$*Da(?i&kI7zB1DD;A<{t;r#7g(XcZ`TeGa8pfHq#(Hd6S`Y$rlSCZ{|z#bE^YQ@u}i^244nUw9FE1c2b3 zKm5*<6Vl(^32hdZu_QuYqPAD>o;P6Gm~D3X)(FFda9x*>HhH)8Z0Q_-wRr$qlON8x zYDqo~;b})Mzeb&my3e7~1P=bJo68I|Y}N=XkMWkINlbAh7nyDrcy~+G_(8}<0+#`J@Fe$wDa=&ar9&^gckxeyR(!G3ctl-5-+Kv^ zx`w%aZS*;R65rW3VA6zkOGmaj>Jj?qh?rcU4@V??zA$EgpmL5%RH#PFwlwJ_P!0Zo zhU86h*!Gza`bM?2#T*dPy!$n3jU=82?KW->x$@@Cko)m$yP^Pwncd#@bXRb|Lt&v7 zAII@~JDf2|Z|fEbktehze3eM{BoO}(&7m`-C8guzp2g2W$8Hqyt`6@x`Cx~GHw zz5H{ZS7Oz&klp^eKmBMhrQ@$yJ63h}b8LFg9rpbdm5njf0WPnL(;M04*Jf*vczO1? zIR!TsS`@)H*7TlNAM7!#rw`cud_o@w^z7`9#8KH~h=?(q^A&GOug36OeJ!s0pU z&xjQ{Sx#13@e-5JCasD5vh{|x*M671dQ`6c;^#+?`GMyGagY7C{yddO;O= zNkZ*6D-JD(8O|)Jt@IGO42!J`gk|~8DsVchM^2C418bi=_z!NNhJ}K_TbYoKdv(Ni=FI}&*2lv z5CphtL7gDA(<%33#v*ZDjI6G_!G6%tiH_#QE&Q)~8B!&9<)CgL-9C0RY}ee0j_QNu zrAx%Nt~aYSo(O5)k8|U!22vpa`A4*5O*~?b=*v%nH3l?m9rA?vLk9P1q`bw}TmC!m zl99Dz0{<_&FPCd$WHOf2JI;30UubH({Ejy)ylg%z(XILw3fs_>cUfY@B)3rxq8FBj za~(HD-?hVhO_8G&V=n+bG$8A1g$M{j9YJ1kQbGUV#QNF4(OUHF2 z)h}`Fgz%j^H-WS@a33@bQx}BWRwK+lU_2fW!FA;OBi@#EHxjg5x|-A^Y-`HkF(a70aDz?;+mL1;5tH%;sNR5CvVPvuQ=b)4Kd{6 zV<-F++Av=;;^Vs3$>AARV;A$kciw;YBaD+&Y`m*+Hsi|WqdpfORV=(oQ{Ji{!6nj3{$V{Km$UoNTK9&Z<(AsHLzi&$2 ztC|~)Kk!630?R_KY|vFvZeHu?eXYX0Z``$n?I_yI+jQF<21vPRVCh2wB;XfuF&>qV zH*t*hCEj@4P{CHO*av}2fUr`cesuyz#E@V1I5Xfa665zO`p4OYtQm0ZGtbve#3f%A zt&qDN{?k|AyiyvoQPVOIrpe$^O3mrIDkHWvmGd#cF zzaL)BJn@NQEB>9%LH8{|&8<^&d_0US3h~Vskaor!x;lPkW(7iyj8O<`bVb)3Ug6DD zkyP9t>h}So(xLiarPgyq-x#bXH{Y!mSV=7fNrj$bzCligbMf-DOmk$%!T`xl+)URW ze2zO>TQ}jAfv$X^M+|$w9I5+Pgei$<3PV|!>O zZ<@Hxr-7|e6*|>Q7C<|=p+s4@fkdMW5~(C!fDw~F8zOVQKje#GvK&82mHscMJp9Kg zmm5FB)lnfJeEI*AQyK&8Oe~!(0e1g~Q!Z%8+Yl~c=*>ph&_?W)>7_Vgg(hD_6Rzzh zd^MeM`9ekgA(cR_wX08nuuiJXz?6#OE2$jrpGOywhY;}>AtEodmoDI+M-j@27}0&4 z8%i-;GyE;|FG2tb@ZTvy0NL@l_N8O0PQoua`f!89bspcX@eCe-nL0Egnvi(i{rOkh z;!=kQ$B%6yDdiT&nj!>{Xer|e^Cwqo>cjXnmXLw~2y#mkm_(A5nGedW+jY_c17^D(PqBj^3?C{AAalx~E+)Hb$fz)JqHZCK6F^2Wl7uaBLV0VIN$J^cPVoW>8JpHrT zU%)1$80I2y)!=2pa0n@3MFS!q-m)RX4`mP?=jBgThd(SxyJ3u~WwcW1Fd;$vimpQ$ zu7?j8hZ~rIkS6$L(yN8Rr{m{+)oNSJqj|_RnkLnGMxAUO zOj~KpDR-CXLGKFrP%X!Ykr8C7g?Wxyq3h$gW6`NH2Ndva4#ZCf{M#%R((JQHKiVxl zAy(V$at=eY6=~0(`5b#)dneaD$Kw?LIF5M{IJ9%`K3^c*t>E1a^bN&qII^!{$q42Zd_M)GFaBxJ6l$yQzI3MVb8VGmzEM^D@c5Z-~Q59=ZqahSaR z9&_CDLKPw?6;Hr@63t`ZqN31dP1u-h`dTFv(J;sU!kiqM7?z=6v6arNR>K@a4KP`# znS47g)wW%w5rt$^G;)g|nKR;He>^frz?rjjh}kX!Bl4pt+7|DdD=N;qRtnM4W7o7o z6asbW!(hf4xQD82#;dx9-i~BgvQh|bpk7j_hnvD8n*JbZE83EMI#j|sRxsWU)RG~E z&H3HrmyZjmdzRMsyJvU)HS|gnLbWEph-fGgDR#-Ld#FP_C0(y9gH~b~&`QT$BBY*) zGkq#fCslM)e-=F$j9hxFa3S2d_AzPRB4c!2hVwQDHMsRiu}RB&)57c2U}a1MSe8&gK9-N1utQr3V>gy zU6p3Yqxe3d;3t}=+9y!H51Pn0l;6Y$da&6zrv*+x8r2qzzu3>7d9!=A)QwhkLGsJm ztD>@euS=2leEi$cCv&Dmty6uHN|4UL%aap}LTSiLo(=Rq5%#bJ;R z7koL3f}uaw-yzYI=!ffvm(n&jZG&;TqE&E0sq0TQ6to~T?{?3L1#tr_HU0~?xU~x7 zJmICF>o;iQ?)58wF-kOUs3PW~IUu|T9tLXzr4Xxgc((zo1yf|OH>IUTW81SG6{_pe zgI>{GSvaRbxljfXGM)EJXbok@Yi9mmBD2ZI;~22mxq)5Z5O$;^Eu(D%E}cs30%CsP zx*6WQ&&fKwt6^5kS}8Qwy473-N=9j*kW!HuXwLm#z;FwzEuaD3p`isg5NA<4$p8Ch zS$M6*0RM!HjEi04TuJA;UE;l>DVsV|igYZstPUT1F^-8$v`&joMLt##o+1wj=rwoF zDd5s%x+;3F;2Hm&ZwM;9|6*SL;h^MhOR?>yDq9~J2I`sgGaD7g47$gQj0j;LI#*B= z8BYG5--qS&9uKrrR}?&L{7g)>HHQMRd& z3^(yzyCn(0ld-wEOCb&5Zs=+tdISuZG=4=MaO1t|43&N1BuNMK9~$k@AK%$Jm7>lo zWBz(gE>sd7#W6{bq-tyyZ~oH(?S+vO8ac15CDTYE$6}(wP$Ba)EKY^O|8=b<%Sg|r-xO3`n;Pw>kDAAlt~34{ z5W4$OS%CNod0T8opAf@XuCrzTZBeGz_zR?he!Z`67c~ z<%?^A0o^~Sf}7~0CI@fYkHPRZZu+XDSwxTUezSZA1!f9Tgq^ZZ_!IISY_g+2wH-I~ zh?-F#Vt`IZb5$=%)%j!i*(mpOu04mtmO6nt^ya1m*Y*YG($4Nc8W_b~P&)uzoALO# ztsJnXLtZoC8P)hC`z9AK=R`L0)t8rA^B}*rrs&)hiJFfiA3Ro=U_+Dar@?sYVs)lK zY#qv`)|~UcyE4gf4TS6b-Ap%MSyHMHPXO;zx8hu(?R~n<2fBlPg51x)B1^qpN=Y)H z%+M2Kyw#%jfjAR*fBg7)JCu!Ey%cW}{WR*CxnF)4To7SN$#}C#H6Q82=S1aE?p3xw z%V1+gqd8W{`kBy6b|(`>vn{Os&0%wefHZpSO-^aj-SpnrjWR1KchGsFsc%&4fP5X* zQ>R4x%it3);2ptpKlGVq1!mupJDtu@&!wjw-rdv?Kb>&ZY=wGW$CDA08f1~vM)1@UXCtb3 zlf1d`y6AKQBAk>BkyI#xA>jINZkmQj#b%i}PZ zu%aIOBEJ3fd}Boxn+$nRw&7=R+Ia!4a#T&_8}OD*=AY`wa{fTGYu*^H!=sU~VI`eD z(!3qo+x3t8b*4d9`{hmDjY|-@EM)^HTS}g;+ONQ3$Rl7t3GSey+>_NeKax;Vy5ImH zx`Z2_OKUjdly%KA&U3>Y>cCdN*xUBK5ze;~9sPm}>7+c&H`}GE>^Tt8TP6C23T*#n z_cyQLZLvCAGy^{Qi|upr)iC%uImldyzBZb6I!DgIX);e_>(!fIbRs-WMsbs`_liAE>FW^ZKHr8sdB(r( za7BVT)ysa%QnAT;pjgkeX~IBhv2?Z5?Mo87OAi(iIosq|*;_jwKtynHHLi)JjkK6* z^I7oH(|Vr2O8}(Oy5S4cDsL8#xs{?xN+UWJ$dKg2cl)~N^{q7tOY)l;!)X3^d1-Wi zC`Z%V-`a8@Z4`EH2ar|R)2x3S9C`@YcpJp)Yq>(E7Jx+!#wda1Uf#+y7MEqzJNA z9wPH?O{>#*5$auURO`bgJ)M1!+I6Mi#`&TGpc{m*t%#8bg8qEqU6*{sfzeZ{KZ7^h zXoA!=NVcpOm7OQ>s?6z%%aa(olrYPeUVfeMdxW1^br!gkRsm>*NplKHSZ-p$_m&Y_ zs8oWV@c*w0yZ%>&kxXg!0`$kLZy2FTob{FA^5X;r$%*y)uG0mKj-Q zO_dox0#in&mHEN)bYO3~7iC9FpOR;|xLmwH0fxanf2;KjFQbN;DnrCv6J!dHSdsGu zzYjtb>u;sm=P?DczA$3FvnP3p(fZ~FPF4-}D%(1dASVqC;~Ct%Rw70Qx1JD~c}9$E z?d$_~4E$Q;QqqRQqoL7 zffeoJ!0ufTtbmZnVutIIgFyI@b&jB7ray{Vc;o5fME{^xuPQMnjNMYB;lj;)Bm5Xv zI*&H1eBwWBmYUPs8|x+)kkMCNr>wv@#~ovCDqLxdb_#LR1}fx1%@JF&L>-JMa zH?ETWm9_rcxHcgSd>D5fvTCK1H@_^tEdH8`x&4jWGe?RTJJLSyS<>gcX>VB((4`0h ze;INxo$L}mY-Wn7a`K2NcWU8}zj=%5jGfA=>73tsT*{}D)bFNM%(h4>|C>fjKz#lZ z3OjU3A5gDq`sBF^+PA#Se(2xja#FG>cRZ}HU#-AhHMtx~l z8FfhM{KoCvgu)II(RsSp*%34JELIuDW}u_} z`dxb>nP2P6O;QR)PLApdU$Z#L+a2zLJX=0;$sVJ`zmNNmbVm(Qn`-$d`EMEQ<$~eb z_O&UeKa0P|919iFbCZ29b4BoS6G)*jSKv~!v5ZmJl}?~biaA}B_G=S{cx`{OPe9w= z`-!_+#931FD|lP;?pfdX=r+!H*ccd5&fucz(N)~NiB9Oad(!q8USs`)y`RDGXr(X) zvQ6>sU@Ww%U1kl+a>!Qr^7?Sn-s_R0J0zR@&dZanPM)|JbZ7HzXlh^9kXrDw$HmL+ z{!`m0f@uRggTkKlazWX&G=IO2rS4>DTenp8skp|FDP}XhUhRNm?qz;?mqTh(@daFO zi{McYh_d$OS?B@IXSVZ1z&~;uERQPG6xSTN%5PG<{vujigt^&QwK>YNofvtnUw1i* z_;^3Nqb;1i!m=XRUx5UR3Ja9!5Y3kjMENRs!5u=|s?a*1{2(~4*qCG7*W%q0SwBrx z;6t7=I?dE9)M{5cuu1LbWpaM1sPyTPH`L#6-pD0>q&Q?< zwGbysU=l-+Gh4GSk!xh*q}g0+9Y9v2uM-G=QFJX_U}VkI@LpjJe7aXVBY8mpp^7+K ze;l!VVEJGb0Z#7_-st;S?nRVjA)&CK|NqT`|1!Y;9tHmYVg6g|;D43>PYmxr#Sjn$ r{w4on{5PWaU*-Q(^7(&qW{Uroij-tw;r`chxfk#*{rVz literal 0 HcmV?d00001 diff --git a/tests/test_base.py b/tests/test_base.py new file mode 100644 index 0000000..0186bf3 --- /dev/null +++ b/tests/test_base.py @@ -0,0 +1,140 @@ +from fhirflat.resources.encounter import Encounter +import pandas as pd +import pytest +from pydantic import ValidationError + + +def test_validate_fhirflat_single_resource_errors(): + df = pd.DataFrame( + { + "subjid": [2], + "flat_dict": [ + { + "subject": "Patient/2", + "id": 11, + "actualPeriod.start": "NOT A DATE", + "actualPeriod.end": "2021-04-10", + "extension.timingPhase.system": "https://snomed.info/sct", + "extension.timingPhase.code": 278307001.0, + "extension.timingPhase.text": "On admission (qualifier value)", + "class.system": "https://snomed.info/sct", + "class.code": 32485007.0, + "class.text": "Hospital admission (procedure)", + "diagnosis.condition.concept.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.condition.concept.code": [38362002.0, 722863008.0], + "diagnosis.condition.concept.text": [ + "Dengue (disorder)", + "Dengue with warning signs (disorder)", + ], + "diagnosis.use.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.use.code": [89100005.0, 89100005.0], + "diagnosis.use.text": [ + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + ], + "admission.dischargeDisposition.system": "https://snomed.info/sct", + "admission.dischargeDisposition.code": 371827001.0, + "admission.dischargeDisposition.text": "Patient discharged alive (finding)", # noqa: E501 + } + ], + }, + index=[0], + ) + + flat_df = Encounter.ingest_to_flat(df) + with pytest.raises(ValidationError, match="invalid datetime format"): + _, _ = Encounter.validate_fhirflat(flat_df) + + +def test_validate_fhirflat_multi_resource_errors(): + df = pd.DataFrame( + { + "subjid": [1, 2], + "flat_dict": [ + { + "subject": "Patient/1", + "id": 11, + "actualPeriod.start": "2021-04-01", + "actualPeriod.end": "2021-04-10", + "extension.timingPhase.system": "https://snomed.info/sct", + "extension.timingPhase.code": 278307001.0, + "extension.timingPhase.text": "On admission (qualifier value)", + "class.system": "https://snomed.info/sct", + "class.code": 32485007.0, + "class.text": "Hospital admission (procedure)", + "diagnosis.condition.concept.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.condition.concept.code": [38362002.0, 722863008.0], + "diagnosis.condition.concept.text": [ + "Dengue (disorder)", + "Dengue with warning signs (disorder)", + ], + "diagnosis.use.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.use.code": [89100005.0, 89100005.0], + "diagnosis.use.text": [ + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + ], + "admission.dischargeDisposition.system": "https://snomed.info/sct", + "admission.dischargeDisposition.code": 371827001.0, + "admission.dischargeDisposition.text": "Patient discharged alive (finding)", # noqa: E501 + }, + { + "subject": "Patient/2", + "id": 12, + "actualPeriod.start": ["2021-04-01", None], + "actualPeriod.end": [None, "2021-04-10"], + "extension.timingPhase.system": "https://snomed.info/sct", + "extension.timingPhase.code": 278307001.0, + "extension.timingPhase.text": "On admission (qualifier value)", + "class.system": "https://snomed.info/sct", + "class.code": 32485007.0, + "class.text": "Hospital admission (procedure)", + "diagnosis.condition.concept.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.condition.concept.code": [38362002.0, 722863008.0], + "diagnosis.condition.concept.text": [ + "Dengue (disorder)", + "Dengue with warning signs (disorder)", + ], + "diagnosis.use.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.use.code": [89100005.0, 89100005.0], + "diagnosis.use.text": [ + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + ], + "admission.dischargeDisposition.system": "https://snomed.info/sct", + "admission.dischargeDisposition.code": 371827001.0, + "admission.dischargeDisposition.text": "Patient discharged alive (finding)", # noqa: E501 + }, + ], + }, + ) + flat_df = Encounter.ingest_to_flat(df) + + assert "diagnosis_dense" in flat_df.columns + + valid, errors = Encounter.validate_fhirflat(flat_df) + + assert len(valid) == 1 + assert len(errors) == 1 + assert ( + repr(errors["validation_error"][1].errors()) + == "[{'loc': ('actualPeriod', 'end'), 'msg': 'invalid type; expected datetime, string, bytes, int or float', 'type': 'type_error'}, {'loc': ('actualPeriod', 'start'), 'msg': 'invalid type; expected datetime, string, bytes, int or float', 'type': 'type_error'}]" # noqa: E501 + ) diff --git a/tests/test_ingest.py b/tests/test_ingest.py index 3b378ef..e2c3a33 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -22,7 +22,6 @@ from pathlib import Path import numpy as np import pytest -from pydantic.v1 import ValidationError if sys.version_info < (3, 11): # tomllib was introduced in 3.11 import tomli # pragma: no cover @@ -1097,142 +1096,6 @@ def test_validate_cli(capsys, monkeypatch): assert "validation errors" not in captured.out -def test_validate_fhirflat_single_resource_errors(): - df = pd.DataFrame( - { - "subjid": [2], - "flat_dict": [ - { - "subject": "Patient/2", - "id": 11, - "actualPeriod.start": "NOT A DATE", - "actualPeriod.end": "2021-04-10", - "extension.timingPhase.system": "https://snomed.info/sct", - "extension.timingPhase.code": 278307001.0, - "extension.timingPhase.text": "On admission (qualifier value)", - "class.system": "https://snomed.info/sct", - "class.code": 32485007.0, - "class.text": "Hospital admission (procedure)", - "diagnosis.condition.concept.system": [ - "https://snomed.info/sct", - "https://snomed.info/sct", - ], - "diagnosis.condition.concept.code": [38362002.0, 722863008.0], - "diagnosis.condition.concept.text": [ - "Dengue (disorder)", - "Dengue with warning signs (disorder)", - ], - "diagnosis.use.system": [ - "https://snomed.info/sct", - "https://snomed.info/sct", - ], - "diagnosis.use.code": [89100005.0, 89100005.0], - "diagnosis.use.text": [ - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 - ], - "admission.dischargeDisposition.system": "https://snomed.info/sct", - "admission.dischargeDisposition.code": 371827001.0, - "admission.dischargeDisposition.text": "Patient discharged alive (finding)", # noqa: E501 - } - ], - }, - index=[0], - ) - - flat_df = Encounter.ingest_to_flat(df) - with pytest.raises(ValidationError, match="invalid datetime format"): - _, _ = Encounter.validate_fhirflat(flat_df) - - -def test_validate_fhirflat_multi_resource_errors(): - df = pd.DataFrame( - { - "subjid": [1, 2], - "flat_dict": [ - { - "subject": "Patient/1", - "id": 11, - "actualPeriod.start": "2021-04-01", - "actualPeriod.end": "2021-04-10", - "extension.timingPhase.system": "https://snomed.info/sct", - "extension.timingPhase.code": 278307001.0, - "extension.timingPhase.text": "On admission (qualifier value)", - "class.system": "https://snomed.info/sct", - "class.code": 32485007.0, - "class.text": "Hospital admission (procedure)", - "diagnosis.condition.concept.system": [ - "https://snomed.info/sct", - "https://snomed.info/sct", - ], - "diagnosis.condition.concept.code": [38362002.0, 722863008.0], - "diagnosis.condition.concept.text": [ - "Dengue (disorder)", - "Dengue with warning signs (disorder)", - ], - "diagnosis.use.system": [ - "https://snomed.info/sct", - "https://snomed.info/sct", - ], - "diagnosis.use.code": [89100005.0, 89100005.0], - "diagnosis.use.text": [ - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 - ], - "admission.dischargeDisposition.system": "https://snomed.info/sct", - "admission.dischargeDisposition.code": 371827001.0, - "admission.dischargeDisposition.text": "Patient discharged alive (finding)", # noqa: E501 - }, - { - "subject": "Patient/2", - "id": 12, - "actualPeriod.start": ["2021-04-01", None], - "actualPeriod.end": [None, "2021-04-10"], - "extension.timingPhase.system": "https://snomed.info/sct", - "extension.timingPhase.code": 278307001.0, - "extension.timingPhase.text": "On admission (qualifier value)", - "class.system": "https://snomed.info/sct", - "class.code": 32485007.0, - "class.text": "Hospital admission (procedure)", - "diagnosis.condition.concept.system": [ - "https://snomed.info/sct", - "https://snomed.info/sct", - ], - "diagnosis.condition.concept.code": [38362002.0, 722863008.0], - "diagnosis.condition.concept.text": [ - "Dengue (disorder)", - "Dengue with warning signs (disorder)", - ], - "diagnosis.use.system": [ - "https://snomed.info/sct", - "https://snomed.info/sct", - ], - "diagnosis.use.code": [89100005.0, 89100005.0], - "diagnosis.use.text": [ - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 - "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 - ], - "admission.dischargeDisposition.system": "https://snomed.info/sct", - "admission.dischargeDisposition.code": 371827001.0, - "admission.dischargeDisposition.text": "Patient discharged alive (finding)", # noqa: E501 - }, - ], - }, - ) - flat_df = Encounter.ingest_to_flat(df) - - assert "diagnosis_dense" in flat_df.columns - - valid, errors = Encounter.validate_fhirflat(flat_df) - - assert len(valid) == 1 - assert len(errors) == 1 - assert ( - repr(errors["validation_error"][1].errors()) - == "[{'loc': ('actualPeriod', 'end'), 'msg': 'invalid type; expected datetime, string, bytes, int or float', 'type': 'type_error'}, {'loc': ('actualPeriod', 'start'), 'msg': 'invalid type; expected datetime, string, bytes, int or float', 'type': 'type_error'}]" # noqa: E501 - ) - - def test_convert_data_to_flat_local_mapping_errors(): output_folder = "tests/ingestion_output_errors" mappings = { @@ -1295,6 +1158,19 @@ def test_validate_valid(capsys): assert "Validation complete" in captured.out +def test_validate_compress(capsys): + folder = "tests/data/valid_flat_compressed.zip" + + validate(folder, compress_format="zip") + + captured = capsys.readouterr() + assert "patient.parquet is valid" in captured.out + assert "Validation complete" in captured.out + + assert Path("tests/data/valid_flat_compressed_validated.zip").exists() + Path.unlink("tests/data/valid_flat_compressed_validated.zip") + + def test_validate_invalid(capsys): folder = "tests/data/invalid_flat_bundle" From b951c788eb128053f681462e4fa399b2f4daab61 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 2 Aug 2024 15:30:24 +0100 Subject: [PATCH 10/19] fix validationerror import --- tests/ingestion_output_errors/encounter.parquet | Bin 18137 -> 0 bytes tests/ingestion_output_errors/fhirflat.toml | 5 ----- tests/ingestion_output_errors/sha256sums.txt | 1 - tests/test_base.py | 2 +- 4 files changed, 1 insertion(+), 7 deletions(-) delete mode 100644 tests/ingestion_output_errors/encounter.parquet delete mode 100644 tests/ingestion_output_errors/fhirflat.toml delete mode 100644 tests/ingestion_output_errors/sha256sums.txt diff --git a/tests/ingestion_output_errors/encounter.parquet b/tests/ingestion_output_errors/encounter.parquet deleted file mode 100644 index 0b186b5a43a4564864016453085dbeb91909370f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 18137 zcmeHPZD=F+nIFlH*ZY#qc9T)$)JYU|cj`UAXhyPZX`9lFY)h6;*4kQIZ>8OiG-GS* z(MZuqeoF%(giw0r2qi52aFnGSC4^AQ5srm}qa5WvgkH&~qaQ*(lyDzHIgaB#gb5$dv} z)_vU6;}qqG%XFai^)yYy zqx7YbAT@Ml#C=X{6t<*dop!p0UH|QH{ilPeGQ+)9jZ6Siigr?PxdMMt6gkUC!k!o^9oYLw- zP50O@ovy!kGQT~?{G7PoMyaiRRr>p;b)o5Y>mI-CCbn_x$mMx3CTKMKC)t@vznvv$ zg3;Q~oDVKG&AHNMj-=`y{M_OClY{xg1?J=P7=XR>+FJCsgWgDBXhQZZz4ZccS*q8o z+D)HNQ!8ajoRZazicc%nf9MZ!f!Q$2vbF2f=xiv!&ax~A4tkFEwx!YwhbnOQ7BTe# zjweDS^(b;3a12~#K7PVtBJ_;3SC>>xuBcOWxh$)jDAo(D5lYSN@2mSqjGdd6OcXp}c( zspi=cl!kQUIS{#}dV~!xR)$V4pFgCx`#J2}Te~fde|qXr8tcGrH;w=6bp6iB{QODg z-c_XWp~2ZQ_)`a6G8v5jtY1I|X1NfAA?MZNN}# z%j9d%G?~0mR0K`y5rD6g0DRZY{NvMo1MoP2Y}xyXgMI@Fu%6FR(Sy;|s8{s~(NnEe zijvrTnV#Y4>_AsCi zgaR{umi6N*c-HR^%|O)*Sr5<>0HVg|i3$3Wxg;JOJ8wz4hfM|0|Ea_PTnDZ=aFI6f z>wmdOIS1aLn7dOJ8~2|OWnoi=DAVqUFh{pf$mfd{@Fd1x+XQnCUKgcmeF%aQ{M^m|>wP*VYdN0T>I?b$Kvn=@n7aHUZ%jN=sDNz*qCMc5UD+IzRCcYm#G#NKTFQ{eR2 zJO>t66=9zcdVj$6ME$=h*%>?ZvW%TQ%r@`PJ^DGd{(0|hwe^2)i zb@yrj?N2Ut2B%|~VtyYwJvN85eXTB)37Kz_O~u_=<~K7)W-pw*4X~xqO^=!sUYfvc z4uv^b^sqr2&W6o4w{_RW%Ds0lP{_*XFIud;(>G%W8XYOzKS$qlxZZa#|NkxK-@k<{ zSU_4J+1mM*gT4#XfgUW2{}Y=Bt7>>{-5V=x z8=B@%La+NowDfe({kfBLPg~OeY|!_PKXWL$athh#7vhz#81*6 z-DTt6BJ)A)1j~kte#T6ui(}^?%$5a3sL2APL;I3wXzZNP_N$$H4%a&l=F$cUH|H2-d$n-?G|!y62NR(JH=Ys z89ZmHdf0~T2icxZvJcds4!Ztfkojbt`PCXa=`0y&=j@R$W`dM^2=;eUS)fNL`qFi$ zbv%8YN{mA6(u4304t(`Sn?q4k4*PZ;G8&CRr@POo8zkva;zbxMIOry;cRcGRP>hqf zL$#ncj-jRHz~%V8jycv*cGAZHahSSeqU&6}*&zYvSx2IS6g)NeVTXx3QQX|0fmh+l zSOFf^b4ZT>vVUuhFmpB1`p%dUn^A(x&cg=y(!d-#-`OGU5&kwa#e?@gPN=wsM@C1! zc=Ck5$6`+POC{#CzRh7y@q3js|4jvRhQ9+~&e*9sbC%AgzcS~$%o#l%J<|u3%wNX2 z-&gs-?Q9*3F@HS;(tj+&e5HGxwb2iT+n7{1Zx9e@rHOvgVKAn1sqDCg%LsdZ) z1?9QIvbi%y2TueN|`N8vSo7>7#QptLqT6dq*j)d_%Q z*yxqT?!okT9plJqeP+7bYU_22(OrYI{qjopRF{SJG94g}M}B`C@C&eR)7wxO5+{qH z_iwMd(e#(?z=glu`mRCH_qrWr8=hSZ*^h`D<)flAWlsZL^JhZcXxiIOZ*Z2Dnql{tMhW}vi3S36=;=A_V&6-G@G>XjOq_!YTO6KeZ7?^pMQ zTCK8+{Om|I{4p!O<$_b}6zhfQ*lS>DY4lIQKi*d<_a#1+x|O-9e?{TGKk38IE}!Cg zeii=3&)@R=G#}aIqddRK&ux--kw{znF1}o_+86jG{TZ%A3u%w@{4Nju;BTTNHwSI- z3?B{TBeU_c(#Xfs(^-FVr?8l=7u03>mK-Ta`%5(;6Woq(RW{@5QmIfDmAJgS87~J* zh0Hq2G{nrFmYCz^Vjxv2s;lsh-!-0higWRq`24CCpIbT*W9ePgvz+BbY|r__nwa4l zS-(DZK5LEH$YnSMWeaLrE6m||`D#A1x1IIdj2nvr_UJ)2kO~%Kh>4fQOt1?2xAV8- zI5uG9cBOQfm9i0{H(n0QLOH!9&XM+dKAS3m4mNii@Gfd{J`>z3_*2zFxk&V-)2Ucu zZByRpz+T)3`Dh*ZGjSmg=~mrkA(Phf(XeGxTupMha#hI%R%OD|N#lg7r!L#a+xG6W}@19 zEWIye!rO^*x?ynE?cZ{AOI%Es3ekvCEGs+txu8-gf4RQSXG<)^@Bv_LBvgpw4CELW zseo^lL^-*Whdk-Qw<0a3l!BTB-%K~em{N!QSJz{uQqjNOXvrzNSkZ)RlFbCt4IzVL zq(!4vaD{9n*qWaO|I*8O;Pe=A(-#-imT4oGZTtakv2>&8S9S_=kQFi<PSNDy@04)0)$TOq$JQQYA6AVAaTt z+q;`kPn&h1oYcS{2Z#7$s{lBQ`=lP}wf9lsR8!!Co;Ae8`3U)pZbO_O7=GS{Slj|1 z@8|vL1-RcW1d>&VXN&=??Z!OTIj_LHlu6Y=UeBF!s#^3XA#aw{{5sqRB0HcTVhHl8 zC6DatP$8xqh|xM%C@W;V?1RijJ!<)pG4}Tv%y}Hxr004b@aFuG=P<{o;Jz3O9|&2f z9i$Fe`SoSz{kD*as)aIQNp3(*fjsV6yL!s@@sE+WsgtZ#EjX63X2SZU88d})QYkL3 zLT$A;wW(RDYAz!-FlP1K4?;ZazCe9Be|lG(dz7`S4IkGrHymb9r~D&~spW=4Z1G0eIjQM5kFW`l2O6Ir6SA0Hr zh|FV0p0D6V-;cmn#@M7R;8rEp9cLtOpeeYr<@4yocEOQ6*+?3jNCFKU4hw+5Yj1u8SNYvp8 zY0GbCJjRKAD+dN%miSMPKLVYE4$+4-iH~4jY~`~NmR}*`X}JOQ8^!Ox%lSyO_uaP6KTlHE!N9^6P^!MNw32LcFHIyGmd&vCKFaF`2 z4vMUQuT%`cdIHyV@EpMYdutHN$C;mqe8>l>11ikUA(URi-uN1q$@pYYwnKQS_bQ+4F`$ z+jnno#{B~5^sPXS9bW&M^HBoj{cCV(@n35_us}t+&Zyw*WgDCM*gW(f&%uTG)9?pr z)BQ=LZsWD(xY3qQDjus_s)_~1W*hX1L}p8?kR6Y>B6 diff --git a/tests/ingestion_output_errors/fhirflat.toml b/tests/ingestion_output_errors/fhirflat.toml deleted file mode 100644 index 2103d2d..0000000 --- a/tests/ingestion_output_errors/fhirflat.toml +++ /dev/null @@ -1,5 +0,0 @@ -[metadata] -N = NA -generator = "fhirflat/0.1.0" -checksum = "437a5f50450c7671084451fc83d2c049239bcc8686cdd663823589ec689d872a" -checksum_file = "sha256sums.txt" diff --git a/tests/ingestion_output_errors/sha256sums.txt b/tests/ingestion_output_errors/sha256sums.txt deleted file mode 100644 index 2ff9e43..0000000 --- a/tests/ingestion_output_errors/sha256sums.txt +++ /dev/null @@ -1 +0,0 @@ -8980474796eea6a656ae2d6cf5c6d31f290d97e2bbe39eaa0469d2ebed8e3df1 encounter.parquet diff --git a/tests/test_base.py b/tests/test_base.py index 0186bf3..fea4edf 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,7 +1,7 @@ from fhirflat.resources.encounter import Encounter import pandas as pd import pytest -from pydantic import ValidationError +from pydantic.v1 import ValidationError def test_validate_fhirflat_single_resource_errors(): From 721fd72cc83d0f9b6e7b219b7a3fe68e11b8f30d Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 2 Aug 2024 15:40:56 +0100 Subject: [PATCH 11/19] Add test for flat_fields --- tests/test_base.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_base.py b/tests/test_base.py index fea4edf..c0bc7c2 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,9 +1,29 @@ from fhirflat.resources.encounter import Encounter +import fhirflat import pandas as pd import pytest from pydantic.v1 import ValidationError +def test_flat_fields(): + p = fhirflat.Patient() + ff = p.flat_fields() + + assert ff == [ + "id", + "extension", + "gender", + "birthDate", + "deceasedBoolean", + "deceasedDateTime", + "maritalStatus", + "multipleBirthBoolean", + "multipleBirthInteger", + "generalPractitioner", + "managingOrganization", + ] + + def test_validate_fhirflat_single_resource_errors(): df = pd.DataFrame( { From 226d1c67a57b5593ababa823feed4c6fca82eabb Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 2 Aug 2024 16:01:07 +0100 Subject: [PATCH 12/19] add test to_flat_no_filename --- tests/test_base.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_base.py b/tests/test_base.py index c0bc7c2..88c523c 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -3,6 +3,7 @@ import pandas as pd import pytest from pydantic.v1 import ValidationError +from pathlib import Path def test_flat_fields(): @@ -24,6 +25,23 @@ def test_flat_fields(): ] +def test_to_flat_no_filename(): + PATIENT_DICT_INPUT = { + "id": "f001", + "active": True, + "name": [{"text": "Micky Mouse"}], + "gender": "male", + "deceasedBoolean": False, + "address": [{"country": "Switzerland"}], + "birthDate": "1996-05-30", + } + + p = fhirflat.Patient(**PATIENT_DICT_INPUT) + flat_p = p.to_flat() + + assert isinstance(flat_p, pd.Series) + + def test_validate_fhirflat_single_resource_errors(): df = pd.DataFrame( { From 26f05775d0605ef6ec6d1919a1322e588488bb1a Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 2 Aug 2024 16:16:10 +0100 Subject: [PATCH 13/19] Add test for pyarrow error --- .../combined_dummy_data_parquet_error.csv | 3 ++ tests/test_ingest.py | 33 +++++++++---------- 2 files changed, 19 insertions(+), 17 deletions(-) create mode 100644 tests/dummy_data/combined_dummy_data_parquet_error.csv diff --git a/tests/dummy_data/combined_dummy_data_parquet_error.csv b/tests/dummy_data/combined_dummy_data_parquet_error.csv new file mode 100644 index 0000000..13ceff9 --- /dev/null +++ b/tests/dummy_data/combined_dummy_data_parquet_error.csv @@ -0,0 +1,3 @@ +subjid,visitid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,non_encounter_field,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_not_dengue,outco_secondiag_oth,outco_date,outco_outcome,daily_date,vital_highesttem_c,vital_hr,vital_rr,vital_systolicbp,vital_diastolicbp,vital_spo2,vital_fio2spo2_02110,vital_fio2spo2_pcnt,vital_capillaryr,vital_avpu,vital_gcs,vital_urineflow +1,10,2020-05-01,0,,,,,,,cough,,,7,2020-01-01,36.2,120.0,30.0,70.0,120.0,5.0,,75.0,1.0,1.0,1.0,150.0 +2,11,2021-04-01,1,,,fish,1.0,,2.0,,,2021-04-10,1,2021-02-02,37.0,100.0,40.0,80.0,130.0,6.0,10.0,85.0,0.0,2.0,1.0,200.0 diff --git a/tests/test_ingest.py b/tests/test_ingest.py index e2c3a33..da019b8 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -958,23 +958,22 @@ def test_convert_data_to_flat_no_validation_warning(): shutil.rmtree("tests/ingestion_output") -# TODO: write a working version of this (needs data like the private ones) -# def test_convert_data_to_flat_no_validation_invalid_file_warning(): -# mappings = { -# Encounter: "tests/dummy_data/encounter_dummy_mapping.csv", -# } -# resource_types = {"Encounter": "one-to-one"} - -# with pytest.warns(UserWarning, match="This is likely due to a validation error"): -# convert_data_to_flat( -# "tests/dummy_data/combined_dummy_data_error.csv", -# folder_name="tests/ingestion_output_errors", -# date_format="%Y-%m-%d", -# timezone="Brazil/East", -# mapping_files_types=(mappings, resource_types), -# validate=False, -# ) -# shutil.rmtree("tests/ingestion_output_errors") +def test_convert_data_to_flat_no_validation_invalid_file_warning(): + mappings = { + Encounter: "tests/dummy_data/encounter_dummy_mapping.csv", + } + resource_types = {"Encounter": "one-to-one"} + + with pytest.warns(UserWarning, match="This is likely due to a validation error"): + convert_data_to_flat( + "tests/dummy_data/combined_dummy_data_parquet_error.csv", + folder_name="tests/ingestion_output_errors", + date_format="%Y-%m-%d", + timezone="Brazil/East", + mapping_files_types=(mappings, resource_types), + validate=False, + ) + shutil.rmtree("tests/ingestion_output_errors") def test_generate_metadata(): From f9fbcdc3d5cc6454d18e0216931a5a1d2a1593c2 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 2 Aug 2024 16:47:32 +0100 Subject: [PATCH 14/19] Test util.codense_codes --- fhirflat/util.py | 6 ++-- tests/test_utils.py | 81 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 3 deletions(-) diff --git a/fhirflat/util.py b/fhirflat/util.py index f62c23e..f6d0be8 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -184,9 +184,7 @@ def condense_codes(row, code_col): raw_codes if isinstance(raw_codes, str) else str(int(raw_codes)) ) codes = row[code_col + ".system"] + "|" + formatted_code - elif np.isnan(raw_codes) or raw_codes is None: - codes = None - else: + elif isinstance(raw_codes, list): formatted_codes = [ c if (isinstance(c, str) or c is None) else str(int(c)) for c in raw_codes ] @@ -194,6 +192,8 @@ def condense_codes(row, code_col): s + "|" + c for s, c in zip(row[code_col + ".system"], formatted_codes, strict=True) ] + else: + codes = None row[code_col + ".code"] = codes return row diff --git a/tests/test_utils.py b/tests/test_utils.py index 163420a..872d4f2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -6,12 +6,16 @@ get_fhirtype, get_local_extension_type, get_local_resource, + condense_codes, ) from fhir.resources.quantity import Quantity from fhir.resources.codeableconcept import CodeableConcept from fhir.resources.medicationstatement import MedicationStatementAdherence from fhirflat.resources.extensions import dateTimeExtension, Duration +from fhirflat import MedicationStatement +import pandas as pd +import numpy as np def test_group_keys(): @@ -78,3 +82,80 @@ def test_get_local_extension_type_raises(): def test_get_local_resource(): result = get_local_resource("Patient") assert result == fhirflat.Patient + + +def test_get_local_resource_case_insensitive(): + result = get_local_resource("medicationstatement", case_insensitive=True) + assert result == MedicationStatement + + +@pytest.mark.parametrize( + "input, expected", + [ + ( + ( + pd.Series( + { + "test.system": "http://loinc.org", + "test.code": "1234", + "test.display": "Test", + } + ), + "test", + ), + pd.Series( + { + "test.system": "http://loinc.org", + "test.code": "http://loinc.org|1234", + "test.display": "Test", + } + ), + ), + ( + ( + pd.Series( + { + "test.system": "http://loinc.org", + "test.code": np.nan, + "test.display": "Test", + } + ), + "test", + ), + pd.Series( + { + "test.system": "http://loinc.org", + "test.code": None, + "test.display": "Test", + } + ), + ), + ( + ( + pd.Series( + { + "test.system": ["http://loinc.org", "http://snomed.info/sct"], + "test.code": ["1234", 5678], + "test.display": "Test", + } + ), + "test", + ), + pd.Series( + { + "test.system": ["http://loinc.org", "http://snomed.info/sct"], + "test.code": [ + "http://loinc.org|1234", + "http://snomed.info/sct|5678", + ], + "test.display": "Test", + } + ), + ), + ], +) +def test_condense_codes(input, expected): + row, col = input + result = condense_codes(row, col) + + pd.testing.assert_series_equal(result, expected) From 5eaba4eaea9670839158721d19abe454f0ed9000 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Mon, 5 Aug 2024 12:23:50 +0100 Subject: [PATCH 15/19] Add types to util functions --- fhirflat/flat2fhir.py | 11 ++++++++--- fhirflat/util.py | 17 +++++++++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index b201bf1..20ec377 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -8,7 +8,12 @@ from fhir.resources.quantity import Quantity from pydantic.v1.error_wrappers import ValidationError -from .util import find_data_class, get_fhirtype, get_local_extension_type, group_keys +from .util import ( + find_data_class, + get_fhirtype, + get_local_extension_type, + group_keys, +) def create_codeable_concept( @@ -16,7 +21,7 @@ def create_codeable_concept( ) -> dict[str, list[str]]: """Re-creates a codeableConcept structure from the FHIRflat representation.""" - # for reading in from ingestion pipeline + # for creating backbone elements if name + ".code" in old_dict and name + ".system" in old_dict: raw_codes: str | float | list[str | None] = old_dict.get(name + ".code") if raw_codes is not None and not isinstance(raw_codes, list): @@ -24,7 +29,7 @@ def create_codeable_concept( raw_codes if isinstance(raw_codes, str) else str(int(raw_codes)) ) codes = [old_dict[name + ".system"] + "|" + formatted_code] - elif raw_codes is None: + elif not raw_codes: codes = raw_codes else: formatted_codes = [ diff --git a/fhirflat/util.py b/fhirflat/util.py index f6d0be8..9e21e56 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -1,16 +1,23 @@ # Utility functions for FHIRflat +from __future__ import annotations + import datetime import importlib import re from collections.abc import KeysView from itertools import groupby +from typing import TYPE_CHECKING import fhir.resources import numpy as np +import pandas as pd import fhirflat from fhirflat.resources import extensions +if TYPE_CHECKING: + from .resources.base import FHIRFlatBase + def group_keys(data_keys: list[str] | KeysView) -> dict[str, list[str]]: """ @@ -79,7 +86,9 @@ def get_local_resource(t: str, case_insensitive: bool = False): return getattr(fhirflat, a) -def find_data_class(data_class, k): +def find_data_class( + data_class: FHIRFlatBase | list[FHIRFlatBase], k: str +) -> FHIRFlatBase: """ Finds the type class for item k within the data class. @@ -116,7 +125,7 @@ def find_data_class(data_class, k): return get_fhirtype(base_class) -def code_or_codeable_concept(col_name, resource): +def code_or_codeable_concept(col_name: str, resource: FHIRFlatBase) -> bool: search_terms = col_name.split(".") fhir_type = find_data_class(resource, search_terms[0]) @@ -138,7 +147,7 @@ def code_or_codeable_concept(col_name, resource): return code_or_codeable_concept(".".join(search_terms[1:]), fhir_type) -def format_flat(flat_df, resource): +def format_flat(flat_df: pd.DataFrame, resource: FHIRFlatBase) -> pd.DataFrame: """ Performs formatting on dates/lists in FHIRflat resources. """ @@ -177,7 +186,7 @@ def format_flat(flat_df, resource): return flat_df -def condense_codes(row, code_col): +def condense_codes(row: pd.Series, code_col: str) -> pd.Series: raw_codes = row[(code_col + ".code")] if isinstance(raw_codes, (str, int, float)) and raw_codes == raw_codes: formatted_code = ( From 998823a26f2baf1045428395c2e52335f42bef34 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Mon, 5 Aug 2024 12:46:53 +0100 Subject: [PATCH 16/19] Add documentation --- docs/howto/conversion-data.md | 42 +++++++++++++++++++++++++++++++++++ tests/test_ingest.py | 3 ++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/docs/howto/conversion-data.md b/docs/howto/conversion-data.md index b3ed308..c477c02 100644 --- a/docs/howto/conversion-data.md +++ b/docs/howto/conversion-data.md @@ -28,3 +28,45 @@ The equivalent function to the CLI described above can be used as ``` fhirflat.convert_data_to_flat("data_file_path", "sheet_id", "%Y-%m-%d", "Brazil/East") ``` + +## Conversion without validation + +If you wish to convert your data into FHIRflat, but not perform validation to check the +converted data conforms to the FHIR spec, you can add the `--no-validate` flag: + +```bash +fhirflat transform data-file google-sheet-id date-format timezone-name --no-validate +``` + +The equivalent library function is +```python +fhirflat.convert_data_to_flat(, , , , validate=False) +``` + +We strongly recommend you don't do this unless necessary for time constraints; some +errors in conversion can cause the parquet file to fail to save (e.g. if columns contain +mixed types due to errors which would be caught during validation). + +Data which is already in a FHIRflat format can be validated against the schema using + +```bash +fhirflat validate +``` + +where `folder_name` is the path to the folder containing your flat files. The files **must** +be named according to the corresponding FHIR resource, e.g. the folder containing flat +Encounter data must be named `encounter.parquet`. + +The folder can be provided in a compressed format, e.g. zipped; you can specifiy this +using +```bash +fhirflat validate -c "zip" +``` + +The output folder of validated data will be compressed using the same format. + +The equivalent library function is + +```python +fhirflat.validate(, compress_format="zip") +``` diff --git a/tests/test_ingest.py b/tests/test_ingest.py index da019b8..ae272a4 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -1059,7 +1059,8 @@ def test_convert_data_to_flat_local_mapping_zipped(): os.remove("tests/ingestion_output.zip") -# This don't run intermittantly - because of the "#NAME" error i get with the googele sheets +# This doesn't run intermittantly - because of the "#NAME" error i get with the google +# sheets # Turns out this is an issue with custom functions in Google Sheets, not a Python thing. def test_main(capsys, monkeypatch): # Simulate command line arguments From 5f8395388c12ee9a73c984685518a5733b878187 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 6 Aug 2024 12:28:58 +0100 Subject: [PATCH 17/19] Edits after review --- fhirflat/ingest.py | 2 +- fhirflat/resources/base.py | 58 ++++++++++++++++---------------------- fhirflat/util.py | 7 ++--- 3 files changed, 29 insertions(+), 38 deletions(-) diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index df7ec14..5d05d22 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -598,7 +598,7 @@ def validate(folder_name: str, compress_format: str | None = None): resource = file.stem resource_type = get_local_resource(resource, case_insensitive=True) - valid_flat, errors = resource_type.validate_fhirflat(df, return_files=True) + valid_flat, errors = resource_type.validate_fhirflat(df, return_frames=True) if errors is not None: diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 8d1f47d..7477317 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -88,8 +88,8 @@ def create_fhir_resource( @classmethod def validate_fhirflat( - cls, df: pd.DataFrame, return_files: bool = False - ) -> tuple[FHIRFlatBase | list[FHIRFlatBase], pd.Series | None]: + cls, df: pd.DataFrame, return_frames: bool = False + ) -> tuple[FHIRFlatBase | pd.DataFrame, None | pd.DataFrame]: """ Takes a FHIRflat dataframe and validates the data against the FHIR schema. Returns a dataframe of valid resources and a dataframe of the @@ -98,18 +98,23 @@ def validate_fhirflat( Parameters ---------- - df: pd.DataFrame + df Pandas dataframe containing the FHIRflat data - return_files: bool - If True, returns the valid FHIR resources & errors as a parquet file, - even if only one row is present in the dataframe. + return_frames + If True, returns the valid FHIR resources & errors as dataframes, + even if only one row is present in the source. Returns ------- - valid_resources: pd.DataFrame + valid_resources A dataframe containing the valid FHIR resources - errors: pd.DataFrame + errors A dataframe containing the flat_dict and validation errors. + + Raises + ------ + ValidationError + If a single FHIR resource is present and is invalid. """ flat_df = df.copy() @@ -118,7 +123,7 @@ def validate_fhirflat( lambda row: row.to_json(date_format="iso", date_unit="s"), axis=1 ).apply(lambda x: cls.create_fhir_resource(x)) - if len(flat_df) == 1 and return_files is False: + if len(flat_df) == 1 and return_frames is False: resource = flat_df["fhir"].iloc[0] if isinstance(resource, ValidationError): raise resource @@ -149,7 +154,7 @@ def from_flat(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: Parameters ---------- - file: str + file Path to the parquet FHIRflat file containing clinical data Returns @@ -201,13 +206,9 @@ def ingest_backbone_elements(cls, mapped_data: pd.Series) -> pd.Series: Parameters ---------- - mapped_data: pd.Series + mapped_data Pandas series of FHIRflat-like dictionaries ready to be converted to FHIR format. - - Returns - ------- - pd.Series """ def fhir_format(row: pd.Series) -> pd.Series: @@ -251,14 +252,11 @@ def ingest_to_flat(cls, data: pd.DataFrame) -> pd.DataFrame | None: Parameters ---------- - data: pd.DataFrame - Pandas dataframe containing the data - filename: str - Name of the parquet file to be generated. + data + Pandas dataframe containing the raw data Returns ------- - pd.DataFrame or None A dataframe containing the FHIRflat data. """ @@ -279,13 +277,11 @@ def ingest_to_flat(cls, data: pd.DataFrame) -> pd.DataFrame | None: potential_dense_cols = [ x for x in cls.backbone_elements.keys() if x in flat_df.columns ] - list_lengths = [ - len(flat_df[x].dropna().iloc[0]) for x in potential_dense_cols - ] + long_list_cols = [ x - for x, y in zip(potential_dense_cols, list_lengths, strict=True) - if y > 1 + for x in potential_dense_cols + if any(flat_df[x].apply(lambda y: isinstance(y, list) and len(y) > 1)) ] if long_list_cols: @@ -307,12 +303,8 @@ def fhir_bulk_import(cls, file: str) -> FHIRFlatBase | list[FHIRFlatBase]: Parameters ---------- - file: str + file Path to the .ndjson file containing FHIR data - - Returns - ------- - FHIRFlatBase or list[FHIRFlatBase] """ resources = [] @@ -333,9 +325,9 @@ def fhir_file_to_flat(cls, source_file: str, output_name: str | None = None): Parameters ---------- - source_file: str + source_file Path to the FHIR resource file. - output_name: str (optional) + output_name Name of the parquet file to be generated, optional, defaults to {resource}.parquet """ @@ -368,7 +360,7 @@ def to_flat(self, filename: str | None = None) -> None | pd.Series: Parameters ---------- - filename: str + filename Name of the parquet file to be generated. """ diff --git a/fhirflat/util.py b/fhirflat/util.py index 9e21e56..b475236 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -125,13 +125,12 @@ def find_data_class( return get_fhirtype(base_class) -def code_or_codeable_concept(col_name: str, resource: FHIRFlatBase) -> bool: +def code_or_codeable_concept( + col_name: str, resource: FHIRFlatBase | list[FHIRFlatBase] +) -> bool: search_terms = col_name.split(".") fhir_type = find_data_class(resource, search_terms[0]) - if isinstance(fhir_type, list): - return code_or_codeable_concept(".".join(search_terms[1:]), fhir_type) - if len(search_terms) == 2: # e.g. "code.code", "age.code" schema = fhir_type.schema()["properties"] codeable_concepts = [ From c8be988aad40b22117a3266050d7392cc4838dc5 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 6 Aug 2024 12:35:30 +0100 Subject: [PATCH 18/19] edit doc file --- docs/howto/conversion-data.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/howto/conversion-data.md b/docs/howto/conversion-data.md index c477c02..69adc2b 100644 --- a/docs/howto/conversion-data.md +++ b/docs/howto/conversion-data.md @@ -60,7 +60,7 @@ Encounter data must be named `encounter.parquet`. The folder can be provided in a compressed format, e.g. zipped; you can specifiy this using ```bash -fhirflat validate -c "zip" +fhirflat validate -c "zip" ``` The output folder of validated data will be compressed using the same format. @@ -68,5 +68,5 @@ The output folder of validated data will be compressed using the same format. The equivalent library function is ```python -fhirflat.validate(, compress_format="zip") +fhirflat.validate(, compress_format="zip") ``` From 6ba56175985d22390af0d49a7a40fc2a478f1491 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 6 Aug 2024 14:31:23 +0100 Subject: [PATCH 19/19] Change compression description --- fhirflat/ingest.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 5d05d22..7ff35ff 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -585,9 +585,16 @@ def validate(folder_name: str, compress_format: str | None = None): Takes a folder containing (optionally compressed) FHIRflat files and validates them against the FHIR. File names **must** correspond to the FHIR resource types they represent. E.g. a file containing Patient resources must be named "patient.parquet". + + Parameters + ---------- + folder_name + The path to the folder containing the FHIRflat files, or compressed file. + compress_format + The format to compress the validated files into. """ - if compress_format: + if Path(folder_name).is_file(): directory = Path(folder_name).with_suffix("") shutil.unpack_archive(folder_name, extract_dir=directory) else: @@ -694,7 +701,7 @@ def validate_cli(): parser.add_argument( "-c", "--compress_format", - help="Format the folder is compressed in", + help="Format to compress the output into", choices=["zip", "tar", "gztar", "bztar", "xztar"], )