From ee719019e5b43fb69b034d621b7bfb4efb27d7cf Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Tue, 11 Jun 2024 10:35:12 +0100 Subject: [PATCH] Force list elements to be padded with 'None' to maintain same lengths for backbone elements --- fhirflat/flat2fhir.py | 14 +- fhirflat/ingest.py | 34 +++- fhirflat/resources/base.py | 8 +- .../data_multirow_encounter_freetext.csv | 3 + ...a_multirow_encounter_freetext_maindiag.csv | 3 + ...ta_multirow_encounter_freetext_secdiag.csv | 3 + tests/dummy_data/encounter_dummy_mapping.csv | 4 +- tests/test_flat2fhir_units.py | 55 ++++++ tests/test_ingest.py | 162 +++++++++++++++++- 9 files changed, 275 insertions(+), 11 deletions(-) create mode 100644 tests/dummy_data/data_multirow_encounter_freetext.csv create mode 100644 tests/dummy_data/data_multirow_encounter_freetext_maindiag.csv create mode 100644 tests/dummy_data/data_multirow_encounter_freetext_secdiag.csv diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py index 08a7271..1eb85e0 100644 --- a/fhirflat/flat2fhir.py +++ b/fhirflat/flat2fhir.py @@ -17,24 +17,27 @@ def create_codeable_concept( - old_dict: dict[str, list[str] | str | float], name: str + old_dict: dict[str, list[str] | str | float | None], name: str ) -> dict[str, list[str]]: """Re-creates a codeableConcept structure from the FHIRflat representation.""" # for reading in from ingestion pipeline if name + ".code" in old_dict and name + ".system" in old_dict: - raw_codes: str | float | list[str] = old_dict.get(name + ".code") - if not isinstance(raw_codes, list): + raw_codes: str | float | list[str | None] = old_dict.get(name + ".code") + if raw_codes is not None and not isinstance(raw_codes, list): formatted_code = ( raw_codes if isinstance(raw_codes, str) else str(int(raw_codes)) ) codes = [old_dict[name + ".system"] + "|" + formatted_code] + elif raw_codes is None: + codes = raw_codes else: formatted_codes = [ - c if isinstance(c, str) else str(int(c)) for c in raw_codes + c if (isinstance(c, str) or c is None) else str(int(c)) + for c in raw_codes ] codes = [ - [s + "|" + c] + s + "|" + c for s, c in zip( old_dict[name + ".system"], formatted_codes, strict=True ) @@ -217,7 +220,6 @@ def expand_concepts(data: dict[str, str], data_class: type[_DomainResource]) -> group_classes = {} for k in groups.keys(): - group_classes[k] = find_data_class(data_class, k) expanded = {} diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index f868d72..35bcec3 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -15,7 +15,7 @@ import numpy as np import pandas as pd -from fhirflat.util import get_local_resource +from fhirflat.util import get_local_resource, group_keys # 1:1 (single row, single resource) mapping: Patient, Encounter # 1:M (single row, multiple resources) mapping: Observation, Condition, Procedure, ... @@ -165,6 +165,38 @@ def create_dict_wide( result[key].append(snippet[key]) else: result[key] = [result[key], snippet[key]] + + # Keys that were not previously in the result still need to be added + remaining_keys = set(snippet.keys()) ^ duplicate_keys + if remaining_keys: + key_length = max(len(result[k]) for k in duplicate_keys) + empty_list = [None] * (key_length - 1) + for key in remaining_keys: + result[key] = [*empty_list, snippet[key]] + + # Check for existing keys that might need to be extended + snippet_keys = list(snippet.keys()) + result_groups = group_keys(result.keys()) + for k_list in result_groups.values(): + if set(snippet_keys).issubset(set(k_list)): + relevant_result = { + k: ( + [result[k]] + if not isinstance(result[k], list) + else result[k] + ) + for k in k_list + } + all_vals_same_length = ( + len(set(map(len, relevant_result.values()))) == 1 + ) + if not all_vals_same_length: + target_length = max(map(len, relevant_result.values())) + for k, v in relevant_result.items(): + if len(v) < target_length: + result[k] = relevant_result[k] + [None] * ( + target_length - len(v) + ) return result diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 8382df8..58ff8ec 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -115,7 +115,12 @@ def fhir_format(row: pd.Series) -> pd.Series: continue else: backbone_list = [] - for i in range(len(next(iter(condensed_dict.values())))): + # assert all lists are the same length - if not different parts + # of the backbone element may be incorrectly grouped together + assert len(set(map(len, condensed_dict.values()))) == 1 + + # iterate through and split the element into individual levels + for i in range(max(len(x) for x in condensed_dict.values())): first_item = { k.lstrip(b_e + "."): v[i] for k, v in condensed_dict.items() @@ -271,6 +276,7 @@ def to_flat(self, filename: str | None = None) -> None | pd.Series: if filename: flat_df.to_parquet(filename) + return None else: assert flat_df.shape[0] == 1 return flat_df.loc[0] diff --git a/tests/dummy_data/data_multirow_encounter_freetext.csv b/tests/dummy_data/data_multirow_encounter_freetext.csv new file mode 100644 index 0000000..7ce611e --- /dev/null +++ b/tests/dummy_data/data_multirow_encounter_freetext.csv @@ -0,0 +1,3 @@ +subjid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_secondiag,outco_secondiag_oth,outco_date,outco_outcome +1,2021-10-22,1,2021-10-20,18:40,,,,,,, +1,,,,,1,,3,1,RTI,2021-10-23,1.0 diff --git a/tests/dummy_data/data_multirow_encounter_freetext_maindiag.csv b/tests/dummy_data/data_multirow_encounter_freetext_maindiag.csv new file mode 100644 index 0000000..0bf4bb9 --- /dev/null +++ b/tests/dummy_data/data_multirow_encounter_freetext_maindiag.csv @@ -0,0 +1,3 @@ +subjid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_secondiag,outco_secondiag_oth,outco_date,outco_outcome +1,2021-10-22,1,2021-10-20,18:40,,,,,,, +1,,,,,0,sepsis,2,,,2021-10-23,1.0 diff --git a/tests/dummy_data/data_multirow_encounter_freetext_secdiag.csv b/tests/dummy_data/data_multirow_encounter_freetext_secdiag.csv new file mode 100644 index 0000000..83f4b59 --- /dev/null +++ b/tests/dummy_data/data_multirow_encounter_freetext_secdiag.csv @@ -0,0 +1,3 @@ +subjid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_secondiag,outco_secondiag_oth,outco_date,outco_outcome +1,2021-10-22,1,2021-10-20,18:40,,,,,,, +1,,,,,99,,,1,secondary Dengue,2021-10-23,1.0 diff --git a/tests/dummy_data/encounter_dummy_mapping.csv b/tests/dummy_data/encounter_dummy_mapping.csv index ae99d6b..ec85ea0 100644 --- a/tests/dummy_data/encounter_dummy_mapping.csv +++ b/tests/dummy_data/encounter_dummy_mapping.csv @@ -9,7 +9,7 @@ dates_admdate,,,,,,,,,,+,,,,,,,,,, dates_admtime,,,,,,,,,,+,,,,,,,,,, outco_denguediag,"1, Yes",,,,,,,,,,,https://snomed.info/sct,38362002,Dengue (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, ,"0, No",,,,,,,,,,,,,,,,,,, -,"99, Unknown",,,,,,,,,,,https://snomed.info/sct,261665006,Unknown (qualifier value),,,,,, +,"99, Unknown",,,,,,,,,,,https://snomed.info/sct,261665006,Unknown (qualifier value),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, outco_date,,,,,,,,,,,,,,,,,,,, outco_outcome,"1, Discharged alive",,,,,,,,,,,,,,,,,https://snomed.info/sct,371827001,Patient discharged alive (finding) ,"2, Still hospitalised",,,,,,,,,,,,,,,,,https://snomed.info/sct,32485007,Hospital admission (procedure) @@ -22,4 +22,4 @@ outco_denguediag_main,,,,,,,,,,,,,,,https://snomed.info/sct,89100005,Fina outco_denguediag_class,"1, Uncomplicated dengue",,,,,,,,,,,https://snomed.info/sct,722862003,Dengue without warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, ,"2, Dengue with warning signs",,,,,,,,,,,https://snomed.info/sct,722863008,Dengue with warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, ,"3, Severe dengue",,,,,,,,,,,https://snomed.info/sct,20927009,Dengue hemorrhagic fever (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,, -outco_secondiag_oth,,,,,,,,,,,,,,,https://snomed.info/sct,85097005,Secondary diagnosis (contextual qualifier) (qualifier value),,, \ No newline at end of file +outco_secondiag_oth,,,,,,,,,,,,,,,https://snomed.info/sct,85097005,Secondary diagnosis (contextual qualifier) (qualifier value),,, diff --git a/tests/test_flat2fhir_units.py b/tests/test_flat2fhir_units.py index ccd2fb0..7a26f78 100644 --- a/tests/test_flat2fhir_units.py +++ b/tests/test_flat2fhir_units.py @@ -78,6 +78,61 @@ def test_create_codeable_concept(data_groups, expected): assert result == expected +@pytest.mark.parametrize( + "data_groups, expected", + [ + ( + ( + { + "code.code": ["1234"], + "code.system": ["http://loinc.org"], + "code.text": ["Test"], + }, + "code", + ), + { + "coding": [ + { + "system": "http://loinc.org", + "code": "1234", + "display": "Test", + } + ] + }, + ), + ( + ( + { + "code.code": ["1234", "5678"], + "code.system": ["http://loinc.org", "http://snomed.info/sct"], + "code.text": ["Test", "Snomed Test"], + }, + "code", + ), + { + "coding": [ + { + "system": "http://loinc.org", + "code": "1234", + "display": "Test", + }, + { + "system": "http://snomed.info/sct", + "code": "5678", + "display": "Snomed Test", + }, + ] + }, + ), + ], +) +def test_create_codeable_concept_ingestion(data_groups, expected): + data, groups = data_groups + result = f2f.create_codeable_concept(data, groups) + + assert result == expected + + @pytest.mark.parametrize( "data_class, expected", [ diff --git a/tests/test_ingest.py b/tests/test_ingest.py index 1447246..34007c2 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -264,7 +264,6 @@ def test_create_dict_one_to_one_single_row(): def test_create_dict_missing_data_warning(): - with pytest.warns(UserWarning, match="No data found for the Observation resource"): create_dictionary( "tests/dummy_data/encounter_dummy_data_single.csv", @@ -339,6 +338,104 @@ def test_create_dict_one_to_one_multirow_condense(): ) +@pytest.mark.parametrize( + ("file, expected"), + [ + ( + "tests/dummy_data/data_multirow_encounter_freetext_maindiag.csv", + { + "diagnosis.condition.concept.code": [None, 722863008.0], + "diagnosis.condition.concept.system": [None, "https://snomed.info/sct"], + "diagnosis.condition.concept.text": [ + "sepsis", + "Dengue with warning signs (disorder)", + ], + "diagnosis.use.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.use.code": [89100005.0, 89100005.0], + "diagnosis.use.text": [ + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + ], + }, + ), + ( + "tests/dummy_data/data_multirow_encounter_freetext.csv", + { + "diagnosis.condition.concept.code": [38362002.0, 20927009.0, None], + "diagnosis.condition.concept.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + None, + ], + "diagnosis.condition.concept.text": [ + "Dengue (disorder)", + "Dengue hemorrhagic fever (disorder)", + "RTI", + ], + "diagnosis.use.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.use.code": [89100005.0, 89100005.0, 85097005.0], + "diagnosis.use.text": [ + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "Secondary diagnosis (contextual qualifier) (qualifier value)", # noqa: E501 + ], + }, + ), + ( + "tests/dummy_data/data_multirow_encounter_freetext_secdiag.csv", + { + "diagnosis.condition.concept.system": ["https://snomed.info/sct", None], + "diagnosis.condition.concept.code": [261665006.0, None], + "diagnosis.condition.concept.text": [ + "Unknown (qualifier value)", + "secondary Dengue", + ], + "diagnosis.use.system": [ + "https://snomed.info/sct", + "https://snomed.info/sct", + ], + "diagnosis.use.code": [89100005.0, 85097005.0], + "diagnosis.use.text": [ + "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "Secondary diagnosis (contextual qualifier) (qualifier value)", + ], + }, + ), + ], +) +def test_create_dict_one_to_one_dense_freetext(file, expected): + df = create_dictionary( + file, + "tests/dummy_data/encounter_dummy_mapping.csv", + "Encounter", + one_to_one=True, + date_format="%Y-%m-%d", + timezone="Brazil/East", + ) + + assert df is not None + dict_out = df["flat_dict"][0] + + diagnosis_cols = [ + "diagnosis.condition.concept.system", + "diagnosis.condition.concept.code", + "diagnosis.condition.concept.text", + "diagnosis.use.system", + "diagnosis.use.code", + "diagnosis.use.text", + ] + + # only interested in the diagnosis (backbone element) columns + assert {k: dict_out[k] for k in diagnosis_cols} == expected + + ENCOUNTER_SINGLE_ROW_FLAT = { "resourceType": "Encounter", "id": "11", @@ -429,6 +526,69 @@ def test_load_data_one_to_one_single_row(): os.remove("encounter_ingestion_single.parquet") +def test_load_data_one_to_one_dense_single_row(): + df = create_dictionary( + "tests/dummy_data/data_multirow_encounter_freetext_maindiag.csv", + "tests/dummy_data/encounter_dummy_mapping.csv", + "Encounter", + one_to_one=True, + date_format="%Y-%m-%d", + timezone="Brazil/East", + ) + + assert df is not None + Encounter.ingest_to_flat(df, "encounter_ingestion_dense") + + df_parquet = pd.read_parquet("encounter_ingestion_dense.parquet") + + expected_diagnosis = [ + { + "condition": [{"concept": {"coding": None, "text": "sepsis"}}], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + { + "condition": [ + { + "concept": { + "coding": [ + { + "code": "722863008", + "display": "Dengue with warning signs (disorder)", + "system": "https://snomed.info/sct", + } + ], + "text": None, + } + } + ], + "use": [ + { + "coding": [ + { + "code": "89100005", + "display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501 + "system": "https://snomed.info/sct", + } + ] + } + ], + }, + ] + + assert all(df_parquet["diagnosis_dense"][0] == expected_diagnosis) + os.remove("encounter_ingestion_dense.parquet") + + ENCOUNTER_SINGLE_ROW_MULTI = { "resourceType": ["Encounter", "Encounter", "Encounter", "Encounter"], "class.code": [