Skip to content

Commit

Permalink
Force list elements to be padded with 'None'
Browse files Browse the repository at this point in the history
to maintain same lengths for backbone elements
  • Loading branch information
pipliggins authored Jun 11, 2024
1 parent 989c83f commit ee71901
Show file tree
Hide file tree
Showing 9 changed files with 275 additions and 11 deletions.
14 changes: 8 additions & 6 deletions fhirflat/flat2fhir.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,27 @@


def create_codeable_concept(
old_dict: dict[str, list[str] | str | float], name: str
old_dict: dict[str, list[str] | str | float | None], name: str
) -> dict[str, list[str]]:
"""Re-creates a codeableConcept structure from the FHIRflat representation."""

# for reading in from ingestion pipeline
if name + ".code" in old_dict and name + ".system" in old_dict:
raw_codes: str | float | list[str] = old_dict.get(name + ".code")
if not isinstance(raw_codes, list):
raw_codes: str | float | list[str | None] = old_dict.get(name + ".code")
if raw_codes is not None and not isinstance(raw_codes, list):
formatted_code = (
raw_codes if isinstance(raw_codes, str) else str(int(raw_codes))
)
codes = [old_dict[name + ".system"] + "|" + formatted_code]
elif raw_codes is None:
codes = raw_codes
else:
formatted_codes = [
c if isinstance(c, str) else str(int(c)) for c in raw_codes
c if (isinstance(c, str) or c is None) else str(int(c))
for c in raw_codes
]
codes = [
[s + "|" + c]
s + "|" + c
for s, c in zip(
old_dict[name + ".system"], formatted_codes, strict=True
)
Expand Down Expand Up @@ -217,7 +220,6 @@ def expand_concepts(data: dict[str, str], data_class: type[_DomainResource]) ->
group_classes = {}

for k in groups.keys():

group_classes[k] = find_data_class(data_class, k)

expanded = {}
Expand Down
34 changes: 33 additions & 1 deletion fhirflat/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import numpy as np
import pandas as pd

from fhirflat.util import get_local_resource
from fhirflat.util import get_local_resource, group_keys

# 1:1 (single row, single resource) mapping: Patient, Encounter
# 1:M (single row, multiple resources) mapping: Observation, Condition, Procedure, ...
Expand Down Expand Up @@ -165,6 +165,38 @@ def create_dict_wide(
result[key].append(snippet[key])
else:
result[key] = [result[key], snippet[key]]

# Keys that were not previously in the result still need to be added
remaining_keys = set(snippet.keys()) ^ duplicate_keys
if remaining_keys:
key_length = max(len(result[k]) for k in duplicate_keys)
empty_list = [None] * (key_length - 1)
for key in remaining_keys:
result[key] = [*empty_list, snippet[key]]

# Check for existing keys that might need to be extended
snippet_keys = list(snippet.keys())
result_groups = group_keys(result.keys())
for k_list in result_groups.values():
if set(snippet_keys).issubset(set(k_list)):
relevant_result = {
k: (
[result[k]]
if not isinstance(result[k], list)
else result[k]
)
for k in k_list
}
all_vals_same_length = (
len(set(map(len, relevant_result.values()))) == 1
)
if not all_vals_same_length:
target_length = max(map(len, relevant_result.values()))
for k, v in relevant_result.items():
if len(v) < target_length:
result[k] = relevant_result[k] + [None] * (
target_length - len(v)
)
return result


Expand Down
8 changes: 7 additions & 1 deletion fhirflat/resources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,12 @@ def fhir_format(row: pd.Series) -> pd.Series:
continue
else:
backbone_list = []
for i in range(len(next(iter(condensed_dict.values())))):
# assert all lists are the same length - if not different parts
# of the backbone element may be incorrectly grouped together
assert len(set(map(len, condensed_dict.values()))) == 1

# iterate through and split the element into individual levels
for i in range(max(len(x) for x in condensed_dict.values())):
first_item = {
k.lstrip(b_e + "."): v[i]
for k, v in condensed_dict.items()
Expand Down Expand Up @@ -271,6 +276,7 @@ def to_flat(self, filename: str | None = None) -> None | pd.Series:

if filename:
flat_df.to_parquet(filename)
return None
else:
assert flat_df.shape[0] == 1
return flat_df.loc[0]
3 changes: 3 additions & 0 deletions tests/dummy_data/data_multirow_encounter_freetext.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
subjid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_secondiag,outco_secondiag_oth,outco_date,outco_outcome
1,2021-10-22,1,2021-10-20,18:40,,,,,,,
1,,,,,1,,3,1,RTI,2021-10-23,1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
subjid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_secondiag,outco_secondiag_oth,outco_date,outco_outcome
1,2021-10-22,1,2021-10-20,18:40,,,,,,,
1,,,,,0,sepsis,2,,,2021-10-23,1.0
3 changes: 3 additions & 0 deletions tests/dummy_data/data_multirow_encounter_freetext_secdiag.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
subjid,dates_enrolment,dates_adm,dates_admdate,dates_admtime,outco_denguediag,outco_denguediag_main,outco_denguediag_class,outco_secondiag,outco_secondiag_oth,outco_date,outco_outcome
1,2021-10-22,1,2021-10-20,18:40,,,,,,,
1,,,,,99,,,1,secondary Dengue,2021-10-23,1.0
4 changes: 2 additions & 2 deletions tests/dummy_data/encounter_dummy_mapping.csv
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ dates_admdate,,,,,,,,,,<FIELD>+<dates_admtime>,,,,,,,,,,
dates_admtime,,,,,,,,,,<dates_admdate>+<FIELD>,,,,,,,,,,
outco_denguediag,"1, Yes",,,,,,,,,,,https://snomed.info/sct,38362002,Dengue (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,,
,"0, No",,,,,,,,,,,,,,,,,,,
,"99, Unknown",,,,,,,,,,,https://snomed.info/sct,261665006,Unknown (qualifier value),,,,,,
,"99, Unknown",,,,,,,,,,,https://snomed.info/sct,261665006,Unknown (qualifier value),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,,
outco_date,,,,,,,,,,,<FIELD>,,,,,,,,,
outco_outcome,"1, Discharged alive",,,,,,,,,,,,,,,,,https://snomed.info/sct,371827001,Patient discharged alive (finding)
,"2, Still hospitalised",,,,,,,,,,,,,,,,,https://snomed.info/sct,32485007,Hospital admission (procedure)
Expand All @@ -22,4 +22,4 @@ outco_denguediag_main,,,,,,,,,,,,,,<FIELD>,https://snomed.info/sct,89100005,Fina
outco_denguediag_class,"1, Uncomplicated dengue",,,,,,,,,,,https://snomed.info/sct,722862003,Dengue without warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,,
,"2, Dengue with warning signs",,,,,,,,,,,https://snomed.info/sct,722863008,Dengue with warning signs (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,,
,"3, Severe dengue",,,,,,,,,,,https://snomed.info/sct,20927009,Dengue hemorrhagic fever (disorder),https://snomed.info/sct,89100005,Final diagnosis (discharge) (contextual qualifier) (qualifier value),,,
outco_secondiag_oth,,,,,,,,,,,,,,<FIELD>,https://snomed.info/sct,85097005,Secondary diagnosis (contextual qualifier) (qualifier value),,,
outco_secondiag_oth,,,,,,,,,,,,,,<FIELD>,https://snomed.info/sct,85097005,Secondary diagnosis (contextual qualifier) (qualifier value),,,
55 changes: 55 additions & 0 deletions tests/test_flat2fhir_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,61 @@ def test_create_codeable_concept(data_groups, expected):
assert result == expected


@pytest.mark.parametrize(
"data_groups, expected",
[
(
(
{
"code.code": ["1234"],
"code.system": ["http://loinc.org"],
"code.text": ["Test"],
},
"code",
),
{
"coding": [
{
"system": "http://loinc.org",
"code": "1234",
"display": "Test",
}
]
},
),
(
(
{
"code.code": ["1234", "5678"],
"code.system": ["http://loinc.org", "http://snomed.info/sct"],
"code.text": ["Test", "Snomed Test"],
},
"code",
),
{
"coding": [
{
"system": "http://loinc.org",
"code": "1234",
"display": "Test",
},
{
"system": "http://snomed.info/sct",
"code": "5678",
"display": "Snomed Test",
},
]
},
),
],
)
def test_create_codeable_concept_ingestion(data_groups, expected):
data, groups = data_groups
result = f2f.create_codeable_concept(data, groups)

assert result == expected


@pytest.mark.parametrize(
"data_class, expected",
[
Expand Down
162 changes: 161 additions & 1 deletion tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,6 @@ def test_create_dict_one_to_one_single_row():


def test_create_dict_missing_data_warning():

with pytest.warns(UserWarning, match="No data found for the Observation resource"):
create_dictionary(
"tests/dummy_data/encounter_dummy_data_single.csv",
Expand Down Expand Up @@ -339,6 +338,104 @@ def test_create_dict_one_to_one_multirow_condense():
)


@pytest.mark.parametrize(
("file, expected"),
[
(
"tests/dummy_data/data_multirow_encounter_freetext_maindiag.csv",
{
"diagnosis.condition.concept.code": [None, 722863008.0],
"diagnosis.condition.concept.system": [None, "https://snomed.info/sct"],
"diagnosis.condition.concept.text": [
"sepsis",
"Dengue with warning signs (disorder)",
],
"diagnosis.use.system": [
"https://snomed.info/sct",
"https://snomed.info/sct",
],
"diagnosis.use.code": [89100005.0, 89100005.0],
"diagnosis.use.text": [
"Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501
"Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501
],
},
),
(
"tests/dummy_data/data_multirow_encounter_freetext.csv",
{
"diagnosis.condition.concept.code": [38362002.0, 20927009.0, None],
"diagnosis.condition.concept.system": [
"https://snomed.info/sct",
"https://snomed.info/sct",
None,
],
"diagnosis.condition.concept.text": [
"Dengue (disorder)",
"Dengue hemorrhagic fever (disorder)",
"RTI",
],
"diagnosis.use.system": [
"https://snomed.info/sct",
"https://snomed.info/sct",
"https://snomed.info/sct",
],
"diagnosis.use.code": [89100005.0, 89100005.0, 85097005.0],
"diagnosis.use.text": [
"Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501
"Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501
"Secondary diagnosis (contextual qualifier) (qualifier value)", # noqa: E501
],
},
),
(
"tests/dummy_data/data_multirow_encounter_freetext_secdiag.csv",
{
"diagnosis.condition.concept.system": ["https://snomed.info/sct", None],
"diagnosis.condition.concept.code": [261665006.0, None],
"diagnosis.condition.concept.text": [
"Unknown (qualifier value)",
"secondary Dengue",
],
"diagnosis.use.system": [
"https://snomed.info/sct",
"https://snomed.info/sct",
],
"diagnosis.use.code": [89100005.0, 85097005.0],
"diagnosis.use.text": [
"Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501
"Secondary diagnosis (contextual qualifier) (qualifier value)",
],
},
),
],
)
def test_create_dict_one_to_one_dense_freetext(file, expected):
df = create_dictionary(
file,
"tests/dummy_data/encounter_dummy_mapping.csv",
"Encounter",
one_to_one=True,
date_format="%Y-%m-%d",
timezone="Brazil/East",
)

assert df is not None
dict_out = df["flat_dict"][0]

diagnosis_cols = [
"diagnosis.condition.concept.system",
"diagnosis.condition.concept.code",
"diagnosis.condition.concept.text",
"diagnosis.use.system",
"diagnosis.use.code",
"diagnosis.use.text",
]

# only interested in the diagnosis (backbone element) columns
assert {k: dict_out[k] for k in diagnosis_cols} == expected


ENCOUNTER_SINGLE_ROW_FLAT = {
"resourceType": "Encounter",
"id": "11",
Expand Down Expand Up @@ -429,6 +526,69 @@ def test_load_data_one_to_one_single_row():
os.remove("encounter_ingestion_single.parquet")


def test_load_data_one_to_one_dense_single_row():
df = create_dictionary(
"tests/dummy_data/data_multirow_encounter_freetext_maindiag.csv",
"tests/dummy_data/encounter_dummy_mapping.csv",
"Encounter",
one_to_one=True,
date_format="%Y-%m-%d",
timezone="Brazil/East",
)

assert df is not None
Encounter.ingest_to_flat(df, "encounter_ingestion_dense")

df_parquet = pd.read_parquet("encounter_ingestion_dense.parquet")

expected_diagnosis = [
{
"condition": [{"concept": {"coding": None, "text": "sepsis"}}],
"use": [
{
"coding": [
{
"code": "89100005",
"display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501
"system": "https://snomed.info/sct",
}
]
}
],
},
{
"condition": [
{
"concept": {
"coding": [
{
"code": "722863008",
"display": "Dengue with warning signs (disorder)",
"system": "https://snomed.info/sct",
}
],
"text": None,
}
}
],
"use": [
{
"coding": [
{
"code": "89100005",
"display": "Final diagnosis (discharge) (contextual qualifier) (qualifier value)", # noqa: E501
"system": "https://snomed.info/sct",
}
]
}
],
},
]

assert all(df_parquet["diagnosis_dense"][0] == expected_diagnosis)
os.remove("encounter_ingestion_dense.parquet")


ENCOUNTER_SINGLE_ROW_MULTI = {
"resourceType": ["Encounter", "Encounter", "Encounter", "Encounter"],
"class.code": [
Expand Down

0 comments on commit ee71901

Please sign in to comment.