Skip to content

Commit

Permalink
Deal with list fields (#20)
Browse files Browse the repository at this point in the history
Exports list fields containing more than one item into a single "_dense" column.
  • Loading branch information
pipliggins authored May 2, 2024
1 parent fd237d7 commit 97d63b4
Show file tree
Hide file tree
Showing 11 changed files with 319 additions and 142 deletions.
36 changes: 20 additions & 16 deletions fhirflat/fhir2flat.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def flatten_column(
raise ValueError("Input data must be a pandas DataFrame or Series.")


def explode_and_flatten(df, list_cols):
def explode_and_flatten(df, list_cols: list[str]):
"""
Recursively explodes and flattens a dataframe.
Columns containing a 'coding' or 'extension' list are left intact for later
Expand All @@ -46,21 +46,23 @@ def explode_and_flatten(df, list_cols):
df: flattened fhir resource
lists: list of columns containing lists in the dataframe
"""
try:
df = df.explode([n for n in list_cols])
except ValueError:
raise ValueError("Can't explode a dataframe with lists of different lengths")

if len(df) == 1:
# only one concept in each list
for lc in list_cols:
df = flatten_column(df, lc)
else:
raise NotImplementedError("Can't handle lists with more than one concept yet")
# for lc in list_cols:
# df = flatten_column(df, lc)

# check if any cols remain containing lists that aren't 'coding' chunks or extension
list_lengths = [len(df[x][0]) for x in list_cols]
long_list_cols = [x for x, y in zip(list_cols, list_lengths) if y > 1]

if long_list_cols:
df.rename(columns={x: x + "_dense" for x in long_list_cols}, inplace=True)
list_cols = [x for x in list_cols if x not in long_list_cols]

df = df.explode(list_cols)

assert len(df) == 1, "List with more than one concept has slipped through."

for lc in list_cols:
df = flatten_column(df, lc)

# check if any cols remain containing lists that aren't 'coding' chunks, extension
# or dense columns (lists of nested data we don't want to explode)
list_columns = df.map(lambda x: isinstance(x, list))
new_list_cols = [
col
Expand All @@ -69,6 +71,7 @@ def explode_and_flatten(df, list_cols):
list_columns[col].any()
and not col.endswith("coding")
and not col.endswith("extension")
and not col.endswith("_dense")
)
]
if new_list_cols:
Expand Down Expand Up @@ -270,7 +273,8 @@ def fhir2flat(resource: FHIRFlatBase, lists: list | None = None) -> pd.DataFrame
df = pd.json_normalize(resource.dict())

if lists:
list_cols = [n for n in lists if n in df.columns]
# extensions are dealt with seperately while still in a list
list_cols = [n for n in lists if n in df.columns if n != "extension"]
if list_cols:
df = explode_and_flatten(df, list_cols)

Expand Down
8 changes: 8 additions & 0 deletions fhirflat/flat2fhir.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,14 @@ def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict:
else:
expanded[k] = set_datatypes(k, v_dict, group_classes[k])

dense_cols = {
k: k.removesuffix("_dense") for k in data.keys() if k.endswith("_dense")
}
if dense_cols:
for old_k, new_k in dense_cols.items():
data[new_k] = data[old_k]
del data[old_k]

for k in keys_to_replace:
data.pop(k)
data.update(expanded)
Expand Down
2 changes: 1 addition & 1 deletion fhirflat/resources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def attr_lists(cls) -> list[str]:
return [
p.alias
for p in cls.element_properties()
if "typing.List" in str(p.outer_type_)
if "typing.List" in str(p.outer_type_) or "list" in str(p.outer_type_)
]

@classmethod
Expand Down
Binary file modified tests/data/encounter_flat.parquet
Binary file not shown.
Binary file modified tests/data/observation_flat.parquet
Binary file not shown.
124 changes: 92 additions & 32 deletions tests/test_encounter_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,36 +94,36 @@
]
}
],
# "diagnosis": [
# {
# "condition": [{"reference": {"reference": "Condition/stroke"}}],
# "use": [
# {
# "coding": [
# {
# "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role", # noqa: E501
# "code": "AD",
# "display": "Admission diagnosis",
# }
# ]
# }
# ],
# },
# {
# "condition": [{"reference": {"reference": "Condition/f201"}}],
# "use": [
# {
# "coding": [
# {
# "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role", # noqa: E501
# "code": "DD",
# "display": "Discharge diagnosis",
# }
# ]
# }
# ],
# },
# ],
"diagnosis": [
{
"condition": [{"reference": {"reference": "Condition/stroke"}}],
"use": [
{
"coding": [
{
"system": "http://terminology.hl7.org/CodeSystem/diagnosis-role", # noqa: E501
"code": "AD",
"display": "Admission diagnosis",
}
]
}
],
},
{
"condition": [{"reference": {"reference": "Condition/f201"}}],
"use": [
{
"coding": [
{
"system": "http://terminology.hl7.org/CodeSystem/diagnosis-role", # noqa: E501
"code": "DD",
"display": "Discharge diagnosis",
}
]
}
],
},
],
"account": [{"reference": "Account/example"}],
"dietPreference": [
{
Expand Down Expand Up @@ -194,8 +194,38 @@
"serviceProvider": "Organization/2",
"actualPeriod.start": datetime.date(2013, 3, 11),
"actualPeriod.end": datetime.date(2013, 3, 20),
# diagnosis.condition: ["Condition/stroke", "Condition/f201"],
# diagnosis.use.code: ["http://terminology.hl7.org/CodeSystem/diagnosis-role|AD", "http://terminology.hl7.org/CodeSystem/diagnosis-role|DD"], # noqa: E501
"diagnosis_dense": [
[
{
"condition": [{"reference": {"reference": "Condition/stroke"}}],
"use": [
{
"coding": [
{
"system": "http://terminology.hl7.org/CodeSystem/diagnosis-role", # noqa: E501
"code": "AD",
"display": "Admission diagnosis",
}
]
}
],
},
{
"condition": [{"reference": {"reference": "Condition/f201"}}],
"use": [
{
"coding": [
{
"system": "http://terminology.hl7.org/CodeSystem/diagnosis-role", # noqa: E501
"code": "DD",
"display": "Discharge diagnosis",
}
]
}
],
},
]
],
"admission.origin": "Location/2",
"admission.admitSource.code": "http://snomed.info/sct|309902002",
"admission.admitSource.text": "Clinical Oncology Department",
Expand Down Expand Up @@ -276,6 +306,36 @@
]
}
],
"diagnosis": [
{
"condition": [{"reference": {"reference": "Condition/stroke"}}],
"use": [
{
"coding": [
{
"system": "http://terminology.hl7.org/CodeSystem/diagnosis-role", # noqa: E501
"code": "AD",
"display": "Admission diagnosis",
}
]
}
],
},
{
"condition": [{"reference": {"reference": "Condition/f201"}}],
"use": [
{
"coding": [
{
"system": "http://terminology.hl7.org/CodeSystem/diagnosis-role", # noqa: E501
"code": "DD",
"display": "Discharge diagnosis",
}
]
}
],
},
],
"admission": {
"origin": {"reference": "Location/2"},
"admitSource": {
Expand Down
2 changes: 0 additions & 2 deletions tests/test_medicationadministration_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,6 @@ def test_medicationadministration_to_flat():


def test_medicationadministration_from_flat():
# 'dose' in this case is a simleQuanitity but nowhere does it state this in the json
# it just uses .code and is therefore assumed to be a codeableConcept.
meds = MedicationAdministration(**MEDS_DICT_OUT)

flat_meds = MedicationAdministration.from_flat(
Expand Down
2 changes: 0 additions & 2 deletions tests/test_medicationstatement_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,6 @@ def test_medicationstatement_to_flat():


def test_medicationstatement_from_flat():
# 'dose' in this case is a simleQuanitity but nowhere does it state this in the json
# it just uses .code and is therefore assumed to be a codeableConcept.
meds = MedicationStatement(**MEDS_DICT_OUT)

flat_meds = MedicationStatement.from_flat("tests/data/medicationstat_flat.parquet")
Expand Down
Loading

0 comments on commit 97d63b4

Please sign in to comment.