Deal with list fields (#20)

Exports list fields containing more than one item into a single "_dense" column.
globaldothealth · May 2, 2024 · 97d63b4 · 97d63b4
1 parent fd237d7
commit 97d63b4
Show file tree

Hide file tree

Showing 11 changed files with 319 additions and 142 deletions.
diff --git a/fhirflat/fhir2flat.py b/fhirflat/fhir2flat.py
@@ -37,7 +37,7 @@ def flatten_column(
         raise ValueError("Input data must be a pandas DataFrame or Series.")
 
 
-def explode_and_flatten(df, list_cols):
+def explode_and_flatten(df, list_cols: list[str]):
     """
     Recursively explodes and flattens a dataframe.
     Columns containing a 'coding' or 'extension' list are left intact for later
@@ -46,21 +46,23 @@ def explode_and_flatten(df, list_cols):
     df: flattened fhir resource
     lists: list of columns containing lists in the dataframe
     """
-    try:
-        df = df.explode([n for n in list_cols])
-    except ValueError:
-        raise ValueError("Can't explode a dataframe with lists of different lengths")
-
-    if len(df) == 1:
-        # only one concept in each list
-        for lc in list_cols:
-            df = flatten_column(df, lc)
-    else:
-        raise NotImplementedError("Can't handle lists with more than one concept yet")
-    # for lc in list_cols:
-    #     df = flatten_column(df, lc)
 
-    # check if any cols remain containing lists that aren't 'coding' chunks or extension
+    list_lengths = [len(df[x][0]) for x in list_cols]
+    long_list_cols = [x for x, y in zip(list_cols, list_lengths) if y > 1]
+
+    if long_list_cols:
+        df.rename(columns={x: x + "_dense" for x in long_list_cols}, inplace=True)
+        list_cols = [x for x in list_cols if x not in long_list_cols]
+
+    df = df.explode(list_cols)
+
+    assert len(df) == 1, "List with more than one concept has slipped through."
+
+    for lc in list_cols:
+        df = flatten_column(df, lc)
+
+    # check if any cols remain containing lists that aren't 'coding' chunks, extension
+    # or dense columns (lists of nested data we don't want to explode)
     list_columns = df.map(lambda x: isinstance(x, list))
     new_list_cols = [
         col
@@ -69,6 +71,7 @@ def explode_and_flatten(df, list_cols):
             list_columns[col].any()
             and not col.endswith("coding")
             and not col.endswith("extension")
+            and not col.endswith("_dense")
         )
     ]
     if new_list_cols:
@@ -270,7 +273,8 @@ def fhir2flat(resource: FHIRFlatBase, lists: list | None = None) -> pd.DataFrame
     df = pd.json_normalize(resource.dict())
 
     if lists:
-        list_cols = [n for n in lists if n in df.columns]
+        # extensions are dealt with seperately while still in a list
+        list_cols = [n for n in lists if n in df.columns if n != "extension"]
         if list_cols:
             df = explode_and_flatten(df, list_cols)
 

diff --git a/fhirflat/flat2fhir.py b/fhirflat/flat2fhir.py
@@ -202,6 +202,14 @@ def expand_concepts(data: dict, data_class: type[_DomainResource]) -> dict:
         else:
             expanded[k] = set_datatypes(k, v_dict, group_classes[k])
 
+    dense_cols = {
+        k: k.removesuffix("_dense") for k in data.keys() if k.endswith("_dense")
+    }
+    if dense_cols:
+        for old_k, new_k in dense_cols.items():
+            data[new_k] = data[old_k]
+            del data[old_k]
+
     for k in keys_to_replace:
         data.pop(k)
     data.update(expanded)

diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py
@@ -29,7 +29,7 @@ def attr_lists(cls) -> list[str]:
         return [
             p.alias
             for p in cls.element_properties()
-            if "typing.List" in str(p.outer_type_)
+            if "typing.List" in str(p.outer_type_) or "list" in str(p.outer_type_)
         ]
 
     @classmethod

diff --git a/tests/data/encounter_flat.parquet b/tests/data/encounter_flat.parquet
diff --git a/tests/data/observation_flat.parquet b/tests/data/observation_flat.parquet
diff --git a/tests/test_encounter_resource.py b/tests/test_encounter_resource.py
@@ -94,36 +94,36 @@
             ]
         }
     ],
-    # "diagnosis": [
-    #     {
-    #         "condition": [{"reference": {"reference": "Condition/stroke"}}],
-    #         "use": [
-    #             {
-    #                 "coding": [
-    #                     {
-    #                         "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role",  # noqa: E501
-    #                         "code": "AD",
-    #                         "display": "Admission diagnosis",
-    #                     }
-    #                 ]
-    #             }
-    #         ],
-    #     },
-    #     {
-    #         "condition": [{"reference": {"reference": "Condition/f201"}}],
-    #         "use": [
-    #             {
-    #                 "coding": [
-    #                     {
-    #                         "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role",  # noqa: E501
-    #                         "code": "DD",
-    #                         "display": "Discharge diagnosis",
-    #                     }
-    #                 ]
-    #             }
-    #         ],
-    #     },
-    # ],
+    "diagnosis": [
+        {
+            "condition": [{"reference": {"reference": "Condition/stroke"}}],
+            "use": [
+                {
+                    "coding": [
+                        {
+                            "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role",  # noqa: E501
+                            "code": "AD",
+                            "display": "Admission diagnosis",
+                        }
+                    ]
+                }
+            ],
+        },
+        {
+            "condition": [{"reference": {"reference": "Condition/f201"}}],
+            "use": [
+                {
+                    "coding": [
+                        {
+                            "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role",  # noqa: E501
+                            "code": "DD",
+                            "display": "Discharge diagnosis",
+                        }
+                    ]
+                }
+            ],
+        },
+    ],
     "account": [{"reference": "Account/example"}],
     "dietPreference": [
         {
@@ -194,8 +194,38 @@
     "serviceProvider": "Organization/2",
     "actualPeriod.start": datetime.date(2013, 3, 11),
     "actualPeriod.end": datetime.date(2013, 3, 20),
-    # diagnosis.condition: ["Condition/stroke", "Condition/f201"],
-    # diagnosis.use.code: ["http://terminology.hl7.org/CodeSystem/diagnosis-role|AD", "http://terminology.hl7.org/CodeSystem/diagnosis-role|DD"],  # noqa: E501
+    "diagnosis_dense": [
+        [
+            {
+                "condition": [{"reference": {"reference": "Condition/stroke"}}],
+                "use": [
+                    {
+                        "coding": [
+                            {
+                                "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role",  # noqa: E501
+                                "code": "AD",
+                                "display": "Admission diagnosis",
+                            }
+                        ]
+                    }
+                ],
+            },
+            {
+                "condition": [{"reference": {"reference": "Condition/f201"}}],
+                "use": [
+                    {
+                        "coding": [
+                            {
+                                "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role",  # noqa: E501
+                                "code": "DD",
+                                "display": "Discharge diagnosis",
+                            }
+                        ]
+                    }
+                ],
+            },
+        ]
+    ],
     "admission.origin": "Location/2",
     "admission.admitSource.code": "http://snomed.info/sct|309902002",
     "admission.admitSource.text": "Clinical Oncology Department",
@@ -276,6 +306,36 @@
             ]
         }
     ],
+    "diagnosis": [
+        {
+            "condition": [{"reference": {"reference": "Condition/stroke"}}],
+            "use": [
+                {
+                    "coding": [
+                        {
+                            "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role",  # noqa: E501
+                            "code": "AD",
+                            "display": "Admission diagnosis",
+                        }
+                    ]
+                }
+            ],
+        },
+        {
+            "condition": [{"reference": {"reference": "Condition/f201"}}],
+            "use": [
+                {
+                    "coding": [
+                        {
+                            "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role",  # noqa: E501
+                            "code": "DD",
+                            "display": "Discharge diagnosis",
+                        }
+                    ]
+                }
+            ],
+        },
+    ],
     "admission": {
         "origin": {"reference": "Location/2"},
         "admitSource": {

diff --git a/tests/test_medicationadministration_resource.py b/tests/test_medicationadministration_resource.py
@@ -160,8 +160,6 @@ def test_medicationadministration_to_flat():
 
 
 def test_medicationadministration_from_flat():
-    # 'dose' in this case is a simleQuanitity but nowhere does it state this in the json
-    # it just uses .code and is therefore assumed to be a codeableConcept.
     meds = MedicationAdministration(**MEDS_DICT_OUT)
 
     flat_meds = MedicationAdministration.from_flat(

diff --git a/tests/test_medicationstatement_resource.py b/tests/test_medicationstatement_resource.py
@@ -188,8 +188,6 @@ def test_medicationstatement_to_flat():
 
 
 def test_medicationstatement_from_flat():
-    # 'dose' in this case is a simleQuanitity but nowhere does it state this in the json
-    # it just uses .code and is therefore assumed to be a codeableConcept.
     meds = MedicationStatement(**MEDS_DICT_OUT)
 
     flat_meds = MedicationStatement.from_flat("tests/data/medicationstat_flat.parquet")