Bugfix when LLM doesn't return all schema fields during field mapping

globaldothealth · Dec 20, 2024 · 4edb6b3 · 4edb6b3
1 parent 707d405
commit 4edb6b3
Showing 2 changed files with 24 additions and 20 deletions.
diff --git a/src/adtl/autoparser/mapping.py b/src/adtl/autoparser/mapping.py
@@ -213,7 +213,7 @@ def match_values_to_schema(self) -> pd.DataFrame:
 
         values_tuples = []
         for f in self.target_fields:
-            s = self.common_values_mapped[f]
+            s = self.common_values_mapped.get(f)
             t = self.target_values[f]
             if s and t:
                 values_tuples.append((f, s, t))
@@ -260,6 +260,10 @@ def create_mapping(self, save=True, file_name="mapping_file") -> pd.DataFrame:
         mapped_vals = self.match_values_to_schema()
 
         mapping_dict.drop(columns=["source_type"], inplace=True)
+
+        # reindex to add in any schema fields that weren't returned by the LLM
+        mapping_dict = mapping_dict.reindex(self.target_fields)
+
         mapping_dict["target_values"] = mapping_dict.index.map(self.target_values)
         mapping_dict["value_mapping"] = mapping_dict.index.map(mapped_vals)
 
@@ -269,23 +273,22 @@ def create_mapping(self, save=True, file_name="mapping_file") -> pd.DataFrame:
                 f"The following schema fields have not been mapped: {list(unmapped)}",
                 UserWarning,
             )
+
+        # turn lists & dicts into strings for consistancy with saved CSV
+        mapping_dict["target_values"] = mapping_dict["target_values"].apply(
+            lambda x: (", ".join(str(item) for item in x) if isinstance(x, list) else x)
+        )
+        mapping_dict["value_mapping"] = mapping_dict["value_mapping"].apply(
+            lambda x: (
+                ", ".join(f"{k}={v}" for k, v in x.items())
+                if isinstance(x, dict)
+                else x
+            )
+        )
+
         if save is False:
             return mapping_dict
         else:
-            # turn lists & dicts into strings to save to CSV
-            mapping_dict["target_values"] = mapping_dict["target_values"].apply(
-                lambda x: (
-                    ", ".join(str(item) for item in x) if isinstance(x, list) else x
-                )
-            )
-            mapping_dict["value_mapping"] = mapping_dict["value_mapping"].apply(
-                lambda x: (
-                    ", ".join(f"{k}={v}" for k, v in x.items())
-                    if isinstance(x, dict)
-                    else x
-                )
-            )
-
             # Write to CSV
             if not file_name.endswith(".csv"):
                 file_name += ".csv"
@@ -324,6 +327,10 @@ def create_mapping(
         Which LLM to use, currently only 'openai' is supported.
     config
         Path to a JSON file containing the configuration for autoparser.
+    save
+        Whether to save the mapping to a CSV file.
+    file_name
+        Name of the file to save the mapping to.
 
     Returns
     -------

diff --git a/tests/test_autoparser/test_mapper.py b/tests/test_autoparser/test_mapper.py
@@ -344,11 +344,8 @@ def test_class_create_mapping_no_save():
             "source_description": "Pet Animal",
             "source_field": "AnimalDeCompagnie",
             "common_values": "Oui, Non, non",
-            "target_values": ["True", "False", "None"],
-            "value_mapping": {
-                "oui": "True",
-                "non": "False",
-            },
+            "target_values": "True, False, None",
+            "value_mapping": "oui=True, non=False",
         },
         name="pet",
     )