From 4edb6b362e943e4a18070d33126a35c06dceddaf Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Fri, 20 Dec 2024 11:43:36 +0000 Subject: [PATCH] Bugfix when LLM doesn't return all schema fields during field mapping --- src/adtl/autoparser/mapping.py | 37 +++++++++++++++++----------- tests/test_autoparser/test_mapper.py | 7 ++---- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/adtl/autoparser/mapping.py b/src/adtl/autoparser/mapping.py index cb04c47..7269bc6 100644 --- a/src/adtl/autoparser/mapping.py +++ b/src/adtl/autoparser/mapping.py @@ -213,7 +213,7 @@ def match_values_to_schema(self) -> pd.DataFrame: values_tuples = [] for f in self.target_fields: - s = self.common_values_mapped[f] + s = self.common_values_mapped.get(f) t = self.target_values[f] if s and t: values_tuples.append((f, s, t)) @@ -260,6 +260,10 @@ def create_mapping(self, save=True, file_name="mapping_file") -> pd.DataFrame: mapped_vals = self.match_values_to_schema() mapping_dict.drop(columns=["source_type"], inplace=True) + + # reindex to add in any schema fields that weren't returned by the LLM + mapping_dict = mapping_dict.reindex(self.target_fields) + mapping_dict["target_values"] = mapping_dict.index.map(self.target_values) mapping_dict["value_mapping"] = mapping_dict.index.map(mapped_vals) @@ -269,23 +273,22 @@ def create_mapping(self, save=True, file_name="mapping_file") -> pd.DataFrame: f"The following schema fields have not been mapped: {list(unmapped)}", UserWarning, ) + + # turn lists & dicts into strings for consistancy with saved CSV + mapping_dict["target_values"] = mapping_dict["target_values"].apply( + lambda x: (", ".join(str(item) for item in x) if isinstance(x, list) else x) + ) + mapping_dict["value_mapping"] = mapping_dict["value_mapping"].apply( + lambda x: ( + ", ".join(f"{k}={v}" for k, v in x.items()) + if isinstance(x, dict) + else x + ) + ) + if save is False: return mapping_dict else: - # turn lists & dicts into strings to save to CSV - mapping_dict["target_values"] = mapping_dict["target_values"].apply( - lambda x: ( - ", ".join(str(item) for item in x) if isinstance(x, list) else x - ) - ) - mapping_dict["value_mapping"] = mapping_dict["value_mapping"].apply( - lambda x: ( - ", ".join(f"{k}={v}" for k, v in x.items()) - if isinstance(x, dict) - else x - ) - ) - # Write to CSV if not file_name.endswith(".csv"): file_name += ".csv" @@ -324,6 +327,10 @@ def create_mapping( Which LLM to use, currently only 'openai' is supported. config Path to a JSON file containing the configuration for autoparser. + save + Whether to save the mapping to a CSV file. + file_name + Name of the file to save the mapping to. Returns ------- diff --git a/tests/test_autoparser/test_mapper.py b/tests/test_autoparser/test_mapper.py index d4fd93b..0fcee02 100644 --- a/tests/test_autoparser/test_mapper.py +++ b/tests/test_autoparser/test_mapper.py @@ -344,11 +344,8 @@ def test_class_create_mapping_no_save(): "source_description": "Pet Animal", "source_field": "AnimalDeCompagnie", "common_values": "Oui, Non, non", - "target_values": ["True", "False", "None"], - "value_mapping": { - "oui": "True", - "non": "False", - }, + "target_values": "True, False, None", + "value_mapping": "oui=True, non=False", }, name="pet", )