microsoft · neelsj · Dec 4, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 7, 2024
diff --git a/eureka_ml_insights/data_utils/__init__.py b/eureka_ml_insights/data_utils/__init__.py
@@ -15,8 +15,8 @@
 from .prompt_processing import JinjaPromptTemplate
 from .spatial_utils import (
     ExtractAnswerGrid,
-    ExtractAnswerMaze,
-    ExtractAnswerSpatialMap,
+    ExtractAnswerSpatialMapAndMaze,
+    ExtractQuestionOptions,
 )
 from .transform import (
     AddColumn,
@@ -68,8 +68,7 @@
     ASTEvalTransform,
     PrependStringTransform,
     ExtractAnswerGrid,
-    ExtractAnswerSpatialMap,
-    ExtractAnswerMaze,
+    ExtractAnswerSpatialMapAndMaze,
     ShuffleColumnsTransform,
     ColumnMatchMapTransform,
     TokenCounterTransform,

diff --git a/eureka_ml_insights/data_utils/spatial_utils.py b/eureka_ml_insights/data_utils/spatial_utils.py
@@ -157,25 +157,34 @@ def extract_answer_from_text_grid(text, question_type):
     return None  # Return None if no numbers are found
 
 
-def extract_answer_from_text_map(text, question_type, model_name):
+def extract_answer_from_text_map_and_maze(model_output_raw, options):
     """
-    Extracts the answer from the text based on specific patterns,
-    and as a fallback, extracts the first number if no patterns match.
-    The code is from: https://github.com/alvinmingwisc/spatial_reason_vlm/tree/main/eval,
-    and included with minimal modifications.
+    Extracts the answer from the text based on known model output patterns.
+    Searches for both a letter and whole word answer and returns both as they are not
+    always consistent.
 
     Args:
-    - text (str): The text containing the model's answer.
-    - question_type (str): The text containing the question type.
-    - model_name (str): The model name.
+    - model_output_raw (str): The text containing the model's answer.
+    - options (str): The list of options.
 
     Returns:
-    - str or None: The extracted answer, or None if no answer could be extracted.
+    - str or None: The extracted answers, or empty strings if no answer could be extracted.
     """
-    # Mapping of textual numbers to their numeric equivalents
+
+    # replace common subsitutions in model outputs
+
+    model_output_parsed_letter = ""
+    model_output_parsed = ""
+
+    if not model_output_raw:
+        return [model_output_parsed, model_output_parsed_letter]
+
+    model_output_raw =  re.sub(r"\bno objects\b", "0 objects", model_output_raw, re.IGNORECASE)
+    model_output_raw =  re.sub(r"\bnot\b", "no", model_output_raw, re.IGNORECASE)
+    model_output_raw =  re.sub(r"\bshould be\b", "is", model_output_raw, re.IGNORECASE)
+
     number_mapping = {
-        "zero": 0,
-        "no": 0,
+        "zero": 0,           
         "one": 1,
         "two": 2,
         "three": 3,
@@ -187,127 +196,71 @@ def extract_answer_from_text_map(text, question_type, model_name):
         "nine": 9,
     }
 
-    dirs = ["southeast", "northeast", "northwest", "southwest"]
-    dir_pattern = rf"\b({'|'.join(dirs)})\b"
-
-    if text is None:
-        return None
-
-    question_id = int(re.search("[0-9]", re.search("Q[0-9]", question_type).group()).group())
-
-    if question_id == 0:
-        direction_match = re.search(r"\b[A-D]\.\s*(" + "|".join(dirs) + r")\b", text, re.IGNORECASE)
-        if direction_match:
-            return direction_match.group(1).lower()
-
-        match = re.search(dir_pattern, text, re.IGNORECASE)
-        if match:
-            return match.group(1)
-        return None
-
-    elif question_id == 1:
-        match = re.search(
-            rf"^([\w\s\'\']+?)\s+is\s+(?:located\s+|in\s+the\s+|located\s+to\s+the\s+)({dir_pattern})",
-            text,
-            re.IGNORECASE,
-        )
-
-        if match:
-            string = match.group(1)
-            return string
-
-        match = re.search(r"\b[A-D]\.\s*(.*)", text)  # problem with extracting .
-
-        if match:
-            string = match.group(1)
-            string = remove_redundancy(string)
-            string = extract_before_is(string)
-            return string
-
-        match = re.search(r"\b([ABCD][.,]|[(][abcdABCD][)])\s*(.*?)(?=\sis\b|\.|,|<|$)", text)
-        if match:
-            answer = match.group(1).strip()
-            # Remove trailing punctuation if any
-            answer = re.sub(r"[\.,\?!<]+$", "", answer)
-            return answer
-
-        match = re.search(
-            rf"Therefore, the object in the {dir_pattern} of [\w\s\'\']+ is ([\w\s\'\']+)", text, re.IGNORECASE
-        )
-        if match:
-            string = match.group(2)
-            return string
-
-        if "claude" in model_name.lower():
-            match = re.search(rf"^([\w\s\'\']+?)\s+is\s+(to\s+the\s+)({dir_pattern})", text, re.IGNORECASE)
-            if match:
-                string = match.group(1)
-                return string
-
-        if "gemini" in model_name.lower():
-            patterns = [
-                rf"\*\*Concise Answer:\*\*\n([\w\s\'\']+?)\s+is\s+(?:located\s+|in\s+the\s+|in\s+|located\s+to\s+the\s+)({dir_pattern})",
-                rf"\*\*Answer:\*\*\s+([\w\s\'\']+?)\s+is\s+in\s+the\s+({dir_pattern})\s+of\s+([\w\s\'\']+)",
-                r"\*\*Answer:\*\*\n([\w\s\'\']+)",
-                r"\*\*Answer\*\*:\s+([\w\s\'\']+)",
-                r"\*\*Answer:\*\*\s+([\w\s\'\']+)",
-            ]
-
-            for pattern in patterns:
-                match = re.search(pattern, text, re.IGNORECASE)
-                if match:
-                    return match.group(1)
-
-        if "gpt-4o" in model_name.lower() or "gpt4o" in model_name.lower():
-            match = re.search(
-                rf"Concise Answer:\s+([\w\s\'\']+?)\s+is\s+(?:located\s+|in\s+the\s+|in\s+|located\s+to\s+the\s+)({dir_pattern})",
-                text,
-                re.IGNORECASE,
-            )
-            if match:
-                string = match.group(1)
-                return string
-
-        # If no match, check for an answer following "is", with specific end markers defined
-        match = re.search(r"\bis\b\s+(.*?)(?=\.|,|<|$)", text)
-        if match:
-            answer = match.group(1).strip()
-            # Remove trailing punctuation if any
-            answer = re.sub(r"[\.,\?!<]+$", "", answer)
-            return answer
-
-        return None  # Return None if no match is found
-
-    elif question_id == 2:
-        match = re.search(r"\b[A-D]\.\s*(\d+)", text)  # match number only
-        if match:
-            return match.group(1)
-        # Create a list to store all found numbers along with their positions
-        found_numbers = []
-
-        # Check for textual numbers and their positions
-        for text_num, num in number_mapping.items():
-            for match in re.finditer(rf"\b{text_num}\b", text, re.IGNORECASE):
-                found_numbers.append((match.start(), num))
-
-        # Check for digit sequences and their positions, specifically ignoring list markers at the start
-        # Exclude numbers following "\n\n" and directly followed by ". "
-        text = re.sub(r"^\n\n\d+\.\s", "", text)  # Remove the leading list marker if it exists
-
-        for match in re.finditer(r"\d+", text):
-            found_numbers.append((match.start(), int(match.group(0))))
-
-        # Sort found numbers by their positions (smallest position first)
-        if found_numbers:
-            found_numbers.sort(key=lambda x: x[0])
-            # Return the number associated with the earliest position
-            return str(found_numbers[0][1])
-        return None
-
-    else:
-        raise ValueError(f"Question ID {question_id} is not supported.")
-
-    return None  # Return None if no numbers are found
+    for k, v in number_mapping.items():
+        model_output_raw =  re.sub(rf"\b{k}\b", str(v), model_output_raw, re.IGNORECASE)
+
+    # get dict of options from options string
+    options_dict = {x.split(".")[0].strip().lower():x.split(".")[1].strip().lower() for x in options}
+
+
+    model_output_parsed_letter = ""
+    model_output_parsed = ""
+
+    answers = [v for k, v in options_dict.items()]
+    answers_pattern = rf"\b({'|'.join(answers)})\b"
+
+    if "Answer:".lower() in model_output_raw.lower():
+        pattern_letter = r"^\**Answer:\**\s+(\w)\. (\w+)"
+        matches = re.search(pattern_letter, model_output_raw, re.IGNORECASE)
+        if matches:
+            match_option = matches.group(1).lower()
+            if match_option in options_dict:
+                model_output_parsed_letter = options_dict[match_option]
+            else:
+                model_output_parsed_letter = match_option
+
+        pattern_phrase = r"Answer:\**\s+([^\n]+)"
+        matches = re.search(pattern_phrase, model_output_raw, re.IGNORECASE)
+        if matches:
+            model_output_answer_line = matches.group(1)
+
+            answers_match = re.search(answers_pattern, model_output_answer_line, re.IGNORECASE)
+
+            if answers_match:
+                model_output_parsed =  answers_match.group(1)
+            else:
+                letters = [k for k, v in options_dict.items()]
+                letters_pattern = rf"\b({'|'.join(letters)})\b"
+                letters_pattern_match = re.search(letters_pattern, model_output_answer_line, re.IGNORECASE)
+
+                if letters_pattern_match:
+                    match_option =  letters_pattern_match.group(1).lower()
+                    model_output_parsed_letter = options_dict[match_option]
+
+    elif "answer is".lower() in model_output_raw.lower():
+        pattern_letter = r'answer is:*\s*\**([\w\d]+)[\s:.]*\**'
+
+        # first look for a single letter answer
+        matches = re.search(pattern_letter, model_output_raw, re.IGNORECASE)
+        if matches:
+            match_option = matches.group(1).lower()
+            if match_option in options_dict:
+                model_output_parsed_letter = options_dict[match_option]
+            else:
+                model_output_parsed_letter = match_option
+
+    # next look if any of the options names are present in the first line
+
+    model_output_answer_line = model_output_raw.splitlines()[0]        
+
+    answers = [v for k, v in options_dict.items()]
+    answers_pattern = rf"\b({'|'.join(answers)})\b"
+    answers_match = re.search(answers_pattern, model_output_answer_line, re.IGNORECASE)
+
+    if answers_match:
+        model_output_parsed =  answers_match.group(1)
+
+    return [model_output_parsed, model_output_parsed_letter]
 
 
 def extract_answer_from_text_maze(text, question_type):
@@ -443,43 +396,59 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
         )
         return df
 
-
 @dataclass
-class ExtractAnswerGrid(ExtractAnswer):
-    """This class is an answer extractor for the GRID benchmark."""
+class ExtractQuestionOptions(DFTransformBase):
+    """This class is for extracting the option list from a prompt."""
 
-    answer_column_name: str
-    extracted_answer_column_name: str
-    question_type_column_name: str
-    mode: str
+    prompt_column_name: str
+    extracted_options_column_name: str
 
-    @abstractmethod
-    def _parse_answer_function(self, answer_text, question_type):
-        return extract_answer_from_text_grid(answer_text, question_type)
+    def _extract_options_from_text_map(self, prompt):
+        """
+        Extracts the options list from the text.
+
+        Args:
+        - text (str): The text containing the prompt.
+
+        Returns:
+        - str or None: The extracted list of options.
+        """
 
+        # get list of options from prompt
+        prompt_lines = prompt.splitlines()
+        matches = [i for i, x in enumerate(prompt_lines) if "Available options:" in x]
+        options = prompt_lines[matches[0]+1:matches[0]+5]
+
+        return options
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[self.extracted_options_column_name] = df[self.prompt_column_name].apply(self._extract_options_from_text_map)
+        return df
 
 @dataclass
-class ExtractAnswerSpatialMap(ExtractAnswer):
-    """This class is an answer extractor for the SPATIAL_MAP benchmark."""
+class ExtractAnswerGrid(ExtractAnswer):
+    """This class is an answer extractor for the GRID benchmark."""
 
     answer_column_name: str
     extracted_answer_column_name: str
     question_type_column_name: str
-    model_name: str
+    mode: str
 
     @abstractmethod
     def _parse_answer_function(self, answer_text, question_type):
-        return extract_answer_from_text_map(answer_text, question_type, self.model_name)
+        return extract_answer_from_text_grid(answer_text, question_type)
 
 
 @dataclass
-class ExtractAnswerMaze(ExtractAnswer):
-    """This class is an answer extractor for the MAZE benchmark."""
+class ExtractAnswerSpatialMapAndMaze(DFTransformBase):
+    """This class is an answer extractor for the SPATIAL_MAP and MAZE benchmark."""
 
     answer_column_name: str
     extracted_answer_column_name: str
-    question_type_column_name: str
+    extracted_options_column_name: str
 
-    @abstractmethod
-    def _parse_answer_function(self, answer_text, question_type):
-        return extract_answer_from_text_maze(answer_text, question_type)
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[self.extracted_answer_column_name] = df.apply(
+            lambda x: extract_answer_from_text_map_and_maze(x[self.answer_column_name], x[self.extracted_options_column_name]), axis=1
+        )
+        return df
diff --git a/eureka_ml_insights/metrics/__init__.py b/eureka_ml_insights/metrics/__init__.py
@@ -2,6 +2,7 @@
 from .geomtric_reasoning_metrics import GeoMCQMetric
 from .metrics_base import (
     CaseInsensitiveMatch,
+    CaseInsensitiveOrMatch,
     ClassicMetric,
     CompositeMetric,
     ExactMatch,

diff --git a/eureka_ml_insights/metrics/metrics_base.py b/eureka_ml_insights/metrics/metrics_base.py
@@ -144,13 +144,39 @@ def __evaluate__(self, answer_text, target_text, is_valid):
         else:
             return "incorrect"
 
+class ExactOrMatch(ExactMatch):
+    """This class checks for a case-insensitive, but otherwise exact match, and returns the or of them."""
+
+    def __evaluate__(self, answer_texts, target_text, is_valid):
+
+        if not is_valid:
+            return "none"
+
+        results = []
+        for answer_text in answer_texts:
+            res = super().__evaluate__(str(answer_text), str(target_text), is_valid)
+            results.append(res)
+
+        corrects = [x=="correct" for x in results]
+
+        if (any(corrects)):
+            return "correct"
+        else:
+            return "incorrect"
 
 class CaseInsensitiveMatch(ExactMatch):
     """This class checks for a case-insensitive, but otherwise exact match."""
 
     def __evaluate__(self, answer_text, target_text, is_valid):
         return super().__evaluate__(str(answer_text).lower(), str(target_text).lower(), is_valid)
 
+class CaseInsensitiveOrMatch(ExactOrMatch):
+    """This class checks for a case-insensitive, but otherwise exact or match."""
+
+    def __evaluate__(self, answer_texts, target_text, is_valid):
+        answer_texts = [str(answer_text).lower() for answer_text in answer_texts]
+        return super().__evaluate__(answer_texts, str(target_text).lower(), is_valid)
+
 
 class IdentityMetric(Metric):