Integrating GPQA Dataset (#47)

microsoft · Dec 3, 2024 · 7a27e6c · 7a27e6c
1 parent 55c42c6
commit 7a27e6c
Show file tree

Hide file tree

Showing 8 changed files with 289 additions and 2 deletions.
diff --git a/eureka_ml_insights/configs/__init__.py b/eureka_ml_insights/configs/__init__.py
@@ -16,6 +16,7 @@
 from .experiment_config import ExperimentConfig, create_logdir
 from .flenqa import FlenQA_Experiment_Pipeline
 from .geometer import GEOMETER_PIPELINE
+from .gpqa import GPQA_Experiment_Pipeline
 from .ifeval import IFEval_PIPELINE
 from .image_understanding.object_detection import (
     OBJECT_DETECTION_PAIRS_LOCAL_PIPELINE,
@@ -115,6 +116,7 @@
     MAZE_REPORTING_PIPELINE,
     IFEval_PIPELINE,
     FlenQA_Experiment_Pipeline,
+    GPQA_Experiment_Pipeline,
     Drop_Experiment_Pipeline,
     GEOMETER_PIPELINE,
     MMMU_BASELINE_PIPELINE,

diff --git a/eureka_ml_insights/configs/gpqa.py b/eureka_ml_insights/configs/gpqa.py
@@ -0,0 +1,117 @@
+import os
+from typing import Any
+
+from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
+from eureka_ml_insights.data_utils import (
+    ColumnMatchMapTransform,
+    CopyColumn,
+    DataReader,
+    HFDataReader,
+    ImputeNA,
+    MMDataLoader,
+    RegexTransform,
+    SequenceTransform,
+    ShuffleColumnsTransform,
+)
+from eureka_ml_insights.metrics import CountAggregator, ExactMatch
+
+from .config import (
+    AggregatorConfig,
+    DataSetConfig,
+    EvalReportingConfig,
+    InferenceConfig,
+    MetricConfig,
+    ModelConfig,
+    PipelineConfig,
+    PromptProcessingConfig,
+)
+from .experiment_config import ExperimentConfig
+
+"""This file contains user defined configuration classes for the geometric reasoning task on the GPQA dataset.
+"""
+
+
+class GPQA_Experiment_Pipeline(ExperimentConfig):
+    def configure_pipeline(
+        self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
+    ) -> PipelineConfig:
+        # Configure the data processing component.
+        self.data_processing_comp = PromptProcessingConfig(
+            component_type=PromptProcessing,
+            data_reader_config=DataSetConfig(
+                HFDataReader,
+                {
+                    "path": "Idavidrein/gpqa",
+                    "tasks": "gpqa_diamond",
+                    "split": "train",
+                    "transform": SequenceTransform(
+                        [
+                            CopyColumn(column_name_src="Correct Answer", column_name_dst="A"),
+                            CopyColumn(column_name_src="Incorrect Answer 1", column_name_dst="B"),
+                            CopyColumn(column_name_src="Incorrect Answer 2", column_name_dst="C"),
+                            CopyColumn(column_name_src="Incorrect Answer 3", column_name_dst="D"),
+                            ShuffleColumnsTransform(columns=["A", "B", "C", "D"]),
+                            # finds answer choice that "Correct Answer" is mapped to, and stores it in "ground_truth"
+                            ColumnMatchMapTransform(
+                                new_col="ground_truth", key_col="Correct Answer", columns=["A", "B", "C", "D"]
+                            ),
+                        ]
+                    ),
+                },
+            ),
+            prompt_template_path=os.path.join(
+                os.path.dirname(__file__),
+                "../prompt_templates/gpqa_templates/basic.jinja",
+            ),
+            output_dir=os.path.join(self.log_dir, "data_processing_output"),
+        )
+        # Configure the inference component
+        self.inference_comp = InferenceConfig(
+            component_type=Inference,
+            model_config=model_config,
+            data_loader_config=DataSetConfig(
+                MMDataLoader,
+                {"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl")},
+            ),
+            output_dir=os.path.join(self.log_dir, "inference_result"),
+            resume_from=resume_from,
+        )
+        # Configure the evaluation and reporting component.
+        self.evalreporting_comp = EvalReportingConfig(
+            component_type=EvalReporting,
+            data_reader_config=DataSetConfig(
+                DataReader,
+                {
+                    "path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
+                    "format": ".jsonl",
+                    "transform": SequenceTransform(
+                        [
+                            CopyColumn(
+                                column_name_src="model_output",
+                                column_name_dst="raw_model_output",
+                            ),
+                            RegexTransform(
+                                columns="model_output",
+                                prompt_pattern=r"My answer is (\w)(?=\s|\W|$)",
+                                case=True,
+                            ),
+                            ImputeNA(columns="model_output", value=""),
+                        ]
+                    ),
+                },
+            ),
+            metric_config=MetricConfig(ExactMatch),
+            aggregator_configs=[
+                AggregatorConfig(CountAggregator, {"column_names": ["ExactMatch_result"], "normalize": True})
+            ],
+            output_dir=os.path.join(self.log_dir, "eval_report"),
+        )
+        # # Configure the pipeline
+        return PipelineConfig(
+            [
+                self.data_processing_comp,
+                self.inference_comp,
+                self.evalreporting_comp,
+            ],
+            self.log_dir,
+        )
diff --git a/eureka_ml_insights/configs/model_configs.py b/eureka_ml_insights/configs/model_configs.py
@@ -193,4 +193,4 @@
         },
         "model_name": "Mistral-large-2407",
     },
-)
+)
diff --git a/eureka_ml_insights/data_utils/__init__.py b/eureka_ml_insights/data_utils/__init__.py
@@ -22,6 +22,7 @@
     AddColumn,
     AddColumnAndData,
     ASTEvalTransform,
+    ColumnMatchMapTransform,
     ColumnRename,
     CopyColumn,
     DFTransformBase,
@@ -34,6 +35,7 @@
     RunPythonTransform,
     SamplerTransform,
     SequenceTransform,
+    ShuffleColumnsTransform,
     TokenCounterTransform,
 )
 
@@ -69,5 +71,7 @@
     ExtractAnswerGrid,
     ExtractAnswerSpatialMap,
     ExtractAnswerMaze,
+    ShuffleColumnsTransform,
+    ColumnMatchMapTransform,
     TokenCounterTransform,
 ]
diff --git a/eureka_ml_insights/data_utils/transform.py b/eureka_ml_insights/data_utils/transform.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass, field
 from typing import Dict, List
 
+import numpy as np
 import pandas as pd
 import tiktoken
 
@@ -180,6 +181,63 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
         return df
 
 
+@dataclass
+class ShuffleColumnsTransform(MultiColumnTransform):
+    """
+    For a set of columns, shuffles the values across each row of these columns.
+    Values will be shuffled differently for each row. 
+
+    This class is meant to be used in MCQ benchmarks to shuffle answer choices
+    across different letter options (e.g. shuffle what choice maps to 'A' vs 'B' vs 'C').
+    """
+
+    columns: List[str]
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """For each row in df, shuffle values across these columns."""
+        self.validate(df)
+
+        def shuffle_row(row):
+            row[self.columns] = np.random.permutation(row[self.columns].values)
+            return row
+
+        df = df.apply(shuffle_row, axis=1)
+        return df
+
+
+@dataclass
+class ColumnMatchMapTransform(DFTransformBase):
+    """
+    Creates a new column indicating the name of the column that matches the value in the key column for each row.
+    E.g. for a row, if value of key_col matches value of 'A' column, new_col will contain the value 'A'.
+    Used to store the letter of the correct answer choice in MCQ benchmarks.
+    """
+
+    key_col: str
+    new_col: str
+    columns: List[str]
+
+    # Function to find matching column
+    def _find_matching_column(self, row):
+        for col in self.columns:
+            if row[col] == row[self.key_col]:
+                return col
+        return None  # If no match is found (optional)
+
+    def validate(self, df: pd.DataFrame):
+        """Check that all columns to be transformed are present actually in the data frame."""
+        extra_columns = set(self.columns + [self.key_col]) - set(df.columns)
+        if extra_columns:
+            msg = ", ".join(sorted(extra_columns))
+            raise ValueError(f"The following columns are not present in the data frame: {msg}")
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """For each row in df, shuffle values across these columns."""
+        self.validate(df)
+        df[self.new_col] = df.apply(self._find_matching_column, axis=1)
+        return df
+
+
 @dataclass
 class ImputeNA(MultiColumnTransform):
     """Impute missing values in selected columns with a specified value."""

diff --git a/eureka_ml_insights/prompt_templates/gpqa_templates/basic.jinja b/eureka_ml_insights/prompt_templates/gpqa_templates/basic.jinja
@@ -0,0 +1,6 @@
+Answer the following question by saying 'My answer is <letter of your answer choice>.' Don't provide any explanations or other information.
+Question: {{Question}}
+A) {{A}}
+B) {{B}}
+C) {{C}}
+D) {{D}}
diff --git a/tests/data_utils_tests/data_tests.py b/tests/data_utils_tests/data_tests.py
@@ -19,6 +19,8 @@
     RunPythonTransform,
     SequenceTransform,
     TokenCounterTransform,
+    ShuffleColumnsTransform,
+    ColumnMatchMapTransform
 )
 
 
@@ -146,6 +148,86 @@ def test_mm_loader(self):
             for _, model_inputs in self.data_loader:
                 self.assertTrue(isinstance(model_inputs[1][0], Image.Image))
 
+class TestShuffleColumns(unittest.TestCase):
+    """Testing the ShuffleColumnsTransform used in MCQ benchmarks to shuffle answer choices."""
+    def setUp(self):
+        self.df = pd.DataFrame(
+                    {
+                        "A": [1, 2, 3, 4, 5],
+                        "B": ["a", "b", "c", "d", "e"],
+                        "C": [-10, -20, -30, -40, -50],
+                        "D": ["hi", "how", "are", "you", "?"],
+                    }
+                )
+        self.shuffle_transform = ShuffleColumnsTransform(columns=["A", "B", "C"])
+
+    def test_shuffle_columns_values(self):
+        # Apply the transformation twice
+        np.random.seed(42)
+        transformed_df_1 = self.shuffle_transform.transform(self.df.copy())
+        np.random.seed(0)
+        transformed_df_2 = self.shuffle_transform.transform(self.df.copy())
+
+        # Columns that should remain unchanged
+        unshuffled_columns = [col for col in self.df.columns if col not in self.shuffle_transform.columns]
+
+        # Ensure each row has the same set of values in the shuffled columns after both transformations
+        for _, row in self.df.iterrows():
+            original_values = set(row[self.shuffle_transform.columns])
+
+            # Get the transformed row values for both shuffles
+            transformed_values_1 = set(transformed_df_1.loc[row.name, self.shuffle_transform.columns])
+            transformed_values_2 = set(transformed_df_2.loc[row.name, self.shuffle_transform.columns])
+
+            # Check that each transformed row has the same set of values as the original
+            self.assertEqual(original_values, transformed_values_1)
+            self.assertEqual(original_values, transformed_values_2)
+
+            # Verify that the order is different between the two shuffles
+            self.assertNotEqual(
+                tuple(transformed_df_1.loc[row.name, self.shuffle_transform.columns]),
+                tuple(transformed_df_2.loc[row.name, self.shuffle_transform.columns]),
+            )
+
+        # Ensure unshuffled columns remain the same in both transformations
+        for col in unshuffled_columns:
+            pd.testing.assert_series_equal(self.df[col], transformed_df_1[col], check_exact=True)
+            pd.testing.assert_series_equal(self.df[col], transformed_df_2[col], check_exact=True)
+
+class TestColMatchMap(unittest.TestCase):
+    """
+    Testing the ColumnMatchMapTransform used in MCQ benchmarks to store the letter of the correct
+    answer choice.
+    """
+    def setUp(self):
+        # Seed the random number generator for reproducibility
+        np.random.seed(42)
+
+        # Sample DataFrame
+        self.values = [
+            {
+                "df": pd.DataFrame(
+                    {
+                        "A": [1, 2, 3, 4, "e"],
+                        "B": ["a", "b", "c", "d", "e"],
+                        "C": ["a", -20, -30, "d", -50],
+                        "D": ["hi", "b", "c", "you", "?"],
+                    }
+                ),
+                "cols": ["A", "C", "D"],
+                "key_col": "B",
+                "ground_truth": ["C", "D", "D", "C", "A"],
+            }
+        ]
+
+    def test_col_match_map(self):
+        for val in self.values:
+            self.col_match_map_transform = ColumnMatchMapTransform(
+                key_col=val["key_col"], new_col="ground_truth", columns=val["cols"]
+            )
+            df = val["df"]
+            df = self.col_match_map_transform.transform(df)
+            self.assertEqual(list(df["ground_truth"]), val["ground_truth"])
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/pipeline_tests.py b/tests/pipeline_tests.py
@@ -28,6 +28,7 @@
     SPATIAL_MAP_TEXTONLY_PIPELINE,
     SPATIAL_REASONING_SINGLE_PIPELINE,
     VISUAL_PROMPTING_SINGLE_PIPELINE,
+    GPQA_Experiment_Pipeline,
     Drop_Experiment_Pipeline,
     IFEval_PIPELINE,
     MetricConfig,
@@ -281,12 +282,24 @@ def configure_pipeline(self, resume_from=None):
         self.data_processing_comp.data_reader_config.init_args["split"] = "dev"
         self.data_processing_comp.data_reader_config.init_args["tasks"] = ["Math"]
 
-        self.inference_comp.data_loader_config.class_name = TestMMDataLoader
+        self.inference_comp.data_loader_config.class_name = TestDataLoader
         self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
         return config
 
 
+class TEST_GPQA_PIPELINE(GPQA_Experiment_Pipeline):
+    # Test config the GPQA benchmark with TestModel and TestDataLoader
+    def configure_pipeline(self):
+        config = super().configure_pipeline(model_config=ModelConfig(GenericTestModel, {}))
+        self.inference_comp.data_loader_config.class_name = TestDataLoader
+        self.inference_comp.data_loader_config.init_args = {
+            "path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
+            "n_iter": N_ITER,
+        }
+        return config
+
 class TEST_DROP_PIPELINE(Drop_Experiment_Pipeline):
+    # Test config the Drop benchmark with TestModel and TestDataLoader
     def configure_pipeline(self):
         config = super().configure_pipeline(model_config=ModelConfig(GenericTestModel, {}))
         self.inference_comp.data_loader_config.class_name = TestDataLoader
@@ -457,6 +470,11 @@ class KITAB_ONE_BOOK_CONSTRAINT_PIPELINE_PipelineTest(PipelineTest, unittest.Tes
     def get_config(self):
         return TEST_KITAB_ONE_BOOK_CONSTRAINT_PIPELINE().pipeline_config
 
+@unittest.skipIf("skip_tests_with_missing_ds" in os.environ, "Missing public dataset. TODO: revert")
+class GPQA_PipelineTest(PipelineTest, unittest.TestCase):
+    def get_config(self):
+        return TEST_GPQA_PIPELINE().pipeline_config
+
 
 class DROP_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):
-Original file line number
+Diff line change
@@ Expand Up / @@ -193,4 +193,4 @@ @@
             },
             "model_name": "Mistral-large-2407",
         },
-    )
+    )