Skip to content

Commit

Permalink
Integrating GPQA Dataset (#47)
Browse files Browse the repository at this point in the history
  • Loading branch information
vshrivas authored Dec 3, 2024
1 parent 55c42c6 commit 7a27e6c
Show file tree
Hide file tree
Showing 8 changed files with 289 additions and 2 deletions.
2 changes: 2 additions & 0 deletions eureka_ml_insights/configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .experiment_config import ExperimentConfig, create_logdir
from .flenqa import FlenQA_Experiment_Pipeline
from .geometer import GEOMETER_PIPELINE
from .gpqa import GPQA_Experiment_Pipeline
from .ifeval import IFEval_PIPELINE
from .image_understanding.object_detection import (
OBJECT_DETECTION_PAIRS_LOCAL_PIPELINE,
Expand Down Expand Up @@ -115,6 +116,7 @@
MAZE_REPORTING_PIPELINE,
IFEval_PIPELINE,
FlenQA_Experiment_Pipeline,
GPQA_Experiment_Pipeline,
Drop_Experiment_Pipeline,
GEOMETER_PIPELINE,
MMMU_BASELINE_PIPELINE,
Expand Down
117 changes: 117 additions & 0 deletions eureka_ml_insights/configs/gpqa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import os
from typing import Any

from eureka_ml_insights.core import EvalReporting, Inference, PromptProcessing
from eureka_ml_insights.data_utils import (
ColumnMatchMapTransform,
CopyColumn,
DataReader,
HFDataReader,
ImputeNA,
MMDataLoader,
RegexTransform,
SequenceTransform,
ShuffleColumnsTransform,
)
from eureka_ml_insights.metrics import CountAggregator, ExactMatch

from .config import (
AggregatorConfig,
DataSetConfig,
EvalReportingConfig,
InferenceConfig,
MetricConfig,
ModelConfig,
PipelineConfig,
PromptProcessingConfig,
)
from .experiment_config import ExperimentConfig

"""This file contains user defined configuration classes for the geometric reasoning task on the GPQA dataset.
"""


class GPQA_Experiment_Pipeline(ExperimentConfig):
def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:
# Configure the data processing component.
self.data_processing_comp = PromptProcessingConfig(
component_type=PromptProcessing,
data_reader_config=DataSetConfig(
HFDataReader,
{
"path": "Idavidrein/gpqa",
"tasks": "gpqa_diamond",
"split": "train",
"transform": SequenceTransform(
[
CopyColumn(column_name_src="Correct Answer", column_name_dst="A"),
CopyColumn(column_name_src="Incorrect Answer 1", column_name_dst="B"),
CopyColumn(column_name_src="Incorrect Answer 2", column_name_dst="C"),
CopyColumn(column_name_src="Incorrect Answer 3", column_name_dst="D"),
ShuffleColumnsTransform(columns=["A", "B", "C", "D"]),
# finds answer choice that "Correct Answer" is mapped to, and stores it in "ground_truth"
ColumnMatchMapTransform(
new_col="ground_truth", key_col="Correct Answer", columns=["A", "B", "C", "D"]
),
]
),
},
),
prompt_template_path=os.path.join(
os.path.dirname(__file__),
"../prompt_templates/gpqa_templates/basic.jinja",
),
output_dir=os.path.join(self.log_dir, "data_processing_output"),
)
# Configure the inference component
self.inference_comp = InferenceConfig(
component_type=Inference,
model_config=model_config,
data_loader_config=DataSetConfig(
MMDataLoader,
{"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl")},
),
output_dir=os.path.join(self.log_dir, "inference_result"),
resume_from=resume_from,
)
# Configure the evaluation and reporting component.
self.evalreporting_comp = EvalReportingConfig(
component_type=EvalReporting,
data_reader_config=DataSetConfig(
DataReader,
{
"path": os.path.join(self.inference_comp.output_dir, "inference_result.jsonl"),
"format": ".jsonl",
"transform": SequenceTransform(
[
CopyColumn(
column_name_src="model_output",
column_name_dst="raw_model_output",
),
RegexTransform(
columns="model_output",
prompt_pattern=r"My answer is (\w)(?=\s|\W|$)",
case=True,
),
ImputeNA(columns="model_output", value=""),
]
),
},
),
metric_config=MetricConfig(ExactMatch),
aggregator_configs=[
AggregatorConfig(CountAggregator, {"column_names": ["ExactMatch_result"], "normalize": True})
],
output_dir=os.path.join(self.log_dir, "eval_report"),
)
# # Configure the pipeline
return PipelineConfig(
[
self.data_processing_comp,
self.inference_comp,
self.evalreporting_comp,
],
self.log_dir,
)
2 changes: 1 addition & 1 deletion eureka_ml_insights/configs/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,4 +193,4 @@
},
"model_name": "Mistral-large-2407",
},
)
)
4 changes: 4 additions & 0 deletions eureka_ml_insights/data_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
AddColumn,
AddColumnAndData,
ASTEvalTransform,
ColumnMatchMapTransform,
ColumnRename,
CopyColumn,
DFTransformBase,
Expand All @@ -34,6 +35,7 @@
RunPythonTransform,
SamplerTransform,
SequenceTransform,
ShuffleColumnsTransform,
TokenCounterTransform,
)

Expand Down Expand Up @@ -69,5 +71,7 @@
ExtractAnswerGrid,
ExtractAnswerSpatialMap,
ExtractAnswerMaze,
ShuffleColumnsTransform,
ColumnMatchMapTransform,
TokenCounterTransform,
]
58 changes: 58 additions & 0 deletions eureka_ml_insights/data_utils/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from dataclasses import dataclass, field
from typing import Dict, List

import numpy as np
import pandas as pd
import tiktoken

Expand Down Expand Up @@ -180,6 +181,63 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
return df


@dataclass
class ShuffleColumnsTransform(MultiColumnTransform):
"""
For a set of columns, shuffles the values across each row of these columns.
Values will be shuffled differently for each row.
This class is meant to be used in MCQ benchmarks to shuffle answer choices
across different letter options (e.g. shuffle what choice maps to 'A' vs 'B' vs 'C').
"""

columns: List[str]

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""For each row in df, shuffle values across these columns."""
self.validate(df)

def shuffle_row(row):
row[self.columns] = np.random.permutation(row[self.columns].values)
return row

df = df.apply(shuffle_row, axis=1)
return df


@dataclass
class ColumnMatchMapTransform(DFTransformBase):
"""
Creates a new column indicating the name of the column that matches the value in the key column for each row.
E.g. for a row, if value of key_col matches value of 'A' column, new_col will contain the value 'A'.
Used to store the letter of the correct answer choice in MCQ benchmarks.
"""

key_col: str
new_col: str
columns: List[str]

# Function to find matching column
def _find_matching_column(self, row):
for col in self.columns:
if row[col] == row[self.key_col]:
return col
return None # If no match is found (optional)

def validate(self, df: pd.DataFrame):
"""Check that all columns to be transformed are present actually in the data frame."""
extra_columns = set(self.columns + [self.key_col]) - set(df.columns)
if extra_columns:
msg = ", ".join(sorted(extra_columns))
raise ValueError(f"The following columns are not present in the data frame: {msg}")

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""For each row in df, shuffle values across these columns."""
self.validate(df)
df[self.new_col] = df.apply(self._find_matching_column, axis=1)
return df


@dataclass
class ImputeNA(MultiColumnTransform):
"""Impute missing values in selected columns with a specified value."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Answer the following question by saying 'My answer is <letter of your answer choice>.' Don't provide any explanations or other information.
Question: {{Question}}
A) {{A}}
B) {{B}}
C) {{C}}
D) {{D}}
82 changes: 82 additions & 0 deletions tests/data_utils_tests/data_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
RunPythonTransform,
SequenceTransform,
TokenCounterTransform,
ShuffleColumnsTransform,
ColumnMatchMapTransform
)


Expand Down Expand Up @@ -146,6 +148,86 @@ def test_mm_loader(self):
for _, model_inputs in self.data_loader:
self.assertTrue(isinstance(model_inputs[1][0], Image.Image))

class TestShuffleColumns(unittest.TestCase):
"""Testing the ShuffleColumnsTransform used in MCQ benchmarks to shuffle answer choices."""
def setUp(self):
self.df = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5],
"B": ["a", "b", "c", "d", "e"],
"C": [-10, -20, -30, -40, -50],
"D": ["hi", "how", "are", "you", "?"],
}
)
self.shuffle_transform = ShuffleColumnsTransform(columns=["A", "B", "C"])

def test_shuffle_columns_values(self):
# Apply the transformation twice
np.random.seed(42)
transformed_df_1 = self.shuffle_transform.transform(self.df.copy())
np.random.seed(0)
transformed_df_2 = self.shuffle_transform.transform(self.df.copy())

# Columns that should remain unchanged
unshuffled_columns = [col for col in self.df.columns if col not in self.shuffle_transform.columns]

# Ensure each row has the same set of values in the shuffled columns after both transformations
for _, row in self.df.iterrows():
original_values = set(row[self.shuffle_transform.columns])

# Get the transformed row values for both shuffles
transformed_values_1 = set(transformed_df_1.loc[row.name, self.shuffle_transform.columns])
transformed_values_2 = set(transformed_df_2.loc[row.name, self.shuffle_transform.columns])

# Check that each transformed row has the same set of values as the original
self.assertEqual(original_values, transformed_values_1)
self.assertEqual(original_values, transformed_values_2)

# Verify that the order is different between the two shuffles
self.assertNotEqual(
tuple(transformed_df_1.loc[row.name, self.shuffle_transform.columns]),
tuple(transformed_df_2.loc[row.name, self.shuffle_transform.columns]),
)

# Ensure unshuffled columns remain the same in both transformations
for col in unshuffled_columns:
pd.testing.assert_series_equal(self.df[col], transformed_df_1[col], check_exact=True)
pd.testing.assert_series_equal(self.df[col], transformed_df_2[col], check_exact=True)

class TestColMatchMap(unittest.TestCase):
"""
Testing the ColumnMatchMapTransform used in MCQ benchmarks to store the letter of the correct
answer choice.
"""
def setUp(self):
# Seed the random number generator for reproducibility
np.random.seed(42)

# Sample DataFrame
self.values = [
{
"df": pd.DataFrame(
{
"A": [1, 2, 3, 4, "e"],
"B": ["a", "b", "c", "d", "e"],
"C": ["a", -20, -30, "d", -50],
"D": ["hi", "b", "c", "you", "?"],
}
),
"cols": ["A", "C", "D"],
"key_col": "B",
"ground_truth": ["C", "D", "D", "C", "A"],
}
]

def test_col_match_map(self):
for val in self.values:
self.col_match_map_transform = ColumnMatchMapTransform(
key_col=val["key_col"], new_col="ground_truth", columns=val["cols"]
)
df = val["df"]
df = self.col_match_map_transform.transform(df)
self.assertEqual(list(df["ground_truth"]), val["ground_truth"])

if __name__ == "__main__":
unittest.main()
20 changes: 19 additions & 1 deletion tests/pipeline_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
SPATIAL_MAP_TEXTONLY_PIPELINE,
SPATIAL_REASONING_SINGLE_PIPELINE,
VISUAL_PROMPTING_SINGLE_PIPELINE,
GPQA_Experiment_Pipeline,
Drop_Experiment_Pipeline,
IFEval_PIPELINE,
MetricConfig,
Expand Down Expand Up @@ -281,12 +282,24 @@ def configure_pipeline(self, resume_from=None):
self.data_processing_comp.data_reader_config.init_args["split"] = "dev"
self.data_processing_comp.data_reader_config.init_args["tasks"] = ["Math"]

self.inference_comp.data_loader_config.class_name = TestMMDataLoader
self.inference_comp.data_loader_config.class_name = TestDataLoader
self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
return config


class TEST_GPQA_PIPELINE(GPQA_Experiment_Pipeline):
# Test config the GPQA benchmark with TestModel and TestDataLoader
def configure_pipeline(self):
config = super().configure_pipeline(model_config=ModelConfig(GenericTestModel, {}))
self.inference_comp.data_loader_config.class_name = TestDataLoader
self.inference_comp.data_loader_config.init_args = {
"path": os.path.join(self.data_processing_comp.output_dir, "transformed_data.jsonl"),
"n_iter": N_ITER,
}
return config

class TEST_DROP_PIPELINE(Drop_Experiment_Pipeline):
# Test config the Drop benchmark with TestModel and TestDataLoader
def configure_pipeline(self):
config = super().configure_pipeline(model_config=ModelConfig(GenericTestModel, {}))
self.inference_comp.data_loader_config.class_name = TestDataLoader
Expand Down Expand Up @@ -457,6 +470,11 @@ class KITAB_ONE_BOOK_CONSTRAINT_PIPELINE_PipelineTest(PipelineTest, unittest.Tes
def get_config(self):
return TEST_KITAB_ONE_BOOK_CONSTRAINT_PIPELINE().pipeline_config

@unittest.skipIf("skip_tests_with_missing_ds" in os.environ, "Missing public dataset. TODO: revert")
class GPQA_PipelineTest(PipelineTest, unittest.TestCase):
def get_config(self):
return TEST_GPQA_PIPELINE().pipeline_config


class DROP_PipelineTest(PipelineTest, unittest.TestCase):
def get_config(self):
Expand Down

0 comments on commit 7a27e6c

Please sign in to comment.