From abf6c03adc1516611f40c995a15e8d959117d681 Mon Sep 17 00:00:00 2001
From: Lingjiao Chen <lingjiaochen@microsoft.com>
Date: Tue, 17 Dec 2024 11:39:26 -0800
Subject: [PATCH 1/7] add tagging

---
 .../aime_templates/Template_tag1.jinja        | 19 ++++++++++++++++++
 eureka_ml_insights/user_configs/__init__.py   |  1 +
 eureka_ml_insights/user_configs/aime.py       | 20 +++++++++++++++++++
 3 files changed, 40 insertions(+)
 create mode 100644 eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja

diff --git a/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja b/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja
new file mode 100644
index 0000000..056c1ac
--- /dev/null
+++ b/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja
@@ -0,0 +1,19 @@
+You are a genius math expert in understanding math questions. Please read the following math question, and then decide which math category it falls into.
+
+Your judgment should be one of the following:
+
+arithmetic
+algebra
+counting
+geometry
+number theory
+probability
+other topics
+
+Do not generate any other texts except one of the above topics.
+
+----------
+Original question:
+{{prompt}}
+----------
+Your judgment:
diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py
index 4061a27..e1a48f9 100644
--- a/eureka_ml_insights/user_configs/__init__.py
+++ b/eureka_ml_insights/user_configs/__init__.py
@@ -7,6 +7,7 @@
     AIME_PIPELINE256Run,
     AIME_PIPELINE512Run,
     AIME_PIPELINE1024Run,
+    AIME_PIPELINETag,
 )
 from .dna import DNA_PIPELINE
 from .drop import Drop_Experiment_Pipeline
diff --git a/eureka_ml_insights/user_configs/aime.py b/eureka_ml_insights/user_configs/aime.py
index 5b85b73..40669e1 100644
--- a/eureka_ml_insights/user_configs/aime.py
+++ b/eureka_ml_insights/user_configs/aime.py
@@ -23,6 +23,7 @@
     MajorityVoteTransform,
     MultiplyTransform,
     SequenceTransform,
+    SamplerTransform,
 )
 from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
 from eureka_ml_insights.data_utils.data import DataLoader
@@ -312,3 +313,22 @@ def configure_pipeline(
             MultiplyTransform(n_repeats=1024)
         )
         return pipeline
+
+
+class AIME_PIPELINETag(AIME_PIPELINE):
+    """This class specifies the config for running AIME benchmark 5 repeated times"""
+
+    def configure_pipeline(
+        self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
+    ) -> PipelineConfig:
+        pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
+        self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
+            SamplerTransform(random_seed=0,
+                             sample_count=10,
+                              )
+        )
+        # data preprocessing
+        self.data_processing_comp.prompt_template_path=os.path.join(
+                os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_tag1.jinja"
+            )
+        return pipeline
\ No newline at end of file

From 77d1103a318761f04eeb2886310fcae72ab3d0ec Mon Sep 17 00:00:00 2001
From: Lingjiao Chen <lingjiaochen@microsoft.com>
Date: Wed, 18 Dec 2024 08:27:07 -0800
Subject: [PATCH 2/7] add new metric

---
 eureka_ml_insights/configs/model_configs.py | 236 +++++++++++++++++++-
 eureka_ml_insights/metrics/__init__.py      |   2 +
 eureka_ml_insights/metrics/aime_metrics.py  |  20 ++
 eureka_ml_insights/user_configs/aime.py     |   8 +-
 4 files changed, 259 insertions(+), 7 deletions(-)
 create mode 100644 eureka_ml_insights/metrics/aime_metrics.py

diff --git a/eureka_ml_insights/configs/model_configs.py b/eureka_ml_insights/configs/model_configs.py
index 1b83a7b..8f1e023 100644
--- a/eureka_ml_insights/configs/model_configs.py
+++ b/eureka_ml_insights/configs/model_configs.py
@@ -4,6 +4,7 @@
 
 from eureka_ml_insights.models import (
     AzureOpenAIO1Model,
+    AzureOpenAIModel,
     ClaudeModel,
     DirectOpenAIModel,
     DirectOpenAIO1Model,
@@ -13,7 +14,7 @@
     LLaVAModel,
     MistralServerlessAzureRestEndpointModel,
     RestEndpointModel,
-    TestModel,
+    #TnRModels,
 )
 
 from .config import ModelConfig
@@ -22,17 +23,22 @@
 # in the secret_key_params dictionary. OR you can provide the key name and key vault URL to fetch the key from Azure Key Vault.
 # You don't need to provide both the key_vault_url and local_keys_path. You can provide one of them based on your setup.
 
-
-# Test model
-TEST_MODEL_CONFIG = ModelConfig(TestModel, {})
-
 # OpenAI models
 
+'''
 OPENAI_SECRET_KEY_PARAMS = {
     "key_name": "your_openai_secret_key_name",
     "local_keys_path": "keys/keys.json",
     "key_vault_url": None,
 }
+'''
+
+OPENAI_SECRET_KEY_PARAMS = {
+    "key_name": "openai",
+    "local_keys_path": "keys/aifeval-vault-azure-net.json",
+    "key_vault_url": "https://aifeval.vault.azure.net",
+}
+
 
 OAI_O1_PREVIEW_CONFIG = ModelConfig(
     DirectOpenAIO1Model,
@@ -42,6 +48,14 @@
     },
 )
 
+OAI_O1_MINI_CONFIG = ModelConfig(
+    DirectOpenAIO1Model,
+    {
+        "model_name": "o1-mini-2024-09-12",
+        "secret_key_params": OPENAI_SECRET_KEY_PARAMS,
+    },
+)
+
 OAI_O1_PREVIEW_AUZRE_CONFIG = ModelConfig(
     AzureOpenAIO1Model,
     {
@@ -91,12 +105,110 @@
     },
 )
 
+# Azure OAI models
+## Azure OAI models -- TNR Models
+
+TNR_SECRET_KEY_PARAMS = {
+    "key_name": "tnrllmproxy",
+    "local_keys_path": "keys/aifeval-vault-azure-net.json",
+    "key_vault_url": "https://aifeval.vault.azure.net",
+}
+
+GCRAOAI8SW1_AZURE_OAI_O1_PREVIEW_CONFIG = ModelConfig(
+    AzureOpenAIO1Model,
+    {
+        "url": "https://gcraoai8sw1.openai.azure.com/",
+        "model_name": "o1-preview",
+        "api_version": "2024-08-01-preview",
+    }
+)
+
+GCRAOAI8SW1_AZURE_OAI_O1_MINI_CONFIG = ModelConfig(
+    AzureOpenAIO1Model,
+    {
+        "url": "https://gcraoai8sw1.openai.azure.com/",
+        "model_name": "o1-mini",
+        "api_version": "2024-08-01-preview",
+    }
+)
+
+GCRAOAI8SW1_AZURE_OAI_GPT4O_CONFIG = ModelConfig(
+    AzureOpenAIO1Model,
+    {
+        "url": "https://gcraoai8sw1.openai.azure.com/",
+        "model_name": "gpt-4o",
+        "api_version": "2024-08-01-preview",
+        "temperature": 1.0,
+
+    }
+)
+
+
+GCRAOAI8SW1_AZURE_OAI_GPT4_T1_CONFIG = ModelConfig(
+    AzureOpenAIO1Model,
+    {
+        "url": "https://gcraoai8sw1.openai.azure.com/",
+        "model_name": "gpt-4",
+        "api_version": "2024-08-01-preview",
+        "temperature": 1.0,
+
+    }
+)
+
+AzureOpenAIModel
+
+"""
+TNR_GPT4_1106_PREVIEW_CONFIG = ModelConfig(
+    TnRModels,
+    {
+        "url": "https://trapi.research.microsoft.com/gcr/shared/nj/",
+        "secret_key_params": TNR_SECRET_KEY_PARAMS,
+        "model_name": "gpt-4",
+    },
+)
+
+TNR_GPT4_VISION_PREVIEW_CONFIG = ModelConfig(
+    TnRModels,
+    {
+        "url": "https://trapi.research.microsoft.com/gcr/shared/nj/",
+        "secret_key_params": TNR_SECRET_KEY_PARAMS,
+        "model_name": "gpt-4-turbo-v",
+    },
+)
+
+TNR_GPT4V_TURBO_2024_04_09_CONFIG = ModelConfig(
+    TnRModels,
+    {
+        "url": "https://trapi.research.microsoft.com/gcr/shared/nj/",
+        "secret_key_params": TNR_SECRET_KEY_PARAMS,
+        "model_name": "gpt-4-turbo",
+    },
+)
+
+TNR_GPT4O_2024_05_13_CONFIG = ModelConfig(
+    TnRModels,
+    {
+        "url": "https://trapi.research.microsoft.com/gcr/shared/nj/",
+        "secret_key_params": TNR_SECRET_KEY_PARAMS,
+        "model_name": "gpt-4o",
+    },
+)
+"""
+
 # Gemini models
+'''
 GEMINI_SECRET_KEY_PARAMS = {
     "key_name": "your_gemini_secret_key_name",
     "local_keys_path": "keys/keys.json",
     "key_vault_url": None,
 }
+'''
+
+GEMINI_SECRET_KEY_PARAMS = {
+    "key_name": "aif-eval-gemini-firstproject",
+    "local_keys_path": "keys/aifeval-vault-azure-net.json",
+    "key_vault_url": "https://aifeval.vault.azure.net",
+}
 
 GEMINI_V15_PRO_CONFIG = ModelConfig(
     GeminiModel,
@@ -106,6 +218,37 @@
     },
 )
 
+
+GEMINI_V15_PRO_T1_CONFIG = ModelConfig(
+    GeminiModel,
+    {
+        "model_name": "gemini-1.5-pro",
+        "secret_key_params": GEMINI_SECRET_KEY_PARAMS,
+        "temperature":1.0,
+    },
+)
+
+GEMINI_EXP_1206_T1_CONFIG = ModelConfig(
+    GeminiModel,
+    {
+        "model_name": "gemini-exp-1206",
+        "secret_key_params": GEMINI_SECRET_KEY_PARAMS,
+        "temperature":1.0,
+    },
+)
+
+
+GEMINI_EXP_1121_T1_CONFIG = ModelConfig(
+    GeminiModel,
+    {
+        "model_name": "gemini-exp-1121",
+        "secret_key_params": GEMINI_SECRET_KEY_PARAMS,
+        "temperature":1.0,
+    },
+)
+
+
+
 GEMINI_V1_PRO_CONFIG = ModelConfig(
     GeminiModel,
     {
@@ -115,11 +258,20 @@
 )
 
 # Claude models
+'''
 CLAUDE_SECRET_KEY_PARAMS = {
     "key_name": "your_claude_secret_key_name",
     "local_keys_path": "keys/keys.json",
     "key_vault_url": None,
 }
+'''
+
+CLAUDE_SECRET_KEY_PARAMS = {
+    "key_name": "aif-eval-claude",
+    "local_keys_path": "keys/aifeval-vault-azure-net.json",
+    "key_vault_url": "https://aifeval.vault.azure.net",
+}
+
 
 CLAUDE_3_OPUS_CONFIG = ModelConfig(
     ClaudeModel,
@@ -137,6 +289,32 @@
     },
 )
 
+CLAUDE_3_5_SONNET_T1_CONFIG = ModelConfig(
+    ClaudeModel,
+    {
+        "secret_key_params": CLAUDE_SECRET_KEY_PARAMS,
+        "model_name": "claude-3-5-sonnet-20240620",
+        "temperature":1.0,
+    },
+)
+
+CLAUDE_3_5_SONNET_SEARCH_T1_CONFIG = ModelConfig(
+    ClaudeModel,
+    {
+        "secret_key_params": CLAUDE_SECRET_KEY_PARAMS,
+        "model_name": "claude-3-5-sonnet-20241022",
+        "temperature": 1.0,
+    },
+)
+
+CLAUDE_3_5_SONNET_SEARCH_CONFIG = ModelConfig(
+    ClaudeModel,
+    {
+        "secret_key_params": CLAUDE_SECRET_KEY_PARAMS,
+        "model_name": "claude-3-5-sonnet-20241022",
+    },
+)
+
 # LLAVA models
 LLAVAHF_V16_34B_CONFIG = ModelConfig(
     LLaVAHuggingFaceModel,
@@ -199,3 +377,51 @@
         "model_name": "Mistral-large-2407",
     },
 )
+
+
+
+AIF_NT_MISTRAL_LARGE_2_2407_T1_CONFIG = ModelConfig(
+    MistralServerlessAzureRestEndpointModel,
+    {
+        "url": "https://Mistral-large-2407-aifeval.eastus.models.ai.azure.com/v1/chat/completions",
+        "secret_key_params": {
+            "key_name": "aif-nt-mistral-large-2-2407",
+            "local_keys_path": "keys/aifeval-vault-azure-net.json",
+            "key_vault_url": "https://aifeval.vault.azure.net",
+        },
+        "model_name": "Mistral-large-2407-aifeval",
+                "temperature": 1.0,
+
+    },
+)
+
+
+GCR_LLAMA3_1_70B_INSTRUCT_CONFIG = ModelConfig(
+    RestEndpointModel,
+    {
+        "url": "https://gcr-llama31-70b-instruct.westus3.inference.ml.azure.com/score",
+        "secret_key_params": {
+            "key_name": "meta-llama-3-1-70b-instruct-1",
+            "local_keys_path": "keys/aifeval-vault-azure-net.json",
+            "key_vault_url": "https://aifeval.vault.azure.net",
+        },
+        "model_name": "meta-llama-3-1-70b-instruct-1",
+                        "temperature": 1.0,
+
+    },
+)
+
+AIF_NT_LLAMA3_1_405B_INSTRUCT_CONFIG = ModelConfig(
+    LlamaServerlessAzureRestEndpointModel,
+    {
+        "url": "https://Meta-Llama-3-1-405B-Instruct-aif.eastus.models.ai.azure.com/v1/chat/completions",
+        "secret_key_params": {
+            "key_name": "aif-nt-meta-llama-3-1-405b-instruct-1",
+            "local_keys_path": "keys/aifeval-vault-azure-net.json",
+            "key_vault_url": "https://aifeval.vault.azure.net",
+        },
+        "model_name": "Meta-Llama-3-1-405B-Instruct-aif",
+                        "temperature": 1.0,
+
+    },
+)
diff --git a/eureka_ml_insights/metrics/__init__.py b/eureka_ml_insights/metrics/__init__.py
index 2b19ec1..e31b08a 100644
--- a/eureka_ml_insights/metrics/__init__.py
+++ b/eureka_ml_insights/metrics/__init__.py
@@ -28,6 +28,7 @@
     SpatialAndLayoutReasoningMetric,
 )
 
+from .aime_metrics import NumericMatch
 __all__ = [
     Metric,
     ClassicMetric,
@@ -52,4 +53,5 @@
     SumAggregator,
     MMMUMetric,
     MaxTokenF1ScoreMetric,
+    NumericMatch,
 ]
diff --git a/eureka_ml_insights/metrics/aime_metrics.py b/eureka_ml_insights/metrics/aime_metrics.py
new file mode 100644
index 0000000..8106bb6
--- /dev/null
+++ b/eureka_ml_insights/metrics/aime_metrics.py
@@ -0,0 +1,20 @@
+from tqdm.auto import tqdm
+
+from eureka_ml_insights.metrics.metrics_base import ClassicMetric
+
+import numpy as np
+
+class NumericMatch(ClassicMetric):
+    """This class checks for a numeric match."""
+    eps = 1e-6
+    def __evaluate__(self, answer_text, target_text, is_valid):
+        if not is_valid:
+            return "none"
+        try:
+            diff = np.abs(float(target_text)-float(answer_text))
+        except:
+            return "none"
+        if diff<self.eps:
+            return "correct"
+        else:
+            return "incorrect"
\ No newline at end of file
diff --git a/eureka_ml_insights/user_configs/aime.py b/eureka_ml_insights/user_configs/aime.py
index 40669e1..23366e8 100644
--- a/eureka_ml_insights/user_configs/aime.py
+++ b/eureka_ml_insights/user_configs/aime.py
@@ -28,6 +28,8 @@
 from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
 from eureka_ml_insights.data_utils.data import DataLoader
 from eureka_ml_insights.metrics.metrics_base import ExactMatch
+from eureka_ml_insights.metrics.aime_metrics import NumericMatch
+
 from eureka_ml_insights.metrics.reports import (
     BiLevelCountAggregator,
     CountAggregator,
@@ -115,7 +117,7 @@ def configure_pipeline(
                     "format": ".jsonl",
                 },
             ),
-            metric_config=MetricConfig(ExactMatch),
+            metric_config=MetricConfig(NumericMatch),
             aggregator_configs=[
                 AggregatorConfig(
                     CountAggregator,
@@ -172,7 +174,7 @@ def configure_pipeline(
                     "format": ".jsonl",
                 },
             ),
-            metric_config=MetricConfig(ExactMatch),
+            metric_config=MetricConfig(NumericMatch),
             aggregator_configs=[
                 AggregatorConfig(
                     BiLevelCountAggregator,
@@ -322,11 +324,13 @@ def configure_pipeline(
         self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
     ) -> PipelineConfig:
         pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
+        '''
         self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
             SamplerTransform(random_seed=0,
                              sample_count=10,
                               )
         )
+        '''
         # data preprocessing
         self.data_processing_comp.prompt_template_path=os.path.join(
                 os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_tag1.jinja"

From 2fe709537bfd85cf365ab76b91adb6d5ee30448f Mon Sep 17 00:00:00 2001
From: Lingjiao Chen <lingjiaochen@microsoft.com>
Date: Wed, 18 Dec 2024 09:10:37 -0800
Subject: [PATCH 3/7] remove sampling

---
 eureka_ml_insights/user_configs/aime.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/eureka_ml_insights/user_configs/aime.py b/eureka_ml_insights/user_configs/aime.py
index 23366e8..cd7b442 100644
--- a/eureka_ml_insights/user_configs/aime.py
+++ b/eureka_ml_insights/user_configs/aime.py
@@ -123,10 +123,10 @@ def configure_pipeline(
                     CountAggregator,
                     {
                         "column_names": [
-                            "ExactMatch_result",
+                            "NumericMatch_result",
                         ],
                         "group_by": "Year",
-                        "filename_base": "ExactMatch_GroupBy",
+                        "filename_base": "NumericMatch_GroupBy",
                     },
                 ),
             ],
@@ -180,7 +180,7 @@ def configure_pipeline(
                     BiLevelCountAggregator,
                     {
                         "column_names": [
-                            "ExactMatch_result",
+                            "NumericMatch_result",
                         ],
                         "first_groupby": "ID",
                         "filename_base": "MajorityVote",

From bd721690d5d07ccd691b0e5c6192ab0c05140f42 Mon Sep 17 00:00:00 2001
From: Lingjiao Chen <lingjiaochen@microsoft.com>
Date: Wed, 18 Dec 2024 09:16:03 -0800
Subject: [PATCH 4/7] back to original model configs

---
 eureka_ml_insights/configs/model_configs.py | 236 +-------------------
 eureka_ml_insights/user_configs/aime.py     |   1 -
 2 files changed, 5 insertions(+), 232 deletions(-)

diff --git a/eureka_ml_insights/configs/model_configs.py b/eureka_ml_insights/configs/model_configs.py
index 8f1e023..1b83a7b 100644
--- a/eureka_ml_insights/configs/model_configs.py
+++ b/eureka_ml_insights/configs/model_configs.py
@@ -4,7 +4,6 @@
 
 from eureka_ml_insights.models import (
     AzureOpenAIO1Model,
-    AzureOpenAIModel,
     ClaudeModel,
     DirectOpenAIModel,
     DirectOpenAIO1Model,
@@ -14,7 +13,7 @@
     LLaVAModel,
     MistralServerlessAzureRestEndpointModel,
     RestEndpointModel,
-    #TnRModels,
+    TestModel,
 )
 
 from .config import ModelConfig
@@ -23,22 +22,17 @@
 # in the secret_key_params dictionary. OR you can provide the key name and key vault URL to fetch the key from Azure Key Vault.
 # You don't need to provide both the key_vault_url and local_keys_path. You can provide one of them based on your setup.
 
+
+# Test model
+TEST_MODEL_CONFIG = ModelConfig(TestModel, {})
+
 # OpenAI models
 
-'''
 OPENAI_SECRET_KEY_PARAMS = {
     "key_name": "your_openai_secret_key_name",
     "local_keys_path": "keys/keys.json",
     "key_vault_url": None,
 }
-'''
-
-OPENAI_SECRET_KEY_PARAMS = {
-    "key_name": "openai",
-    "local_keys_path": "keys/aifeval-vault-azure-net.json",
-    "key_vault_url": "https://aifeval.vault.azure.net",
-}
-
 
 OAI_O1_PREVIEW_CONFIG = ModelConfig(
     DirectOpenAIO1Model,
@@ -48,14 +42,6 @@
     },
 )
 
-OAI_O1_MINI_CONFIG = ModelConfig(
-    DirectOpenAIO1Model,
-    {
-        "model_name": "o1-mini-2024-09-12",
-        "secret_key_params": OPENAI_SECRET_KEY_PARAMS,
-    },
-)
-
 OAI_O1_PREVIEW_AUZRE_CONFIG = ModelConfig(
     AzureOpenAIO1Model,
     {
@@ -105,110 +91,12 @@
     },
 )
 
-# Azure OAI models
-## Azure OAI models -- TNR Models
-
-TNR_SECRET_KEY_PARAMS = {
-    "key_name": "tnrllmproxy",
-    "local_keys_path": "keys/aifeval-vault-azure-net.json",
-    "key_vault_url": "https://aifeval.vault.azure.net",
-}
-
-GCRAOAI8SW1_AZURE_OAI_O1_PREVIEW_CONFIG = ModelConfig(
-    AzureOpenAIO1Model,
-    {
-        "url": "https://gcraoai8sw1.openai.azure.com/",
-        "model_name": "o1-preview",
-        "api_version": "2024-08-01-preview",
-    }
-)
-
-GCRAOAI8SW1_AZURE_OAI_O1_MINI_CONFIG = ModelConfig(
-    AzureOpenAIO1Model,
-    {
-        "url": "https://gcraoai8sw1.openai.azure.com/",
-        "model_name": "o1-mini",
-        "api_version": "2024-08-01-preview",
-    }
-)
-
-GCRAOAI8SW1_AZURE_OAI_GPT4O_CONFIG = ModelConfig(
-    AzureOpenAIO1Model,
-    {
-        "url": "https://gcraoai8sw1.openai.azure.com/",
-        "model_name": "gpt-4o",
-        "api_version": "2024-08-01-preview",
-        "temperature": 1.0,
-
-    }
-)
-
-
-GCRAOAI8SW1_AZURE_OAI_GPT4_T1_CONFIG = ModelConfig(
-    AzureOpenAIO1Model,
-    {
-        "url": "https://gcraoai8sw1.openai.azure.com/",
-        "model_name": "gpt-4",
-        "api_version": "2024-08-01-preview",
-        "temperature": 1.0,
-
-    }
-)
-
-AzureOpenAIModel
-
-"""
-TNR_GPT4_1106_PREVIEW_CONFIG = ModelConfig(
-    TnRModels,
-    {
-        "url": "https://trapi.research.microsoft.com/gcr/shared/nj/",
-        "secret_key_params": TNR_SECRET_KEY_PARAMS,
-        "model_name": "gpt-4",
-    },
-)
-
-TNR_GPT4_VISION_PREVIEW_CONFIG = ModelConfig(
-    TnRModels,
-    {
-        "url": "https://trapi.research.microsoft.com/gcr/shared/nj/",
-        "secret_key_params": TNR_SECRET_KEY_PARAMS,
-        "model_name": "gpt-4-turbo-v",
-    },
-)
-
-TNR_GPT4V_TURBO_2024_04_09_CONFIG = ModelConfig(
-    TnRModels,
-    {
-        "url": "https://trapi.research.microsoft.com/gcr/shared/nj/",
-        "secret_key_params": TNR_SECRET_KEY_PARAMS,
-        "model_name": "gpt-4-turbo",
-    },
-)
-
-TNR_GPT4O_2024_05_13_CONFIG = ModelConfig(
-    TnRModels,
-    {
-        "url": "https://trapi.research.microsoft.com/gcr/shared/nj/",
-        "secret_key_params": TNR_SECRET_KEY_PARAMS,
-        "model_name": "gpt-4o",
-    },
-)
-"""
-
 # Gemini models
-'''
 GEMINI_SECRET_KEY_PARAMS = {
     "key_name": "your_gemini_secret_key_name",
     "local_keys_path": "keys/keys.json",
     "key_vault_url": None,
 }
-'''
-
-GEMINI_SECRET_KEY_PARAMS = {
-    "key_name": "aif-eval-gemini-firstproject",
-    "local_keys_path": "keys/aifeval-vault-azure-net.json",
-    "key_vault_url": "https://aifeval.vault.azure.net",
-}
 
 GEMINI_V15_PRO_CONFIG = ModelConfig(
     GeminiModel,
@@ -218,37 +106,6 @@
     },
 )
 
-
-GEMINI_V15_PRO_T1_CONFIG = ModelConfig(
-    GeminiModel,
-    {
-        "model_name": "gemini-1.5-pro",
-        "secret_key_params": GEMINI_SECRET_KEY_PARAMS,
-        "temperature":1.0,
-    },
-)
-
-GEMINI_EXP_1206_T1_CONFIG = ModelConfig(
-    GeminiModel,
-    {
-        "model_name": "gemini-exp-1206",
-        "secret_key_params": GEMINI_SECRET_KEY_PARAMS,
-        "temperature":1.0,
-    },
-)
-
-
-GEMINI_EXP_1121_T1_CONFIG = ModelConfig(
-    GeminiModel,
-    {
-        "model_name": "gemini-exp-1121",
-        "secret_key_params": GEMINI_SECRET_KEY_PARAMS,
-        "temperature":1.0,
-    },
-)
-
-
-
 GEMINI_V1_PRO_CONFIG = ModelConfig(
     GeminiModel,
     {
@@ -258,20 +115,11 @@
 )
 
 # Claude models
-'''
 CLAUDE_SECRET_KEY_PARAMS = {
     "key_name": "your_claude_secret_key_name",
     "local_keys_path": "keys/keys.json",
     "key_vault_url": None,
 }
-'''
-
-CLAUDE_SECRET_KEY_PARAMS = {
-    "key_name": "aif-eval-claude",
-    "local_keys_path": "keys/aifeval-vault-azure-net.json",
-    "key_vault_url": "https://aifeval.vault.azure.net",
-}
-
 
 CLAUDE_3_OPUS_CONFIG = ModelConfig(
     ClaudeModel,
@@ -289,32 +137,6 @@
     },
 )
 
-CLAUDE_3_5_SONNET_T1_CONFIG = ModelConfig(
-    ClaudeModel,
-    {
-        "secret_key_params": CLAUDE_SECRET_KEY_PARAMS,
-        "model_name": "claude-3-5-sonnet-20240620",
-        "temperature":1.0,
-    },
-)
-
-CLAUDE_3_5_SONNET_SEARCH_T1_CONFIG = ModelConfig(
-    ClaudeModel,
-    {
-        "secret_key_params": CLAUDE_SECRET_KEY_PARAMS,
-        "model_name": "claude-3-5-sonnet-20241022",
-        "temperature": 1.0,
-    },
-)
-
-CLAUDE_3_5_SONNET_SEARCH_CONFIG = ModelConfig(
-    ClaudeModel,
-    {
-        "secret_key_params": CLAUDE_SECRET_KEY_PARAMS,
-        "model_name": "claude-3-5-sonnet-20241022",
-    },
-)
-
 # LLAVA models
 LLAVAHF_V16_34B_CONFIG = ModelConfig(
     LLaVAHuggingFaceModel,
@@ -377,51 +199,3 @@
         "model_name": "Mistral-large-2407",
     },
 )
-
-
-
-AIF_NT_MISTRAL_LARGE_2_2407_T1_CONFIG = ModelConfig(
-    MistralServerlessAzureRestEndpointModel,
-    {
-        "url": "https://Mistral-large-2407-aifeval.eastus.models.ai.azure.com/v1/chat/completions",
-        "secret_key_params": {
-            "key_name": "aif-nt-mistral-large-2-2407",
-            "local_keys_path": "keys/aifeval-vault-azure-net.json",
-            "key_vault_url": "https://aifeval.vault.azure.net",
-        },
-        "model_name": "Mistral-large-2407-aifeval",
-                "temperature": 1.0,
-
-    },
-)
-
-
-GCR_LLAMA3_1_70B_INSTRUCT_CONFIG = ModelConfig(
-    RestEndpointModel,
-    {
-        "url": "https://gcr-llama31-70b-instruct.westus3.inference.ml.azure.com/score",
-        "secret_key_params": {
-            "key_name": "meta-llama-3-1-70b-instruct-1",
-            "local_keys_path": "keys/aifeval-vault-azure-net.json",
-            "key_vault_url": "https://aifeval.vault.azure.net",
-        },
-        "model_name": "meta-llama-3-1-70b-instruct-1",
-                        "temperature": 1.0,
-
-    },
-)
-
-AIF_NT_LLAMA3_1_405B_INSTRUCT_CONFIG = ModelConfig(
-    LlamaServerlessAzureRestEndpointModel,
-    {
-        "url": "https://Meta-Llama-3-1-405B-Instruct-aif.eastus.models.ai.azure.com/v1/chat/completions",
-        "secret_key_params": {
-            "key_name": "aif-nt-meta-llama-3-1-405b-instruct-1",
-            "local_keys_path": "keys/aifeval-vault-azure-net.json",
-            "key_vault_url": "https://aifeval.vault.azure.net",
-        },
-        "model_name": "Meta-Llama-3-1-405B-Instruct-aif",
-                        "temperature": 1.0,
-
-    },
-)
diff --git a/eureka_ml_insights/user_configs/aime.py b/eureka_ml_insights/user_configs/aime.py
index cd7b442..2d7eefb 100644
--- a/eureka_ml_insights/user_configs/aime.py
+++ b/eureka_ml_insights/user_configs/aime.py
@@ -23,7 +23,6 @@
     MajorityVoteTransform,
     MultiplyTransform,
     SequenceTransform,
-    SamplerTransform,
 )
 from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
 from eureka_ml_insights.data_utils.data import DataLoader

From e4cb82025f2e623f80afb0f75472ffa7363d32ca Mon Sep 17 00:00:00 2001
From: Lingjiao Chen <lingjiaochen@microsoft.com>
Date: Wed, 18 Dec 2024 17:07:47 -0800
Subject: [PATCH 5/7] update the tagging prompt

---
 .../aime_templates/Template_tag1.jinja        |  6 +++---
 eureka_ml_insights/user_configs/aime.py       | 20 +++++++------------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja b/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja
index 056c1ac..325e8c1 100644
--- a/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja
+++ b/eureka_ml_insights/prompt_templates/aime_templates/Template_tag1.jinja
@@ -1,6 +1,6 @@
-You are a genius math expert in understanding math questions. Please read the following math question, and then decide which math category it falls into.
+You are a genius math expert in understanding math questions. Please read the following math question, and then decide which math categories it falls into.
 
-Your judgment should be one of the following:
+Your judgment should be one or more of the following:
 
 arithmetic
 algebra
@@ -10,7 +10,7 @@ number theory
 probability
 other topics
 
-Do not generate any other texts except one of the above topics.
+Do not generate any other texts except one or more of the above topics. For multiple topics, seperate them by commas.
 
 ----------
 Original question:
diff --git a/eureka_ml_insights/user_configs/aime.py b/eureka_ml_insights/user_configs/aime.py
index 2d7eefb..4f314e4 100644
--- a/eureka_ml_insights/user_configs/aime.py
+++ b/eureka_ml_insights/user_configs/aime.py
@@ -26,9 +26,7 @@
 )
 from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
 from eureka_ml_insights.data_utils.data import DataLoader
-from eureka_ml_insights.metrics.metrics_base import ExactMatch
 from eureka_ml_insights.metrics.aime_metrics import NumericMatch
-
 from eureka_ml_insights.metrics.reports import (
     BiLevelCountAggregator,
     CountAggregator,
@@ -323,15 +321,11 @@ def configure_pipeline(
         self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
     ) -> PipelineConfig:
         pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
-        '''
-        self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
-            SamplerTransform(random_seed=0,
-                             sample_count=10,
-                              )
-        )
-        '''
         # data preprocessing
-        self.data_processing_comp.prompt_template_path=os.path.join(
-                os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_tag1.jinja"
-            )
-        return pipeline
\ No newline at end of file
+        self.data_processing_comp.prompt_template_path = os.path.join(
+            os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_tag1.jinja"
+        )
+        # Each query is tagged with one or more topics from arithmetic, algebra, counting, geometry, number theory, and probability and other.
+        # These topics follow the description on the official website: https://artofproblemsolving.com/wiki/index.php/American_Invitational_Mathematics_Examination?srsltid=AfmBOooSIQ8ua5aJX00ZtYCKDuOAB4I4c-YE9zr1xYZ86fq8x5RL2sEg.
+        # In their own words, "The AIME tests mathematical problem solving with arithmetic, algebra, counting, geometry, number theory, and probability and other secondary school math topics"
+        return pipeline

From 551341ab4902ccfcf5c9c4e293b62654c17c1533 Mon Sep 17 00:00:00 2001
From: lchen001 <lingjiaochen001@hotmail.com>
Date: Fri, 17 Jan 2025 11:02:52 -0800
Subject: [PATCH 6/7] add direct run prompt

---
 .../aime_templates/Template_1direct.jinja        |  5 +++++
 eureka_ml_insights/user_configs/__init__.py      |  3 +++
 eureka_ml_insights/user_configs/aime.py          | 16 +++++++++++++++-
 3 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 eureka_ml_insights/prompt_templates/aime_templates/Template_1direct.jinja

diff --git a/eureka_ml_insights/prompt_templates/aime_templates/Template_1direct.jinja b/eureka_ml_insights/prompt_templates/aime_templates/Template_1direct.jinja
new file mode 100644
index 0000000..7804716
--- /dev/null
+++ b/eureka_ml_insights/prompt_templates/aime_templates/Template_1direct.jinja
@@ -0,0 +1,5 @@
+You are a genius math graduate student solving math problems from the AIME competition. 
+
+Provide your final answer in the format: 'Final Answer: [numeric value]'. Dont box it, just provide the answer directly at the end.
+
+{{prompt}}
diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py
index e1a48f9..36545ac 100644
--- a/eureka_ml_insights/user_configs/__init__.py
+++ b/eureka_ml_insights/user_configs/__init__.py
@@ -4,9 +4,12 @@
     AIME_PIPELINE16Run,
     AIME_PIPELINE32Run,
     AIME_PIPELINE64Run,
+    AIME_PIPELINE128Run,
     AIME_PIPELINE256Run,
     AIME_PIPELINE512Run,
     AIME_PIPELINE1024Run,
+    AIME_PIPELINE5Run,
+    AIME_PIPELINEDirect5Run,
     AIME_PIPELINETag,
 )
 from .dna import DNA_PIPELINE
diff --git a/eureka_ml_insights/user_configs/aime.py b/eureka_ml_insights/user_configs/aime.py
index 4f314e4..3fdecc8 100644
--- a/eureka_ml_insights/user_configs/aime.py
+++ b/eureka_ml_insights/user_configs/aime.py
@@ -23,6 +23,7 @@
     MajorityVoteTransform,
     MultiplyTransform,
     SequenceTransform,
+    SamplerTransform
 )
 from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
 from eureka_ml_insights.data_utils.data import DataLoader
@@ -78,7 +79,7 @@ def configure_pipeline(
             ),
             output_dir=os.path.join(self.log_dir, "inference_result"),
             resume_from=resume_from,
-            max_concurrent=10,
+            max_concurrent=1,
         )
         # post process the response to extract the answer
         self.data_post_processing = DataProcessingConfig(
@@ -215,6 +216,19 @@ def configure_pipeline(
         )
         return pipeline
 
+class AIME_PIPELINEDirect5Run(AIME_PIPELINE5Run):
+    """This class specifies the config for running AIME benchmark 5 repeated times"""
+
+    def configure_pipeline(
+        self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
+    ) -> PipelineConfig:
+        pipeline = super().configure_pipeline(model_config=model_config, resume_from=resume_from)
+        # data preprocessing
+        self.data_processing_comp.prompt_template_path=os.path.join(
+                os.path.dirname(__file__), "../prompt_templates/aime_templates/Template_1direct.jinja"
+            )
+        return pipeline
+
 
 class AIME_PIPELINE16Run(AIME_PIPELINE):
     """This class specifies the config for running AIME benchmark 5 repeated times"""

From 392b55909db18d344bbd553188fb4af1c5570439 Mon Sep 17 00:00:00 2001
From: Lingjiao Chen <lingjiaochen@microsoft.com>
Date: Wed, 12 Feb 2025 15:28:23 -0800
Subject: [PATCH 7/7] split majority vote performance by year

---
 eureka_ml_insights/user_configs/aime.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/eureka_ml_insights/user_configs/aime.py b/eureka_ml_insights/user_configs/aime.py
index 3fdecc8..029f84d 100644
--- a/eureka_ml_insights/user_configs/aime.py
+++ b/eureka_ml_insights/user_configs/aime.py
@@ -59,6 +59,7 @@ def configure_pipeline(
                                     "Answer": "ground_truth",
                                 }
                             ),
+                            #SamplerTransform(sample_count=10,random_seed=0),
                         ],
                     ),
                 },
@@ -185,6 +186,18 @@ def configure_pipeline(
                         "normalize": True,
                     },
                 ),
+                AggregatorConfig(
+                    BiLevelCountAggregator,
+                    {
+                        "column_names": [
+                            "NumericMatch_result",
+                        ],
+                        "first_groupby": "ID",
+                        "second_groupby": "Year",
+                        "filename_base": "MajorityVote_byyear",
+                        "normalize": True,
+                    },
+                ),
             ],
             output_dir=os.path.join(self.log_dir, "eval_report_majorityVote"),
         )