Added two new experiment bots

Metaculus · Dec 21, 2024 · 8013a75 · 8013a75
1 parent 374fa4c
commit 8013a75
Show file tree

Hide file tree

Showing 10 changed files with 189 additions and 64 deletions.
diff --git a/code_tests/low_cost_or_live_api_tests/test_ai_models/test_ai_model_interface.py b/code_tests/low_cost_or_live_api_tests/test_ai_models/test_ai_model_interface.py
@@ -13,7 +13,7 @@
 from forecasting_tools.ai_models.basic_model_interfaces.ai_model import AiModel
 from forecasting_tools.ai_models.claude35sonnet import Claude35Sonnet
 from forecasting_tools.ai_models.exa_searcher import ExaSearcher
-from forecasting_tools.ai_models.gpto1 import GptO1
+from forecasting_tools.ai_models.gpto1preview import GptO1Preview
 from forecasting_tools.ai_models.metaculus4o import Gpt4oMetaculusProxy
 from forecasting_tools.ai_models.perplexity import Perplexity
 
@@ -34,7 +34,7 @@ async def test_response_from_a_direct_call_is_same_ask_mock_value(
             "As of Aug 18 2024 Exasearcher doesn't depend on exact mock return values to validate other steps"
         )
 
-    if issubclass(subclass, GptO1):
+    if issubclass(subclass, GptO1Preview):
         pytest.skip("GptO1 has inconsistent reasoning token count.")
 
     if issubclass(subclass, Claude35Sonnet):
@@ -90,7 +90,7 @@ def test_ai_model_async_is_not_blocking(subclass: type[AiModel]) -> None:
     number_of_coroutines_to_run = 5
     list_should_run_under_x_times_first_coroutine = 3
 
-    if issubclass(subclass, GptO1):
+    if issubclass(subclass, GptO1Preview):
         pytest.skip(
             "GptO1 is around 2c per call, so this test would be too expensive"
         )

diff --git a/code_tests/low_cost_or_live_api_tests/test_ai_models/test_models_tracking_token_cost.py b/code_tests/low_cost_or_live_api_tests/test_ai_models/test_models_tracking_token_cost.py
@@ -13,7 +13,7 @@
 from forecasting_tools.ai_models.basic_model_interfaces.tokens_incur_cost import (
     TokensIncurCost,
 )
-from forecasting_tools.ai_models.gpto1 import GptO1
+from forecasting_tools.ai_models.gpto1preview import GptO1Preview
 from forecasting_tools.ai_models.model_archetypes.traditional_online_llm import (
     TraditionalOnlineLlm,
 )
@@ -23,7 +23,7 @@
 def test_predicted_cost_and_tokens_correct_for_cheap_input(
     subclass: type[TokensIncurCost],
 ) -> None:
-    if issubclass(subclass, GptO1):
+    if issubclass(subclass, GptO1Preview):
         pytest.skip(
             "GptO1 has a weird tokenization scheme that doesn't allow for predicting tokens, and is sometimes 1 token off on small prompts. This is after following official instructions"
         )
@@ -36,7 +36,7 @@ def test_system_prompt_cost_and_tokens_correct_for_cheap_input(
     subclass: type[TokensIncurCost],
 ) -> None:
     if not issubclass(subclass, TraditionalOnlineLlm) or issubclass(
-        subclass, GptO1
+        subclass, GptO1Preview
     ):
         pytest.skip("Model doesn't have a system prompt")
 

diff --git a/code_tests/unit_tests/test_ai_models/models_to_test.py b/code_tests/unit_tests/test_ai_models/models_to_test.py
@@ -34,6 +34,7 @@ class ModelsToTest:
         Gpt4o,
         Gpt4oMetaculusProxy,
         Gpt4oVision,
+        # GptO1Preview,
         GptO1,
         Claude35Sonnet,
         Perplexity,

diff --git a/forecasting_tools/ai_models/gpto1.py b/forecasting_tools/ai_models/gpto1.py
@@ -1,43 +1,5 @@
-from typing import Any, Final
+from forecasting_tools.ai_models.gpto1preview import GptO1Preview
 
-from forecasting_tools.ai_models.ai_utils.response_types import (
-    TextTokenCostResponse,
-)
-from forecasting_tools.ai_models.model_archetypes.openai_text_model import (
-    OpenAiTextToTextModel,
-)
 
-
-class GptO1(OpenAiTextToTextModel):
-    # See OpenAI Limit on the account dashboard for most up-to-date limit
-    MODEL_NAME: Final[str] = "o1-preview"
-    REQUESTS_PER_PERIOD_LIMIT: Final[int] = 8_000
-    REQUEST_PERIOD_IN_SECONDS: Final[int] = 60
-    TIMEOUT_TIME: Final[int] = 120
-    TOKENS_PER_PERIOD_LIMIT: Final[int] = 2_000_000
-    TOKEN_PERIOD_IN_SECONDS: Final[int] = 60
-
-    def __init__(
-        self,
-        *args: Any,
-        temperature: float = 1,
-        system_prompt: str | None = None,
-        **kwargs: Any,
-    ):
-        assert (
-            system_prompt is None
-        ), "GptO1Preview does not support system prompts"
-        assert (
-            temperature == 1
-        ), f"GptO1Preview must have temperature 1, but {temperature} was given."
-        super().__init__(*args, temperature=temperature, **kwargs)
-
-    @classmethod
-    def _get_mock_return_for_direct_call_to_model_using_cheap_input(
-        cls,
-    ) -> TextTokenCostResponse:
-        response = (
-            super()._get_mock_return_for_direct_call_to_model_using_cheap_input()
-        )
-        response.total_tokens_used += 269  # Add reasoning tokens
-        return response
+class GptO1(GptO1Preview):
+    MODEL_NAME: str = "o1"
diff --git a/forecasting_tools/ai_models/gpto1preview.py b/forecasting_tools/ai_models/gpto1preview.py
@@ -0,0 +1,43 @@
+from typing import Any
+
+from forecasting_tools.ai_models.ai_utils.response_types import (
+    TextTokenCostResponse,
+)
+from forecasting_tools.ai_models.model_archetypes.openai_text_model import (
+    OpenAiTextToTextModel,
+)
+
+
+class GptO1Preview(OpenAiTextToTextModel):
+    # See OpenAI Limit on the account dashboard for most up-to-date limit
+    MODEL_NAME: str = "o1-preview"
+    REQUESTS_PER_PERIOD_LIMIT: int = 8_000
+    REQUEST_PERIOD_IN_SECONDS: int = 60
+    TIMEOUT_TIME: int = 120
+    TOKENS_PER_PERIOD_LIMIT: int = 2_000_000
+    TOKEN_PERIOD_IN_SECONDS: int = 60
+
+    def __init__(
+        self,
+        *args: Any,
+        temperature: float = 1,
+        system_prompt: str | None = None,
+        **kwargs: Any,
+    ):
+        assert (
+            system_prompt is None
+        ), "GptO1Preview does not support system prompts"
+        assert (
+            temperature == 1
+        ), f"GptO1Preview must have temperature 1, but {temperature} was given."
+        super().__init__(*args, temperature=temperature, **kwargs)
+
+    @classmethod
+    def _get_mock_return_for_direct_call_to_model_using_cheap_input(
+        cls,
+    ) -> TextTokenCostResponse:
+        response = (
+            super()._get_mock_return_for_direct_call_to_model_using_cheap_input()
+        )
+        response.total_tokens_used += 269  # Add reasoning tokens
+        return response
diff --git a/forecasting_tools/forecasting/forecast_bots/experiments/exa_q4_binary.py b/forecasting_tools/forecasting/forecast_bots/experiments/exa_q4_binary.py
@@ -0,0 +1,85 @@
+from datetime import datetime
+
+from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents
+from forecasting_tools.ai_models.gpt4o import Gpt4o
+from forecasting_tools.forecasting.forecast_bots.template_bot import (
+    TemplateBot,
+)
+from forecasting_tools.forecasting.helpers.smart_searcher import SmartSearcher
+from forecasting_tools.forecasting.questions_and_reports.forecast_report import (
+    ReasonedPrediction,
+)
+from forecasting_tools.forecasting.questions_and_reports.questions import (
+    BinaryQuestion,
+    MetaculusQuestion,
+)
+
+
+class ExaQ4BinaryBot(TemplateBot):
+    FINAL_DECISION_LLM = Gpt4o(temperature=0.1)
+
+    async def run_research(self, question: MetaculusQuestion) -> str:
+        prompt = clean_indents(
+            f"""
+            You are an assistant to a superforecaster.
+            The superforecaster will give you a question they intend to forecast on.
+            To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information.
+            You do not produce forecasts yourself.
+
+            Question:
+            {question.question_text}
+            """
+        )
+
+        response = await SmartSearcher(temperature=0.1).invoke(prompt)
+        return response
+
+    async def _run_forecast_on_binary(
+        self, question: BinaryQuestion, research: str
+    ) -> ReasonedPrediction[float]:
+        assert isinstance(
+            question, BinaryQuestion
+        ), "Question must be a BinaryQuestion"
+        prompt = clean_indents(
+            f"""
+            You are a professional forecaster interviewing for a job.
+            Your interview question is:
+            {question.question_text}
+
+            Background information:
+            {question.background_info if question.background_info else "No background information provided."}
+
+            Resolution criteria:
+            {question.resolution_criteria if question.resolution_criteria else "No resolution criteria provided."}
+
+            Fine print:
+            {question.fine_print if question.fine_print else "No fine print provided."}
+
+
+            Your research assistant says:
+            ```
+            {research}
+            ```
+
+            Today is {datetime.now().strftime("%Y-%m-%d")}.
+
+
+            Before answering you write:
+            (a) The time left until the outcome to the question is known.
+            (b) What the outcome would be if nothing changed.
+            (c) The most important factors that will influence a successful/unsuccessful resolution.
+            (d) What do you not know that should give you pause and lower confidence? Remember people are statistically overconfident.
+            (e) What you would forecast if you were to only use historical precedent (i.e. how often this happens in the past) without any current information.
+            (f) What you would forecast if there was only a quarter of the time left.
+            (g) What you would forecast if there was 4x the time left.
+
+            You write your rationale and then the last thing you write is your final answer as: "Probability: ZZ%", 0-100
+            """
+        )
+        gpt_forecast = await self.FINAL_DECISION_LLM.invoke(prompt)
+        prediction = self._extract_forecast_from_binary_rationale(
+            gpt_forecast, max_prediction=0.99, min_prediction=0.01
+        )
+        return ReasonedPrediction(
+            prediction_value=prediction, reasoning=gpt_forecast
+        )
diff --git a/forecasting_tools/forecasting/forecast_bots/experiments/exa_q4_binary_o1.py b/forecasting_tools/forecasting/forecast_bots/experiments/exa_q4_binary_o1.py
@@ -0,0 +1,8 @@
+from forecasting_tools.ai_models.gpto1 import GptO1
+from forecasting_tools.forecasting.forecast_bots.experiments.exa_q4_binary import (
+    ExaQ4BinaryBot,
+)
+
+
+class ExaQ4BinaryO1Bot(ExaQ4BinaryBot):
+    FINAL_DECISION_LLM = GptO1()
diff --git a/forecasting_tools/forecasting/helpers/benchmarker.py b/forecasting_tools/forecasting/helpers/benchmarker.py
@@ -1,3 +1,5 @@
+import inspect
+import logging
 import subprocess
 import time
 from datetime import datetime
@@ -27,6 +29,8 @@
     MetaculusQuestion,
 )
 
+logger = logging.getLogger(__name__)
+
 
 class Benchmarker:
     """
@@ -46,6 +50,7 @@ def __init__(
         forecast_bots: list[ForecastBot],
         number_of_questions_to_use: int,
         file_path_to_save_reports: str | None = None,
+        concurrent_question_batch_size: int = 10,
     ) -> None:
         self.forecast_bots = forecast_bots
         self.number_of_questions_to_use = number_of_questions_to_use
@@ -56,35 +61,44 @@ def __init__(
             file_path_to_save_reports += "/"
         self.file_path_to_save_reports = file_path_to_save_reports
         self.initialization_timestamp = datetime.now()
+        self.concurrent_question_batch_size = concurrent_question_batch_size
 
     async def run_benchmark(self) -> list[BenchmarkForBot]:
-
         questions = MetaculusApi.get_benchmark_questions(
             self.number_of_questions_to_use,
             random_seed=42,
             # Choose a random seed so all benchmarks in a similar time period use the same questions
         )
+
         questions = typeguard.check_type(questions, list[MetaculusQuestion])
         assert len(questions) == self.number_of_questions_to_use
-        benchmarks = [
-            BenchmarkForBot(
+
+        benchmarks = []
+        for bot in self.forecast_bots:
+            try:
+                source_code = inspect.getsource(bot.__class__)
+            except Exception:
+                logger.warning(
+                    f"Could not get source code for {bot.__class__.__name__}"
+                )
+                source_code = None
+            benchmark = BenchmarkForBot(
                 forecast_reports=[],
                 forecast_bot_config=bot.get_config(),
                 description=f"This benchmark ran the {bot.__class__.__name__} bot on {self.number_of_questions_to_use} questions.",
                 name=f"Benchmark for {bot.__class__.__name__}",
                 time_taken_in_minutes=None,
                 total_cost=None,
                 git_commit_hash=self._get_git_commit_hash(),
+                code=source_code,
             )
-            for bot in self.forecast_bots
-        ]
+            benchmarks.append(benchmark)
 
-        question_batch_size = 10
         for bot, benchmark in zip(self.forecast_bots, benchmarks):
             with MonetaryCostManager() as cost_manager:
                 start_time = time.time()
                 for batch in self._batch_questions(
-                    questions, question_batch_size
+                    questions, self.concurrent_question_batch_size
                 ):
                     reports = await bot.forecast_questions(batch)
                     reports = typeguard.check_type(

diff --git a/forecasting_tools/forecasting/questions_and_reports/benchmark_for_bot.py b/forecasting_tools/forecasting/questions_and_reports/benchmark_for_bot.py
@@ -26,6 +26,7 @@ class BenchmarkForBot(BaseModel, Jsonable):
     total_cost: float | None
     git_commit_hash: str
     forecast_bot_config: dict[str, str]
+    code: str | None = None
     forecast_reports: list[BinaryReport | NumericReport | MultipleChoiceReport]
 
     @property

diff --git a/scripts/benchmark_forecast_bot.py b/scripts/benchmark_forecast_bot.py
@@ -11,45 +11,56 @@
 from forecasting_tools.forecasting.forecast_bots.experiments.exa_bot import (
     ExaBot,
 )
-from forecasting_tools.forecasting.forecast_bots.experiments.q3_template_bot import (
-    Q3TemplateBot,
+from forecasting_tools.forecasting.forecast_bots.experiments.exa_q4_binary import (
+    ExaQ4BinaryBot,
+)
+from forecasting_tools.forecasting.forecast_bots.experiments.exa_q4_binary_o1 import (
+    ExaQ4BinaryO1Bot,
 )
 from forecasting_tools.forecasting.forecast_bots.experiments.q4_main_binary_bot import (
     Q4MainBinaryBot,
 )
 from forecasting_tools.forecasting.forecast_bots.forecast_bot import (
     ForecastBot,
 )
-from forecasting_tools.forecasting.forecast_bots.template_bot import (
-    TemplateBot,
-)
 from forecasting_tools.forecasting.helpers.benchmarker import Benchmarker
 from forecasting_tools.util.custom_logger import CustomLogger
 
 logger = logging.getLogger(__name__)
 
 
 async def benchmark_forecast_bot() -> None:
-    questions_to_use = 120
+    questions_to_use = 2
     with MonetaryCostManager() as cost_manager:
         bots = [
             ExaBot(),
             Q4MainBinaryBot(),
-            Q3TemplateBot(),
-            TemplateBot(),
-            Q3TemplateBot(
+            ExaBot(
+                research_reports_per_question=3,
+                predictions_per_research_report=3,
+            ),
+            Q4MainBinaryBot(
                 research_reports_per_question=3,
                 predictions_per_research_report=3,
             ),
+            ExaQ4BinaryBot(),
+            ExaQ4BinaryBot(
+                research_reports_per_question=3,
+                predictions_per_research_report=3,
+            ),
+            ExaQ4BinaryO1Bot(),
         ]
         bots = typeguard.check_type(bots, list[ForecastBot])
         benchmarks = await Benchmarker(
             number_of_questions_to_use=questions_to_use,
             forecast_bots=bots,
             file_path_to_save_reports="logs/forecasts/benchmarks/",
+            concurrent_question_batch_size=50,
         ).run_benchmark()
         for i, benchmark in enumerate(benchmarks):
-            logger.info(f"Benchmark {i+1} of {len(benchmarks)}")
+            logger.info(
+                f"Benchmark {i+1} of {len(benchmarks)}: {benchmark.name}"
+            )
             logger.info(
                 f"- Final Score: {benchmark.average_inverse_expected_log_score}"
             )