Added a few new bots in experiments and updated benchmarking (#7)

* Added new experimental bots * Fixed the 'publish reports to meatculus' ForecastBot parameter so it actually works * Added two new experiment bots * Updated o1 and o1 preview in bot * Removed o1 model from testing due to depdencies not supporting it yet
Metaculus · Dec 22, 2024 · c3678ed · c3678ed
1 parent 2438958
commit c3678ed
Show file tree

Hide file tree

Showing 18 changed files with 519 additions and 133 deletions.
diff --git a/code_tests/low_cost_or_live_api_tests/test_ai_models/test_ai_model_interface.py b/code_tests/low_cost_or_live_api_tests/test_ai_models/test_ai_model_interface.py
@@ -13,7 +13,7 @@
 from forecasting_tools.ai_models.basic_model_interfaces.ai_model import AiModel
 from forecasting_tools.ai_models.claude35sonnet import Claude35Sonnet
 from forecasting_tools.ai_models.exa_searcher import ExaSearcher
-from forecasting_tools.ai_models.gpto1 import GptO1
+from forecasting_tools.ai_models.gpto1preview import GptO1Preview
 from forecasting_tools.ai_models.metaculus4o import Gpt4oMetaculusProxy
 from forecasting_tools.ai_models.perplexity import Perplexity
 
@@ -34,7 +34,7 @@ async def test_response_from_a_direct_call_is_same_ask_mock_value(
             "As of Aug 18 2024 Exasearcher doesn't depend on exact mock return values to validate other steps"
         )
 
-    if issubclass(subclass, GptO1):
+    if issubclass(subclass, GptO1Preview):
         pytest.skip("GptO1 has inconsistent reasoning token count.")
 
     if issubclass(subclass, Claude35Sonnet):
@@ -90,7 +90,7 @@ def test_ai_model_async_is_not_blocking(subclass: type[AiModel]) -> None:
     number_of_coroutines_to_run = 5
     list_should_run_under_x_times_first_coroutine = 3
 
-    if issubclass(subclass, GptO1):
+    if issubclass(subclass, GptO1Preview):
         pytest.skip(
             "GptO1 is around 2c per call, so this test would be too expensive"
         )

diff --git a/code_tests/low_cost_or_live_api_tests/test_ai_models/test_models_tracking_token_cost.py b/code_tests/low_cost_or_live_api_tests/test_ai_models/test_models_tracking_token_cost.py
@@ -13,7 +13,7 @@
 from forecasting_tools.ai_models.basic_model_interfaces.tokens_incur_cost import (
     TokensIncurCost,
 )
-from forecasting_tools.ai_models.gpto1 import GptO1
+from forecasting_tools.ai_models.gpto1preview import GptO1Preview
 from forecasting_tools.ai_models.model_archetypes.traditional_online_llm import (
     TraditionalOnlineLlm,
 )
@@ -23,7 +23,7 @@
 def test_predicted_cost_and_tokens_correct_for_cheap_input(
     subclass: type[TokensIncurCost],
 ) -> None:
-    if issubclass(subclass, GptO1):
+    if issubclass(subclass, GptO1Preview):
         pytest.skip(
             "GptO1 has a weird tokenization scheme that doesn't allow for predicting tokens, and is sometimes 1 token off on small prompts. This is after following official instructions"
         )
@@ -36,7 +36,7 @@ def test_system_prompt_cost_and_tokens_correct_for_cheap_input(
     subclass: type[TokensIncurCost],
 ) -> None:
     if not issubclass(subclass, TraditionalOnlineLlm) or issubclass(
-        subclass, GptO1
+        subclass, GptO1Preview
     ):
         pytest.skip("Model doesn't have a system prompt")
 

diff --git a/code_tests/unit_tests/test_ai_models/models_to_test.py b/code_tests/unit_tests/test_ai_models/models_to_test.py
@@ -24,7 +24,7 @@
 from forecasting_tools.ai_models.exa_searcher import ExaSearcher
 from forecasting_tools.ai_models.gpt4o import Gpt4o
 from forecasting_tools.ai_models.gpt4ovision import Gpt4oVision
-from forecasting_tools.ai_models.gpto1 import GptO1
+from forecasting_tools.ai_models.gpto1preview import GptO1Preview
 from forecasting_tools.ai_models.metaculus4o import Gpt4oMetaculusProxy
 from forecasting_tools.ai_models.perplexity import Perplexity
 
@@ -34,7 +34,8 @@ class ModelsToTest:
         Gpt4o,
         Gpt4oMetaculusProxy,
         Gpt4oVision,
-        GptO1,
+        GptO1Preview,
+        # GptO1, # TODO: dependencies do not yet support this
         Claude35Sonnet,
         Perplexity,
         ExaSearcher,

diff --git a/forecasting_tools/ai_models/ai_utils/openai_utils.py b/forecasting_tools/ai_models/ai_utils/openai_utils.py
@@ -1,7 +1,5 @@
-import logging
-
-logger = logging.getLogger(__name__)
 import base64
+import logging
 import math
 import re
 from io import BytesIO
@@ -21,6 +19,8 @@
 from pydantic import BaseModel
 from tiktoken import Encoding
 
+logger = logging.getLogger(__name__)
+
 
 class VisionMessageData(BaseModel):
     prompt: str

diff --git a/forecasting_tools/ai_models/gpto1.py b/forecasting_tools/ai_models/gpto1.py
@@ -1,43 +1,5 @@
-from typing import Any, Final
+from forecasting_tools.ai_models.gpto1preview import GptO1Preview
 
-from forecasting_tools.ai_models.ai_utils.response_types import (
-    TextTokenCostResponse,
-)
-from forecasting_tools.ai_models.model_archetypes.openai_text_model import (
-    OpenAiTextToTextModel,
-)
 
-
-class GptO1(OpenAiTextToTextModel):
-    # See OpenAI Limit on the account dashboard for most up-to-date limit
-    MODEL_NAME: Final[str] = "o1-preview"
-    REQUESTS_PER_PERIOD_LIMIT: Final[int] = 8_000
-    REQUEST_PERIOD_IN_SECONDS: Final[int] = 60
-    TIMEOUT_TIME: Final[int] = 120
-    TOKENS_PER_PERIOD_LIMIT: Final[int] = 2_000_000
-    TOKEN_PERIOD_IN_SECONDS: Final[int] = 60
-
-    def __init__(
-        self,
-        *args: Any,
-        temperature: float = 1,
-        system_prompt: str | None = None,
-        **kwargs: Any,
-    ):
-        assert (
-            system_prompt is None
-        ), "GptO1Preview does not support system prompts"
-        assert (
-            temperature == 1
-        ), f"GptO1Preview must have temperature 1, but {temperature} was given."
-        super().__init__(*args, temperature=temperature, **kwargs)
-
-    @classmethod
-    def _get_mock_return_for_direct_call_to_model_using_cheap_input(
-        cls,
-    ) -> TextTokenCostResponse:
-        response = (
-            super()._get_mock_return_for_direct_call_to_model_using_cheap_input()
-        )
-        response.total_tokens_used += 269  # Add reasoning tokens
-        return response
+class GptO1(GptO1Preview):
+    MODEL_NAME: str = "o1"
diff --git a/forecasting_tools/ai_models/gpto1preview.py b/forecasting_tools/ai_models/gpto1preview.py
@@ -0,0 +1,43 @@
+from typing import Any
+
+from forecasting_tools.ai_models.ai_utils.response_types import (
+    TextTokenCostResponse,
+)
+from forecasting_tools.ai_models.model_archetypes.openai_text_model import (
+    OpenAiTextToTextModel,
+)
+
+
+class GptO1Preview(OpenAiTextToTextModel):
+    # See OpenAI Limit on the account dashboard for most up-to-date limit
+    MODEL_NAME: str = "o1-preview"
+    REQUESTS_PER_PERIOD_LIMIT: int = 8_000
+    REQUEST_PERIOD_IN_SECONDS: int = 60
+    TIMEOUT_TIME: int = 120
+    TOKENS_PER_PERIOD_LIMIT: int = 2_000_000
+    TOKEN_PERIOD_IN_SECONDS: int = 60
+
+    def __init__(
+        self,
+        *args: Any,
+        temperature: float = 1,
+        system_prompt: str | None = None,
+        **kwargs: Any,
+    ):
+        assert (
+            system_prompt is None
+        ), "GptO1Preview does not support system prompts"
+        assert (
+            temperature == 1
+        ), f"GptO1Preview must have temperature 1, but {temperature} was given."
+        super().__init__(*args, temperature=temperature, **kwargs)
+
+    @classmethod
+    def _get_mock_return_for_direct_call_to_model_using_cheap_input(
+        cls,
+    ) -> TextTokenCostResponse:
+        response = (
+            super()._get_mock_return_for_direct_call_to_model_using_cheap_input()
+        )
+        response.total_tokens_used += 269  # Add reasoning tokens
+        return response
diff --git a/forecasting_tools/forecasting/forecast_bots/experiments/exa_bot.py b/forecasting_tools/forecasting/forecast_bots/experiments/exa_bot.py
@@ -0,0 +1,27 @@
+from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents
+from forecasting_tools.forecasting.forecast_bots.experiments.q3_template_bot import (
+    Q3TemplateBot,
+)
+from forecasting_tools.forecasting.helpers.smart_searcher import SmartSearcher
+from forecasting_tools.forecasting.questions_and_reports.questions import (
+    MetaculusQuestion,
+)
+
+
+class ExaBot(Q3TemplateBot):
+
+    async def run_research(self, question: MetaculusQuestion) -> str:
+        prompt = clean_indents(
+            f"""
+            You are an assistant to a superforecaster.
+            The superforecaster will give you a question they intend to forecast on.
+            To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information.
+            You do not produce forecasts yourself.
+
+            Question:
+            {question.question_text}
+            """
+        )
+
+        response = await SmartSearcher(temperature=0.1).invoke(prompt)
+        return response
diff --git a/forecasting_tools/forecasting/forecast_bots/experiments/exa_q4_binary.py b/forecasting_tools/forecasting/forecast_bots/experiments/exa_q4_binary.py
@@ -0,0 +1,85 @@
+from datetime import datetime
+
+from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents
+from forecasting_tools.ai_models.gpt4o import Gpt4o
+from forecasting_tools.forecasting.forecast_bots.template_bot import (
+    TemplateBot,
+)
+from forecasting_tools.forecasting.helpers.smart_searcher import SmartSearcher
+from forecasting_tools.forecasting.questions_and_reports.forecast_report import (
+    ReasonedPrediction,
+)
+from forecasting_tools.forecasting.questions_and_reports.questions import (
+    BinaryQuestion,
+    MetaculusQuestion,
+)
+
+
+class ExaQ4BinaryBot(TemplateBot):
+    FINAL_DECISION_LLM = Gpt4o(temperature=0.1)
+
+    async def run_research(self, question: MetaculusQuestion) -> str:
+        prompt = clean_indents(
+            f"""
+            You are an assistant to a superforecaster.
+            The superforecaster will give you a question they intend to forecast on.
+            To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information.
+            You do not produce forecasts yourself.
+
+            Question:
+            {question.question_text}
+            """
+        )
+
+        response = await SmartSearcher(temperature=0.1).invoke(prompt)
+        return response
+
+    async def _run_forecast_on_binary(
+        self, question: BinaryQuestion, research: str
+    ) -> ReasonedPrediction[float]:
+        assert isinstance(
+            question, BinaryQuestion
+        ), "Question must be a BinaryQuestion"
+        prompt = clean_indents(
+            f"""
+            You are a professional forecaster interviewing for a job.
+            Your interview question is:
+            {question.question_text}
+
+            Background information:
+            {question.background_info if question.background_info else "No background information provided."}
+
+            Resolution criteria:
+            {question.resolution_criteria if question.resolution_criteria else "No resolution criteria provided."}
+
+            Fine print:
+            {question.fine_print if question.fine_print else "No fine print provided."}
+
+
+            Your research assistant says:
+            ```
+            {research}
+            ```
+
+            Today is {datetime.now().strftime("%Y-%m-%d")}.
+
+
+            Before answering you write:
+            (a) The time left until the outcome to the question is known.
+            (b) What the outcome would be if nothing changed.
+            (c) The most important factors that will influence a successful/unsuccessful resolution.
+            (d) What do you not know that should give you pause and lower confidence? Remember people are statistically overconfident.
+            (e) What you would forecast if you were to only use historical precedent (i.e. how often this happens in the past) without any current information.
+            (f) What you would forecast if there was only a quarter of the time left.
+            (g) What you would forecast if there was 4x the time left.
+
+            You write your rationale and then the last thing you write is your final answer as: "Probability: ZZ%", 0-100
+            """
+        )
+        gpt_forecast = await self.FINAL_DECISION_LLM.invoke(prompt)
+        prediction = self._extract_forecast_from_binary_rationale(
+            gpt_forecast, max_prediction=0.99, min_prediction=0.01
+        )
+        return ReasonedPrediction(
+            prediction_value=prediction, reasoning=gpt_forecast
+        )
diff --git a/forecasting_tools/forecasting/forecast_bots/experiments/exa_q4_binary_o1_preview.py b/forecasting_tools/forecasting/forecast_bots/experiments/exa_q4_binary_o1_preview.py
@@ -0,0 +1,8 @@
+from forecasting_tools.ai_models.gpto1preview import GptO1Preview
+from forecasting_tools.forecasting.forecast_bots.experiments.exa_q4_binary import (
+    ExaQ4BinaryBot,
+)
+
+
+class ExaQ4BinaryO1PreviewBot(ExaQ4BinaryBot):
+    FINAL_DECISION_LLM = GptO1Preview()
diff --git a/forecasting_tools/forecasting/forecast_bots/experiments/q3_template_bot.py b/forecasting_tools/forecasting/forecast_bots/experiments/q3_template_bot.py
@@ -0,0 +1,81 @@
+from datetime import datetime
+
+from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents
+from forecasting_tools.ai_models.gpt4o import Gpt4o
+from forecasting_tools.ai_models.perplexity import Perplexity
+from forecasting_tools.forecasting.forecast_bots.template_bot import (
+    TemplateBot,
+)
+from forecasting_tools.forecasting.questions_and_reports.forecast_report import (
+    ReasonedPrediction,
+)
+from forecasting_tools.forecasting.questions_and_reports.questions import (
+    BinaryQuestion,
+    MetaculusQuestion,
+)
+
+
+class Q3TemplateBot(TemplateBot):
+    """
+    Find the q3 bot here: https://github.com/Metaculus/metac-bot/commit/e459f2958f66658783057da46e257896b49607be
+    """
+
+    FINAL_DECISION_LLM = Gpt4o(
+        temperature=0.1
+    )  # Q3 Bot used the default llama index temperature which as of Dec 21 2024 is 0.1
+
+    async def run_research(self, question: MetaculusQuestion) -> str:
+        system_prompt = clean_indents(
+            """
+            You are an assistant to a superforecaster.
+            The superforecaster will give you a question they intend to forecast on.
+            To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information.
+            You do not produce forecasts yourself.
+            """
+        )
+
+        # Note: The original q3 bot did not set temperature, and I could not find the default temperature of perplexity
+        response = await Perplexity(
+            temperature=0.1, system_prompt=system_prompt
+        ).invoke(question.question_text)
+        return response
+
+    async def _run_forecast_on_binary(
+        self, question: BinaryQuestion, research: str
+    ) -> ReasonedPrediction[float]:
+        prompt = clean_indents(
+            f"""
+            You are a professional forecaster interviewing for a job.
+
+            Your interview question is:
+            {question.question_text}
+
+            background:
+            {question.background_info}
+
+            {question.resolution_criteria}
+
+            {question.fine_print}
+
+
+            Your research assistant says:
+            {research}
+
+            Today is {datetime.now().strftime("%Y-%m-%d")}.
+
+            Before answering you write:
+            (a) The time left until the outcome to the question is known.
+            (b) What the outcome would be if nothing changed.
+            (c) What you would forecast if there was only a quarter of the time left.
+            (d) What you would forecast if there was 4x the time left.
+
+            You write your rationale and then the last thing you write is your final answer as: "Probability: ZZ%", 0-100
+            """
+        )
+        reasoning = await self.FINAL_DECISION_LLM.invoke(prompt)
+        prediction = self._extract_forecast_from_binary_rationale(
+            reasoning, max_prediction=0.99, min_prediction=0.01
+        )
+        return ReasonedPrediction(
+            prediction_value=prediction, reasoning=reasoning
+        )