Skip to content

Commit

Permalink
Added a few new bots in experiments and updated benchmarking (#7)
Browse files Browse the repository at this point in the history
* Added new experimental bots

* Fixed the 'publish reports to meatculus' ForecastBot parameter so it actually works

* Added two new experiment bots

* Updated o1 and o1 preview in bot

* Removed o1 model from testing due to depdencies not supporting it yet
  • Loading branch information
CodexVeritas authored Dec 22, 2024
1 parent 2438958 commit c3678ed
Show file tree
Hide file tree
Showing 18 changed files with 519 additions and 133 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from forecasting_tools.ai_models.basic_model_interfaces.ai_model import AiModel
from forecasting_tools.ai_models.claude35sonnet import Claude35Sonnet
from forecasting_tools.ai_models.exa_searcher import ExaSearcher
from forecasting_tools.ai_models.gpto1 import GptO1
from forecasting_tools.ai_models.gpto1preview import GptO1Preview
from forecasting_tools.ai_models.metaculus4o import Gpt4oMetaculusProxy
from forecasting_tools.ai_models.perplexity import Perplexity

Expand All @@ -34,7 +34,7 @@ async def test_response_from_a_direct_call_is_same_ask_mock_value(
"As of Aug 18 2024 Exasearcher doesn't depend on exact mock return values to validate other steps"
)

if issubclass(subclass, GptO1):
if issubclass(subclass, GptO1Preview):
pytest.skip("GptO1 has inconsistent reasoning token count.")

if issubclass(subclass, Claude35Sonnet):
Expand Down Expand Up @@ -90,7 +90,7 @@ def test_ai_model_async_is_not_blocking(subclass: type[AiModel]) -> None:
number_of_coroutines_to_run = 5
list_should_run_under_x_times_first_coroutine = 3

if issubclass(subclass, GptO1):
if issubclass(subclass, GptO1Preview):
pytest.skip(
"GptO1 is around 2c per call, so this test would be too expensive"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from forecasting_tools.ai_models.basic_model_interfaces.tokens_incur_cost import (
TokensIncurCost,
)
from forecasting_tools.ai_models.gpto1 import GptO1
from forecasting_tools.ai_models.gpto1preview import GptO1Preview
from forecasting_tools.ai_models.model_archetypes.traditional_online_llm import (
TraditionalOnlineLlm,
)
Expand All @@ -23,7 +23,7 @@
def test_predicted_cost_and_tokens_correct_for_cheap_input(
subclass: type[TokensIncurCost],
) -> None:
if issubclass(subclass, GptO1):
if issubclass(subclass, GptO1Preview):
pytest.skip(
"GptO1 has a weird tokenization scheme that doesn't allow for predicting tokens, and is sometimes 1 token off on small prompts. This is after following official instructions"
)
Expand All @@ -36,7 +36,7 @@ def test_system_prompt_cost_and_tokens_correct_for_cheap_input(
subclass: type[TokensIncurCost],
) -> None:
if not issubclass(subclass, TraditionalOnlineLlm) or issubclass(
subclass, GptO1
subclass, GptO1Preview
):
pytest.skip("Model doesn't have a system prompt")

Expand Down
5 changes: 3 additions & 2 deletions code_tests/unit_tests/test_ai_models/models_to_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from forecasting_tools.ai_models.exa_searcher import ExaSearcher
from forecasting_tools.ai_models.gpt4o import Gpt4o
from forecasting_tools.ai_models.gpt4ovision import Gpt4oVision
from forecasting_tools.ai_models.gpto1 import GptO1
from forecasting_tools.ai_models.gpto1preview import GptO1Preview
from forecasting_tools.ai_models.metaculus4o import Gpt4oMetaculusProxy
from forecasting_tools.ai_models.perplexity import Perplexity

Expand All @@ -34,7 +34,8 @@ class ModelsToTest:
Gpt4o,
Gpt4oMetaculusProxy,
Gpt4oVision,
GptO1,
GptO1Preview,
# GptO1, # TODO: dependencies do not yet support this
Claude35Sonnet,
Perplexity,
ExaSearcher,
Expand Down
6 changes: 3 additions & 3 deletions forecasting_tools/ai_models/ai_utils/openai_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import logging

logger = logging.getLogger(__name__)
import base64
import logging
import math
import re
from io import BytesIO
Expand All @@ -21,6 +19,8 @@
from pydantic import BaseModel
from tiktoken import Encoding

logger = logging.getLogger(__name__)


class VisionMessageData(BaseModel):
prompt: str
Expand Down
44 changes: 3 additions & 41 deletions forecasting_tools/ai_models/gpto1.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,5 @@
from typing import Any, Final
from forecasting_tools.ai_models.gpto1preview import GptO1Preview

from forecasting_tools.ai_models.ai_utils.response_types import (
TextTokenCostResponse,
)
from forecasting_tools.ai_models.model_archetypes.openai_text_model import (
OpenAiTextToTextModel,
)


class GptO1(OpenAiTextToTextModel):
# See OpenAI Limit on the account dashboard for most up-to-date limit
MODEL_NAME: Final[str] = "o1-preview"
REQUESTS_PER_PERIOD_LIMIT: Final[int] = 8_000
REQUEST_PERIOD_IN_SECONDS: Final[int] = 60
TIMEOUT_TIME: Final[int] = 120
TOKENS_PER_PERIOD_LIMIT: Final[int] = 2_000_000
TOKEN_PERIOD_IN_SECONDS: Final[int] = 60

def __init__(
self,
*args: Any,
temperature: float = 1,
system_prompt: str | None = None,
**kwargs: Any,
):
assert (
system_prompt is None
), "GptO1Preview does not support system prompts"
assert (
temperature == 1
), f"GptO1Preview must have temperature 1, but {temperature} was given."
super().__init__(*args, temperature=temperature, **kwargs)

@classmethod
def _get_mock_return_for_direct_call_to_model_using_cheap_input(
cls,
) -> TextTokenCostResponse:
response = (
super()._get_mock_return_for_direct_call_to_model_using_cheap_input()
)
response.total_tokens_used += 269 # Add reasoning tokens
return response
class GptO1(GptO1Preview):
MODEL_NAME: str = "o1"
43 changes: 43 additions & 0 deletions forecasting_tools/ai_models/gpto1preview.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Any

from forecasting_tools.ai_models.ai_utils.response_types import (
TextTokenCostResponse,
)
from forecasting_tools.ai_models.model_archetypes.openai_text_model import (
OpenAiTextToTextModel,
)


class GptO1Preview(OpenAiTextToTextModel):
# See OpenAI Limit on the account dashboard for most up-to-date limit
MODEL_NAME: str = "o1-preview"
REQUESTS_PER_PERIOD_LIMIT: int = 8_000
REQUEST_PERIOD_IN_SECONDS: int = 60
TIMEOUT_TIME: int = 120
TOKENS_PER_PERIOD_LIMIT: int = 2_000_000
TOKEN_PERIOD_IN_SECONDS: int = 60

def __init__(
self,
*args: Any,
temperature: float = 1,
system_prompt: str | None = None,
**kwargs: Any,
):
assert (
system_prompt is None
), "GptO1Preview does not support system prompts"
assert (
temperature == 1
), f"GptO1Preview must have temperature 1, but {temperature} was given."
super().__init__(*args, temperature=temperature, **kwargs)

@classmethod
def _get_mock_return_for_direct_call_to_model_using_cheap_input(
cls,
) -> TextTokenCostResponse:
response = (
super()._get_mock_return_for_direct_call_to_model_using_cheap_input()
)
response.total_tokens_used += 269 # Add reasoning tokens
return response
27 changes: 27 additions & 0 deletions forecasting_tools/forecasting/forecast_bots/experiments/exa_bot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents
from forecasting_tools.forecasting.forecast_bots.experiments.q3_template_bot import (
Q3TemplateBot,
)
from forecasting_tools.forecasting.helpers.smart_searcher import SmartSearcher
from forecasting_tools.forecasting.questions_and_reports.questions import (
MetaculusQuestion,
)


class ExaBot(Q3TemplateBot):

async def run_research(self, question: MetaculusQuestion) -> str:
prompt = clean_indents(
f"""
You are an assistant to a superforecaster.
The superforecaster will give you a question they intend to forecast on.
To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information.
You do not produce forecasts yourself.
Question:
{question.question_text}
"""
)

response = await SmartSearcher(temperature=0.1).invoke(prompt)
return response
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from datetime import datetime

from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents
from forecasting_tools.ai_models.gpt4o import Gpt4o
from forecasting_tools.forecasting.forecast_bots.template_bot import (
TemplateBot,
)
from forecasting_tools.forecasting.helpers.smart_searcher import SmartSearcher
from forecasting_tools.forecasting.questions_and_reports.forecast_report import (
ReasonedPrediction,
)
from forecasting_tools.forecasting.questions_and_reports.questions import (
BinaryQuestion,
MetaculusQuestion,
)


class ExaQ4BinaryBot(TemplateBot):
FINAL_DECISION_LLM = Gpt4o(temperature=0.1)

async def run_research(self, question: MetaculusQuestion) -> str:
prompt = clean_indents(
f"""
You are an assistant to a superforecaster.
The superforecaster will give you a question they intend to forecast on.
To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information.
You do not produce forecasts yourself.
Question:
{question.question_text}
"""
)

response = await SmartSearcher(temperature=0.1).invoke(prompt)
return response

async def _run_forecast_on_binary(
self, question: BinaryQuestion, research: str
) -> ReasonedPrediction[float]:
assert isinstance(
question, BinaryQuestion
), "Question must be a BinaryQuestion"
prompt = clean_indents(
f"""
You are a professional forecaster interviewing for a job.
Your interview question is:
{question.question_text}
Background information:
{question.background_info if question.background_info else "No background information provided."}
Resolution criteria:
{question.resolution_criteria if question.resolution_criteria else "No resolution criteria provided."}
Fine print:
{question.fine_print if question.fine_print else "No fine print provided."}
Your research assistant says:
```
{research}
```
Today is {datetime.now().strftime("%Y-%m-%d")}.
Before answering you write:
(a) The time left until the outcome to the question is known.
(b) What the outcome would be if nothing changed.
(c) The most important factors that will influence a successful/unsuccessful resolution.
(d) What do you not know that should give you pause and lower confidence? Remember people are statistically overconfident.
(e) What you would forecast if you were to only use historical precedent (i.e. how often this happens in the past) without any current information.
(f) What you would forecast if there was only a quarter of the time left.
(g) What you would forecast if there was 4x the time left.
You write your rationale and then the last thing you write is your final answer as: "Probability: ZZ%", 0-100
"""
)
gpt_forecast = await self.FINAL_DECISION_LLM.invoke(prompt)
prediction = self._extract_forecast_from_binary_rationale(
gpt_forecast, max_prediction=0.99, min_prediction=0.01
)
return ReasonedPrediction(
prediction_value=prediction, reasoning=gpt_forecast
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from forecasting_tools.ai_models.gpto1preview import GptO1Preview
from forecasting_tools.forecasting.forecast_bots.experiments.exa_q4_binary import (
ExaQ4BinaryBot,
)


class ExaQ4BinaryO1PreviewBot(ExaQ4BinaryBot):
FINAL_DECISION_LLM = GptO1Preview()
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from datetime import datetime

from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents
from forecasting_tools.ai_models.gpt4o import Gpt4o
from forecasting_tools.ai_models.perplexity import Perplexity
from forecasting_tools.forecasting.forecast_bots.template_bot import (
TemplateBot,
)
from forecasting_tools.forecasting.questions_and_reports.forecast_report import (
ReasonedPrediction,
)
from forecasting_tools.forecasting.questions_and_reports.questions import (
BinaryQuestion,
MetaculusQuestion,
)


class Q3TemplateBot(TemplateBot):
"""
Find the q3 bot here: https://github.com/Metaculus/metac-bot/commit/e459f2958f66658783057da46e257896b49607be
"""

FINAL_DECISION_LLM = Gpt4o(
temperature=0.1
) # Q3 Bot used the default llama index temperature which as of Dec 21 2024 is 0.1

async def run_research(self, question: MetaculusQuestion) -> str:
system_prompt = clean_indents(
"""
You are an assistant to a superforecaster.
The superforecaster will give you a question they intend to forecast on.
To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information.
You do not produce forecasts yourself.
"""
)

# Note: The original q3 bot did not set temperature, and I could not find the default temperature of perplexity
response = await Perplexity(
temperature=0.1, system_prompt=system_prompt
).invoke(question.question_text)
return response

async def _run_forecast_on_binary(
self, question: BinaryQuestion, research: str
) -> ReasonedPrediction[float]:
prompt = clean_indents(
f"""
You are a professional forecaster interviewing for a job.
Your interview question is:
{question.question_text}
background:
{question.background_info}
{question.resolution_criteria}
{question.fine_print}
Your research assistant says:
{research}
Today is {datetime.now().strftime("%Y-%m-%d")}.
Before answering you write:
(a) The time left until the outcome to the question is known.
(b) What the outcome would be if nothing changed.
(c) What you would forecast if there was only a quarter of the time left.
(d) What you would forecast if there was 4x the time left.
You write your rationale and then the last thing you write is your final answer as: "Probability: ZZ%", 0-100
"""
)
reasoning = await self.FINAL_DECISION_LLM.invoke(prompt)
prediction = self._extract_forecast_from_binary_rationale(
reasoning, max_prediction=0.99, min_prediction=0.01
)
return ReasonedPrediction(
prediction_value=prediction, reasoning=reasoning
)
Loading

0 comments on commit c3678ed

Please sign in to comment.