Skip to content

Commit

Permalink
Added two new experiment bots
Browse files Browse the repository at this point in the history
  • Loading branch information
CodexVeritas committed Dec 21, 2024
1 parent 374fa4c commit 8013a75
Show file tree
Hide file tree
Showing 10 changed files with 189 additions and 64 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from forecasting_tools.ai_models.basic_model_interfaces.ai_model import AiModel
from forecasting_tools.ai_models.claude35sonnet import Claude35Sonnet
from forecasting_tools.ai_models.exa_searcher import ExaSearcher
from forecasting_tools.ai_models.gpto1 import GptO1
from forecasting_tools.ai_models.gpto1preview import GptO1Preview
from forecasting_tools.ai_models.metaculus4o import Gpt4oMetaculusProxy
from forecasting_tools.ai_models.perplexity import Perplexity

Expand All @@ -34,7 +34,7 @@ async def test_response_from_a_direct_call_is_same_ask_mock_value(
"As of Aug 18 2024 Exasearcher doesn't depend on exact mock return values to validate other steps"
)

if issubclass(subclass, GptO1):
if issubclass(subclass, GptO1Preview):
pytest.skip("GptO1 has inconsistent reasoning token count.")

if issubclass(subclass, Claude35Sonnet):
Expand Down Expand Up @@ -90,7 +90,7 @@ def test_ai_model_async_is_not_blocking(subclass: type[AiModel]) -> None:
number_of_coroutines_to_run = 5
list_should_run_under_x_times_first_coroutine = 3

if issubclass(subclass, GptO1):
if issubclass(subclass, GptO1Preview):
pytest.skip(
"GptO1 is around 2c per call, so this test would be too expensive"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from forecasting_tools.ai_models.basic_model_interfaces.tokens_incur_cost import (
TokensIncurCost,
)
from forecasting_tools.ai_models.gpto1 import GptO1
from forecasting_tools.ai_models.gpto1preview import GptO1Preview
from forecasting_tools.ai_models.model_archetypes.traditional_online_llm import (
TraditionalOnlineLlm,
)
Expand All @@ -23,7 +23,7 @@
def test_predicted_cost_and_tokens_correct_for_cheap_input(
subclass: type[TokensIncurCost],
) -> None:
if issubclass(subclass, GptO1):
if issubclass(subclass, GptO1Preview):
pytest.skip(
"GptO1 has a weird tokenization scheme that doesn't allow for predicting tokens, and is sometimes 1 token off on small prompts. This is after following official instructions"
)
Expand All @@ -36,7 +36,7 @@ def test_system_prompt_cost_and_tokens_correct_for_cheap_input(
subclass: type[TokensIncurCost],
) -> None:
if not issubclass(subclass, TraditionalOnlineLlm) or issubclass(
subclass, GptO1
subclass, GptO1Preview
):
pytest.skip("Model doesn't have a system prompt")

Expand Down
1 change: 1 addition & 0 deletions code_tests/unit_tests/test_ai_models/models_to_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class ModelsToTest:
Gpt4o,
Gpt4oMetaculusProxy,
Gpt4oVision,
# GptO1Preview,
GptO1,
Claude35Sonnet,
Perplexity,
Expand Down
44 changes: 3 additions & 41 deletions forecasting_tools/ai_models/gpto1.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,5 @@
from typing import Any, Final
from forecasting_tools.ai_models.gpto1preview import GptO1Preview

from forecasting_tools.ai_models.ai_utils.response_types import (
TextTokenCostResponse,
)
from forecasting_tools.ai_models.model_archetypes.openai_text_model import (
OpenAiTextToTextModel,
)


class GptO1(OpenAiTextToTextModel):
# See OpenAI Limit on the account dashboard for most up-to-date limit
MODEL_NAME: Final[str] = "o1-preview"
REQUESTS_PER_PERIOD_LIMIT: Final[int] = 8_000
REQUEST_PERIOD_IN_SECONDS: Final[int] = 60
TIMEOUT_TIME: Final[int] = 120
TOKENS_PER_PERIOD_LIMIT: Final[int] = 2_000_000
TOKEN_PERIOD_IN_SECONDS: Final[int] = 60

def __init__(
self,
*args: Any,
temperature: float = 1,
system_prompt: str | None = None,
**kwargs: Any,
):
assert (
system_prompt is None
), "GptO1Preview does not support system prompts"
assert (
temperature == 1
), f"GptO1Preview must have temperature 1, but {temperature} was given."
super().__init__(*args, temperature=temperature, **kwargs)

@classmethod
def _get_mock_return_for_direct_call_to_model_using_cheap_input(
cls,
) -> TextTokenCostResponse:
response = (
super()._get_mock_return_for_direct_call_to_model_using_cheap_input()
)
response.total_tokens_used += 269 # Add reasoning tokens
return response
class GptO1(GptO1Preview):
MODEL_NAME: str = "o1"
43 changes: 43 additions & 0 deletions forecasting_tools/ai_models/gpto1preview.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Any

from forecasting_tools.ai_models.ai_utils.response_types import (
TextTokenCostResponse,
)
from forecasting_tools.ai_models.model_archetypes.openai_text_model import (
OpenAiTextToTextModel,
)


class GptO1Preview(OpenAiTextToTextModel):
# See OpenAI Limit on the account dashboard for most up-to-date limit
MODEL_NAME: str = "o1-preview"
REQUESTS_PER_PERIOD_LIMIT: int = 8_000
REQUEST_PERIOD_IN_SECONDS: int = 60
TIMEOUT_TIME: int = 120
TOKENS_PER_PERIOD_LIMIT: int = 2_000_000
TOKEN_PERIOD_IN_SECONDS: int = 60

def __init__(
self,
*args: Any,
temperature: float = 1,
system_prompt: str | None = None,
**kwargs: Any,
):
assert (
system_prompt is None
), "GptO1Preview does not support system prompts"
assert (
temperature == 1
), f"GptO1Preview must have temperature 1, but {temperature} was given."
super().__init__(*args, temperature=temperature, **kwargs)

@classmethod
def _get_mock_return_for_direct_call_to_model_using_cheap_input(
cls,
) -> TextTokenCostResponse:
response = (
super()._get_mock_return_for_direct_call_to_model_using_cheap_input()
)
response.total_tokens_used += 269 # Add reasoning tokens
return response
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from datetime import datetime

from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents
from forecasting_tools.ai_models.gpt4o import Gpt4o
from forecasting_tools.forecasting.forecast_bots.template_bot import (
TemplateBot,
)
from forecasting_tools.forecasting.helpers.smart_searcher import SmartSearcher
from forecasting_tools.forecasting.questions_and_reports.forecast_report import (
ReasonedPrediction,
)
from forecasting_tools.forecasting.questions_and_reports.questions import (
BinaryQuestion,
MetaculusQuestion,
)


class ExaQ4BinaryBot(TemplateBot):
FINAL_DECISION_LLM = Gpt4o(temperature=0.1)

async def run_research(self, question: MetaculusQuestion) -> str:
prompt = clean_indents(
f"""
You are an assistant to a superforecaster.
The superforecaster will give you a question they intend to forecast on.
To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information.
You do not produce forecasts yourself.
Question:
{question.question_text}
"""
)

response = await SmartSearcher(temperature=0.1).invoke(prompt)
return response

async def _run_forecast_on_binary(
self, question: BinaryQuestion, research: str
) -> ReasonedPrediction[float]:
assert isinstance(
question, BinaryQuestion
), "Question must be a BinaryQuestion"
prompt = clean_indents(
f"""
You are a professional forecaster interviewing for a job.
Your interview question is:
{question.question_text}
Background information:
{question.background_info if question.background_info else "No background information provided."}
Resolution criteria:
{question.resolution_criteria if question.resolution_criteria else "No resolution criteria provided."}
Fine print:
{question.fine_print if question.fine_print else "No fine print provided."}
Your research assistant says:
```
{research}
```
Today is {datetime.now().strftime("%Y-%m-%d")}.
Before answering you write:
(a) The time left until the outcome to the question is known.
(b) What the outcome would be if nothing changed.
(c) The most important factors that will influence a successful/unsuccessful resolution.
(d) What do you not know that should give you pause and lower confidence? Remember people are statistically overconfident.
(e) What you would forecast if you were to only use historical precedent (i.e. how often this happens in the past) without any current information.
(f) What you would forecast if there was only a quarter of the time left.
(g) What you would forecast if there was 4x the time left.
You write your rationale and then the last thing you write is your final answer as: "Probability: ZZ%", 0-100
"""
)
gpt_forecast = await self.FINAL_DECISION_LLM.invoke(prompt)
prediction = self._extract_forecast_from_binary_rationale(
gpt_forecast, max_prediction=0.99, min_prediction=0.01
)
return ReasonedPrediction(
prediction_value=prediction, reasoning=gpt_forecast
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from forecasting_tools.ai_models.gpto1 import GptO1
from forecasting_tools.forecasting.forecast_bots.experiments.exa_q4_binary import (
ExaQ4BinaryBot,
)


class ExaQ4BinaryO1Bot(ExaQ4BinaryBot):
FINAL_DECISION_LLM = GptO1()
28 changes: 21 additions & 7 deletions forecasting_tools/forecasting/helpers/benchmarker.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import inspect
import logging
import subprocess
import time
from datetime import datetime
Expand Down Expand Up @@ -27,6 +29,8 @@
MetaculusQuestion,
)

logger = logging.getLogger(__name__)


class Benchmarker:
"""
Expand All @@ -46,6 +50,7 @@ def __init__(
forecast_bots: list[ForecastBot],
number_of_questions_to_use: int,
file_path_to_save_reports: str | None = None,
concurrent_question_batch_size: int = 10,
) -> None:
self.forecast_bots = forecast_bots
self.number_of_questions_to_use = number_of_questions_to_use
Expand All @@ -56,35 +61,44 @@ def __init__(
file_path_to_save_reports += "/"
self.file_path_to_save_reports = file_path_to_save_reports
self.initialization_timestamp = datetime.now()
self.concurrent_question_batch_size = concurrent_question_batch_size

async def run_benchmark(self) -> list[BenchmarkForBot]:

questions = MetaculusApi.get_benchmark_questions(
self.number_of_questions_to_use,
random_seed=42,
# Choose a random seed so all benchmarks in a similar time period use the same questions
)

questions = typeguard.check_type(questions, list[MetaculusQuestion])
assert len(questions) == self.number_of_questions_to_use
benchmarks = [
BenchmarkForBot(

benchmarks = []
for bot in self.forecast_bots:
try:
source_code = inspect.getsource(bot.__class__)
except Exception:
logger.warning(
f"Could not get source code for {bot.__class__.__name__}"
)
source_code = None
benchmark = BenchmarkForBot(
forecast_reports=[],
forecast_bot_config=bot.get_config(),
description=f"This benchmark ran the {bot.__class__.__name__} bot on {self.number_of_questions_to_use} questions.",
name=f"Benchmark for {bot.__class__.__name__}",
time_taken_in_minutes=None,
total_cost=None,
git_commit_hash=self._get_git_commit_hash(),
code=source_code,
)
for bot in self.forecast_bots
]
benchmarks.append(benchmark)

question_batch_size = 10
for bot, benchmark in zip(self.forecast_bots, benchmarks):
with MonetaryCostManager() as cost_manager:
start_time = time.time()
for batch in self._batch_questions(
questions, question_batch_size
questions, self.concurrent_question_batch_size
):
reports = await bot.forecast_questions(batch)
reports = typeguard.check_type(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class BenchmarkForBot(BaseModel, Jsonable):
total_cost: float | None
git_commit_hash: str
forecast_bot_config: dict[str, str]
code: str | None = None
forecast_reports: list[BinaryReport | NumericReport | MultipleChoiceReport]

@property
Expand Down
31 changes: 21 additions & 10 deletions scripts/benchmark_forecast_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,45 +11,56 @@
from forecasting_tools.forecasting.forecast_bots.experiments.exa_bot import (
ExaBot,
)
from forecasting_tools.forecasting.forecast_bots.experiments.q3_template_bot import (
Q3TemplateBot,
from forecasting_tools.forecasting.forecast_bots.experiments.exa_q4_binary import (
ExaQ4BinaryBot,
)
from forecasting_tools.forecasting.forecast_bots.experiments.exa_q4_binary_o1 import (
ExaQ4BinaryO1Bot,
)
from forecasting_tools.forecasting.forecast_bots.experiments.q4_main_binary_bot import (
Q4MainBinaryBot,
)
from forecasting_tools.forecasting.forecast_bots.forecast_bot import (
ForecastBot,
)
from forecasting_tools.forecasting.forecast_bots.template_bot import (
TemplateBot,
)
from forecasting_tools.forecasting.helpers.benchmarker import Benchmarker
from forecasting_tools.util.custom_logger import CustomLogger

logger = logging.getLogger(__name__)


async def benchmark_forecast_bot() -> None:
questions_to_use = 120
questions_to_use = 2
with MonetaryCostManager() as cost_manager:
bots = [
ExaBot(),
Q4MainBinaryBot(),
Q3TemplateBot(),
TemplateBot(),
Q3TemplateBot(
ExaBot(
research_reports_per_question=3,
predictions_per_research_report=3,
),
Q4MainBinaryBot(
research_reports_per_question=3,
predictions_per_research_report=3,
),
ExaQ4BinaryBot(),
ExaQ4BinaryBot(
research_reports_per_question=3,
predictions_per_research_report=3,
),
ExaQ4BinaryO1Bot(),
]
bots = typeguard.check_type(bots, list[ForecastBot])
benchmarks = await Benchmarker(
number_of_questions_to_use=questions_to_use,
forecast_bots=bots,
file_path_to_save_reports="logs/forecasts/benchmarks/",
concurrent_question_batch_size=50,
).run_benchmark()
for i, benchmark in enumerate(benchmarks):
logger.info(f"Benchmark {i+1} of {len(benchmarks)}")
logger.info(
f"Benchmark {i+1} of {len(benchmarks)}: {benchmark.name}"
)
logger.info(
f"- Final Score: {benchmark.average_inverse_expected_log_score}"
)
Expand Down

0 comments on commit 8013a75

Please sign in to comment.