-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a few new bots in experiments and updated benchmarking (#7)
* Added new experimental bots * Fixed the 'publish reports to meatculus' ForecastBot parameter so it actually works * Added two new experiment bots * Updated o1 and o1 preview in bot * Removed o1 model from testing due to depdencies not supporting it yet
- Loading branch information
1 parent
2438958
commit c3678ed
Showing
18 changed files
with
519 additions
and
133 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,43 +1,5 @@ | ||
from typing import Any, Final | ||
from forecasting_tools.ai_models.gpto1preview import GptO1Preview | ||
|
||
from forecasting_tools.ai_models.ai_utils.response_types import ( | ||
TextTokenCostResponse, | ||
) | ||
from forecasting_tools.ai_models.model_archetypes.openai_text_model import ( | ||
OpenAiTextToTextModel, | ||
) | ||
|
||
|
||
class GptO1(OpenAiTextToTextModel): | ||
# See OpenAI Limit on the account dashboard for most up-to-date limit | ||
MODEL_NAME: Final[str] = "o1-preview" | ||
REQUESTS_PER_PERIOD_LIMIT: Final[int] = 8_000 | ||
REQUEST_PERIOD_IN_SECONDS: Final[int] = 60 | ||
TIMEOUT_TIME: Final[int] = 120 | ||
TOKENS_PER_PERIOD_LIMIT: Final[int] = 2_000_000 | ||
TOKEN_PERIOD_IN_SECONDS: Final[int] = 60 | ||
|
||
def __init__( | ||
self, | ||
*args: Any, | ||
temperature: float = 1, | ||
system_prompt: str | None = None, | ||
**kwargs: Any, | ||
): | ||
assert ( | ||
system_prompt is None | ||
), "GptO1Preview does not support system prompts" | ||
assert ( | ||
temperature == 1 | ||
), f"GptO1Preview must have temperature 1, but {temperature} was given." | ||
super().__init__(*args, temperature=temperature, **kwargs) | ||
|
||
@classmethod | ||
def _get_mock_return_for_direct_call_to_model_using_cheap_input( | ||
cls, | ||
) -> TextTokenCostResponse: | ||
response = ( | ||
super()._get_mock_return_for_direct_call_to_model_using_cheap_input() | ||
) | ||
response.total_tokens_used += 269 # Add reasoning tokens | ||
return response | ||
class GptO1(GptO1Preview): | ||
MODEL_NAME: str = "o1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from typing import Any | ||
|
||
from forecasting_tools.ai_models.ai_utils.response_types import ( | ||
TextTokenCostResponse, | ||
) | ||
from forecasting_tools.ai_models.model_archetypes.openai_text_model import ( | ||
OpenAiTextToTextModel, | ||
) | ||
|
||
|
||
class GptO1Preview(OpenAiTextToTextModel): | ||
# See OpenAI Limit on the account dashboard for most up-to-date limit | ||
MODEL_NAME: str = "o1-preview" | ||
REQUESTS_PER_PERIOD_LIMIT: int = 8_000 | ||
REQUEST_PERIOD_IN_SECONDS: int = 60 | ||
TIMEOUT_TIME: int = 120 | ||
TOKENS_PER_PERIOD_LIMIT: int = 2_000_000 | ||
TOKEN_PERIOD_IN_SECONDS: int = 60 | ||
|
||
def __init__( | ||
self, | ||
*args: Any, | ||
temperature: float = 1, | ||
system_prompt: str | None = None, | ||
**kwargs: Any, | ||
): | ||
assert ( | ||
system_prompt is None | ||
), "GptO1Preview does not support system prompts" | ||
assert ( | ||
temperature == 1 | ||
), f"GptO1Preview must have temperature 1, but {temperature} was given." | ||
super().__init__(*args, temperature=temperature, **kwargs) | ||
|
||
@classmethod | ||
def _get_mock_return_for_direct_call_to_model_using_cheap_input( | ||
cls, | ||
) -> TextTokenCostResponse: | ||
response = ( | ||
super()._get_mock_return_for_direct_call_to_model_using_cheap_input() | ||
) | ||
response.total_tokens_used += 269 # Add reasoning tokens | ||
return response |
27 changes: 27 additions & 0 deletions
27
forecasting_tools/forecasting/forecast_bots/experiments/exa_bot.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents | ||
from forecasting_tools.forecasting.forecast_bots.experiments.q3_template_bot import ( | ||
Q3TemplateBot, | ||
) | ||
from forecasting_tools.forecasting.helpers.smart_searcher import SmartSearcher | ||
from forecasting_tools.forecasting.questions_and_reports.questions import ( | ||
MetaculusQuestion, | ||
) | ||
|
||
|
||
class ExaBot(Q3TemplateBot): | ||
|
||
async def run_research(self, question: MetaculusQuestion) -> str: | ||
prompt = clean_indents( | ||
f""" | ||
You are an assistant to a superforecaster. | ||
The superforecaster will give you a question they intend to forecast on. | ||
To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information. | ||
You do not produce forecasts yourself. | ||
Question: | ||
{question.question_text} | ||
""" | ||
) | ||
|
||
response = await SmartSearcher(temperature=0.1).invoke(prompt) | ||
return response |
85 changes: 85 additions & 0 deletions
85
forecasting_tools/forecasting/forecast_bots/experiments/exa_q4_binary.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
from datetime import datetime | ||
|
||
from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents | ||
from forecasting_tools.ai_models.gpt4o import Gpt4o | ||
from forecasting_tools.forecasting.forecast_bots.template_bot import ( | ||
TemplateBot, | ||
) | ||
from forecasting_tools.forecasting.helpers.smart_searcher import SmartSearcher | ||
from forecasting_tools.forecasting.questions_and_reports.forecast_report import ( | ||
ReasonedPrediction, | ||
) | ||
from forecasting_tools.forecasting.questions_and_reports.questions import ( | ||
BinaryQuestion, | ||
MetaculusQuestion, | ||
) | ||
|
||
|
||
class ExaQ4BinaryBot(TemplateBot): | ||
FINAL_DECISION_LLM = Gpt4o(temperature=0.1) | ||
|
||
async def run_research(self, question: MetaculusQuestion) -> str: | ||
prompt = clean_indents( | ||
f""" | ||
You are an assistant to a superforecaster. | ||
The superforecaster will give you a question they intend to forecast on. | ||
To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information. | ||
You do not produce forecasts yourself. | ||
Question: | ||
{question.question_text} | ||
""" | ||
) | ||
|
||
response = await SmartSearcher(temperature=0.1).invoke(prompt) | ||
return response | ||
|
||
async def _run_forecast_on_binary( | ||
self, question: BinaryQuestion, research: str | ||
) -> ReasonedPrediction[float]: | ||
assert isinstance( | ||
question, BinaryQuestion | ||
), "Question must be a BinaryQuestion" | ||
prompt = clean_indents( | ||
f""" | ||
You are a professional forecaster interviewing for a job. | ||
Your interview question is: | ||
{question.question_text} | ||
Background information: | ||
{question.background_info if question.background_info else "No background information provided."} | ||
Resolution criteria: | ||
{question.resolution_criteria if question.resolution_criteria else "No resolution criteria provided."} | ||
Fine print: | ||
{question.fine_print if question.fine_print else "No fine print provided."} | ||
Your research assistant says: | ||
``` | ||
{research} | ||
``` | ||
Today is {datetime.now().strftime("%Y-%m-%d")}. | ||
Before answering you write: | ||
(a) The time left until the outcome to the question is known. | ||
(b) What the outcome would be if nothing changed. | ||
(c) The most important factors that will influence a successful/unsuccessful resolution. | ||
(d) What do you not know that should give you pause and lower confidence? Remember people are statistically overconfident. | ||
(e) What you would forecast if you were to only use historical precedent (i.e. how often this happens in the past) without any current information. | ||
(f) What you would forecast if there was only a quarter of the time left. | ||
(g) What you would forecast if there was 4x the time left. | ||
You write your rationale and then the last thing you write is your final answer as: "Probability: ZZ%", 0-100 | ||
""" | ||
) | ||
gpt_forecast = await self.FINAL_DECISION_LLM.invoke(prompt) | ||
prediction = self._extract_forecast_from_binary_rationale( | ||
gpt_forecast, max_prediction=0.99, min_prediction=0.01 | ||
) | ||
return ReasonedPrediction( | ||
prediction_value=prediction, reasoning=gpt_forecast | ||
) |
8 changes: 8 additions & 0 deletions
8
forecasting_tools/forecasting/forecast_bots/experiments/exa_q4_binary_o1_preview.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from forecasting_tools.ai_models.gpto1preview import GptO1Preview | ||
from forecasting_tools.forecasting.forecast_bots.experiments.exa_q4_binary import ( | ||
ExaQ4BinaryBot, | ||
) | ||
|
||
|
||
class ExaQ4BinaryO1PreviewBot(ExaQ4BinaryBot): | ||
FINAL_DECISION_LLM = GptO1Preview() |
81 changes: 81 additions & 0 deletions
81
forecasting_tools/forecasting/forecast_bots/experiments/q3_template_bot.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from datetime import datetime | ||
|
||
from forecasting_tools.ai_models.ai_utils.ai_misc import clean_indents | ||
from forecasting_tools.ai_models.gpt4o import Gpt4o | ||
from forecasting_tools.ai_models.perplexity import Perplexity | ||
from forecasting_tools.forecasting.forecast_bots.template_bot import ( | ||
TemplateBot, | ||
) | ||
from forecasting_tools.forecasting.questions_and_reports.forecast_report import ( | ||
ReasonedPrediction, | ||
) | ||
from forecasting_tools.forecasting.questions_and_reports.questions import ( | ||
BinaryQuestion, | ||
MetaculusQuestion, | ||
) | ||
|
||
|
||
class Q3TemplateBot(TemplateBot): | ||
""" | ||
Find the q3 bot here: https://github.com/Metaculus/metac-bot/commit/e459f2958f66658783057da46e257896b49607be | ||
""" | ||
|
||
FINAL_DECISION_LLM = Gpt4o( | ||
temperature=0.1 | ||
) # Q3 Bot used the default llama index temperature which as of Dec 21 2024 is 0.1 | ||
|
||
async def run_research(self, question: MetaculusQuestion) -> str: | ||
system_prompt = clean_indents( | ||
""" | ||
You are an assistant to a superforecaster. | ||
The superforecaster will give you a question they intend to forecast on. | ||
To be a great assistant, you generate a concise but detailed rundown of the most relevant news, including if the question would resolve Yes or No based on current information. | ||
You do not produce forecasts yourself. | ||
""" | ||
) | ||
|
||
# Note: The original q3 bot did not set temperature, and I could not find the default temperature of perplexity | ||
response = await Perplexity( | ||
temperature=0.1, system_prompt=system_prompt | ||
).invoke(question.question_text) | ||
return response | ||
|
||
async def _run_forecast_on_binary( | ||
self, question: BinaryQuestion, research: str | ||
) -> ReasonedPrediction[float]: | ||
prompt = clean_indents( | ||
f""" | ||
You are a professional forecaster interviewing for a job. | ||
Your interview question is: | ||
{question.question_text} | ||
background: | ||
{question.background_info} | ||
{question.resolution_criteria} | ||
{question.fine_print} | ||
Your research assistant says: | ||
{research} | ||
Today is {datetime.now().strftime("%Y-%m-%d")}. | ||
Before answering you write: | ||
(a) The time left until the outcome to the question is known. | ||
(b) What the outcome would be if nothing changed. | ||
(c) What you would forecast if there was only a quarter of the time left. | ||
(d) What you would forecast if there was 4x the time left. | ||
You write your rationale and then the last thing you write is your final answer as: "Probability: ZZ%", 0-100 | ||
""" | ||
) | ||
reasoning = await self.FINAL_DECISION_LLM.invoke(prompt) | ||
prediction = self._extract_forecast_from_binary_rationale( | ||
reasoning, max_prediction=0.99, min_prediction=0.01 | ||
) | ||
return ReasonedPrediction( | ||
prediction_value=prediction, reasoning=reasoning | ||
) |
Oops, something went wrong.