-
Notifications
You must be signed in to change notification settings - Fork 0
/
gaming_coach.py
222 lines (179 loc) · 7.64 KB
/
gaming_coach.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import time
import json
from dataclasses import dataclass, field
from dotenv import load_dotenv
from fancy_dataclass import ArgparseDataclass
from langchain.chat_models import init_chat_model
from modules.ai_coach import AICoach
from modules.image_proc.monitor_cam import MonitorCam
from modules.chatbot.llm_chatbot import LLMChatbot
from modules.chatbot.messages import HumanTextMessage, HumanImageMessage
from modules.text_to_speech.offline.windows_tts import WindowsTTS
from modules.overlay_ui.snapshot_overlay import SnapshotOverlay
from utils.config_types import CoachConfig
from utils.perf_timer import PerfTimer
from utils import image_proc
@dataclass
class CoachProgramArgs(ArgparseDataclass, default_help=True):
'''The main coaching program.'''
config_path: str = field(
metadata={
'args': '--config',
'required': True,
'help': 'The path to the config file (ends in .toml)'
}
)
if __name__ == '__main__':
# Fetch CLI arguments
cli_args = CoachProgramArgs.from_cli_args()
config_filepath = cli_args.config_path
# Load environment variables
load_dotenv('.env')
# Load config file from specified file path
config = CoachConfig.load_config(config_filepath)
# Screenshot capture options
GPU_IDX = config.screenshot.GPU_IDX
MONITOR_IDX = config.screenshot.MONITOR_IDX
LOW_RESOLUTION_MODE = config.screenshot.LOW_RESOLUTION_MODE
# Chatbot AI options
MODEL_PROVIDER = config.chatbot.MODEL_PROVIDER
MODEL_NAME = config.chatbot.MODEL_NAME
KEEP_BACK_FORTH_HISTORY = config.chatbot.KEEP_BACK_FORTH_HISTORY
# Prompt options
PROMPTS_LIST_PATH = config.prompts.PROMPTS_LIST_PATH
PROMPT_CONFIG_NAME = config.prompts.PROMPT_CONFIG_NAME
# TTS options
MAIN_TTS_SERVICE = config.text_to_speech.MAIN_TTS_SERVICE
TTS_COQUI_VOICE_SAMPLES_PATH = config.text_to_speech.offline_ai.VOICE_SAMPLES_PATH
TTS_ELEVENLABS_VOICE_MODEL = config.text_to_speech.online_ai.ONLINE_VOICE_MODEL
# Start a Text-to-speech service based on either ElevenLabs, Coqui TTS, or the Windows API
match MAIN_TTS_SERVICE:
case 'online-ai':
from modules.text_to_speech.online.elevenlabs_tts import ElevenLabsTTS
from elevenlabs.types import VoiceSettings
main_tts = ElevenLabsTTS(
voice=TTS_ELEVENLABS_VOICE_MODEL,
voice_settings=VoiceSettings(
stability=0.5,
similarity_boost=0.75,
style=0.0,
use_speaker_boost=False,
),
rate=1.0,
)
case 'offline-ai':
from modules.text_to_speech.offline.coqui_ai_tts import CoquiTTS
main_tts = CoquiTTS(
voice_samples_path=TTS_COQUI_VOICE_SAMPLES_PATH,
rate=1.25,
)
case 'windows':
main_tts = WindowsTTS(
voice_idx=0,
rate=1.5,
volume=1.00
)
case _:
main_tts = None
# Start the performance timer
perf_timer = PerfTimer(
default_precision=3
)
# Setup the AI-based coach with necessary services
coach = AICoach(
# Start a virtual camera to start taking screenshots of the game
monitor_cam = MonitorCam(
device_idx=GPU_IDX,
output_idx=MONITOR_IDX,
),
# Initialize the LLM model interface
chatbot = LLMChatbot(
chat_model=init_chat_model(
model_provider=MODEL_PROVIDER,
model=MODEL_NAME,
),
keep_back_forth_cycles=KEEP_BACK_FORTH_HISTORY,
),
# Setup TTS services, always setting windows as the fallback
main_tts=main_tts,
fallback_tts=WindowsTTS(
voice_idx=0,
rate=1.5,
volume=1.00
)
)
# Load list of prompts and fetch system and user prompts with given config name
with open(PROMPTS_LIST_PATH, 'r') as fp:
PROMPTS_LIST = json.load(fp)
SELECTED_PROMPT_CONFIG = PROMPTS_LIST[PROMPT_CONFIG_NAME]
coach.setup_prompts(
system_prompt=SELECTED_PROMPT_CONFIG['system-prompt'],
input_prompts=[
*[HumanTextMessage(message) for message in SELECTED_PROMPT_CONFIG['user-prompts']],
HumanImageMessage('{image_input}')
],
)
# Create the snapshot overlay to display the screenshot and bot response
snap_overlay = SnapshotOverlay()
snap_overlay.hide_ui()
print(f'Starting coach with following configuration...')
CHATBOT_SETTINGS = {
'model': {
'provider': MODEL_PROVIDER,
'name': MODEL_NAME,
'params': coach.fetch_chatbot_params(),
},
'prompt': {
'list_path': PROMPTS_LIST_PATH,
'config_name': PROMPT_CONFIG_NAME,
'config': SELECTED_PROMPT_CONFIG,
}
}
print(json.dumps(CHATBOT_SETTINGS, indent=2))
print()
coach.speak_text('Ready to coach!')
try:
while True:
print('Starting new cycle...')
# Start a new snapshot in the session
perf_timer.reset()
# Make sure to hide the overlay before taking a screenshot
snap_overlay.hide_ui()
# NOTE: We add a small amount of delay to prevent the previous screenshot from being captured in the next one
time.sleep(0.200)
# Grab a screencap of the current gameplay
monitor_frame = coach.capture_monitor_screenshot()
perf_timer.print_elapsed_time_and_reset('Desktop capture from GPU')
if not LOW_RESOLUTION_MODE:
# Reduce the smallest side length to 768 (otherwise OpenAI would do it anyways) and convert the screencap to base 64
# - See https://platform.openai.com/docs/guides/vision#calculating-costs for mode info
resize_kwargs = {'height': 768}
else:
# Reduce the largest side length to 512 to ensure that the image isn't split internally by the LLM model
# - See https://platform.openai.com/docs/guides/vision#calculating-costs for mode info
resize_kwargs = {'width': 512}
resized_frame = image_proc.resize_image_min_length(monitor_frame, **resize_kwargs)
monitor_frame_b64 = image_proc.numpy_to_base64(resized_frame)
perf_timer.print_elapsed_time_and_reset('Convert screencap to base 64')
# Ask the chatbot to process the image with the given prompts
bot_response = coach.ask_chatbot(image_input=monitor_frame_b64)
perf_timer.print_elapsed_time_and_reset('AI Gameplay Analysis')
# Set the snapshot data and show the snapshot overlay
snap_overlay.set_data(
screencap_img=resized_frame,
response_text=bot_response.content, # type: ignore
)
print('LLM response metadata (for token usage):')
print(json.dumps(bot_response.response_metadata, indent=2))
snap_overlay.show_ui()
snap_overlay.update_ui()
# Speak the bot response via Text-to-Speech
coach.speak_text(bot_response.content) # type: ignore
perf_timer.print_elapsed_time_and_reset('TTS Voice transcription')
print()
except KeyboardInterrupt:
print('Exiting due to user request...')
# Close the snapshot overlay
snap_overlay.root.destroy()
# Stop the coach and cleanup associated resources
coach.cleanup()