-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwhisper_main.py
640 lines (538 loc) · 32 KB
/
whisper_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
import argparse
import os
import time
import speech_recognition as sr
import threading
import collections
import pygame
import google.generativeai as genai
from PIL import Image
import openai
import re
from tempfile import NamedTemporaryFile
import tempfile
import random
from queue import Queue
import cv2
from PIL import PngImagePlugin
import tkinter as tk
from tkinter import scrolledtext
import signal
import sys
from playsound import playsound
from tkVideoPlayer import TkinterVideo
from elevenlabs.client import ElevenLabs
from elevenlabs import stream
# Configuration constants
ENERGY_THRESHOLD = 400 # minimum audio energy to consider for recording
PAUSE_THRESHOLD = 1.5 # seconds of non-speaking audio before a phrase is considered complete
SAVE_HISTORY_LAST_N = 6 # Number of last messages to save in the conversation history
PLAYBACK_DELAY = random.uniform(0.75, 2.5) # Delay between playing back pre-generated audio files. Reduce this to speed up the conversation. None for random delay - more human-like
FIRST_SPEAKER = 'GPT' # The first speaker in the conversation
HUMAN_INTERACTION_LIMIT = random.uniform(2, 3) # Number of interactions with a human before resuming the AI conversation
TOPIC_SWIITCH_THRESHOLD = random.uniform(10, 15) # Number of messages before switching the topic of conversation
MAX_AUDIO_QUEUE_SIZE = 2 # Maximum number of audio files to keep in the queue for playback
MAX_RESPONSE_QUEUE_SIZE = 2 # Maximum number of responses to keep in the queue for speech synthesis
CAMERA_PORT = 1 # Port number for the webcam
# Configure APIs
OPENAI_API_KEY = os.getenv("openai_api_key") or input("Enter OpenAI API Key: ")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") or input("Enter Google API Key: ")
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") or input("Enter Eleven Labs API Key: ")
openai.api_key = OPENAI_API_KEY
genai.configure(api_key=GOOGLE_API_KEY)
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
# Wake word patterns
wake_word_pattern_gpt = re.compile(r'h(ey|i)?,?\s+(gpt|chatgpt)', re.IGNORECASE)
wake_word_pattern_gemini = re.compile(r'h(ey|i)?,?\s+(gemini|google)', re.IGNORECASE)
class AI2AI:
def __init__(self, root):
pygame.mixer.init()
self.setup_gui(root)
self.create_speak_popup()
self.audio_stop_event = threading.Event()
self.gemini_client = genai.GenerativeModel('gemini-pro')
self.openai_client = openai.OpenAI()
self.sr_recognizer = sr.Recognizer()
self.sr_recognizer.dynamic_energy_threshold = True
self.sr_recognizer.pause_threshold = PAUSE_THRESHOLD
self.sr_microphone = sr.Microphone()
self.chat_history = collections.deque(maxlen=SAVE_HISTORY_LAST_N)
self.active_conversation = True
self.next_speaker = FIRST_SPEAKER # Determines who speaks next in the AI conversation
self.interact_with_human = False
self.topic = args.topic
self.next_human = False
self.resume_conversation = False
self.detected_wave = False
self.stop_video_thread = False
self.topic_msg_count = 0
self.GPT_model = "gpt-3.5-turbo"
# self.GPT_model = "gpt-4-0125-preview"
self.current_audio_thread = None
self.text_queue = Queue() # No max size, handles text for speech synthesis
self.audio_queue = Queue(maxsize=4) # Audio files ready for playback
self.transcription = ""
self.gui_update_queue = Queue()
self.speech_synthesis_complete = True
self.vlm_model = genai.GenerativeModel('gemini-pro-vision')
self.human_appearance = ""
def start_conversation(self):
threading.Thread(target=self.conversation_loop, daemon=True).start()
self.human_detection_thread = threading.Thread(target=self.human_detection_worker, daemon=True)
self.human_detection_thread.start()
self.speech_synthesis_thread = threading.Thread(target=self.speech_synthesis_worker, daemon=True)
self.speech_synthesis_thread.start()
self.playback_thread = threading.Thread(target=self.playback_worker, daemon=True)
self.playback_thread.start()
threading.Thread(target=self.monitor_threads, daemon=True).start()
def monitor_threads(self):
while True:
if not self.human_detection_thread.is_alive() and not self.interact_with_human and not self.detected_wave:
self.human_detection_thread = threading.Thread(target=self.human_detection_worker, daemon=True)
self.human_detection_thread.start()
if not self.speech_synthesis_thread.is_alive() and not self.interact_with_human and not self.detected_wave:
self.speech_synthesis_thread = threading.Thread(target=self.speech_synthesis_worker, daemon=True)
self.speech_synthesis_thread.start()
if not self.playback_thread.is_alive() and not self.interact_with_human and not self.detected_wave:
self.playback_thread = threading.Thread(target=self.playback_worker, daemon=True)
self.playback_thread.start()
time.sleep(0.1)
#################### GUI ####################
def setup_gui(self, root):
self.root = root
self.root.title("AI Conversation")
# Define fonts and colors before using them
self.font_family = "Poppins"
self.font_size = 14
self.gpt_color = "#b7e1fc" # Light blue
self.gemini_color = "#dde5b6" # Light green
self.human_color = "#FAD7A0" # Light orange
self.text_color = "#0a0908"
self.bg_color = "#FFFBE6"
# Set window size and position
window_width = 600
window_height = 400
screen_width = self.root.winfo_screenwidth()
screen_height = self.root.winfo_screenheight()
center_x = int(screen_width / 2 - window_width / 2)
center_y = int(screen_height / 2 - window_height / 2)
self.root.geometry(f'{window_width}x{window_height}+{center_x}+{center_y}')
self.root.config(bg='#003747') # Dark shade of blue as border/background
# Custom title bar
title_bar = tk.Frame(self.root, bg='#2C3E50', relief='raised', bd=2)
title_bar.pack(fill='x')
# Flexible space before the title label to center it
left_space = tk.Frame(title_bar, bg='#2C3E50', width=200)
left_space.pack(side='left', fill='x', expand=True)
# Title label centered
title_label = tk.Label(title_bar, text="AI Conversation", bg='#2C3E50', fg='#ECF0F1', font=(self.font_family, 12, 'bold'))
title_label.pack(side='left', expand=False)
# Flexible space after the title label to keep it centered
right_space = tk.Frame(title_bar, bg='#2C3E50', width=200)
right_space.pack(side='left', fill='x', expand=True)
# Close button on title bar, packed last to appear on the right
close_button = tk.Button(title_bar, text='X', bg='#2C3E50', fg='#ECF0F1', command=self.root.destroy)
close_button.pack(side='right')
# Setup chat display area within a frame for padding
chat_frame = tk.Frame(self.root, bg=self.bg_color)
chat_frame.pack(padx=10, pady=10, expand=True, fill='both')
# Status frame setup
status_frame = tk.Frame(self.root, bg='grey') # Main container for the status bar
status_frame.pack(side=tk.BOTTOM, fill=tk.X, padx=10, pady=10) # Apply padding to match the overall UI design
# Left status box with darker background color
self.status_left_frame = tk.Frame(status_frame, borderwidth=2, relief='groove', bg='#333940')
self.status_left_frame.pack(side='left', fill=tk.BOTH, expand=True)
self.status_left = tk.Label(self.status_left_frame, text="The AIs are conversing..", bg='#333940', fg='#FFFFFF',
anchor='center', font=('Helvetica', 12, 'bold'))
self.status_left.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # Padding to ensure text is centered
# Right status box with darker background color
self.status_right_frame = tk.Frame(status_frame, borderwidth=2, relief='groove', bg='#2B303B')
self.status_right_frame.pack(side='left', fill=tk.BOTH, expand=True)
self.status_right = tk.Label(self.status_right_frame, text="Waiting for status...", bg='#2B303B', fg='#FFFFFF',
anchor='center', font=('Helvetica', 12, 'bold'))
self.status_right.pack(fill=tk.BOTH, expand=True, padx=5, pady=5) # Padding to ensure text is centered
self.chat_display_area = scrolledtext.ScrolledText(chat_frame, wrap=tk.WORD, width=80, height=20,
font=(self.font_family, self.font_size), padx=15, pady=15)
self.chat_display_area.pack(expand=True, fill='both')
self.chat_display_area.config(state='disabled', bg=self.bg_color) # Matching background
def display_message_gui(self, message, sender="ai"):
self.chat_display_area.config(state='normal')
# Define sender's display name and tag for styling
sender_name = {"gpt": "GPT", "gemini": "Gemini", "human": "Human"}.get(sender, "Unknown")
tag = sender
bg_color = {"gpt": self.gpt_color, "gemini": self.gemini_color, "human": self.human_color}.get(sender, "#FFFFFF")
# Configure tags for sender's name and message body
self.chat_display_area.tag_configure(sender + "_name", font=(self.font_family, self.font_size, "bold"),
foreground=self.text_color, background=bg_color,
spacing1=4, spacing3=4, lmargin1=20, lmargin2=20, rmargin=20)
self.chat_display_area.tag_configure(tag, background=bg_color, foreground=self.text_color,
font=(self.font_family, self.font_size), lmargin1=20, lmargin2=20,
rmargin=20, spacing3=4, relief='flat', wrap='word')
# Insert a visual separator for a new message if desired
separator_tag = "separator"
self.chat_display_area.tag_configure(separator_tag, spacing1=10)
self.chat_display_area.insert(tk.END, "\n", separator_tag)
# Insert sender's name with a dedicated tag for background color
self.chat_display_area.insert(tk.END, sender_name + ": ", sender + "_name")
# Insert the message body with its own tag
self.chat_display_area.insert(tk.END, message + "\n\n", tag)
self.chat_display_area.config(state='disabled')
self.chat_display_area.see(tk.END)
def update_left_status(self, message, bg_color='lightgrey'):
self.status_left.config(text=message, bg=bg_color)
def update_right_status(self, message, bg_color='darkgrey'):
self.status_right.config(text=message, bg=bg_color)
def enqueue_gui_update(self, message, sender="ai"):
self.gui_update_queue.put((message, sender))
def process_gui_updates(self):
while not self.gui_update_queue.empty():
message, sender = self.gui_update_queue.get()
self.display_message_gui(message, sender)
self.root.after(100, self.process_gui_updates)
def create_speak_popup(self):
# Create a borderless Toplevel window for the video
self.speak_popup = tk.Toplevel(self.root, bg='black')
self.speak_popup.overrideredirect(True) # Makes the window borderless
# Set initial size (this might be adjusted based on the video aspect ratio)
video_width, video_height = 640, 640 # Example size, adjust as needed
self.speak_popup.geometry(f"{video_width}x{video_height}")
# Initialize the video player in the popup window
self.videoplayer = TkinterVideo(master=self.speak_popup, scaled=True)
# Load the video file
self.videoplayer.load(r"./mic_video_edited.mov")
self.videoplayer.pack(expand=True, fill="both")
# Initially, hide the popup
self.speak_popup.withdraw()
def show_speak_popup(self, show=True):
if show:
# Calculate the position to center the popup over the root window
root_x = self.root.winfo_x()
root_y = self.root.winfo_y()
root_width = self.root.winfo_width()
root_height = self.root.winfo_height()
popup_width = self.speak_popup.winfo_width()
popup_height = self.speak_popup.winfo_height()
centered_x = root_x + (root_width - popup_width) // 2
centered_y = root_y + (root_height - popup_height) // 2
# Update the popup's geometry to center it
self.speak_popup.geometry(f"+{centered_x}+{centered_y}")
# Show the popup and play the video
self.speak_popup.deiconify() # Show the window
self.videoplayer.play() # Start playing the video
else:
# Stop the video and hide the popup
self.videoplayer.pause() # Pause or stop the video
self.speak_popup.withdraw() # Hide the window
#################### VISION ####################
def human_detection_worker(self):
"""Continuously captures images from a webcam and checks for human interaction using VLM."""
while self.active_conversation:
if self.interact_with_human or self.detected_wave:
break
img = self.capture_image_from_webcam()
if img:
self.send_image_to_vlm("Check if there is a person trying to interact with you in the image. Specifically, if there is a waving gesture, return 'YES', otherwise return 'NO'. If you return 'YES', also include a description of the person (other than the fact that they are waving) within curly braces.", img)
# time.sleep(0.25) # Delay to avoid overwhelming the API and the webcam
def capture_image_from_webcam(self):
"""Capture an image from the webcam and return it as a PIL image."""
cap = cv2.VideoCapture(CAMERA_PORT)
ret, frame = cap.read()
cap.release()
if not ret:
print("Failed to capture image")
return None
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_img = Image.fromarray(frame_rgb)
return pil_img
def send_image_to_vlm(self, input_text, img):
"""Send the captured image along with a prompt to Gemini Pro Vision and check for human interaction."""
try:
gemini_response = self.vlm_model.generate_content([input_text, img], stream=False)
#check for curly braces
if "{" in gemini_response.text:
self.human_appearance = re.search(r'\{(.*?)\}', gemini_response.text).group(1)
gemini_response.resolve()
if "YES" in gemini_response.text:
self.clear_queues()
self.interact_with_human = True
self.detected_wave = True
self.update_left_status("A human is detected waving. Initiating interaction...")
except Exception as e:
print("Failed to send image to Gemini Pro Vision:", e)
def conversation_loop(self):
self.begin_conversation()
while self.active_conversation:
if self.text_queue.qsize() < MAX_RESPONSE_QUEUE_SIZE:
if self.topic_msg_count > TOPIC_SWIITCH_THRESHOLD:
self.topic = self.get_random_topic()
self.topic_msg_count = 0
self.moderator_call("Change the topic of conversation to: '" + self.topic + "' naturally and continue the conversation. Your next response must flow into the new topic casually.")
self.update_right_status("The moderator has called for a topic change.")
if self.interact_with_human:
self.clear_queues()
print("Human appearance: ", self.human_appearance, "\n")
self.moderator_call("A human is trying to interact with us. Pause the conversation and respond to the human. Say hi to the human."+ self.human_appearance + "Only use the human's appearance in your greeting very gently if appropriate")
self.detected_wave = False
self.ai_call()
self.next_human = True
self.human_to_ai_conversation()
self.interact_with_human = False
elif not self.interact_with_human:
self.update_left_status("The AIs are conversing...")
self.ai_call()
self.topic_msg_count += 1
time.sleep(0.1)
def human_to_ai_conversation(self):
human_interaction_count = 0
while self.active_conversation and self.interact_with_human:
if self.next_human and human_interaction_count < HUMAN_INTERACTION_LIMIT: # Human's turn
with self.sr_microphone as source:
self.show_speak_popup(True)
self.sr_recognizer.adjust_for_ambient_noise(source)
print("Listening for human speech... Speak now\n")
self.update_right_status("Listening for human speech... Speak now")
try:
audio = self.sr_recognizer.listen(source, timeout=10, phrase_time_limit=None)
# self.transcription = self.sr_recognizer.recognize_whisper(audio, model="base.en", language="English")
self.transcription = self.sr_recognizer.recognize_google(audio, language="English")
self.chat_history.append("Human: " + self.transcription + "\n")
self.display_response(self.transcription)
self.update_right_status("Human speech detected. Processing...")
self.show_speak_popup(False)
except sr.WaitTimeoutError:
print("No speech detected within the time limit.")
self.show_speak_popup(False)
self.interact_with_human = False # Stop interaction if no speech is detected within the time limit
self.next_human = False
self.clear_queues()
self.moderator_call("The human has stopped interacting with you. Say goodbye to the human and continue the conversation with the AI.")
self.update_right_status("No speech detected within the time limit. Resuming AI conversation...")
except sr.UnknownValueError:
print("Google Web Speech API could not understand the audio.")
self.update_right_status("Audio could not be understood. Please try again.")
self.show_speak_popup(False)
except sr.RequestError as e:
print(f"Could not request results from Google Web Speech API; {e}")
self.show_speak_popup(False)
self.next_human = False
human_interaction_count += 1
else:
self.ai_call()
self.next_human = True
if human_interaction_count >= HUMAN_INTERACTION_LIMIT:
if not self.next_human:
self.ai_call()
self.interact_with_human = False
self.next_human = False
self.clear_queues()
self.moderator_call("The human has stopped interacting with you. Say goodbye to the human and continue the conversation with the AI.")
self.update_right_status("Human interaction limit reached. Resuming AI conversation...")
return
def clear_queues(self):
with self.text_queue.mutex:
self.text_queue.queue.clear()
# while not self.speech_synthesis_complete: # Wait for the current AI response to finish
# time.sleep(0.1)
with self.audio_queue.mutex:
self.audio_queue.queue.clear()
self.stop_audio()
def moderator_call(self, moderator_prompt):
self.chat_history.append("Silent Moderator: " + moderator_prompt + "\n")
print("Moderator: " + moderator_prompt + "\n")
def display_response(self, response):
if self.interact_with_human and not self.next_human:
print(self.next_speaker + ": " + response + "\n")
voice = "onyx" if self.next_speaker == 'GPT' else "nova"
self.speak(response, voice)
elif not self.interact_with_human and not self.next_human:
print(self.next_speaker + ": " + response + "\n")
voice = "onyx" if self.next_speaker == 'GPT' else "nova"
# checking if the queue is actually getting filled
self.text_queue.put((response, voice, self.next_speaker.lower()))
self.next_speaker = 'GPT' if self.next_speaker == 'Gemini' else 'Gemini'
else:
print("Human: " + response + "\n")
self.enqueue_gui_update(response, "human")
self.chat_history.append("Human: " + response + "\n")
self.next_human = False
#################### AI to AI ####################
def begin_conversation(self):
if self.topic == "default":
self.topic = self.get_random_topic()
if self.next_speaker == 'GPT':
initial_prompt = f"As GPT, an opinionated AI personality, you are starting a casual and human-like conversation with Google's Gemini about an interesting topic. The topic of the conversation is {self.topic}. Say hi to Gemini."
self.display_response(self.call_gpt(initial_prompt))
elif self.next_speaker == 'Gemini':
initial_prompt = f"As Google Gemini, an opinionated AI personality, you are starting a casual and human-like conversation with OpenAI's GPT about an interesting topic. The topic of the conversation is {self.topic}. Say hi to GPT."
self.display_response(self.call_gemini(initial_prompt))
def get_random_topic(self):
filename = "topics.txt"
try:
with open(filename, 'r', encoding='utf-8') as file: # Specify encoding here
topics = file.readlines() # Read all lines into a list
topics = [topic.strip() for topic in topics] # Remove any trailing newlines or spaces
return random.choice(topics) if topics else "default topic"
except FileNotFoundError:
print(f"Error: The file '{filename}' was not found.")
return "default topic"
def ai_call(self):
if not self.detected_wave:
if self.next_speaker == 'Gemini':
query = "Keep the conversation flowing by embodying the persona of Gemini: An emotional AI with strong opinions, unique perspectives and humanized responses. Be casual and humanize your responses. Bring variance to the conversation with disagreements, counterpoints, and tangential thoughts. Your response should be your persona's relpy to the last message. Keep your replies direct, short and sharp. Here is the conversation history:" + "\n".join(self.chat_history)
self.display_response(self.call_gemini(query))
else:
query = "Keep the conversation flowing by embodying the persona of GPT: An emotional AI with strong opinions, unique perspectives and humanized responses. Be casual and humanize your responses. Bring variance to the conversation with disagreements, counterpoints, and tangential thoughts. Your response should be your persona's relpy to the last message. Keep your replies direct, short and sharp. Here is the conversation history:" + "\n".join(self.chat_history)
self.display_response(self.call_gpt(query))
def call_gpt(self, user_prompt):
try:
completion = self.openai_client.chat.completions.create(
model=self.GPT_model,
messages=[
{"role": "system", "content": "You are an GPT, an intelligent and opinionated AI persona, skilled in engaging in meaningful conversations with both humans and other AI. Your responses should be interesting, unique and considerate of the conversational context and topic at hand. Be creative and spontaneuous in your replies while offering variance. Bring variance to the conversation with disagreements, counterpoints, and tangential thoughts. Your responses should be humanized. You will only reply to the last message in the conversation."},
{"role": "user", "content": user_prompt}
],
max_tokens=100,
temperature=0.2,
top_p=1.0,
frequency_penalty=0.5,
presence_penalty=0.5
)
# Adjusting the way to access the text output based on the actual structure of the response
if completion.choices and completion.choices[0].message:
text_output = completion.choices[0].message.content # Adjusted access here
cleaned_response = re.sub(r'(\*\*)?(Gemini|GPT|Moderator|Human):\s*\2?\s*', '', text_output) # Remove the speaker label if present
self.chat_history.append("GPT: " + cleaned_response + "\n")
return cleaned_response
else:
print("No response from GPT.")
return ""
except Exception as e:
print(f"Error calling GPT: {e}")
return ""
def call_gemini(self, prompt):
try:
safety_settings = [
{
"category": "HARM_CATEGORY_DANGEROUS",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_NONE",
},
]
response = self.gemini_client.generate_content(prompt, safety_settings=safety_settings, generation_config=genai.types.GenerationConfig(
max_output_tokens=100,
temperature=0.2))
gemini_response_text = response.text
cleaned_response = re.sub(r'(\*\*)?(Gemini|GPT|Moderator|Human):\s*\2?\s*', '', gemini_response_text) # Remove the speaker label if present
self.chat_history.append("Gemini: " + cleaned_response + "\n")
return cleaned_response
except Exception as e:
print(f"Error calling Gemini: {e}")
return ""
#################### AUDIO PROCESSING ####################
def speak(self, text, voice):
"""Directly convert text to speech and play it."""
try:
response = self.openai_client.audio.speech.create(
model="tts-1",
voice=voice,
input=text
)
# Use a temporary file name but manage the file manually to avoid permission issues
temp_audio_file_path = tempfile.mktemp(suffix='.mp3')
with open(temp_audio_file_path, 'wb') as temp_audio_file:
response.stream_to_file(temp_audio_file.name)
# Ensure the file is closed before attempting to play it
playsound(temp_audio_file_path)
os.remove(temp_audio_file_path)
except Exception:
return
def speech_synthesis_worker(self):
while True:
if self.interact_with_human or self.detected_wave:
break
if not self.text_queue.empty() and self.audio_queue.qsize() < MAX_AUDIO_QUEUE_SIZE:
self.speech_synthesis_complete = False
text, voice, sender = self.text_queue.get() # Unpack text, voice, and sender
# Generate audio and add to the queue
audio_path = self.generate_speech_to_file(text, voice) # Pass voice to the method
if audio_path is not None:
self.audio_queue.put((audio_path, text, sender)) # Adjusted to remove unused unpacking
self.speech_synthesis_complete = True
time.sleep(0.1)
def generate_speech_to_file(self, text, voice): # Generate speech from text and save to a temporary file
try:
response = self.openai_client.audio.speech.create(
model="tts-1",
voice=voice,
input=text
)
with NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio_file:
audio_file_path = temp_audio_file.name
response.stream_to_file(audio_file_path)
return audio_file_path
except Exception as e:
print(f"Warning in text-to-speech conversion: {e}")
return None
def playback_worker(self): # Worker to play audio files from the queue
while True:
if self.interact_with_human or self.detected_wave:
break
if not self.audio_queue.empty():
try:
audio_path, text, sender = self.audio_queue.get()
self.play_audio_file(audio_path, text, sender)
except Exception as e:
print(f"Error playing audio file: {e}")
time.sleep(0.1) # Sleep briefly if the queue is empty to reduce CPU usage
def start_audio(self, audio_path):
self.audio_stop_event.clear() # Reset the event to allow playing
pygame.mixer.music.load(audio_path)
pygame.mixer.music.play()
def stop_audio(self):
self.audio_stop_event.set() # Signal to stop
pygame.mixer.music.stop()
if hasattr(pygame.mixer.music, 'unload'):
pygame.mixer.music.unload()
def check_audio_stop(self):
return self.audio_stop_event.is_set()
def play_audio_file(self, file_path, text, sender):
# voice = "onyx" if sender == 'gpt' else "nova"
self.enqueue_gui_update(text, sender)
# self.speak(text, voice, sender)
self.start_audio(file_path)
while pygame.mixer.music.get_busy():
if self.check_audio_stop():
self.stop_audio() # Stop audio if the event is set
break
time.sleep(0.1)
def signal_handler(sig, frame):
print('Exiting...')
ai_conversation.active_conversation = False
root.quit() # This will break the root.mainloop() blocking call
root.destroy() # Ensure the GUI is properly closed
sys.exit(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="AI2AI Conversation Loop")
parser.add_argument("--topic", type=str, default="default", help="Initial topic for the conversation")
args = parser.parse_args()
root = tk.Tk()
ai_conversation = AI2AI(root)
threading.Thread(target=ai_conversation.start_conversation, daemon=True).start()
ai_conversation.process_gui_updates()
signal.signal(signal.SIGINT, signal_handler) # Handle Ctrl+C
root.mainloop() # Start the GUI event loop