Added configuration stanzas (#22)

* Added configuration stanzas for elevenlabs * Fixed broken reference for * Added configuration stanzas for whisper tts * Added configuration stanzas for whisper stt * Fixed dupe keys * We now record elevenlabs outputs properly
Und3rf10w · Dec 20, 2023 · 3d6b258 · 3d6b258
1 parent 26f3062
commit 3d6b258
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 33 deletions.
diff --git a/config.example.toml b/config.example.toml
@@ -8,9 +8,27 @@ recordings_directory = "recordings"
 
 [openai]
 # Set your openai api key here
-api_key = "sk...."
+openai_api_key = "sk...."
+
+[openai.whisper]
 # Set the path to `mpv.exe` if it's not already in your PATH. Only matters if `tts_engine` == `whisper`
 # mpv_path = mpv.exe
+# Which openai whisper voice ID to use, defaults to "nova" if not set
+whisper_voice_id = "nova"
+# Which openai whisper engine ID to use, defaults to "tts-1" if not set
+whisper_voice_model = "tts-1"
+# Which openai whisper TTS engine to use, defaults to `whisper-1` if not set
+whisper_engine = "whisper-1"
 
 [elevenlabs]
-api_key = ""
+eleven_api_key = ""
+# If not set, it'll use the default voice
+elevenlabs_voice_id = ""
+# If not set it'll use the default stability (0.5)
+elevenlabs_stability = 0.5
+# If not set it'll use the default similarity_boost (0.75)
+elevenlabs_similarity_boost = 0.75
+# If not set it'll use the default style (0). You probably want this at zero
+elevenlabs_style = 0
+# If not set to "True" it'll use the default speaker_boost setting (False)
+elevenlabs_use_speaker_boost="False"
diff --git a/pdm.lock b/pdm.lock
diff --git a/src/openjanus/app/config.py b/src/openjanus/app/config.py
@@ -43,8 +43,8 @@ def set_openai_api_key() -> str:
         else:
             LOGGER.debug("Setting openai API key from config file")
             config = load_config()
-            environ["OPENAI_API_KEY"] = config["openai"]["api_key"]
-            return config["openai"]["api_key"]
+            environ["OPENAI_API_KEY"] = config["openai"]["openai_api_key"]
+            return config["openai"]["openai_api_key"]
     except KeyError:
         LOGGER.error("The openai API key was not found in the environment variable or the config file")
         raise ApiKeyNotSetException("OpenAI")
@@ -59,8 +59,8 @@ def set_eleven_api_key() -> str:
         else:
             LOGGER.debug("Setting elevenlabs API key from config file")
             config = load_config()
-            environ["ELEVEN_API_KEY"] = config["elevenlabs"]["api_key"]
-            return config["elevenlabs"]["api_key"]
+            environ["ELEVEN_API_KEY"] = config["elevenlabs"]["eleven_api_key"]
+            return config["elevenlabs"]["eleven_api_key"]
     except KeyError:
         LOGGER.error("The elevenlabs API key was not found in the environment variable or the config file")
         raise ApiKeyNotSetException("Elevenlabs")
@@ -74,14 +74,14 @@ def check_mpv_path() -> str:
         try:
             LOGGER.debug("Setting mpv path from config file")
             config = load_config()
-            if not path.isfile(config["openjanus"]["mpv_path"]):
+            if not path.isfile(config["openai"]["whisper"]["mpv_path"]):
                 LOGGER.error("mpv.exe was not found")
                 raise TtsMpvNotFoundException()
             else:
-                return config["openjanus"]["mpv_path"]
+                return config["openai"]["whisper"]["mpv_path"]
         except KeyError:
             LOGGER.error("The mpv path was not found in the environment variable or the config file")
-            raise ConfigKeyNotFound("openjanus/mpv_path")
+            raise ConfigKeyNotFound("openai/whisper/mpv_path")
         except FileNotFoundError:
             LOGGER.error("mpv.exe was not found")
             raise TtsMpvNotFoundException()
@@ -119,7 +119,7 @@ def get_recordings_dir() -> str:
         LOGGER.debug("Getting recordings directory from config file")
         config = load_config()
         recordings_dir = config["openjanus"]["recordings_directory"]
-        return path.relpath(recordings_dir)
+        return path.relpath(recordings_dir) + "/"
     except KeyError:
         LOGGER.error("The recordings directory was not found in the environment variable or the config file")
         raise ConfigKeyNotFound("openjanus/recordings_directory")
@@ -134,6 +134,57 @@ def ensure_recordings_dir_exists():
         except Exception as e:
             LOGGER.error(f"Failed to create the {recordings_dir} directory", exc_info=e)
             raise DirectoryCreationException(f"Failed to create the {recordings_dir} directory") from e
+
+def get_elevenlabs_config() -> Dict[str, Any]:
+    """Get the elevenlabs config"""
+    try:
+        LOGGER.debug("Getting elevenlabs config from config file")
+        config = load_config()
+        set_eleven_api_key()
+        if not config["elevenlabs"]["elevenlabs_voice_id"]:
+            LOGGER.warning("The elevenlabs voice was not set, using the default voice")
+            from openjanus.tts.elevenlabs.async_patch import DEFAULT_VOICE
+            config["elevenlabs"]["elevenlabs_voice_id"] = DEFAULT_VOICE
+        if not config["elevenlabs"]['elevenlabs_stability']:
+            LOGGER.warning("The elevenlabs stability was not set, using the default stability")
+            config["elevenlabs"]["elevenlabs_stability"] = 0.5
+        if not config["elevenlabs"]['elevenlabs_similarity_boost']:
+            LOGGER.warning("The elevenlabs similarity boost was not set, using the default similarity boost")
+            config["elevenlabs"]["elevenlabs_similarity_boost"] = 0.75
+        if not config["elevenlabs"]['elevenlabs_style']:
+            LOGGER.warning("The elevenlabs style was not set, using the default style")
+            config["elevenlabs"]["elevenlabs_style"] = 0
+        if not config["elevenlabs"]['elevenlabs_use_speaker_boost'] or config["elevenlabs"]['elevenlabs_use_speaker_boost'].lower() != "true":
+            config["elevenlabs"]["elevenlabs_use_speaker_boost"] = False
+        elif config["elevenlabs"]['elevenlabs_use_speaker_boost'].lower() == "true":
+            config["elevenlabs"]["elevenlabs_use_speaker_boost"] = True
+        else:
+            LOGGER.warning("The elevenlabs use speaker boost was misconfigured, using the default use speaker boost")
+            config["elevenlabs"]["elevenlabs_use_speaker_boost"] = False
+        return config["elevenlabs"]
+
+    except KeyError:
+        LOGGER.error("The elevenlabs config was not found in the environment variable or the config file")
+        raise ConfigKeyNotFound("elevenlabs")
+
+def get_openai_whisper_config() -> Dict[str, Any]:
+    """Get the openai whisper config"""
+    try:
+        LOGGER.debug("Getting openai whisper config from config file")
+        config = load_config()
+        if not config["openai"]["whisper"]["whisper_voice_id"]:
+            LOGGER.warning("The openai whisper voice id was not set, using the default voice id")
+            config["openai"]["whisper"]["whisper_voice_id"] = "nova"
+        if not config["openai"]["whisper"]["whisper_voice_model"]:
+            LOGGER.warning("The openai whisper voice model was not set, using the default voice model")
+            config["openai"]["whisper"]["whisper_voice_model"] = "tts-1"
+        if not config["openai"]["whisper"]["whisper_engine"]:
+            LOGGER.warning("The openai whisper engine was not set, using the default engine")
+            config["openai"]["whisper"]["whisper_engine"] = "whisper-1"
+        return config["openai"]["whisper"]
+    except KeyError:
+        LOGGER.error("The openai whisper config was not found in the environment variable or the config file")
+        raise ConfigKeyNotFound("openai/whisper")
 
 
 def startup_checks() -> bool:

diff --git a/src/openjanus/stt/whisper/parser.py b/src/openjanus/stt/whisper/parser.py
@@ -6,7 +6,10 @@
 from langchain.document_loaders.blob_loaders import Blob
 from langchain.schema import Document
 
-logger = logging.getLogger(__name__)
+from openjanus.app.config import get_openai_whisper_config
+
+
+LOGGER = logging.getLogger(__name__)
 
 
 class OpenAIWhisperParser(BaseBlobParser):
@@ -15,6 +18,7 @@ class OpenAIWhisperParser(BaseBlobParser):
 
     def __init__(self, api_key: Optional[str] = None):
         self.api_key = api_key
+        self.config = get_openai_whisper_config()
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:
         """Lazily parse the blob."""
@@ -58,18 +62,18 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
                 file_obj.name = f"part_{split_number}.mp3"
 
             # Transcribe
-            print(f"Transcribing part {split_number+1}!")
+            LOGGER.debug(f"Transcribing part {split_number+1}!")
             attempts = 0
             while attempts < 3:
                 try:
-                    transcript = openai.audio.transcriptions.create(model="whisper-1", file=file_obj)
+                    transcript = openai.audio.transcriptions.create(model=self.config.get('whisper_engine', "whisper-1"), file=file_obj)
                     break
                 except Exception as e:
                     attempts += 1
-                    print(f"Attempt {attempts} failed. Exception: {str(e)}")
+                    LOGGER.error(f"Attempt {attempts} failed. Exception: {str(e)}")
                     time.sleep(5)
             else:
-                print("Failed to transcribe after 3 attempts.")
+                LOGGER.error("Failed to transcribe after 3 attempts.")
                 continue
 
             yield Document(

diff --git a/src/openjanus/tts/elevenlabs/chat.py b/src/openjanus/tts/elevenlabs/chat.py
@@ -7,6 +7,7 @@
 from langchain.schema.language_model import BaseLanguageModel
 from langchain.schema.messages import BaseMessage
 
+from openjanus.app.config import get_elevenlabs_config
 from openjanus.tts.elevenlabs.tts import ElevenLabsText2SpeechTool
 from openjanus.tts.elevenlabs.async_patch import DEFAULT_VOICE
 
@@ -30,13 +31,14 @@ def run_chat_message(tts: ElevenLabsText2SpeechTool, chain: BaseLanguageModel, m
 
 
 def get_tool() -> ElevenLabsText2SpeechTool:
+    elevenlabs_config = get_elevenlabs_config()
     set_api_key(getenv("ELEVEN_API_KEY"))
-    voice_id = DEFAULT_VOICE.voice_id
+    voice_id = elevenlabs_config['elevenlabs_voice_id']
     voice_settings = VoiceSettings(
-        stability=0.5,
-        similarity_boost=0.75,
-        style=0,
-        use_speaker_boost=False
+        stability=elevenlabs_config['elevenlabs_stability'],
+        similarity_boost=elevenlabs_config['elevenlabs_similarity_boost'],
+        style=elevenlabs_config['elevenlabs_style'],
+        use_speaker_boost=elevenlabs_config['elevenlabs_use_speaker_boost']
     )
     voice = Voice(
         voice_id=voice_id,

diff --git a/src/openjanus/tts/elevenlabs/tts.py b/src/openjanus/tts/elevenlabs/tts.py
@@ -1,6 +1,7 @@
 from datetime import datetime
 from enum import Enum
 import logging
+import pathlib
 import tempfile
 from typing import Any, Coroutine, Dict, Optional, Union, Iterator, Generator
 
@@ -12,6 +13,7 @@
 from langchain.tools.base import BaseTool
 from langchain.utils import get_from_dict_or_env
 import openjanus.tts.elevenlabs.async_patch as eleven_labs_async_patch
+from openjanus.app.config import get_recordings_dir
 
 
 LOGGER = logging.getLogger(__name__)
@@ -56,32 +58,39 @@ class ElevenLabsText2SpeechTool(BaseTool):
         "Spanish, Italian, French, Portuguese, and Hindi. "
     )
     voice: Voice
+    output_dir: str = get_recordings_dir()
+    output_file_path: Optional[str] = ""
 
     @root_validator(pre=True)
     def validate_environment(cls, values: Dict) -> Dict:
         """Validate that api key exists in environment."""
         _ = get_from_dict_or_env(values, "eleven_api_key", "ELEVEN_API_KEY")
 
         return values
+
+    def set_recording_path(self):
+        # TODO: Clean this up, set from config, etc
+        output_format = self.output_dir + f"output.{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.mp3".replace(' ','_')
+        self.output_file_path = str(pathlib.PurePath(output_format))
 
     def save_file(self, audio: Union[bytes, Iterator[bytes]]):
         if isinstance(audio, Iterator):
             raw_audio = iter(audio)
         else:
             raw_audio = audio
         elevenlabs = _import_elevenlabs()
-        now = datetime.now()
-        formatted_dt = now.strftime(format="%Y%m%d_%H%M%S")
-        elevenlabs.save(raw_audio, f"{formatted_dt}_{self.voice.voice_id}_chat.mp3")
+        self.set_recording_path()
+        elevenlabs.save(raw_audio, self.output_file_path)
 
     def _run(
         self, query, run_manager: Optional[CallbackManagerForToolRun] = None
-    ) -> str:
+    ):
         """Use the tool."""
         elevenlabs = _import_elevenlabs()
         try:
             speech = elevenlabs.generate(text=query, model=self.model, voice=self.voice)
             elevenlabs.play(speech)
+            self.save_file(audio=speech)
             # with tempfile.NamedTemporaryFile(
             #     mode="bx", suffix=".wav", delete=False
             # ) as f:
@@ -96,15 +105,15 @@ async def _arun(self, stream, **kwargs: Any) -> Coroutine[Any, Any, Any]:
             await self.astream_speech_from_stream(
                 text_stream=stream,
                 chunk_size=100,
-                save_message=False,
+                save_message=True,
             )
         except Exception as e:
             raise RuntimeError(f"Error while running ElevenLabsText2SpeechTool: {e}")
 
 
 
 
-    def play(self, query: str, save_message: bool = False) -> None:
+    def play(self, query: str, save_message: bool = True) -> None:
         """
         Play the speech as text
 
@@ -143,7 +152,7 @@ async def aprocess_message(self, query, save_message):
         if save_message:
             self.save_file(b''.join(audio_chunks))
 
-    async def astream_speech(self, text_stream, save_message: bool = False) -> None:
+    async def astream_speech(self, text_stream, save_message: bool = True) -> None:
         async def async_generator_to_list(async_generator):
             return [item async for item in async_generator]
 
@@ -155,7 +164,7 @@ async def async_generator_to_list(async_generator):
         for future in asyncio.as_completed(tasks):
             result = await future  # result is not used in this case
 
-    async def astream_speech_from_stream(self, text_stream, chunk_size: int = 1000, save_message: bool = False) -> None:
+    async def astream_speech_from_stream(self, text_stream, chunk_size: int = 1000, save_message: bool = True) -> None:
         """
         Play a text stream with TTS
 

diff --git a/src/openjanus/tts/whisper/tts.py b/src/openjanus/tts/whisper/tts.py
@@ -4,11 +4,12 @@
 import pathlib
 import shutil
 import subprocess
-from typing import Any, Optional, Union, Iterator, Literal
+from typing import Any, Dict, Optional, Union, Iterator, Literal
 
 from langchain.tools.base import BaseTool
 
 from openjanus.app.config import get_recordings_dir
+from openjanus.app.config import get_openai_whisper_config
 
 
 LOGGER = logging.getLogger(__name__)
@@ -25,20 +26,23 @@ class OpenAIWhisperSpeaker(BaseTool):
     output_dir: str = get_recordings_dir()
     output_file_path: Optional[str] = ""
     verbose: bool = True
+    config: Dict[str, Any] = get_openai_whisper_config()
 
     def __init__(
             self, 
             api_key: Optional[str] = None, 
             voice_id: Optional[Literal["alloy", "echo", "fable", "onyx", "nova", "shimmer"]] = "nova",
             voice_model: Optional[Union[str, Literal["tts-1", "tts-1-hd"]]] = "tts-1",
             output_dir: str = get_recordings_dir(),
+            config: Dict[str, Any] = get_openai_whisper_config(),
             *args,
             **kwargs
         ) -> None:
         super().__init__(*args, **kwargs)
+        self.config = config
         self.api_key = api_key
-        self.voice_id = voice_id
-        self.voice_model = voice_model
+        self.voice_id = self.config.get('whisper_voice_id', voice_id)
+        self.voice_model = self.config.get('whisper_voice_model', voice_model)
         self.output_dir = output_dir
         self.output_file_path = ""