🎧 fix(TTS): Improve State of audio playback, hook patterns, and fix u…

…ndefined MediaSource (#3632)
danny-avila · Aug 13, 2024 · dc8d30a · dc8d30a
1 parent e3ebcfd
commit dc8d30a
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 72 deletions.
diff --git a/client/src/components/Chat/Input/StreamAudio.tsx b/client/src/components/Chat/Input/StreamAudio.tsx
@@ -5,9 +5,9 @@ import { useQueryClient } from '@tanstack/react-query';
 import { useRecoilState, useRecoilValue, useSetRecoilState } from 'recoil';
 import type { TMessage } from 'librechat-data-provider';
 import { useCustomAudioRef, MediaSourceAppender, usePauseGlobalAudio } from '~/hooks/Audio';
+import { getLatestText, logger } from '~/utils';
 import { useAuthContext } from '~/hooks';
 import { globalAudioId } from '~/common';
-import { getLatestText } from '~/utils';
 import store from '~/store';
 
 function timeoutPromise(ms: number, message?: string) {
@@ -51,7 +51,7 @@ export default function StreamAudio({ index = 0 }) {
     const latestText = getLatestText(latestMessage);
 
     const shouldFetch = !!(
-      token &&
+      token != null &&
       automaticPlayback &&
       isSubmitting &&
       latestMessage &&
@@ -60,7 +60,7 @@ export default function StreamAudio({ index = 0 }) {
       latestMessage.messageId &&
       !latestMessage.messageId.includes('_') &&
       !isFetching &&
-      activeRunId &&
+      activeRunId != null &&
       activeRunId !== audioRunId
     );
 
@@ -109,7 +109,8 @@ export default function StreamAudio({ index = 0 }) {
         const reader = response.body.getReader();
 
         const type = 'audio/mpeg';
-        const browserSupportsType = MediaSource.isTypeSupported(type);
+        const browserSupportsType =
+          typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported(type);
         let mediaSource: MediaSourceAppender | undefined;
         if (browserSupportsType) {
           mediaSource = new MediaSourceAppender(type);
@@ -210,6 +211,7 @@ export default function StreamAudio({ index = 0 }) {
     // eslint-disable-next-line react-hooks/exhaustive-deps
   }, [paramId]);
 
+  logger.log('StreamAudio.tsx - globalAudioURL:', globalAudioURL);
   return (
     <audio
       ref={audioRef}
@@ -222,7 +224,7 @@ export default function StreamAudio({ index = 0 }) {
         height: '0px',
         width: '0px',
       }}
-      src={globalAudioURL || undefined}
+      src={globalAudioURL ?? undefined}
       id={globalAudioId}
       muted
       autoPlay

diff --git a/client/src/components/Chat/Messages/MessageAudio.tsx b/client/src/components/Chat/Messages/MessageAudio.tsx
@@ -3,6 +3,7 @@ import { useRecoilValue } from 'recoil';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import { VolumeIcon, VolumeMuteIcon, Spinner } from '~/components/svg';
 import { useLocalize, useTextToSpeech } from '~/hooks';
+import { logger } from '~/utils';
 import store from '~/store';
 
 type THoverButtons = {
@@ -45,6 +46,12 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
     }
   }, [audioRef, isSpeaking, playbackRate, messageId]);
 
+  logger.log(
+    'MessageAudio: audioRef.current?.src, audioRef.current',
+    audioRef.current?.src,
+    audioRef.current,
+  );
+
   return (
     <>
       <button
@@ -75,6 +82,7 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
       <audio
         ref={audioRef}
         controls
+        preload="none"
         controlsList="nodownload nofullscreen noremoteplayback"
         style={{
           position: 'absolute',
@@ -83,7 +91,10 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
           height: '0px',
           width: '0px',
         }}
-        src={audioRef.current?.src ?? undefined}
+        src={audioRef.current?.src}
+        onError={(error) => {
+          console.error('Error fetching audio:', error);
+        }}
         id={`audio-${messageId}`}
         muted
         autoPlay

diff --git a/client/src/hooks/Input/useTextToSpeech.ts b/client/src/hooks/Input/useTextToSpeech.ts
@@ -1,12 +1,13 @@
-import { useRecoilState } from 'recoil';
-import { useRef, useMemo, useEffect } from 'react';
+import { useRecoilState, useRecoilValue } from 'recoil';
+import { useRef, useMemo, useEffect, useState } from 'react';
 import { parseTextParts } from 'librechat-data-provider';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import type { Option } from '~/common';
 import useTextToSpeechExternal from './useTextToSpeechExternal';
 import useTextToSpeechBrowser from './useTextToSpeechBrowser';
 import useGetAudioSettings from './useGetAudioSettings';
 import useTextToSpeechEdge from './useTextToSpeechEdge';
+import useAudioRef from '~/hooks/Audio/useAudioRef';
 import { usePauseGlobalAudio } from '../Audio';
 import { logger } from '~/utils';
 import store from '~/store';
@@ -20,41 +21,77 @@ type TUseTextToSpeech = {
 
 const useTextToSpeech = (props?: TUseTextToSpeech) => {
   const { messageId, content, isLast = false, index = 0 } = props ?? {};
-  const [voice, setVoice] = useRecoilState(store.voice);
+
+  const isMouseDownRef = useRef(false);
+  const timerRef = useRef<number | undefined>(undefined);
+  const [isSpeakingState, setIsSpeaking] = useState(false);
+  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
+
   const { textToSpeechEndpoint } = useGetAudioSettings();
   const { pauseGlobalAudio } = usePauseGlobalAudio(index);
-  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const [voice, setVoice] = useRecoilState(store.voice);
+  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
+
+  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
 
   const {
     generateSpeechLocal,
     cancelSpeechLocal,
-    isSpeaking: isSpeakingLocal,
     voices: voicesLocal,
-  } = useTextToSpeechBrowser();
+  } = useTextToSpeechBrowser({ setIsSpeaking });
 
   const {
     generateSpeechEdge,
     cancelSpeechEdge,
-    isSpeaking: isSpeakingEdge,
     voices: voicesEdge,
-  } = useTextToSpeechEdge();
+  } = useTextToSpeechEdge({ setIsSpeaking });
 
   const {
     generateSpeechExternal,
     cancelSpeech: cancelSpeechExternal,
-    isSpeaking: isSpeakingExternal,
     isLoading: isLoadingExternal,
-    audioRef: audioRefExternal,
     voices: voicesExternal,
-  } = useTextToSpeechExternal(messageId ?? '', isLast, index);
+  } = useTextToSpeechExternal({
+    setIsSpeaking,
+    audioRef,
+    messageId,
+    isLast,
+    index,
+  });
+
+  const generateSpeech = useMemo(() => {
+    const map = {
+      edge: generateSpeechEdge,
+      browser: generateSpeechLocal,
+      external: generateSpeechExternal,
+    };
 
-  let generateSpeech, cancelSpeech, isSpeaking, isLoading;
+    return map[textToSpeechEndpoint];
+  }, [generateSpeechEdge, generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);
+
+  const cancelSpeech = useMemo(() => {
+    const map = {
+      edge: cancelSpeechEdge,
+      browser: cancelSpeechLocal,
+      external: cancelSpeechExternal,
+    };
+    return map[textToSpeechEndpoint];
+  }, [cancelSpeechEdge, cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);
+
+  const isLoading = useMemo(() => {
+    const map = {
+      edge: false,
+      browser: false,
+      external: isLoadingExternal,
+    };
+    return map[textToSpeechEndpoint];
+  }, [isLoadingExternal, textToSpeechEndpoint]);
 
   const voices: Option[] | string[] = useMemo(() => {
     const voiceMap = {
-      external: voicesExternal,
       edge: voicesEdge,
       browser: voicesLocal,
+      external: voicesExternal,
     };
 
     return voiceMap[textToSpeechEndpoint];
@@ -88,34 +125,6 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
     }
   }, [setVoice, textToSpeechEndpoint, voice, voices]);
 
-  switch (textToSpeechEndpoint) {
-    case 'external':
-      generateSpeech = generateSpeechExternal;
-      cancelSpeech = cancelSpeechExternal;
-      isSpeaking = isSpeakingExternal;
-      isLoading = isLoadingExternal;
-      if (audioRefExternal.current) {
-        audioRef.current = audioRefExternal.current;
-      }
-      break;
-    case 'edge':
-      generateSpeech = generateSpeechEdge;
-      cancelSpeech = cancelSpeechEdge;
-      isSpeaking = isSpeakingEdge;
-      isLoading = false;
-      break;
-    case 'browser':
-    default:
-      generateSpeech = generateSpeechLocal;
-      cancelSpeech = cancelSpeechLocal;
-      isSpeaking = isSpeakingLocal;
-      isLoading = false;
-      break;
-  }
-
-  const isMouseDownRef = useRef(false);
-  const timerRef = useRef<number | undefined>(undefined);
-
   const handleMouseDown = () => {
     isMouseDownRef.current = true;
     timerRef.current = window.setTimeout(() => {

diff --git a/client/src/hooks/Input/useTextToSpeechBrowser.ts b/client/src/hooks/Input/useTextToSpeechBrowser.ts
@@ -7,9 +7,12 @@ interface VoiceOption {
   label: string;
 }
 
-function useTextToSpeechBrowser() {
+function useTextToSpeechBrowser({
+  setIsSpeaking,
+}: {
+  setIsSpeaking: (isSpeaking: boolean) => void;
+}) {
   const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
-  const [isSpeaking, setIsSpeaking] = useState(false);
   const [voiceName] = useRecoilState(store.voice);
   const [voices, setVoices] = useState<VoiceOption[]>([]);
 
@@ -61,7 +64,7 @@ function useTextToSpeechBrowser() {
     setIsSpeaking(false);
   };
 
-  return { generateSpeechLocal, cancelSpeechLocal, isSpeaking, voices };
+  return { generateSpeechLocal, cancelSpeechLocal, voices };
 }
 
 export default useTextToSpeechBrowser;
diff --git a/client/src/hooks/Input/useTextToSpeechEdge.ts b/client/src/hooks/Input/useTextToSpeechEdge.ts
@@ -13,14 +13,16 @@ interface Voice {
 interface UseTextToSpeechEdgeReturn {
   generateSpeechEdge: (text: string) => void;
   cancelSpeechEdge: () => void;
-  isSpeaking: boolean;
   voices: Voice[];
 }
 
-function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
+function useTextToSpeechEdge({
+  setIsSpeaking,
+}: {
+  setIsSpeaking: (isSpeaking: boolean) => void;
+}): UseTextToSpeechEdgeReturn {
   const localize = useLocalize();
   const [voices, setVoices] = useState<Voice[]>([]);
-  const [isSpeaking, setIsSpeaking] = useState<boolean>(false);
   const voiceName = useRecoilValue(store.voice);
   const ttsRef = useRef<MsEdgeTTS | null>(null);
   const audioElementRef = useRef<HTMLAudioElement | null>(null);
@@ -29,7 +31,10 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
   const pendingBuffers = useRef<Uint8Array[]>([]);
   const { showToast } = useToastContext();
 
-  const isBrowserSupported = useMemo(() => MediaSource.isTypeSupported('audio/mpeg'), []);
+  const isBrowserSupported = useMemo(
+    () => typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported('audio/mpeg'),
+    [],
+  );
 
   const fetchVoices = useCallback(() => {
     if (!ttsRef.current) {
@@ -146,7 +151,7 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
           setIsSpeaking(true);
           pendingBuffers.current = [];
 
-          const readable = await ttsRef.current.toStream(text);
+          const readable = ttsRef.current.toStream(text);
 
           readable.on('data', (chunk: Buffer) => {
             pendingBuffers.current.push(new Uint8Array(chunk));
@@ -200,21 +205,21 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
   }, [showToast, localize]);
 
   useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
       return;
     }
     fetchVoices();
-  }, [fetchVoices]);
+  }, [fetchVoices, isBrowserSupported]);
 
   useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
       return;
     }
     initializeTTS();
-  }, [voiceName, initializeTTS]);
+  }, [voiceName, initializeTTS, isBrowserSupported]);
 
   useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
       return;
     }
     initializeMediaSource();
@@ -223,18 +228,17 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
         URL.revokeObjectURL(audioElementRef.current?.src ?? '');
       }
     };
-  }, [initializeMediaSource]);
+  }, [initializeMediaSource, isBrowserSupported]);
 
   if (!isBrowserSupported) {
     return {
       generateSpeechEdge: () => ({}),
       cancelSpeechEdge: () => ({}),
-      isSpeaking: false,
       voices: [],
     };
   }
 
-  return { generateSpeechEdge, cancelSpeechEdge, isSpeaking, voices };
+  return { generateSpeechEdge, cancelSpeechEdge, voices };
 }
 
 export default useTextToSpeechEdge;