diff --git a/.env.example b/.env.example index fcc7c944fa1..70370ef0717 100644 --- a/.env.example +++ b/.env.example @@ -257,6 +257,14 @@ MEILI_NO_ANALYTICS=true MEILI_HOST=http://0.0.0.0:7700 MEILI_MASTER_KEY=DrhYf7zENyR6AlUCKmnz0eYASOQdl6zxH7s7MKFSfFCt + +#==================================================# +# Speech to Text & Text to Speech # +#==================================================# + +STT_API_KEY= +TTS_API_KEY= + #===================================================# # User System # #===================================================# diff --git a/api/app/clients/BaseClient.js b/api/app/clients/BaseClient.js index 7a4d345d74d..39215d917b6 100644 --- a/api/app/clients/BaseClient.js +++ b/api/app/clients/BaseClient.js @@ -373,6 +373,14 @@ class BaseClient { const { user, head, isEdited, conversationId, responseMessageId, saveOptions, userMessage } = await this.handleStartMethods(message, opts); + if (opts.progressCallback) { + opts.onProgress = opts.progressCallback.call(null, { + ...(opts.progressOptions ?? {}), + parentMessageId: userMessage.messageId, + messageId: responseMessageId, + }); + } + const { generation = '' } = opts; // It's not necessary to push to currentMessages diff --git a/api/app/clients/OpenAIClient.js b/api/app/clients/OpenAIClient.js index f2290875754..dbb066534c4 100644 --- a/api/app/clients/OpenAIClient.js +++ b/api/app/clients/OpenAIClient.js @@ -27,6 +27,7 @@ const { createContextHandlers, } = require('./prompts'); const { encodeAndFormat } = require('~/server/services/Files/images/encode'); +const { updateTokenWebsocket } = require('~/server/services/Files/Audio'); const { isEnabled, sleep } = require('~/server/utils'); const { handleOpenAIErrors } = require('./tools/util'); const spendTokens = require('~/models/spendTokens'); @@ -594,6 +595,7 @@ class OpenAIClient extends BaseClient { payload, (progressMessage) => { if (progressMessage === '[DONE]') { + updateTokenWebsocket('[DONE]'); return; } @@ -1216,6 +1218,7 @@ ${convo} }); const azureDelay = this.modelOptions.model?.includes('gpt-4') ? 30 : 17; + for await (const chunk of stream) { const token = chunk.choices[0]?.delta?.content || ''; intermediateReply += token; diff --git a/api/app/clients/PluginsClient.js b/api/app/clients/PluginsClient.js index 9c57ee2e9cd..da1ee1eb463 100644 --- a/api/app/clients/PluginsClient.js +++ b/api/app/clients/PluginsClient.js @@ -250,6 +250,7 @@ class PluginsClient extends OpenAIClient { this.setOptions(opts); return super.sendMessage(message, opts); } + logger.debug('[PluginsClient] sendMessage', { userMessageText: message, opts }); const { user, @@ -264,6 +265,14 @@ class PluginsClient extends OpenAIClient { onToolEnd, } = await this.handleStartMethods(message, opts); + if (opts.progressCallback) { + opts.onProgress = opts.progressCallback.call(null, { + ...(opts.progressOptions ?? {}), + parentMessageId: opts.progressOptions?.parentMessageId ?? userMessage.messageId, + messageId: responseMessageId, + }); + } + this.currentMessages.push(userMessage); let { diff --git a/api/cache/getLogStores.js b/api/cache/getLogStores.js index 0d9b662e4e1..ad181c4c9e8 100644 --- a/api/cache/getLogStores.js +++ b/api/cache/getLogStores.js @@ -7,6 +7,7 @@ const keyvMongo = require('./keyvMongo'); const { BAN_DURATION, USE_REDIS } = process.env ?? {}; const THIRTY_MINUTES = 1800000; +const TEN_MINUTES = 600000; const duration = math(BAN_DURATION, 7200000); @@ -24,6 +25,10 @@ const config = isEnabled(USE_REDIS) ? new Keyv({ store: keyvRedis }) : new Keyv({ namespace: CacheKeys.CONFIG_STORE }); +const audioRuns = isEnabled(USE_REDIS) // ttl: 30 minutes + ? new Keyv({ store: keyvRedis, ttl: TEN_MINUTES }) + : new Keyv({ namespace: CacheKeys.AUDIO_RUNS, ttl: TEN_MINUTES }); + const tokenConfig = isEnabled(USE_REDIS) // ttl: 30 minutes ? new Keyv({ store: keyvRedis, ttl: THIRTY_MINUTES }) : new Keyv({ namespace: CacheKeys.TOKEN_CONFIG, ttl: THIRTY_MINUTES }); @@ -64,6 +69,7 @@ const namespaces = { [CacheKeys.TOKEN_CONFIG]: tokenConfig, [CacheKeys.GEN_TITLE]: genTitle, [CacheKeys.MODEL_QUERIES]: modelQueries, + [CacheKeys.AUDIO_RUNS]: audioRuns, }; /** diff --git a/api/package.json b/api/package.json index d4e0132ddaa..66fb8770600 100644 --- a/api/package.json +++ b/api/package.json @@ -94,6 +94,7 @@ "ua-parser-js": "^1.0.36", "winston": "^3.11.0", "winston-daily-rotate-file": "^4.7.1", + "ws": "^8.17.0", "zod": "^3.22.4" }, "devDependencies": { diff --git a/api/server/controllers/AskController.js b/api/server/controllers/AskController.js index 0925b2221fa..48e79cf0da4 100644 --- a/api/server/controllers/AskController.js +++ b/api/server/controllers/AskController.js @@ -105,11 +105,12 @@ const AskController = async (req, res, next, initializeClient, addTitle) => { getReqData, onStart, abortController, - onProgress: progressCallback.call(null, { + progressCallback, + progressOptions: { res, text, - parentMessageId: overrideParentMessageId || userMessageId, - }), + // parentMessageId: overrideParentMessageId || userMessageId, + }, }; let response = await client.sendMessage(text, messageOptions); diff --git a/api/server/controllers/EditController.js b/api/server/controllers/EditController.js index ba936ac867d..165245f7b0c 100644 --- a/api/server/controllers/EditController.js +++ b/api/server/controllers/EditController.js @@ -112,11 +112,12 @@ const EditController = async (req, res, next, initializeClient) => { getReqData, onStart, abortController, - onProgress: progressCallback.call(null, { + progressCallback, + progressOptions: { res, text, - parentMessageId: overrideParentMessageId || userMessageId, - }), + // parentMessageId: overrideParentMessageId || userMessageId, + }, }); const conversation = await getConvo(user, conversationId); diff --git a/api/server/controllers/assistants/chatV2.js b/api/server/controllers/assistants/chatV2.js index c72d5fc9b43..fef22e64931 100644 --- a/api/server/controllers/assistants/chatV2.js +++ b/api/server/controllers/assistants/chatV2.js @@ -520,6 +520,7 @@ const chatV2 = async (req, res) => { handlers, thread_id, attachedFileIds, + parentMessageId, responseMessage: openai.responseMessage, // streamOptions: { @@ -532,6 +533,7 @@ const chatV2 = async (req, res) => { }); response = streamRunManager; + response.text = streamRunManager.intermediateText; }; await processRun(); @@ -554,6 +556,7 @@ const chatV2 = async (req, res) => { /** @type {ResponseMessage} */ const responseMessage = { ...(response.responseMessage ?? response.finalMessage), + text: response.text, parentMessageId: userMessageId, conversationId, user: req.user.id, diff --git a/api/server/routes/ask/gptPlugins.js b/api/server/routes/ask/gptPlugins.js index f93a5a953bb..2fab6188aff 100644 --- a/api/server/routes/ask/gptPlugins.js +++ b/api/server/routes/ask/gptPlugins.js @@ -174,12 +174,13 @@ router.post( onStart, getPartialText, ...endpointOption, - onProgress: progressCallback.call(null, { + progressCallback, + progressOptions: { res, text, - parentMessageId: overrideParentMessageId || userMessageId, + // parentMessageId: overrideParentMessageId || userMessageId, plugins, - }), + }, abortController, }); diff --git a/api/server/routes/edit/gptPlugins.js b/api/server/routes/edit/gptPlugins.js index 61d76178f4f..f1b0cba248b 100644 --- a/api/server/routes/edit/gptPlugins.js +++ b/api/server/routes/edit/gptPlugins.js @@ -153,12 +153,13 @@ router.post( onChainEnd, onStart, ...endpointOption, - onProgress: progressCallback.call(null, { + progressCallback, + progressOptions: { res, text, plugin, - parentMessageId: overrideParentMessageId || userMessageId, - }), + // parentMessageId: overrideParentMessageId || userMessageId, + }, abortController, }); diff --git a/api/server/routes/files/index.js b/api/server/routes/files/index.js index d268e0a1ba6..e74b167d45e 100644 --- a/api/server/routes/files/index.js +++ b/api/server/routes/files/index.js @@ -5,6 +5,8 @@ const { createMulterInstance } = require('./multer'); const files = require('./files'); const images = require('./images'); const avatar = require('./avatar'); +const stt = require('./stt'); +const tts = require('./tts'); const initialize = async () => { const router = express.Router(); @@ -18,6 +20,9 @@ const initialize = async () => { router.post('/', upload.single('file')); router.post('/images', upload.single('file')); + router.use('/stt', stt); + router.use('/tts', tts); + router.use('/', files); router.use('/images', images); router.use('/images/avatar', avatar); diff --git a/api/server/routes/files/stt.js b/api/server/routes/files/stt.js new file mode 100644 index 00000000000..81c7338cd2d --- /dev/null +++ b/api/server/routes/files/stt.js @@ -0,0 +1,13 @@ +const express = require('express'); +const router = express.Router(); +const multer = require('multer'); +const { requireJwtAuth } = require('~/server/middleware/'); +const { speechToText } = require('~/server/services/Files/Audio'); + +const upload = multer(); + +router.post('/', requireJwtAuth, upload.single('audio'), async (req, res) => { + await speechToText(req, res); +}); + +module.exports = router; diff --git a/api/server/routes/files/tts.js b/api/server/routes/files/tts.js new file mode 100644 index 00000000000..1ee540874fe --- /dev/null +++ b/api/server/routes/files/tts.js @@ -0,0 +1,42 @@ +const multer = require('multer'); +const express = require('express'); +const { CacheKeys } = require('librechat-data-provider'); +const { getVoices, streamAudio, textToSpeech } = require('~/server/services/Files/Audio'); +const { getLogStores } = require('~/cache'); +const { logger } = require('~/config'); + +const router = express.Router(); +const upload = multer(); + +router.post('/manual', upload.none(), async (req, res) => { + await textToSpeech(req, res); +}); + +const logDebugMessage = (req, message) => + logger.debug(`[streamAudio] user: ${req?.user?.id ?? 'UNDEFINED_USER'} | ${message}`); + +// TODO: test caching +router.post('/', async (req, res) => { + try { + const audioRunsCache = getLogStores(CacheKeys.AUDIO_RUNS); + const audioRun = await audioRunsCache.get(req.body.runId); + logDebugMessage(req, 'start stream audio'); + if (audioRun) { + logDebugMessage(req, 'stream audio already running'); + return res.status(401).json({ error: 'Audio stream already running' }); + } + audioRunsCache.set(req.body.runId, true); + await streamAudio(req, res); + logDebugMessage(req, 'end stream audio'); + res.status(200).end(); + } catch (error) { + logger.error(`[streamAudio] user: ${req.user.id} | Failed to stream audio: ${error}`); + res.status(500).json({ error: 'Failed to stream audio' }); + } +}); + +router.get('/voices', async (req, res) => { + await getVoices(req, res); +}); + +module.exports = router; diff --git a/api/server/services/Files/Audio/getVoices.js b/api/server/services/Files/Audio/getVoices.js new file mode 100644 index 00000000000..1d1fd825439 --- /dev/null +++ b/api/server/services/Files/Audio/getVoices.js @@ -0,0 +1,45 @@ +const { logger } = require('~/config'); +const getCustomConfig = require('~/server/services/Config/getCustomConfig'); +const { getProvider } = require('./textToSpeech'); + +/** + * This function retrieves the available voices for the current TTS provider + * It first fetches the TTS configuration and determines the provider + * Then, based on the provider, it sends the corresponding voices as a JSON response + * + * @param {Object} req - The request object + * @param {Object} res - The response object + * @returns {Promise} + * @throws {Error} - If the provider is not 'openai' or 'elevenlabs', an error is thrown + */ +async function getVoices(req, res) { + try { + const customConfig = await getCustomConfig(); + + if (!customConfig || !customConfig?.tts) { + throw new Error('Configuration or TTS schema is missing'); + } + + const ttsSchema = customConfig?.tts; + const provider = getProvider(ttsSchema); + let voices; + + switch (provider) { + case 'openai': + voices = ttsSchema.openai?.voices; + break; + case 'elevenlabs': + voices = ttsSchema.elevenlabs?.voices; + break; + default: + throw new Error('Invalid provider'); + } + + res.json(voices); + } catch (error) { + logger.error(`Failed to get voices: ${error.message}`); + res.status(500).json({ error: 'Failed to get voices' }); + } +} + +module.exports = getVoices; diff --git a/api/server/services/Files/Audio/index.js b/api/server/services/Files/Audio/index.js new file mode 100644 index 00000000000..a201ea556cb --- /dev/null +++ b/api/server/services/Files/Audio/index.js @@ -0,0 +1,11 @@ +const getVoices = require('./getVoices'); +const textToSpeech = require('./textToSpeech'); +const speechToText = require('./speechToText'); +const { updateTokenWebsocket } = require('./webSocket'); + +module.exports = { + getVoices, + speechToText, + ...textToSpeech, + updateTokenWebsocket, +}; diff --git a/api/server/services/Files/Audio/speechToText.js b/api/server/services/Files/Audio/speechToText.js new file mode 100644 index 00000000000..96e70b76fe9 --- /dev/null +++ b/api/server/services/Files/Audio/speechToText.js @@ -0,0 +1,211 @@ +const axios = require('axios'); +const { Readable } = require('stream'); +const { logger } = require('~/config'); +const getCustomConfig = require('~/server/services/Config/getCustomConfig'); +const { extractEnvVariable } = require('librechat-data-provider'); + +/** + * Handle the response from the STT API + * @param {Object} response - The response from the STT API + * + * @returns {string} The text from the response data + * + * @throws Will throw an error if the response status is not 200 or the response data is missing + */ +async function handleResponse(response) { + if (response.status !== 200) { + throw new Error('Invalid response from the STT API'); + } + + if (!response.data || !response.data.text) { + throw new Error('Missing data in response from the STT API'); + } + + return response.data.text.trim(); +} + +function getProvider(sttSchema) { + if (sttSchema.openai) { + return 'openai'; + } + + throw new Error('Invalid provider'); +} + +function removeUndefined(obj) { + Object.keys(obj).forEach((key) => { + if (obj[key] && typeof obj[key] === 'object') { + removeUndefined(obj[key]); + if (Object.keys(obj[key]).length === 0) { + delete obj[key]; + } + } else if (obj[key] === undefined) { + delete obj[key]; + } + }); +} + +/** + * This function prepares the necessary data and headers for making a request to the OpenAI API + * It uses the provided speech-to-text schema and audio stream to create the request + * + * @param {Object} sttSchema - The speech-to-text schema containing the OpenAI configuration + * @param {Stream} audioReadStream - The audio data to be transcribed + * + * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request + * If an error occurs, it returns an array with three null values and logs the error with logger + */ +function openAIProvider(sttSchema, audioReadStream) { + try { + const url = sttSchema.openai?.url || 'https://api.openai.com/v1/audio/transcriptions'; + const apiKey = sttSchema.openai.apiKey ? extractEnvVariable(sttSchema.openai.apiKey) : ''; + + let data = { + file: audioReadStream, + model: sttSchema.openai.model, + }; + + let headers = { + 'Content-Type': 'multipart/form-data', + }; + + [headers].forEach(removeUndefined); + + if (apiKey) { + headers.Authorization = 'Bearer ' + apiKey; + } + + return [url, data, headers]; + } catch (error) { + logger.error('An error occurred while preparing the OpenAI API STT request: ', error); + return [null, null, null]; + } +} + +/** + * This function prepares the necessary data and headers for making a request to the Azure API + * It uses the provided request and audio stream to create the request + * + * @param {Object} req - The request object, which should contain the endpoint in its body + * @param {Stream} audioReadStream - The audio data to be transcribed + * + * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request + * If an error occurs, it returns an array with three null values and logs the error with logger + */ +function azureProvider(req, audioReadStream) { + try { + const { endpoint } = req.body; + const azureConfig = req.app.locals[endpoint]; + + if (!azureConfig) { + throw new Error(`No configuration found for endpoint: ${endpoint}`); + } + + const { apiKey, instanceName, whisperModel, apiVersion } = Object.entries( + azureConfig.groupMap, + ).reduce((acc, [, value]) => { + if (acc) { + return acc; + } + + const whisperKey = Object.keys(value.models).find((modelKey) => + modelKey.startsWith('whisper'), + ); + + if (whisperKey) { + return { + apiVersion: value.version, + apiKey: value.apiKey, + instanceName: value.instanceName, + whisperModel: value.models[whisperKey]['deploymentName'], + }; + } + + return null; + }, null); + + if (!apiKey || !instanceName || !whisperModel || !apiVersion) { + throw new Error('Required Azure configuration values are missing'); + } + + const baseURL = `https://${instanceName}.openai.azure.com`; + + const url = `${baseURL}/openai/deployments/${whisperModel}/audio/transcriptions?api-version=${apiVersion}`; + + let data = { + file: audioReadStream, + filename: 'audio.wav', + contentType: 'audio/wav', + knownLength: audioReadStream.length, + }; + + const headers = { + ...data.getHeaders(), + 'Content-Type': 'multipart/form-data', + 'api-key': apiKey, + }; + + return [url, data, headers]; + } catch (error) { + logger.error('An error occurred while preparing the Azure API STT request: ', error); + return [null, null, null]; + } +} + +/** + * Convert speech to text + * @param {Object} req - The request object + * @param {Object} res - The response object + * + * @returns {Object} The response object with the text from the STT API + * + * @throws Will throw an error if an error occurs while processing the audio + */ + +async function speechToText(req, res) { + const customConfig = await getCustomConfig(); + if (!customConfig) { + return res.status(500).send('Custom config not found'); + } + + if (!req.file || !req.file.buffer) { + return res.status(400).json({ message: 'No audio file provided in the FormData' }); + } + + const audioBuffer = req.file.buffer; + const audioReadStream = Readable.from(audioBuffer); + audioReadStream.path = 'audio.wav'; + + const provider = getProvider(customConfig.stt); + + let [url, data, headers] = []; + + switch (provider) { + case 'openai': + [url, data, headers] = openAIProvider(customConfig.stt, audioReadStream); + break; + case 'azure': + [url, data, headers] = azureProvider(req, audioReadStream); + break; + default: + throw new Error('Invalid provider'); + } + + if (!Readable.from) { + const audioBlob = new Blob([audioBuffer], { type: req.file.mimetype }); + delete data['file']; + data['file'] = audioBlob; + } + + try { + const response = await axios.post(url, data, { headers: headers }); + const text = await handleResponse(response); + + res.json({ text }); + } catch (error) { + logger.error('An error occurred while processing the audio:', error); + res.sendStatus(500); + } +} + +module.exports = speechToText; diff --git a/api/server/services/Files/Audio/streamAudio-wip.js b/api/server/services/Files/Audio/streamAudio-wip.js new file mode 100644 index 00000000000..f4723094965 --- /dev/null +++ b/api/server/services/Files/Audio/streamAudio-wip.js @@ -0,0 +1,91 @@ +const { Message } = require('~/models/Message'); +const { createChunkProcessor } = require('./streamAudio'); + +jest.mock('~/models/Message', () => ({ + Message: { + findOne: jest.fn().mockReturnValue({ + lean: jest.fn(), + }), + }, +})); + +describe('processChunks', () => { + let processChunks; + + beforeEach(() => { + processChunks = createChunkProcessor(); + Message.findOne.mockClear(); + Message.findOne().lean.mockClear(); + }); + + it('should return an empty array when the message is not found', async () => { + Message.findOne().lean.mockResolvedValueOnce(null); + + const result = await processChunks('non-existent-id'); + + expect(result).toEqual([]); + expect(Message.findOne).toHaveBeenCalledWith( + { messageId: 'non-existent-id' }, + 'text unfinished', + ); + expect(Message.findOne().lean).toHaveBeenCalled(); + }); + + it('should return an empty array when the message does not have a text property', async () => { + Message.findOne().lean.mockResolvedValueOnce({ unfinished: true }); + + const result = await processChunks('message-id'); + + expect(result).toEqual([]); + expect(Message.findOne).toHaveBeenCalledWith({ messageId: 'message-id' }, 'text unfinished'); + expect(Message.findOne().lean).toHaveBeenCalled(); + }); + + it('should return chunks for an unfinished message with separators', async () => { + const messageText = 'This is a long message. It should be split into chunks. Lol hi mom'; + Message.findOne().lean.mockResolvedValueOnce({ text: messageText, unfinished: true }); + + const result = await processChunks('message-id'); + + expect(result).toEqual([ + { text: 'This is a long message. It should be split into chunks.', isFinished: false }, + ]); + expect(Message.findOne).toHaveBeenCalledWith({ messageId: 'message-id' }, 'text unfinished'); + expect(Message.findOne().lean).toHaveBeenCalled(); + }); + + it('should return chunks for an unfinished message without separators', async () => { + const messageText = 'This is a long message without separators hello there my friend'; + Message.findOne().lean.mockResolvedValueOnce({ text: messageText, unfinished: true }); + + const result = await processChunks('message-id'); + + expect(result).toEqual([{ text: messageText, isFinished: false }]); + expect(Message.findOne).toHaveBeenCalledWith({ messageId: 'message-id' }, 'text unfinished'); + expect(Message.findOne().lean).toHaveBeenCalled(); + }); + + it('should return the remaining text as a chunk for a finished message', async () => { + const messageText = 'This is a finished message.'; + Message.findOne().lean.mockResolvedValueOnce({ text: messageText, unfinished: false }); + + const result = await processChunks('message-id'); + + expect(result).toEqual([{ text: messageText, isFinished: true }]); + expect(Message.findOne).toHaveBeenCalledWith({ messageId: 'message-id' }, 'text unfinished'); + expect(Message.findOne().lean).toHaveBeenCalled(); + }); + + it('should return an empty array for a finished message with no remaining text', async () => { + const messageText = 'This is a finished message.'; + Message.findOne().lean.mockResolvedValueOnce({ text: messageText, unfinished: false }); + + await processChunks('message-id'); + Message.findOne().lean.mockResolvedValueOnce({ text: messageText, unfinished: false }); + const result = await processChunks('message-id'); + + expect(result).toEqual([]); + expect(Message.findOne).toHaveBeenCalledWith({ messageId: 'message-id' }, 'text unfinished'); + expect(Message.findOne().lean).toHaveBeenCalledTimes(2); + }); +}); diff --git a/api/server/services/Files/Audio/streamAudio.js b/api/server/services/Files/Audio/streamAudio.js new file mode 100644 index 00000000000..4b74816d28c --- /dev/null +++ b/api/server/services/Files/Audio/streamAudio.js @@ -0,0 +1,312 @@ +const WebSocket = require('ws'); +const { Message } = require('~/models/Message'); + +/** + * @param {string[]} voiceIds - Array of voice IDs + * @returns {string} + */ +function getRandomVoiceId(voiceIds) { + const randomIndex = Math.floor(Math.random() * voiceIds.length); + return voiceIds[randomIndex]; +} + +/** + * @typedef {Object} VoiceSettings + * @property {number} similarity_boost + * @property {number} stability + * @property {boolean} use_speaker_boost + */ + +/** + * @typedef {Object} GenerateAudioBulk + * @property {string} model_id + * @property {string} text + * @property {VoiceSettings} voice_settings + */ + +/** + * @typedef {Object} TextToSpeechClient + * @property {function(Object): Promise} generate + */ + +/** + * @typedef {Object} AudioChunk + * @property {string} audio + * @property {boolean} isFinal + * @property {Object} alignment + * @property {number[]} alignment.char_start_times_ms + * @property {number[]} alignment.chars_durations_ms + * @property {string[]} alignment.chars + * @property {Object} normalizedAlignment + * @property {number[]} normalizedAlignment.char_start_times_ms + * @property {number[]} normalizedAlignment.chars_durations_ms + * @property {string[]} normalizedAlignment.chars + */ + +/** + * + * @param {Record} parameters + * @returns + */ +function assembleQuery(parameters) { + let query = ''; + let hasQuestionMark = false; + + for (const [key, value] of Object.entries(parameters)) { + if (value == null) { + continue; + } + + if (!hasQuestionMark) { + query += '?'; + hasQuestionMark = true; + } else { + query += '&'; + } + + query += `${key}=${value}`; + } + + return query; +} + +const SEPARATORS = ['.', '?', '!', '۔', '。', '‥', ';', '¡', '¿', '\n']; + +/** + * + * @param {string} text + * @param {string[] | undefined} [separators] + * @returns + */ +function findLastSeparatorIndex(text, separators = SEPARATORS) { + let lastIndex = -1; + for (const separator of separators) { + const index = text.lastIndexOf(separator); + if (index > lastIndex) { + lastIndex = index; + } + } + return lastIndex; +} + +const MAX_NOT_FOUND_COUNT = 6; +const MAX_NO_CHANGE_COUNT = 12; + +/** + * @param {string} messageId + * @returns {() => Promise<{ text: string, isFinished: boolean }[]>} + */ +function createChunkProcessor(messageId) { + let notFoundCount = 0; + let noChangeCount = 0; + let processedText = ''; + if (!messageId) { + throw new Error('Message ID is required'); + } + + /** + * @returns {Promise<{ text: string, isFinished: boolean }[] | string>} + */ + async function processChunks() { + if (notFoundCount >= MAX_NOT_FOUND_COUNT) { + return `Message not found after ${MAX_NOT_FOUND_COUNT} attempts`; + } + + if (noChangeCount >= MAX_NO_CHANGE_COUNT) { + return `No change in message after ${MAX_NO_CHANGE_COUNT} attempts`; + } + + const message = await Message.findOne({ messageId }, 'text unfinished').lean(); + + if (!message || !message.text) { + notFoundCount++; + return []; + } + + const { text, unfinished } = message; + if (text === processedText) { + noChangeCount++; + } + + const remainingText = text.slice(processedText.length); + const chunks = []; + + if (unfinished && remainingText.length >= 20) { + const separatorIndex = findLastSeparatorIndex(remainingText); + if (separatorIndex !== -1) { + const chunkText = remainingText.slice(0, separatorIndex + 1); + chunks.push({ text: chunkText, isFinished: false }); + processedText += chunkText; + } else { + chunks.push({ text: remainingText, isFinished: false }); + processedText = text; + } + } else if (!unfinished && remainingText.trim().length > 0) { + chunks.push({ text: remainingText.trim(), isFinished: true }); + processedText = text; + } + + return chunks; + } + + return processChunks; +} + +/** + * Input stream text to speech + * @param {Express.Response} res + * @param {AsyncIterable} textStream + * @param {(token: string) => Promise} callback - Whether to continue the stream or not + * @returns {AsyncGenerator} + */ +function inputStreamTextToSpeech(res, textStream, callback) { + const model = 'eleven_monolingual_v1'; + const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${getRandomVoiceId()}/stream-input${assembleQuery( + { + model_id: model, + // flush: true, + // optimize_streaming_latency: this.settings.optimizeStreamingLatency, + optimize_streaming_latency: 1, + // output_format: this.settings.outputFormat, + }, + )}`; + const socket = new WebSocket(wsUrl); + + socket.onopen = function () { + const streamStart = { + text: ' ', + voice_settings: { + stability: 0.5, + similarity_boost: 0.8, + }, + xi_api_key: process.env.ELEVENLABS_API_KEY, + // generation_config: { chunk_length_schedule: [50, 90, 120, 150, 200] }, + }; + + socket.send(JSON.stringify(streamStart)); + + // send stream until done + const streamComplete = new Promise((resolve, reject) => { + (async () => { + let textBuffer = ''; + let shouldContinue = true; + for await (const textDelta of textStream) { + textBuffer += textDelta; + + // using ". " as separator: sending in full sentences improves the quality + // of the audio output significantly. + const separatorIndex = findLastSeparatorIndex(textBuffer); + + // Callback for textStream (will return false if signal is aborted) + shouldContinue = await callback(textDelta); + + if (separatorIndex === -1) { + continue; + } + + if (!shouldContinue) { + break; + } + + const textToProcess = textBuffer.slice(0, separatorIndex); + textBuffer = textBuffer.slice(separatorIndex + 1); + + const request = { + text: textToProcess, + try_trigger_generation: true, + }; + + socket.send(JSON.stringify(request)); + } + + // send remaining text: + if (shouldContinue && textBuffer.length > 0) { + socket.send( + JSON.stringify({ + text: `${textBuffer} `, // append space + try_trigger_generation: true, + }), + ); + } + })() + .then(resolve) + .catch(reject); + }); + + streamComplete + .then(() => { + const endStream = { + text: '', + }; + + socket.send(JSON.stringify(endStream)); + }) + .catch((e) => { + console.error('Error streaming text to speech:', e); + throw e; + }); + }; + + return (async function* audioStream() { + let isDone = false; + let chunks = []; + let resolve; + let waitForMessage = new Promise((r) => (resolve = r)); + + socket.onmessage = function (event) { + // console.log(event); + const audioChunk = JSON.parse(event.data); + if (audioChunk.audio && audioChunk.alignment) { + res.write(`event: audio\ndata: ${event.data}\n\n`); + chunks.push(audioChunk); + resolve(null); + waitForMessage = new Promise((r) => (resolve = r)); + } else if (audioChunk.isFinal) { + isDone = true; + resolve(null); + } else if (audioChunk.message) { + console.warn('Received Elevenlabs message:', audioChunk.message); + resolve(null); + } + }; + + socket.onerror = function (error) { + console.error('WebSocket error:', error); + // throw error; + }; + + socket.onclose = function () { + isDone = true; + resolve(null); + }; + + while (!isDone) { + await waitForMessage; + yield* chunks; + chunks = []; + } + + res.write('event: end\ndata: \n\n'); + })(); +} + +/** + * + * @param {AsyncIterable} llmStream + */ +async function* llmMessageSource(llmStream) { + for await (const chunk of llmStream) { + const message = chunk.choices[0].delta.content; + if (message) { + yield message; + } + } +} + +module.exports = { + inputStreamTextToSpeech, + findLastSeparatorIndex, + createChunkProcessor, + llmMessageSource, + getRandomVoiceId, +}; diff --git a/api/server/services/Files/Audio/textToSpeech.js b/api/server/services/Files/Audio/textToSpeech.js new file mode 100644 index 00000000000..6c8f306c89a --- /dev/null +++ b/api/server/services/Files/Audio/textToSpeech.js @@ -0,0 +1,390 @@ +const axios = require('axios'); +const getCustomConfig = require('~/server/services/Config/getCustomConfig'); +const { getRandomVoiceId, createChunkProcessor } = require('./streamAudio'); +const { extractEnvVariable } = require('librechat-data-provider'); +const { logger } = require('~/config'); + +/** + * getProvider function + * This function takes the ttsSchema object and returns the name of the provider + * If more than one provider is set or no provider is set, it throws an error + * + * @param {Object} ttsSchema - The TTS schema containing the provider configuration + * @returns {string} The name of the provider + * @throws {Error} Throws an error if multiple providers are set or no provider is set + */ +function getProvider(ttsSchema) { + if (!ttsSchema) { + throw new Error(`No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)? +# Example TTS configuration`); + } + const providers = Object.entries(ttsSchema).filter(([, value]) => Object.keys(value).length > 0); + + if (providers.length > 1) { + throw new Error('Multiple providers are set. Please set only one provider.'); + } else if (providers.length === 0) { + throw new Error('No provider is set. Please set a provider.'); + } else { + return providers[0][0]; + } +} + +/** + * removeUndefined function + * This function takes an object and removes all keys with undefined values + * It also removes keys with empty objects as values + * + * @param {Object} obj - The object to be cleaned + * @returns {void} This function does not return a value. It modifies the input object directly + */ +function removeUndefined(obj) { + Object.keys(obj).forEach((key) => { + if (obj[key] && typeof obj[key] === 'object') { + removeUndefined(obj[key]); + if (Object.keys(obj[key]).length === 0) { + delete obj[key]; + } + } else if (obj[key] === undefined) { + delete obj[key]; + } + }); +} + +/** + * This function prepares the necessary data and headers for making a request to the OpenAI TTS + * It uses the provided TTS schema, input text, and voice to create the request + * + * @param {Object} ttsSchema - The TTS schema containing the OpenAI configuration + * @param {string} input - The text to be converted to speech + * @param {string} voice - The voice to be used for the speech + * + * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request + * If an error occurs, it throws an error with a message indicating that the selected voice is not available + */ +function openAIProvider(ttsSchema, input, voice) { + const url = ttsSchema.openai?.url || 'https://api.openai.com/v1/audio/speech'; + + if ( + ttsSchema.openai?.voices && + ttsSchema.openai.voices.length > 0 && + !ttsSchema.openai.voices.includes(voice) && + !ttsSchema.openai.voices.includes('ALL') + ) { + throw new Error(`Voice ${voice} is not available.`); + } + + let data = { + input, + model: ttsSchema.openai?.model, + voice: ttsSchema.openai?.voices && ttsSchema.openai.voices.length > 0 ? voice : undefined, + backend: ttsSchema.openai?.backend, + }; + + let headers = { + 'Content-Type': 'application/json', + Authorization: 'Bearer ' + extractEnvVariable(ttsSchema.openai?.apiKey), + }; + + [data, headers].forEach(removeUndefined); + + return [url, data, headers]; +} + +/** + * elevenLabsProvider function + * This function prepares the necessary data and headers for making a request to the Eleven Labs TTS + * It uses the provided TTS schema, input text, and voice to create the request + * + * @param {Object} ttsSchema - The TTS schema containing the Eleven Labs configuration + * @param {string} input - The text to be converted to speech + * @param {string} voice - The voice to be used for the speech + * @param {boolean} stream - Whether to stream the audio or not + * + * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request + * @throws {Error} Throws an error if the selected voice is not available + */ +function elevenLabsProvider(ttsSchema, input, voice, stream) { + let url = + ttsSchema.elevenlabs?.url || + `https://api.elevenlabs.io/v1/text-to-speech/{voice_id}${stream ? '/stream' : ''}`; + + if ( + !ttsSchema.elevenlabs?.voices.includes(voice) && + !ttsSchema.elevenlabs?.voices.includes('ALL') + ) { + throw new Error(`Voice ${voice} is not available.`); + } + + url = url.replace('{voice_id}', voice); + + let data = { + model_id: ttsSchema.elevenlabs?.model, + text: input, + // voice_id: voice, + voice_settings: { + similarity_boost: ttsSchema.elevenlabs?.voice_settings?.similarity_boost, + stability: ttsSchema.elevenlabs?.voice_settings?.stability, + style: ttsSchema.elevenlabs?.voice_settings?.style, + use_speaker_boost: ttsSchema.elevenlabs?.voice_settings?.use_speaker_boost || undefined, + }, + pronunciation_dictionary_locators: ttsSchema.elevenlabs?.pronunciation_dictionary_locators, + }; + + let headers = { + 'Content-Type': 'application/json', + 'xi-api-key': extractEnvVariable(ttsSchema.elevenlabs?.apiKey), + Accept: 'audio/mpeg', + }; + + [data, headers].forEach(removeUndefined); + + return [url, data, headers]; +} + +/** + * localAIProvider function + * This function prepares the necessary data and headers for making a request to the LocalAI TTS + * It uses the provided TTS schema, input text, and voice to create the request + * + * @param {Object} ttsSchema - The TTS schema containing the LocalAI configuration + * @param {string} input - The text to be converted to speech + * @param {string} voice - The voice to be used for the speech + * + * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request + * @throws {Error} Throws an error if the selected voice is not available + */ +function localAIProvider(ttsSchema, input, voice) { + let url = ttsSchema.localai?.url; + + if ( + ttsSchema.localai?.voices && + ttsSchema.localai.voices.length > 0 && + !ttsSchema.localai.voices.includes(voice) && + !ttsSchema.localai.voices.includes('ALL') + ) { + throw new Error(`Voice ${voice} is not available.`); + } + + let data = { + input, + model: ttsSchema.localai?.voices && ttsSchema.localai.voices.length > 0 ? voice : undefined, + backend: ttsSchema.localai?.backend, + }; + + let headers = { + 'Content-Type': 'application/json', + Authorization: 'Bearer ' + extractEnvVariable(ttsSchema.localai?.apiKey), + }; + + [data, headers].forEach(removeUndefined); + + if (extractEnvVariable(ttsSchema.localai.apiKey) === '') { + delete headers.Authorization; + } + + return [url, data, headers]; +} + +/* not used */ +/* +async function streamAudioFromWebSocket(req, res) { + const { voice } = req.body; + const customConfig = await getCustomConfig(); + + if (!customConfig) { + return res.status(500).send('Custom config not found'); + } + + const ttsSchema = customConfig.tts; + const provider = getProvider(ttsSchema); + + if (provider !== 'elevenlabs') { + return res.status(400).send('WebSocket streaming is only supported for Eleven Labs'); + } + + const url = + ttsSchema.elevenlabs.websocketUrl || + 'wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input?model_id={model}' + .replace('{voice_id}', voice) + .replace('{model}', ttsSchema.elevenlabs.model); + const ws = new WebSocket(url); + + ws.onopen = () => { + logger.debug('WebSocket connection opened'); + sendTextToWebsocket(ws, (data) => { + res.write(data); // Stream data directly to the response + }); + }; + + ws.onclose = () => { + logger.debug('WebSocket connection closed'); + res.end(); // End the response when the WebSocket is closed + }; + + ws.onerror = (error) => { + logger.error('WebSocket error:', error); + res.status(500).send('WebSocket error'); + }; +} +*/ + +/** + * + * @param {TCustomConfig} customConfig + * @param {string} voice + * @returns {Promise} + */ +async function ttsRequest( + customConfig, + { input, voice: _v, stream = true } = { input: '', stream: true }, +) { + const ttsSchema = customConfig.tts; + const provider = getProvider(ttsSchema); + const voices = ttsSchema[provider].voices.filter( + (voice) => voice && voice.toUpperCase() !== 'ALL', + ); + let voice = _v; + if (!voice || !voices.includes(voice) || (voice.toUpperCase() === 'ALL' && voices.length > 1)) { + voice = getRandomVoiceId(voices); + } + + let [url, data, headers] = []; + + switch (provider) { + case 'openai': + [url, data, headers] = openAIProvider(ttsSchema, input, voice); + break; + case 'elevenlabs': + [url, data, headers] = elevenLabsProvider(ttsSchema, input, voice, stream); + break; + case 'localai': + [url, data, headers] = localAIProvider(ttsSchema, input, voice); + break; + default: + throw new Error('Invalid provider'); + } + + if (stream) { + return await axios.post(url, data, { headers, responseType: 'stream' }); + } + + return await axios.post(url, data, { headers, responseType: 'arraybuffer' }); +} + +/** + * Handles a text-to-speech request. Extracts input and voice from the request, retrieves the TTS configuration, + * and sends a request to the appropriate provider. The resulting audio data is sent in the response + * + * @param {Object} req - The request object, which should contain the input text and voice in its body + * @param {Object} res - The response object, used to send the audio data or an error message + * + * @returns {Promise} This function does not return a value. It sends the audio data or an error message in the response + * + * @throws {Error} Throws an error if the provider is invalid + */ +async function textToSpeech(req, res) { + const { input, voice } = req.body; + + if (!input) { + return res.status(400).send('Missing text in request body'); + } + + const customConfig = await getCustomConfig(); + if (!customConfig) { + res.status(500).send('Custom config not found'); + } + + try { + res.setHeader('Content-Type', 'audio/mpeg'); + const response = await ttsRequest(customConfig, { input, voice }); + response.data.pipe(res); + } catch (error) { + logger.error('An error occurred while creating the audio stream:', error); + res.status(500).send('An error occurred'); + } +} + +async function streamAudio(req, res) { + res.setHeader('Content-Type', 'audio/mpeg'); + const customConfig = await getCustomConfig(); + if (!customConfig) { + return res.status(500).send('Custom config not found'); + } + + try { + let shouldContinue = true; + const processChunks = createChunkProcessor(req.body.messageId); + + while (shouldContinue) { + // example updates + // const updates = [ + // { text: 'This is a test.', isFinished: false }, + // { text: 'This is only a test.', isFinished: false }, + // { text: 'Your voice is like a combination of Fergie and Jesus!', isFinished: true }, + // ]; + + const updates = await processChunks(); + if (typeof updates === 'string') { + logger.error(`Error processing audio stream updates: ${JSON.stringify(updates)}`); + res.status(500).end(); + return; + } + + if (updates.length === 0) { + await new Promise((resolve) => setTimeout(resolve, 1250)); + continue; + } + + for (const update of updates) { + try { + const response = await ttsRequest(customConfig, { + input: update.text, + stream: true, + }); + + if (!shouldContinue) { + break; + } + + logger.debug(`[streamAudio] user: ${req?.user?.id} | writing audio stream`); + await new Promise((resolve) => { + response.data.pipe(res, { end: false }); + response.data.on('end', () => { + resolve(); + }); + }); + + if (update.isFinished) { + shouldContinue = false; + break; + } + } catch (innerError) { + logger.error('Error processing update:', update, innerError); + if (!res.headersSent) { + res.status(500).end(); + } + return; + } + } + + if (!shouldContinue) { + break; + } + } + + if (!res.headersSent) { + res.end(); + } + } catch (error) { + logger.error('Failed to fetch audio:', error); + if (!res.headersSent) { + res.status(500).end(); + } + } +} + +module.exports = { + textToSpeech, + getProvider, + streamAudio, +}; diff --git a/api/server/services/Files/Audio/webSocket.js b/api/server/services/Files/Audio/webSocket.js new file mode 100644 index 00000000000..f2d96c79416 --- /dev/null +++ b/api/server/services/Files/Audio/webSocket.js @@ -0,0 +1,31 @@ +let token = ''; + +function updateTokenWebsocket(newToken) { + console.log('Token:', newToken); + token = newToken; +} + +function sendTextToWebsocket(ws, onDataReceived) { + if (token === '[DONE]') { + ws.send(' '); + return; + } + + if (ws.readyState === WebSocket.OPEN) { + ws.send(token); + + ws.onmessage = function (event) { + console.log('Received:', event.data); + if (onDataReceived) { + onDataReceived(event.data); // Pass the received data to the callback function + } + }; + } else { + console.error('WebSocket is not open. Ready state is: ' + ws.readyState); + } +} + +module.exports = { + updateTokenWebsocket, + sendTextToWebsocket, +}; diff --git a/api/server/services/Runs/StreamRunManager.js b/api/server/services/Runs/StreamRunManager.js index ce78b593188..bcae609c7b0 100644 --- a/api/server/services/Runs/StreamRunManager.js +++ b/api/server/services/Runs/StreamRunManager.js @@ -1,3 +1,4 @@ +const throttle = require('lodash/throttle'); const { StepTypes, ContentTypes, @@ -10,6 +11,7 @@ const { retrieveAndProcessFile } = require('~/server/services/Files/process'); const { processRequiredActions } = require('~/server/services/ToolService'); const { createOnProgress, sendMessage } = require('~/server/utils'); const { processMessages } = require('~/server/services/Threads'); +const { saveMessage } = require('~/models'); const { logger } = require('~/config'); /** @@ -43,6 +45,8 @@ class StreamRunManager { /** @type {string} */ this.apiKey = this.openai.apiKey; /** @type {string} */ + this.parentMessageId = fields.parentMessageId; + /** @type {string} */ this.thread_id = fields.thread_id; /** @type {RunCreateAndStreamParams} */ this.initialRunBody = fields.runBody; @@ -58,6 +62,8 @@ class StreamRunManager { this.messages = []; /** @type {string} */ this.text = ''; + /** @type {string} */ + this.intermediateText = ''; /** @type {Set} */ this.attachedFileIds = fields.attachedFileIds; /** @type {undefined | Promise} */ @@ -407,6 +413,7 @@ class StreamRunManager { const content = message.delta.content?.[0]; if (content && content.type === MessageContentTypes.TEXT) { + this.intermediateText += content.text.value; onProgress(content.text.value); } } @@ -523,10 +530,27 @@ class StreamRunManager { const stepKey = message_creation.message_id; const index = this.getStepIndex(stepKey); this.orderedRunSteps.set(index, message_creation); + const getText = () => this.intermediateText; // Create the Factory Function to stream the message const { onProgress: progressCallback } = createOnProgress({ - // todo: add option to save partialText to db - // onProgress: () => {}, + onProgress: throttle( + () => { + const text = getText(); + saveMessage({ + messageId: this.finalMessage.messageId, + conversationId: this.finalMessage.conversationId, + parentMessageId: this.parentMessageId, + model: this.req.body.model, + user: this.req.user.id, + sender: 'Assistant', + unfinished: true, + error: false, + text, + }); + }, + 2000, + { trailing: false }, + ), }); // This creates a function that attaches all of the parameters diff --git a/api/server/services/Threads/manage.js b/api/server/services/Threads/manage.js index fb151cee92a..5e2877bed0b 100644 --- a/api/server/services/Threads/manage.js +++ b/api/server/services/Threads/manage.js @@ -121,6 +121,7 @@ async function saveUserMessage(params) { * @param {Object} params - The parameters of the Assistant message * @param {string} params.user - The user's ID. * @param {string} params.messageId - The message Id. + * @param {string} params.text - The concatenated text of the message. * @param {string} params.assistant_id - The assistant Id. * @param {string} params.thread_id - The thread Id. * @param {string} params.model - The model used by the assistant. @@ -134,14 +135,6 @@ async function saveUserMessage(params) { * @return {Promise} A promise that resolves to the created run object. */ async function saveAssistantMessage(params) { - const text = params.content.reduce((acc, part) => { - if (!part.value) { - return acc; - } - - return acc + ' ' + part.value; - }, ''); - // const tokenCount = // TODO: need to count each content part const message = await recordMessage({ @@ -156,7 +149,8 @@ async function saveAssistantMessage(params) { content: params.content, sender: 'Assistant', isCreatedByUser: false, - text: text.trim(), + text: params.text, + unfinished: false, // tokenCount, }); @@ -302,6 +296,7 @@ async function syncMessages({ aggregateMessages: [{ id: apiMessage.id }], model: apiMessage.role === 'user' ? null : apiMessage.assistant_id, user: openai.req.user.id, + unfinished: false, }; if (apiMessage.file_ids?.length) { diff --git a/client/package.json b/client/package.json index c06aa4454ea..9ff08e13620 100644 --- a/client/package.json +++ b/client/package.json @@ -79,9 +79,11 @@ "react-markdown": "^8.0.6", "react-resizable-panels": "^1.0.9", "react-router-dom": "^6.11.2", + "react-speech-recognition": "^3.10.0", "react-textarea-autosize": "^8.4.0", "react-transition-group": "^4.4.5", "recoil": "^0.7.7", + "regenerator-runtime": "^0.14.1", "rehype-highlight": "^6.0.0", "rehype-katex": "^6.0.2", "rehype-raw": "^6.1.1", diff --git a/client/src/common/types.ts b/client/src/common/types.ts index 62aae7f14be..5038d175560 100644 --- a/client/src/common/types.ts +++ b/client/src/common/types.ts @@ -21,6 +21,21 @@ import type { import type { UseMutationResult } from '@tanstack/react-query'; import type { LucideIcon } from 'lucide-react'; +export type AudioChunk = { + audio: string; + isFinal: boolean; + alignment: { + char_start_times_ms: number[]; + chars_durations_ms: number[]; + chars: string[]; + }; + normalizedAlignment: { + char_start_times_ms: number[]; + chars_durations_ms: number[]; + chars: string[]; + }; +}; + export type AssistantListItem = { id: string; name: string; @@ -37,6 +52,7 @@ export type LastSelectedModels = Record; export type LocalizeFunction = (phraseKey: string, ...values: string[]) => string; export const mainTextareaId = 'prompt-textarea'; +export const globalAudioId = 'global-audio'; export enum IconContext { landing = 'landing', diff --git a/client/src/components/Chat/Input/AudioRecorder.tsx b/client/src/components/Chat/Input/AudioRecorder.tsx new file mode 100644 index 00000000000..c1d100e50dc --- /dev/null +++ b/client/src/components/Chat/Input/AudioRecorder.tsx @@ -0,0 +1,47 @@ +import React from 'react'; +import { ListeningIcon, Spinner, SpeechIcon } from '~/components/svg'; +import { TooltipProvider, Tooltip, TooltipTrigger, TooltipContent } from '~/components/ui/'; +import { useLocalize } from '~/hooks'; + +export default function AudioRecorder({ + isListening, + isLoading, + startRecording, + stopRecording, + disabled, +}) { + const localize = useLocalize(); + const handleStartRecording = async () => { + await startRecording(); + }; + + const handleStopRecording = async () => { + await stopRecording(); + }; + + return ( + + + + + + + {localize('com_ui_use_micrphone')} + + + + ); +} diff --git a/client/src/components/Chat/Input/ChatForm.tsx b/client/src/components/Chat/Input/ChatForm.tsx index f12284cc7fd..d3cb04e4225 100644 --- a/client/src/components/Chat/Input/ChatForm.tsx +++ b/client/src/components/Chat/Input/ChatForm.tsx @@ -1,6 +1,6 @@ -import { useRecoilState } from 'recoil'; import { useForm } from 'react-hook-form'; -import { memo, useCallback, useRef, useMemo } from 'react'; +import { useRecoilState, useRecoilValue } from 'recoil'; +import { memo, useCallback, useRef, useMemo, useEffect } from 'react'; import { supportsFiles, mergeFileConfig, @@ -8,12 +8,14 @@ import { fileConfig as defaultFileConfig, } from 'librechat-data-provider'; import { useChatContext, useAssistantsMapContext } from '~/Providers'; -import { useRequiresKey, useTextarea } from '~/hooks'; +import { useRequiresKey, useTextarea, useSpeechToText } from '~/hooks'; import { TextareaAutosize } from '~/components/ui'; import { useGetFileConfig } from '~/data-provider'; import { cn, removeFocusOutlines } from '~/utils'; import AttachFile from './Files/AttachFile'; +import AudioRecorder from './AudioRecorder'; import { mainTextareaId } from '~/common'; +import StreamAudio from './StreamAudio'; import StopButton from './StopButton'; import SendButton from './SendButton'; import FileRow from './Files/FileRow'; @@ -23,6 +25,9 @@ import store from '~/store'; const ChatForm = ({ index = 0 }) => { const submitButtonRef = useRef(null); const textAreaRef = useRef(null); + const SpeechToText = useRecoilValue(store.SpeechToText); + const TextToSpeech = useRecoilValue(store.TextToSpeech); + const automaticPlayback = useRecoilValue(store.automaticPlayback); const [showStopButton, setShowStopButton] = useRecoilState(store.showStopButtonByIndex(index)); const [showMentionPopover, setShowMentionPopover] = useRecoilState( store.showMentionPopoverFamily(index), @@ -67,6 +72,24 @@ const ChatForm = ({ index = 0 }) => { const { endpoint: _endpoint, endpointType } = conversation ?? { endpoint: null }; const endpoint = endpointType ?? _endpoint; + const handleTranscriptionComplete = (text: string) => { + if (text) { + ask({ text }); + methods.reset({ text: '' }); + clearText(); + } + }; + + const { isListening, isLoading, startRecording, stopRecording, speechText, clearText } = + useSpeechToText(handleTranscriptionComplete); + + useEffect(() => { + if (textAreaRef.current) { + textAreaRef.current.value = speechText; + methods.setValue('text', speechText, { shouldValidate: true }); + } + }, [speechText, methods]); + const { data: fileConfig = defaultFileConfig } = useGetFileConfig({ select: (data) => mergeFileConfig(data), }); @@ -87,7 +110,7 @@ const ChatForm = ({ index = 0 }) => { const { ref, ...registerProps } = methods.register('text', { required: true, onChange: (e) => { - methods.setValue('text', e.target.value); + methods.setValue('text', e.target.value, { shouldValidate: true }); }, }); @@ -135,7 +158,8 @@ const ChatForm = ({ index = 0 }) => { supportsFiles[endpointType ?? endpoint ?? ''] && !endpointFileConfig?.disabled ? ' pl-10 md:pl-[55px]' : 'pl-3 md:pl-4', - 'm-0 w-full resize-none border-0 bg-transparent py-[10px] pr-10 placeholder-black/50 focus:ring-0 focus-visible:ring-0 dark:bg-transparent dark:placeholder-white/50 md:py-3.5 md:pr-12 ', + 'm-0 w-full resize-none border-0 bg-transparent py-[10px] placeholder-black/50 focus:ring-0 focus-visible:ring-0 dark:bg-transparent dark:placeholder-white/50 md:py-3.5 ', + SpeechToText ? 'pr-20 md:pr-[85px]' : 'pr-10 md:pr-12', removeFocusOutlines, 'max-h-[65vh] md:max-h-[75vh]', )} @@ -157,6 +181,16 @@ const ChatForm = ({ index = 0 }) => { /> ) )} + {SpeechToText && ( + + )} + {TextToSpeech && automaticPlayback && } diff --git a/client/src/components/Chat/Input/Files/Table/DataTable.tsx b/client/src/components/Chat/Input/Files/Table/DataTable.tsx index 1886ffc8750..91d3bd751df 100644 --- a/client/src/components/Chat/Input/Files/Table/DataTable.tsx +++ b/client/src/components/Chat/Input/Files/Table/DataTable.tsx @@ -225,7 +225,7 @@ export default function DataTable({ columns, data }: DataTablePro )} + )} {isEditableEndpoint && ( )} {regenerateEnabled ? ( ) : null} - + ) : null} diff --git a/client/src/components/Chat/Messages/Message.tsx b/client/src/components/Chat/Messages/Message.tsx index b1d105a3659..d4d4652aa12 100644 --- a/client/src/components/Chat/Messages/Message.tsx +++ b/client/src/components/Chat/Messages/Message.tsx @@ -20,6 +20,7 @@ export default function Message(props: TMessageProps) { const { ask, edit, + index, isLast, enterEdit, handleScroll, @@ -102,6 +103,7 @@ export default function Message(props: TMessageProps) { setSiblingIdx={setSiblingIdx} />
- + - +
diff --git a/client/src/components/Nav/SettingsTabs/Data/Data.tsx b/client/src/components/Nav/SettingsTabs/Data/Data.tsx index 6e8b8c0c185..d682f79fad1 100644 --- a/client/src/components/Nav/SettingsTabs/Data/Data.tsx +++ b/client/src/components/Nav/SettingsTabs/Data/Data.tsx @@ -1,75 +1,14 @@ +import React, { useState, useRef } from 'react'; import * as Tabs from '@radix-ui/react-tabs'; -import { - useRevokeUserKeyMutation, - useRevokeAllUserKeysMutation, - useClearConversationsMutation, -} from 'librechat-data-provider/react-query'; +import { useClearConversationsMutation } from 'librechat-data-provider/react-query'; import { SettingsTabValues } from 'librechat-data-provider'; -import React, { useState, useCallback, useRef } from 'react'; import { useConversation, useConversations, useOnClickOutside } from '~/hooks'; +import { RevokeKeysButton } from './RevokeKeysButton'; +import { DeleteCacheButton } from './DeleteCacheButton'; import ImportConversations from './ImportConversations'; import { ClearChatsButton } from './ClearChats'; -import DangerButton from '../DangerButton'; import SharedLinks from './SharedLinks'; -export const RevokeKeysButton = ({ - showText = true, - endpoint = '', - all = false, - disabled = false, -}: { - showText?: boolean; - endpoint?: string; - all?: boolean; - disabled?: boolean; -}) => { - const [confirmRevoke, setConfirmRevoke] = useState(false); - - const revokeKeysMutation = useRevokeAllUserKeysMutation(); - const revokeKeyMutation = useRevokeUserKeyMutation(endpoint); - - const revokeContentRef = useRef(null); - useOnClickOutside(revokeContentRef, () => confirmRevoke && setConfirmRevoke(false), []); - - const revokeAllUserKeys = useCallback(() => { - if (confirmRevoke) { - revokeKeysMutation.mutate({}); - setConfirmRevoke(false); - } else { - setConfirmRevoke(true); - } - }, [confirmRevoke, revokeKeysMutation]); - - const revokeUserKey = useCallback(() => { - if (!endpoint) { - return; - } else if (confirmRevoke) { - revokeKeyMutation.mutate({}); - setConfirmRevoke(false); - } else { - setConfirmRevoke(true); - } - }, [confirmRevoke, revokeKeyMutation, endpoint]); - - const onClick = all ? revokeAllUserKeys : revokeUserKey; - - return ( - - ); -}; - function Data() { const dataTabRef = useRef(null); const [confirmClearConvos, setConfirmClearConvos] = useState(false); @@ -114,7 +53,9 @@ function Data() {
- +
+ +
{ + const [confirmClear, setConfirmClear] = useState(false); + const [isCacheEmpty, setIsCacheEmpty] = useState(true); + const contentRef = useRef(null); + useOnClickOutside(contentRef, () => confirmClear && setConfirmClear(false), []); + + const checkCache = useCallback(async () => { + const cache = await caches.open('tts-responses'); + const keys = await cache.keys(); + setIsCacheEmpty(keys.length === 0); + }, []); + + useEffect(() => { + checkCache(); + }, [confirmClear]); + + const revokeAllUserKeys = useCallback(async () => { + if (confirmClear) { + const cache = await caches.open('tts-responses'); + await cache.keys().then((keys) => Promise.all(keys.map((key) => cache.delete(key)))); + + setConfirmClear(false); + } else { + setConfirmClear(true); + } + }, [confirmClear]); + + return ( + + ); +}; diff --git a/client/src/components/Nav/SettingsTabs/Data/RevokeKeysButton.tsx b/client/src/components/Nav/SettingsTabs/Data/RevokeKeysButton.tsx new file mode 100644 index 00000000000..0eaf58d0044 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Data/RevokeKeysButton.tsx @@ -0,0 +1,64 @@ +import { + useRevokeAllUserKeysMutation, + useRevokeUserKeyMutation, +} from 'librechat-data-provider/react-query'; +import React, { useState, useCallback, useRef } from 'react'; +import { useOnClickOutside } from '~/hooks'; +import DangerButton from '../DangerButton'; + +export const RevokeKeysButton = ({ + showText = true, + endpoint = '', + all = false, + disabled = false, +}: { + showText?: boolean; + endpoint?: string; + all?: boolean; + disabled?: boolean; +}) => { + const [confirmClear, setConfirmClear] = useState(false); + const revokeKeyMutation = useRevokeUserKeyMutation(endpoint); + const revokeKeysMutation = useRevokeAllUserKeysMutation(); + + const contentRef = useRef(null); + useOnClickOutside(contentRef, () => confirmClear && setConfirmClear(false), []); + + const revokeAllUserKeys = useCallback(() => { + if (confirmClear) { + revokeKeysMutation.mutate({}); + setConfirmClear(false); + } else { + setConfirmClear(true); + } + }, [confirmClear, revokeKeysMutation]); + + const revokeUserKey = useCallback(() => { + if (!endpoint) { + return; + } else if (confirmClear) { + revokeKeyMutation.mutate({}); + setConfirmClear(false); + } else { + setConfirmClear(true); + } + }, [confirmClear, revokeKeyMutation, endpoint]); + + const onClick = all ? revokeAllUserKeys : revokeUserKey; + + return ( + + ); +}; diff --git a/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.spec.tsx b/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.spec.tsx new file mode 100644 index 00000000000..c57dc122fb0 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.spec.tsx @@ -0,0 +1,38 @@ +import React from 'react'; +import '@testing-library/jest-dom/extend-expect'; +import { render, fireEvent } from 'test/layout-test-utils'; +import ConversationModeSwitch from './ConversationModeSwitch'; +import { RecoilRoot } from 'recoil'; + +describe('ConversationModeSwitch', () => { + /** + * Mock function to set the auto-send-text state. + */ + let mockSetConversationMode: jest.Mock | ((value: boolean) => void) | undefined; + + beforeEach(() => { + mockSetConversationMode = jest.fn(); + }); + + it('renders correctly', () => { + const { getByTestId } = render( + + + , + ); + + expect(getByTestId('ConversationMode')).toBeInTheDocument(); + }); + + it('calls onCheckedChange when the switch is toggled', () => { + const { getByTestId } = render( + + + , + ); + const switchElement = getByTestId('ConversationMode'); + fireEvent.click(switchElement); + + expect(mockSetConversationMode).toHaveBeenCalledWith(true); + }); +}); diff --git a/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.tsx new file mode 100644 index 00000000000..0f76913e8b9 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.tsx @@ -0,0 +1,55 @@ +import { useRecoilState } from 'recoil'; +import { Switch } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; + +export default function ConversationModeSwitch({ + onCheckedChange, +}: { + onCheckedChange?: (value: boolean) => void; +}) { + const localize = useLocalize(); + const [conversationMode, setConversationMode] = useRecoilState(store.conversationMode); + const [advancedMode, setAdvancedMode] = useRecoilState(store.advancedMode); + const [textToSpeech] = useRecoilState(store.TextToSpeech); + const [, setAutoSendText] = useRecoilState(store.autoSendText); + const [, setDecibelValue] = useRecoilState(store.decibelValue); + const [, setAutoTranscribeAudio] = useRecoilState(store.autoTranscribeAudio); + + const handleCheckedChange = (value: boolean) => { + if (!advancedMode) { + setAutoTranscribeAudio(value); + setAutoSendText(value); + setDecibelValue(-45); + } + setConversationMode(value); + if (onCheckedChange) { + onCheckedChange(value); + } + }; + + return ( +
+
+ {localize('com_nav_conversation_mode')} +
+
+ +
+ +
+
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/AutoSendTextSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/AutoSendTextSwitch.tsx new file mode 100644 index 00000000000..aab971252ea --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/STT/AutoSendTextSwitch.tsx @@ -0,0 +1,35 @@ +import { useRecoilState } from 'recoil'; +import { Switch } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; + +export default function AutoSendTextSwitch({ + onCheckedChange, +}: { + onCheckedChange?: (value: boolean) => void; +}) { + const localize = useLocalize(); + const [autoSendText, setAutoSendText] = useRecoilState(store.autoSendText); + const [SpeechToText] = useRecoilState(store.SpeechToText); + + const handleCheckedChange = (value: boolean) => { + setAutoSendText(value); + if (onCheckedChange) { + onCheckedChange(value); + } + }; + + return ( +
+
{localize('com_nav_auto_send_text')}
+ +
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/AutoTranscribeAudioSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/AutoTranscribeAudioSwitch.tsx new file mode 100644 index 00000000000..cf0876d9456 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/STT/AutoTranscribeAudioSwitch.tsx @@ -0,0 +1,37 @@ +import { useRecoilState } from 'recoil'; +import { Switch } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; + +export default function AutoTranscribeAudioSwitch({ + onCheckedChange, +}: { + onCheckedChange?: (value: boolean) => void; +}) { + const localize = useLocalize(); + const [autoTranscribeAudio, setAutoTranscribeAudio] = useRecoilState( + store.autoTranscribeAudio, + ); + const [speechToText] = useRecoilState(store.SpeechToText); + + const handleCheckedChange = (value: boolean) => { + setAutoTranscribeAudio(value); + if (onCheckedChange) { + onCheckedChange(value); + } + }; + + return ( +
+
{localize('com_nav_auto_transcribe_audio')}
+ +
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/DecibelSelector.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/DecibelSelector.tsx new file mode 100755 index 00000000000..3157a7cdcd0 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/STT/DecibelSelector.tsx @@ -0,0 +1,49 @@ +import React from 'react'; +import { useRecoilState, useRecoilValue } from 'recoil'; +import { Slider, InputNumber } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; +import { cn, defaultTextProps, optionText } from '~/utils/'; + +export default function DecibelSelector() { + const localize = useLocalize(); + const speechToText = useRecoilValue(store.SpeechToText); + const [decibelValue, setDecibelValue] = useRecoilState(store.decibelValue); + + return ( +
+
+
{localize('com_nav_db_sensitivity')}
+
+ ({localize('com_endpoint_default_with_num', '0.45')}) +
+
+ setDecibelValue(value[0])} + doubleClickHandler={() => setDecibelValue(-45)} + min={-100} + max={-30} + step={1} + className="ml-4 flex h-4 w-24" + disabled={!speechToText} + /> +
+ setDecibelValue(value ? value[0] : 0)} + min={-100} + max={-30} + className={cn( + defaultTextProps, + cn( + optionText, + 'reset-rc-number-input reset-rc-number-input-text-right h-auto w-12 border-0 group-hover/temp:border-gray-200', + ), + )} + /> +
+
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/EngineSTTDropdown.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/EngineSTTDropdown.tsx new file mode 100644 index 00000000000..4b14f1317d9 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/STT/EngineSTTDropdown.tsx @@ -0,0 +1,31 @@ +import { useRecoilState } from 'recoil'; +import { Dropdown } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; + +export default function EngineSTTDropdown() { + const localize = useLocalize(); + const [endpointSTT, setEndpointSTT] = useRecoilState(store.endpointSTT); + const endpointOptions = [ + { value: 'browser', display: localize('com_nav_browser') }, + { value: 'external', display: localize('com_nav_external') }, + ]; + + const handleSelect = (value: string) => { + setEndpointSTT(value); + }; + + return ( +
+
{localize('com_nav_engine')}
+ +
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/SpeechToTextSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/SpeechToTextSwitch.tsx new file mode 100644 index 00000000000..1aef9b1d9fa --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/STT/SpeechToTextSwitch.tsx @@ -0,0 +1,35 @@ +import { useRecoilState } from 'recoil'; +import { Switch } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; + +export default function SpeechToTextSwitch({ + onCheckedChange, +}: { + onCheckedChange?: (value: boolean) => void; +}) { + const localize = useLocalize(); + const [speechToText, setSpeechToText] = useRecoilState(store.SpeechToText); + + const handleCheckedChange = (value: boolean) => { + setSpeechToText(value); + if (onCheckedChange) { + onCheckedChange(value); + } + }; + + return ( +
+
+ {localize('com_nav_speech_to_text')} +
+ +
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/__tests__/AutoSendTextSwitch.spec.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/__tests__/AutoSendTextSwitch.spec.tsx new file mode 100644 index 00000000000..95b394798d3 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/STT/__tests__/AutoSendTextSwitch.spec.tsx @@ -0,0 +1,38 @@ +import React from 'react'; +import '@testing-library/jest-dom/extend-expect'; +import { render, fireEvent } from 'test/layout-test-utils'; +import AutoSendTextSwitch from '../AutoSendTextSwitch'; +import { RecoilRoot } from 'recoil'; + +describe('AutoSendTextSwitch', () => { + /** + * Mock function to set the auto-send-text state. + */ + let mockSetAutoSendText: jest.Mock | ((value: boolean) => void) | undefined; + + beforeEach(() => { + mockSetAutoSendText = jest.fn(); + }); + + it('renders correctly', () => { + const { getByTestId } = render( + + + , + ); + + expect(getByTestId('AutoSendText')).toBeInTheDocument(); + }); + + it('calls onCheckedChange when the switch is toggled', () => { + const { getByTestId } = render( + + + , + ); + const switchElement = getByTestId('AutoSendText'); + fireEvent.click(switchElement); + + expect(mockSetAutoSendText).toHaveBeenCalledWith(true); + }); +}); diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/__tests__/AutoTranscribeAudioSwitch.spec.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/__tests__/AutoTranscribeAudioSwitch.spec.tsx new file mode 100644 index 00000000000..c7860c6a2d4 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/STT/__tests__/AutoTranscribeAudioSwitch.spec.tsx @@ -0,0 +1,41 @@ +import React from 'react'; +import '@testing-library/jest-dom/extend-expect'; +import { render, fireEvent } from 'test/layout-test-utils'; +import AutoTranscribeAudioSwitch from '../AutoTranscribeAudioSwitch'; +import { RecoilRoot } from 'recoil'; + +describe('AutoTranscribeAudioSwitch', () => { + /** + * Mock function to set the auto-send-text state. + */ + let mockSetAutoTranscribeAudio: + | jest.Mock + | ((value: boolean) => void) + | undefined; + + beforeEach(() => { + mockSetAutoTranscribeAudio = jest.fn(); + }); + + it('renders correctly', () => { + const { getByTestId } = render( + + + , + ); + + expect(getByTestId('AutoTranscribeAudio')).toBeInTheDocument(); + }); + + it('calls onCheckedChange when the switch is toggled', () => { + const { getByTestId } = render( + + + , + ); + const switchElement = getByTestId('AutoTranscribeAudio'); + fireEvent.click(switchElement); + + expect(mockSetAutoTranscribeAudio).toHaveBeenCalledWith(true); + }); +}); diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/__tests__/SpeechToTextSwitch.spec.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/__tests__/SpeechToTextSwitch.spec.tsx new file mode 100644 index 00000000000..90d503eb785 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/STT/__tests__/SpeechToTextSwitch.spec.tsx @@ -0,0 +1,38 @@ +import React from 'react'; +import '@testing-library/jest-dom/extend-expect'; +import { render, fireEvent } from 'test/layout-test-utils'; +import SpeechToTextSwitch from '../SpeechToTextSwitch'; +import { RecoilRoot } from 'recoil'; + +describe('SpeechToTextSwitch', () => { + /** + * Mock function to set the speech-to-text state. + */ + let mockSetSpeechToText: jest.Mock | ((value: boolean) => void) | undefined; + + beforeEach(() => { + mockSetSpeechToText = jest.fn(); + }); + + it('renders correctly', () => { + const { getByTestId } = render( + + + , + ); + + expect(getByTestId('SpeechToText')).toBeInTheDocument(); + }); + + it('calls onCheckedChange when the switch is toggled', () => { + const { getByTestId } = render( + + + , + ); + const switchElement = getByTestId('SpeechToText'); + fireEvent.click(switchElement); + + expect(mockSetSpeechToText).toHaveBeenCalledWith(false); + }); +}); diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/index.ts b/client/src/components/Nav/SettingsTabs/Speech/STT/index.ts new file mode 100644 index 00000000000..9ac483af708 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/STT/index.ts @@ -0,0 +1,5 @@ +export { default as AutoSendTextSwitch } from './AutoSendTextSwitch'; +export { default as SpeechToTextSwitch } from './SpeechToTextSwitch'; +export { default as EngineSTTDropdown } from './EngineSTTDropdown'; +export { default as DecibelSelector } from './DecibelSelector'; +export { default as AutoTranscribeAudioSwitch } from './AutoTranscribeAudioSwitch'; diff --git a/client/src/components/Nav/SettingsTabs/Speech/Speech.tsx b/client/src/components/Nav/SettingsTabs/Speech/Speech.tsx new file mode 100644 index 00000000000..d3d8a8951a7 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/Speech.tsx @@ -0,0 +1,93 @@ +import * as Tabs from '@radix-ui/react-tabs'; +import { SettingsTabValues } from 'librechat-data-provider'; +import React, { useState, useRef } from 'react'; +import { useRecoilState } from 'recoil'; +import { useOnClickOutside } from '~/hooks'; +import store from '~/store'; +import ConversationModeSwitch from './ConversationModeSwitch'; +import { + TextToSpeechSwitch, + EngineTTSDropdown, + AutomaticPlayback, + CacheTTSSwitch, + VoiceDropdown, + PlaybackRate, +} from './TTS'; +import { + DecibelSelector, + EngineSTTDropdown, + SpeechToTextSwitch, + AutoSendTextSwitch, + AutoTranscribeAudioSwitch, +} from './STT'; + +function Speech() { + const [confirmClear, setConfirmClear] = useState(false); + const [advancedMode] = useRecoilState(store.advancedMode); + const [autoTranscribeAudio] = useRecoilState(store.autoTranscribeAudio); + + const contentRef = useRef(null); + useOnClickOutside(contentRef, () => confirmClear && setConfirmClear(false), []); + + return ( + +
+
+ +
+
+
+ +
+
+ +
+ {advancedMode && ( +
+ +
+ )} + {autoTranscribeAudio && advancedMode && ( +
+ +
+ )} + {advancedMode && ( +
+ +
+ )} +
+
+ +
+
+ +
+
+ +
+
+ +
+ {advancedMode && ( +
+ +
+ )} + {advancedMode && ( +
+ +
+ )} +
+ + ); +} + +export default React.memo(Speech); diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlayback.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlayback.tsx new file mode 100644 index 00000000000..400d43a230a --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlayback.tsx @@ -0,0 +1,33 @@ +import { useRecoilState } from 'recoil'; +import { Switch } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; + +export default function AutomaticPlayback({ + onCheckedChange, +}: { + onCheckedChange?: (value: boolean) => void; +}) { + const localize = useLocalize(); + const [automaticPlayback, setAutomaticPlayback] = useRecoilState(store.automaticPlayback); + + const handleCheckedChange = (value: boolean) => { + setAutomaticPlayback(value); + if (onCheckedChange) { + onCheckedChange(value); + } + }; + + return ( +
+
{localize('com_nav_automatic_playback')}
+ +
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/CacheTTSSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/CacheTTSSwitch.tsx new file mode 100644 index 00000000000..ac765981070 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/CacheTTSSwitch.tsx @@ -0,0 +1,35 @@ +import { useRecoilState } from 'recoil'; +import { Switch } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; + +export default function CacheTTSSwitch({ + onCheckedChange, +}: { + onCheckedChange?: (value: boolean) => void; +}) { + const localize = useLocalize(); + const [cacheTTS, setCacheTTS] = useRecoilState(store.cacheTTS); + const [textToSpeech] = useRecoilState(store.TextToSpeech); + + const handleCheckedChange = (value: boolean) => { + setCacheTTS(value); + if (onCheckedChange) { + onCheckedChange(value); + } + }; + + return ( +
+
{localize('com_nav_enable_cache_tts')}
+ +
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/EngineTTSDropdown.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/EngineTTSDropdown.tsx new file mode 100644 index 00000000000..852381a48a2 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/EngineTTSDropdown.tsx @@ -0,0 +1,31 @@ +import { useRecoilState } from 'recoil'; +import { Dropdown } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; + +export default function EngineTTSDropdown() { + const localize = useLocalize(); + const [endpointTTS, setEndpointTTS] = useRecoilState(store.endpointTTS); + const endpointOptions = [ + { value: 'browser', display: localize('com_nav_browser') }, + { value: 'external', display: localize('com_nav_external') }, + ]; + + const handleSelect = (value: string) => { + setEndpointTTS(value); + }; + + return ( +
+
{localize('com_nav_engine')}
+ +
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/PlaybackRate.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/PlaybackRate.tsx new file mode 100755 index 00000000000..e16bef62f37 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/PlaybackRate.tsx @@ -0,0 +1,49 @@ +import React from 'react'; +import { useRecoilState, useRecoilValue } from 'recoil'; +import { Slider, InputNumber } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; +import { cn, defaultTextProps, optionText } from '~/utils/'; + +export default function DecibelSelector() { + const localize = useLocalize(); + const textToSpeech = useRecoilValue(store.TextToSpeech); + const [playbackRate, setPlaybackRate] = useRecoilState(store.playbackRate); + + return ( +
+
+
{localize('com_nav_playback_rate')}
+
+ ({localize('com_endpoint_default_with_num', '1')}) +
+
+ setPlaybackRate(value[0])} + doubleClickHandler={() => setPlaybackRate(null)} + min={-0.1} + max={2} + step={0.1} + className="ml-4 flex h-4 w-24" + disabled={!textToSpeech} + /> +
+ setPlaybackRate(value ? value[0] : 0)} + min={-0.1} + max={2} + className={cn( + defaultTextProps, + cn( + optionText, + 'reset-rc-number-input reset-rc-number-input-text-right h-auto w-12 border-0 group-hover/temp:border-gray-200', + ), + )} + /> +
+
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/TextToSpeechSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/TextToSpeechSwitch.tsx new file mode 100644 index 00000000000..f0b5b8d6dc5 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/TextToSpeechSwitch.tsx @@ -0,0 +1,35 @@ +import { useRecoilState } from 'recoil'; +import { Switch } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; + +export default function TextToSpeechSwitch({ + onCheckedChange, +}: { + onCheckedChange?: (value: boolean) => void; +}) { + const localize = useLocalize(); + const [TextToSpeech, setTextToSpeech] = useRecoilState(store.TextToSpeech); + + const handleCheckedChange = (value: boolean) => { + setTextToSpeech(value); + if (onCheckedChange) { + onCheckedChange(value); + } + }; + + return ( +
+
+ {localize('com_nav_text_to_speech')} +
+ +
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/VoiceDropdown.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/VoiceDropdown.tsx new file mode 100644 index 00000000000..8037ed4086e --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/VoiceDropdown.tsx @@ -0,0 +1,31 @@ +import { useMemo } from 'react'; +import { useRecoilState } from 'recoil'; +import { useVoicesQuery } from '~/data-provider'; +import { Dropdown } from '~/components/ui'; +import { useLocalize } from '~/hooks'; +import store from '~/store'; + +export default function VoiceDropdown() { + const localize = useLocalize(); + const [voice, setVoice] = useRecoilState(store.voice); + const { data } = useVoicesQuery(); + + const voiceOptions = useMemo( + () => (data ?? []).map((v: string) => ({ value: v, display: v })), + [data], + ); + + return ( +
+
{localize('com_nav_voice_select')}
+ setVoice(value)} + options={voiceOptions} + width={220} + position={'left'} + testId="VoiceDropdown" + /> +
+ ); +} diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/CacheTTSSwitch.spec.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/CacheTTSSwitch.spec.tsx new file mode 100644 index 00000000000..879e6d505d7 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/CacheTTSSwitch.spec.tsx @@ -0,0 +1,38 @@ +import React from 'react'; +import '@testing-library/jest-dom/extend-expect'; +import { render, fireEvent } from 'test/layout-test-utils'; +import CacheTTSSwitch from '../CacheTTSSwitch'; +import { RecoilRoot } from 'recoil'; + +describe('CacheTTSSwitch', () => { + /** + * Mock function to set the cache-tts state. + */ + let mockSetCacheTTS: jest.Mock | ((value: boolean) => void) | undefined; + + beforeEach(() => { + mockSetCacheTTS = jest.fn(); + }); + + it('renders correctly', () => { + const { getByTestId } = render( + + + , + ); + + expect(getByTestId('CacheTTS')).toBeInTheDocument(); + }); + + it('calls onCheckedChange when the switch is toggled', () => { + const { getByTestId } = render( + + + , + ); + const switchElement = getByTestId('CacheTTS'); + fireEvent.click(switchElement); + + expect(mockSetCacheTTS).toHaveBeenCalledWith(false); + }); +}); diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/TextToSpeechSwitch.spec.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/TextToSpeechSwitch.spec.tsx new file mode 100644 index 00000000000..46b2fc23c0c --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/TextToSpeechSwitch.spec.tsx @@ -0,0 +1,38 @@ +import React from 'react'; +import '@testing-library/jest-dom/extend-expect'; +import { render, fireEvent } from 'test/layout-test-utils'; +import TextToSpeechSwitch from '../TextToSpeechSwitch'; +import { RecoilRoot } from 'recoil'; + +describe('TextToSpeechSwitch', () => { + /** + * Mock function to set the text-to-speech state. + */ + let mockSetTextToSpeech: jest.Mock | ((value: boolean) => void) | undefined; + + beforeEach(() => { + mockSetTextToSpeech = jest.fn(); + }); + + it('renders correctly', () => { + const { getByTestId } = render( + + + , + ); + + expect(getByTestId('TextToSpeech')).toBeInTheDocument(); + }); + + it('calls onCheckedChange when the switch is toggled', () => { + const { getByTestId } = render( + + + , + ); + const switchElement = getByTestId('TextToSpeech'); + fireEvent.click(switchElement); + + expect(mockSetTextToSpeech).toHaveBeenCalledWith(false); + }); +}); diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/index.ts b/client/src/components/Nav/SettingsTabs/Speech/TTS/index.ts new file mode 100644 index 00000000000..2fdae1fcf83 --- /dev/null +++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/index.ts @@ -0,0 +1,6 @@ +export { default as AutomaticPlayback } from './AutomaticPlayback'; +export { default as CacheTTSSwitch } from './CacheTTSSwitch'; +export { default as EngineTTSDropdown } from './EngineTTSDropdown'; +export { default as PlaybackRate } from './PlaybackRate'; +export { default as TextToSpeechSwitch } from './TextToSpeechSwitch'; +export { default as VoiceDropdown } from './VoiceDropdown'; diff --git a/client/src/components/Nav/SettingsTabs/index.ts b/client/src/components/Nav/SettingsTabs/index.ts index df008bc8cf6..c0002d91d58 100644 --- a/client/src/components/Nav/SettingsTabs/index.ts +++ b/client/src/components/Nav/SettingsTabs/index.ts @@ -3,5 +3,6 @@ export { default as Messages } from './Messages/Messages'; export { ClearChatsButton } from './General/General'; export { default as Data } from './Data/Data'; export { default as Beta } from './Beta/Beta'; -export { RevokeKeysButton } from './Data/Data'; +export { RevokeKeysButton } from './Data/RevokeKeysButton'; export { default as Account } from './Account/Account'; +export { default as Speech } from './Speech/Speech'; diff --git a/client/src/components/svg/Clipboard.tsx b/client/src/components/svg/Clipboard.tsx index 3ad1f01d29a..a1e6baf3837 100644 --- a/client/src/components/svg/Clipboard.tsx +++ b/client/src/components/svg/Clipboard.tsx @@ -1,14 +1,16 @@ -import React from 'react'; +import { cn } from '~/utils/'; -export default function Clipboard() { +export default function Clipboard({ className = 'icon-md-heavy', size = '1em' }) { return ( + + + + + ); +} diff --git a/client/src/components/svg/RegenerateIcon.tsx b/client/src/components/svg/RegenerateIcon.tsx index d048a6e29f6..47df168deee 100644 --- a/client/src/components/svg/RegenerateIcon.tsx +++ b/client/src/components/svg/RegenerateIcon.tsx @@ -1,13 +1,15 @@ import { cn } from '~/utils'; -export default function RegenerateIcon({ className = '' }: { className?: string }) { +export default function RegenerateIcon({ className = '', size = '1em' }) { return ( + + + + + + + + ); +} diff --git a/client/src/components/svg/VolumeIcon.tsx b/client/src/components/svg/VolumeIcon.tsx new file mode 100644 index 00000000000..c796a71a1fc --- /dev/null +++ b/client/src/components/svg/VolumeIcon.tsx @@ -0,0 +1,21 @@ +import { cn } from '~/utils'; + +export default function VolumeIcon({ className = '', size = '1em' }) { + return ( + + + + ); +} diff --git a/client/src/components/svg/VolumeMuteIcon.tsx b/client/src/components/svg/VolumeMuteIcon.tsx new file mode 100644 index 00000000000..4d5b1ab6b2d --- /dev/null +++ b/client/src/components/svg/VolumeMuteIcon.tsx @@ -0,0 +1,21 @@ +import { cn } from '~/utils'; + +export default function VolumeMuteIcon({ className = '', size = '1em' }) { + return ( + + + + ); +} diff --git a/client/src/components/svg/index.ts b/client/src/components/svg/index.ts index d822e090ed0..509f2c91f7f 100644 --- a/client/src/components/svg/index.ts +++ b/client/src/components/svg/index.ts @@ -41,6 +41,9 @@ export { default as CodeyIcon } from './CodeyIcon'; export { default as GeminiIcon } from './GeminiIcon'; export { default as GoogleMinimalIcon } from './GoogleMinimalIcon'; export { default as AnthropicMinimalIcon } from './AnthropicMinimalIcon'; +export { default as ListeningIcon } from './ListeningIcon'; +export { default as VolumeIcon } from './VolumeIcon'; +export { default as VolumeMuteIcon } from './VolumeMuteIcon'; export { default as SendMessageIcon } from './SendMessageIcon'; export { default as UserIcon } from './UserIcon'; export { default as NewChatIcon } from './NewChatIcon'; @@ -49,3 +52,4 @@ export { default as GoogleIconChat } from './GoogleIconChat'; export { default as BirthdayIcon } from './BirthdayIcon'; export { default as AssistantIcon } from './AssistantIcon'; export { default as Sparkles } from './Sparkles'; +export { default as SpeechIcon } from './SpeechIcon'; diff --git a/client/src/components/ui/Dropdown.tsx b/client/src/components/ui/Dropdown.tsx index b23731b145e..76aa09caa06 100644 --- a/client/src/components/ui/Dropdown.tsx +++ b/client/src/components/ui/Dropdown.tsx @@ -1,4 +1,4 @@ -import React, { FC, useContext, useState } from 'react'; +import React, { FC, useState } from 'react'; import { Listbox } from '@headlessui/react'; import { cn } from '~/utils/'; diff --git a/client/src/components/ui/Landing.tsx b/client/src/components/ui/Landing.tsx index ee038445bd5..1953b209783 100644 --- a/client/src/components/ui/Landing.tsx +++ b/client/src/components/ui/Landing.tsx @@ -30,7 +30,7 @@ export default function Landing() {

{config?.appTitle || 'LibreChat'}

diff --git a/client/src/data-provider/mutations.ts b/client/src/data-provider/mutations.ts index 01e3a45ecb1..8db0f6c54f7 100644 --- a/client/src/data-provider/mutations.ts +++ b/client/src/data-provider/mutations.ts @@ -547,6 +547,36 @@ export const useUploadAvatarMutation = ( }); }; +/* Speech to text */ +export const useSpeechToTextMutation = ( + options?: t.SpeechToTextOptions, +): UseMutationResult< + t.SpeechToTextResponse, // response data + unknown, // error + FormData, // request + unknown // context +> => { + return useMutation([MutationKeys.speechToText], { + mutationFn: (variables: FormData) => dataService.speechToText(variables), + ...(options || {}), + }); +}; + +/* Text to speech */ +export const useTextToSpeechMutation = ( + options?: t.TextToSpeechOptions, +): UseMutationResult< + ArrayBuffer, // response data + unknown, // error + FormData, // request + unknown // context +> => { + return useMutation([MutationKeys.textToSpeech], { + mutationFn: (variables: FormData) => dataService.textToSpeech(variables), + ...(options || {}), + }); +}; + /** * ASSISTANTS */ diff --git a/client/src/data-provider/queries.ts b/client/src/data-provider/queries.ts index 5358e39bb4b..69061e1532b 100644 --- a/client/src/data-provider/queries.ts +++ b/client/src/data-provider/queries.ts @@ -9,6 +9,7 @@ import type { UseInfiniteQueryOptions, QueryObserverResult, UseQueryOptions, + UseQueryResult, } from '@tanstack/react-query'; import type t from 'librechat-data-provider'; import type { @@ -414,3 +415,10 @@ export const useFileDownload = (userId?: string, file_id?: string): QueryObserve }, ); }; + +/** STT/TTS */ + +/* Text to speech voices */ +export const useVoicesQuery = (): UseQueryResult => { + return useQuery([QueryKeys.voices], () => dataService.getVoices()); +}; diff --git a/client/src/hooks/Audio/MediaSourceAppender.ts b/client/src/hooks/Audio/MediaSourceAppender.ts new file mode 100644 index 00000000000..aef4f42edf5 --- /dev/null +++ b/client/src/hooks/Audio/MediaSourceAppender.ts @@ -0,0 +1,41 @@ +export class MediaSourceAppender { + private readonly mediaSource = new MediaSource(); + private readonly audioChunks: ArrayBuffer[] = []; + + private sourceBuffer?: SourceBuffer; + + constructor(type: string) { + this.mediaSource.addEventListener('sourceopen', async () => { + this.sourceBuffer = this.mediaSource.addSourceBuffer(type); + + this.sourceBuffer.addEventListener('updateend', () => { + this.tryAppendNextChunk(); + }); + }); + } + + private tryAppendNextChunk() { + if (this.sourceBuffer != null && !this.sourceBuffer.updating && this.audioChunks.length > 0) { + this.sourceBuffer.appendBuffer(this.audioChunks.shift()!); + } + } + + public addBase64Data(base64Data: string) { + this.addData(Uint8Array.from(atob(base64Data), (char) => char.charCodeAt(0)).buffer); + } + + public addData(data: ArrayBuffer) { + this.audioChunks.push(data); + this.tryAppendNextChunk(); + } + + public close() { + if (this.mediaSource.readyState === 'open') { + this.mediaSource.endOfStream(); + } + } + + public get mediaSourceUrl() { + return URL.createObjectURL(this.mediaSource); + } +} diff --git a/client/src/hooks/Audio/index.ts b/client/src/hooks/Audio/index.ts new file mode 100644 index 00000000000..c294db34527 --- /dev/null +++ b/client/src/hooks/Audio/index.ts @@ -0,0 +1,3 @@ +export * from './MediaSourceAppender'; +export { default as useCustomAudioRef } from './useCustomAudioRef'; +export { default as usePauseGlobalAudio } from './usePauseGlobalAudio'; diff --git a/client/src/hooks/Audio/useCustomAudioRef.ts b/client/src/hooks/Audio/useCustomAudioRef.ts new file mode 100644 index 00000000000..e2163567e38 --- /dev/null +++ b/client/src/hooks/Audio/useCustomAudioRef.ts @@ -0,0 +1,98 @@ +import { useEffect, useRef } from 'react'; + +interface CustomAudioElement extends HTMLAudioElement { + customStarted?: boolean; + customEnded?: boolean; + customPaused?: boolean; + customProps?: { + customStarted?: boolean; + customEnded?: boolean; + customPaused?: boolean; + }; +} + +type TCustomAudioResult = { audioRef: React.MutableRefObject }; + +export default function useCustomAudioRef({ + setIsPlaying, +}: { + setIsPlaying: (isPlaying: boolean) => void; +}): TCustomAudioResult { + const audioRef = useRef(null); + useEffect(() => { + let lastTimeUpdate: number | null = null; + let sameTimeUpdateCount = 0; + + const handleEnded = () => { + setIsPlaying(false); + console.log('global audio ended'); + if (audioRef.current) { + audioRef.current.customEnded = true; + URL.revokeObjectURL(audioRef.current.src); + } + }; + + const handleStart = () => { + setIsPlaying(true); + console.log('global audio started'); + if (audioRef.current) { + audioRef.current.customStarted = true; + } + }; + + const handlePause = () => { + console.log('global audio paused'); + if (audioRef.current) { + audioRef.current.customPaused = true; + } + }; + + const handleTimeUpdate = () => { + if (audioRef.current) { + const currentTime = audioRef.current.currentTime; + // console.log('Current time: ', currentTime); + + if (currentTime === lastTimeUpdate) { + sameTimeUpdateCount += 1; + } else { + sameTimeUpdateCount = 0; + } + + lastTimeUpdate = currentTime; + + if (sameTimeUpdateCount >= 1) { + console.log('Detected end of audio based on time update'); + audioRef.current.pause(); + handleEnded(); + } + } + }; + + const audioElement = audioRef.current; + + if (audioRef.current) { + audioRef.current.addEventListener('ended', handleEnded); + audioRef.current.addEventListener('play', handleStart); + audioRef.current.addEventListener('pause', handlePause); + audioRef.current.addEventListener('timeupdate', handleTimeUpdate); + + audioRef.current.customProps = { + customStarted: false, + customEnded: false, + customPaused: false, + }; + } + + return () => { + if (audioElement) { + audioElement.removeEventListener('ended', handleEnded); + audioElement.removeEventListener('play', handleStart); + audioElement.removeEventListener('pause', handlePause); + audioElement.removeEventListener('timeupdate', handleTimeUpdate); + URL.revokeObjectURL(audioElement.src); + } + }; + }, [setIsPlaying]); + + return { audioRef }; +} diff --git a/client/src/hooks/Audio/usePauseGlobalAudio.ts b/client/src/hooks/Audio/usePauseGlobalAudio.ts new file mode 100644 index 00000000000..a36f66c89f8 --- /dev/null +++ b/client/src/hooks/Audio/usePauseGlobalAudio.ts @@ -0,0 +1,37 @@ +import { useCallback } from 'react'; +import { useRecoilState, useSetRecoilState } from 'recoil'; +import { globalAudioId } from '~/common'; +import store from '~/store'; + +function usePauseGlobalAudio(index = 0) { + /* Global Audio Variables */ + const setAudioRunId = useSetRecoilState(store.audioRunFamily(index)); + const setIsGlobalAudioFetching = useSetRecoilState(store.globalAudioFetchingFamily(index)); + const [globalAudioURL, setGlobalAudioURL] = useRecoilState(store.globalAudioURLFamily(index)); + const setGlobalIsPlaying = useSetRecoilState(store.globalAudioPlayingFamily(index)); + + const pauseGlobalAudio = useCallback(() => { + if (globalAudioURL) { + const globalAudio = document.getElementById(globalAudioId); + if (globalAudio) { + console.log('Pausing global audio', globalAudioURL); + (globalAudio as HTMLAudioElement).pause(); + setGlobalIsPlaying(false); + } + URL.revokeObjectURL(globalAudioURL); + setIsGlobalAudioFetching(false); + setGlobalAudioURL(null); + setAudioRunId(null); + } + }, [ + globalAudioURL, + setGlobalAudioURL, + setGlobalIsPlaying, + setIsGlobalAudioFetching, + setAudioRunId, + ]); + + return { pauseGlobalAudio }; +} + +export default usePauseGlobalAudio; diff --git a/client/src/hooks/Input/index.ts b/client/src/hooks/Input/index.ts index 5b78eb4f318..3593bd3f806 100644 --- a/client/src/hooks/Input/index.ts +++ b/client/src/hooks/Input/index.ts @@ -4,3 +4,5 @@ export { default as useTextarea } from './useTextarea'; export { default as useCombobox } from './useCombobox'; export { default as useRequiresKey } from './useRequiresKey'; export { default as useMultipleKeys } from './useMultipleKeys'; +export { default as useSpeechToText } from './useSpeechToText'; +export { default as useTextToSpeech } from './useTextToSpeech'; diff --git a/client/src/hooks/Input/useSpeechToText.ts b/client/src/hooks/Input/useSpeechToText.ts new file mode 100644 index 00000000000..fd927ce35dd --- /dev/null +++ b/client/src/hooks/Input/useSpeechToText.ts @@ -0,0 +1,83 @@ +import { useState, useEffect } from 'react'; +import useSpeechToTextBrowser from './useSpeechToTextBrowser'; +import useSpeechToTextExternal from './useSpeechToTextExternal'; +import { useRecoilState } from 'recoil'; +import store from '~/store'; + +const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) => { + const [endpointSTT] = useRecoilState(store.endpointSTT); + const useExternalSpeechToText = endpointSTT === 'external'; + const [animatedText, setAnimatedText] = useState(''); + + const { + isListening: speechIsListeningBrowser, + isLoading: speechIsLoadingBrowser, + text: speechTextBrowser, + startRecording: startSpeechRecordingBrowser, + stopRecording: stopSpeechRecordingBrowser, + } = useSpeechToTextBrowser(); + + const { + isListening: speechIsListeningExternal, + isLoading: speechIsLoadingExternal, + text: speechTextExternal, + externalStartRecording: startSpeechRecordingExternal, + externalStopRecording: stopSpeechRecordingExternal, + clearText, + } = useSpeechToTextExternal(handleTranscriptionComplete); + + const isListening = useExternalSpeechToText + ? speechIsListeningExternal + : speechIsListeningBrowser; + const isLoading = useExternalSpeechToText ? speechIsLoadingExternal : speechIsLoadingBrowser; + const speechTextForm = useExternalSpeechToText ? speechTextExternal : speechTextBrowser; + const startRecording = useExternalSpeechToText + ? startSpeechRecordingExternal + : startSpeechRecordingBrowser; + const stopRecording = useExternalSpeechToText + ? stopSpeechRecordingExternal + : stopSpeechRecordingBrowser; + const speechText = + isListening || (speechTextExternal && speechTextExternal.length > 0) + ? speechTextExternal + : speechTextForm || ''; + + const animateTextTyping = (text: string) => { + const totalDuration = 2000; + const frameRate = 60; + const totalFrames = totalDuration / (1000 / frameRate); + const charsPerFrame = Math.ceil(text.length / totalFrames); + let currentIndex = 0; + + const animate = () => { + currentIndex += charsPerFrame; + const currentText = text.substring(0, currentIndex); + setAnimatedText(currentText); + + if (currentIndex < text.length) { + requestAnimationFrame(animate); + } else { + setAnimatedText(text); + } + }; + + requestAnimationFrame(animate); + }; + + useEffect(() => { + if (speechText) { + animateTextTyping(speechText); + } + }, [speechText]); + + return { + isListening, + isLoading, + startRecording, + stopRecording, + speechText: animatedText, + clearText, + }; +}; + +export default useSpeechToText; diff --git a/client/src/hooks/Input/useSpeechToTextBrowser.ts b/client/src/hooks/Input/useSpeechToTextBrowser.ts new file mode 100644 index 00000000000..9f414568992 --- /dev/null +++ b/client/src/hooks/Input/useSpeechToTextBrowser.ts @@ -0,0 +1,52 @@ +import { useEffect } from 'react'; +import { useRecoilState } from 'recoil'; +import { useToastContext } from '~/Providers'; +import store from '~/store'; +import SpeechRecognition, { useSpeechRecognition } from 'react-speech-recognition'; + +const useSpeechToTextBrowser = () => { + const { showToast } = useToastContext(); + const [endpointSTT] = useRecoilState(store.endpointSTT); + + const { transcript, listening, resetTranscript, browserSupportsSpeechRecognition } = + useSpeechRecognition(); + + const toggleListening = () => { + if (browserSupportsSpeechRecognition) { + if (listening) { + SpeechRecognition.stopListening(); + } else { + SpeechRecognition.startListening(); + } + } else { + showToast({ + message: 'Browser does not support SpeechRecognition', + status: 'error', + }); + } + }; + + useEffect(() => { + const handleKeyDown = (e: KeyboardEvent) => { + if (e.shiftKey && e.altKey && e.code === 'KeyL' && endpointSTT === 'browser') { + toggleListening(); + } + }; + + window.addEventListener('keydown', handleKeyDown); + return () => window.removeEventListener('keydown', handleKeyDown); + }, []); + + return { + isListening: listening, + isLoading: false, + text: transcript, + startRecording: toggleListening, + stopRecording: () => { + SpeechRecognition.stopListening(); + resetTranscript(); + }, + }; +}; + +export default useSpeechToTextBrowser; diff --git a/client/src/hooks/Input/useSpeechToTextExternal.ts b/client/src/hooks/Input/useSpeechToTextExternal.ts new file mode 100644 index 00000000000..3d8eca360cd --- /dev/null +++ b/client/src/hooks/Input/useSpeechToTextExternal.ts @@ -0,0 +1,238 @@ +import { useState, useEffect, useRef } from 'react'; +import { useRecoilState } from 'recoil'; +import { useSpeechToTextMutation } from '~/data-provider'; +import { useToastContext } from '~/Providers'; +import store from '~/store'; + +const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void) => { + const { showToast } = useToastContext(); + const [endpointSTT] = useRecoilState(store.endpointSTT); + const [speechToText] = useRecoilState(store.SpeechToText); + const [autoTranscribeAudio] = useRecoilState(store.autoTranscribeAudio); + const [autoSendText] = useRecoilState(store.autoSendText); + const [text, setText] = useState(''); + const [isListening, setIsListening] = useState(false); + const [permission, setPermission] = useState(false); + const [audioChunks, setAudioChunks] = useState([]); + const [isRequestBeingMade, setIsRequestBeingMade] = useState(false); + const [minDecibels] = useRecoilState(store.decibelValue); + const mediaRecorderRef = useRef(null); + const audioStream = useRef(null); + const audioContextRef = useRef(null); + const animationFrameIdRef = useRef(null); + + const { mutate: processAudio, isLoading: isProcessing } = useSpeechToTextMutation({ + onSuccess: (data) => { + const extractedText = data.text; + setText(extractedText); + setIsRequestBeingMade(false); + if (autoSendText && speechToText && extractedText.length > 0) { + setTimeout(() => { + onTranscriptionComplete(extractedText); + }, 3000); + } + }, + onError: () => { + showToast({ + message: 'An error occurred while processing the audio, maybe the audio was too short', + status: 'error', + }); + setIsRequestBeingMade(false); + }, + }); + + const cleanup = () => { + if (mediaRecorderRef.current) { + mediaRecorderRef.current.removeEventListener('dataavailable', handleDataAvailable); + mediaRecorderRef.current.removeEventListener('stop', handleStop); + mediaRecorderRef.current = null; + } + }; + + const clearText = () => { + setText(''); + }; + + const getMicrophonePermission = async () => { + try { + const streamData = await navigator.mediaDevices.getUserMedia({ + audio: true, + video: false, + }); + setPermission(true); + audioStream.current = streamData ?? null; + } catch (err) { + setPermission(false); + } + }; + + const handleDataAvailable = (event: BlobEvent) => { + if (event.data.size > 0) { + audioChunks.push(event.data); + } else { + showToast({ message: 'No audio data available', status: 'warning' }); + } + }; + + const handleStop = () => { + if (audioChunks.length > 0) { + const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); + + setAudioChunks([]); + + const formData = new FormData(); + formData.append('audio', audioBlob, 'audio.wav'); + setIsRequestBeingMade(true); + cleanup(); + processAudio(formData); + } else { + showToast({ message: 'The audio was too short', status: 'warning' }); + } + }; + + const monitorSilence = (stream: MediaStream, stopRecording: () => void) => { + const audioContext = new AudioContext(); + const audioStreamSource = audioContext.createMediaStreamSource(stream); + const analyser = audioContext.createAnalyser(); + analyser.minDecibels = minDecibels; + audioStreamSource.connect(analyser); + + const bufferLength = analyser.frequencyBinCount; + const domainData = new Uint8Array(bufferLength); + let lastSoundTime = Date.now(); + + const detectSound = () => { + analyser.getByteFrequencyData(domainData); + const isSoundDetected = domainData.some((value) => value > 0); + + if (isSoundDetected) { + lastSoundTime = Date.now(); + } + + const timeSinceLastSound = Date.now() - lastSoundTime; + const isOverSilenceThreshold = timeSinceLastSound > 3000; + + if (isOverSilenceThreshold) { + stopRecording(); + return; + } + + animationFrameIdRef.current = window.requestAnimationFrame(detectSound); + }; + + animationFrameIdRef.current = window.requestAnimationFrame(detectSound); + }; + + const startRecording = async () => { + if (isRequestBeingMade) { + showToast({ message: 'A request is already being made. Please wait.', status: 'warning' }); + return; + } + + if (!audioStream.current) { + await getMicrophonePermission(); + } + + if (audioStream.current) { + try { + setAudioChunks([]); + mediaRecorderRef.current = new MediaRecorder(audioStream.current); + mediaRecorderRef.current.addEventListener('dataavailable', handleDataAvailable); + mediaRecorderRef.current.addEventListener('stop', handleStop); + mediaRecorderRef.current.start(100); + if (!audioContextRef.current && autoTranscribeAudio && speechToText) { + monitorSilence(audioStream.current, stopRecording); + } + setIsListening(true); + } catch (error) { + showToast({ message: `Error starting recording: ${error}`, status: 'error' }); + } + } else { + showToast({ message: 'Microphone permission not granted', status: 'error' }); + } + }; + + const stopRecording = () => { + if (!mediaRecorderRef.current) { + return; + } + + if (mediaRecorderRef.current.state === 'recording') { + mediaRecorderRef.current.stop(); + + audioStream.current?.getTracks().forEach((track) => track.stop()); + audioStream.current = null; + + if (animationFrameIdRef.current !== null) { + window.cancelAnimationFrame(animationFrameIdRef.current); + animationFrameIdRef.current = null; + } + + setIsListening(false); + } else { + showToast({ message: 'MediaRecorder is not recording', status: 'error' }); + } + }; + + const externalStartRecording = () => { + if (isListening) { + showToast({ message: 'Already listening. Please stop recording first.', status: 'warning' }); + return; + } + + startRecording(); + }; + + const externalStopRecording = () => { + if (!isListening) { + showToast({ + message: 'Not currently recording. Please start recording first.', + status: 'warning', + }); + return; + } + + stopRecording(); + }; + + const handleKeyDown = async (e: KeyboardEvent) => { + if (e.shiftKey && e.altKey && e.code === 'KeyL' && endpointSTT !== 'browser') { + if (!window.MediaRecorder) { + showToast({ message: 'MediaRecorder is not supported in this browser', status: 'error' }); + return; + } + + if (permission === false) { + await getMicrophonePermission(); + } + + if (isListening) { + stopRecording(); + } else { + startRecording(); + } + + e.preventDefault(); + } + }; + + useEffect(() => { + window.addEventListener('keydown', handleKeyDown); + + return () => { + window.removeEventListener('keydown', handleKeyDown); + }; + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [isListening]); + + return { + isListening, + isLoading: isProcessing, + text, + externalStartRecording, + externalStopRecording, + clearText, + }; +}; + +export default useSpeechToTextExternal; diff --git a/client/src/hooks/Input/useTextToSpeech.ts b/client/src/hooks/Input/useTextToSpeech.ts new file mode 100644 index 00000000000..0c1680ee3f4 --- /dev/null +++ b/client/src/hooks/Input/useTextToSpeech.ts @@ -0,0 +1,67 @@ +import { useRef } from 'react'; +import useTextToSpeechBrowser from './useTextToSpeechBrowser'; +import useTextToSpeechExternal from './useTextToSpeechExternal'; +import { usePauseGlobalAudio } from '../Audio'; +import { useRecoilState } from 'recoil'; +import store from '~/store'; + +const useTextToSpeech = (message: string, isLast: boolean, index = 0) => { + const [endpointTTS] = useRecoilState(store.endpointTTS); + const useExternalTextToSpeech = endpointTTS === 'external'; + + const { + generateSpeechLocal: generateSpeechLocal, + cancelSpeechLocal: cancelSpeechLocal, + isSpeaking: isSpeakingLocal, + } = useTextToSpeechBrowser(); + + const { + generateSpeechExternal: generateSpeechExternal, + cancelSpeech: cancelSpeechExternal, + isSpeaking: isSpeakingExternal, + isLoading: isLoading, + } = useTextToSpeechExternal(isLast, index); + const { pauseGlobalAudio } = usePauseGlobalAudio(index); + + const generateSpeech = useExternalTextToSpeech ? generateSpeechExternal : generateSpeechLocal; + const cancelSpeech = useExternalTextToSpeech ? cancelSpeechExternal : cancelSpeechLocal; + const isSpeaking = useExternalTextToSpeech ? isSpeakingExternal : isSpeakingLocal; + + const isMouseDownRef = useRef(false); + const timerRef = useRef(undefined); + + const handleMouseDown = () => { + isMouseDownRef.current = true; + timerRef.current = window.setTimeout(() => { + if (isMouseDownRef.current) { + generateSpeech(message, true); + } + }, 1000); + }; + + const handleMouseUp = () => { + isMouseDownRef.current = false; + if (timerRef.current) { + window.clearTimeout(timerRef.current); + } + }; + + const toggleSpeech = () => { + if (isSpeaking) { + cancelSpeech(); + pauseGlobalAudio(); + } else { + generateSpeech(message, false); + } + }; + + return { + handleMouseDown, + handleMouseUp, + toggleSpeech, + isSpeaking, + isLoading, + }; +}; + +export default useTextToSpeech; diff --git a/client/src/hooks/Input/useTextToSpeechBrowser.ts b/client/src/hooks/Input/useTextToSpeechBrowser.ts new file mode 100644 index 00000000000..8e54e8930c4 --- /dev/null +++ b/client/src/hooks/Input/useTextToSpeechBrowser.ts @@ -0,0 +1,26 @@ +import { useState } from 'react'; + +function useTextToSpeechBrowser() { + const [isSpeaking, setIsSpeaking] = useState(false); + + const generateSpeechLocal = (text: string) => { + const synth = window.speechSynthesis; + synth.cancel(); + const utterance = new SpeechSynthesisUtterance(text); + utterance.onend = () => { + setIsSpeaking(false); + }; + setIsSpeaking(true); + synth.speak(utterance); + }; + + const cancelSpeechLocal = () => { + const synth = window.speechSynthesis; + synth.cancel(); + setIsSpeaking(false); + }; + + return { generateSpeechLocal, cancelSpeechLocal, isSpeaking }; +} + +export default useTextToSpeechBrowser; diff --git a/client/src/hooks/Input/useTextToSpeechExternal.ts b/client/src/hooks/Input/useTextToSpeechExternal.ts new file mode 100644 index 00000000000..709b4606d78 --- /dev/null +++ b/client/src/hooks/Input/useTextToSpeechExternal.ts @@ -0,0 +1,155 @@ +import { useRecoilValue } from 'recoil'; +import { useCallback, useEffect, useState, useMemo } from 'react'; +import { useTextToSpeechMutation } from '~/data-provider'; +import { useToastContext } from '~/Providers'; +import store from '~/store'; + +const createFormData = (text: string, voice: string) => { + const formData = new FormData(); + formData.append('input', text); + formData.append('voice', voice); + return formData; +}; + +function useTextToSpeechExternal(isLast: boolean, index = 0) { + const { showToast } = useToastContext(); + const voice = useRecoilValue(store.voice); + const cacheTTS = useRecoilValue(store.cacheTTS); + const playbackRate = useRecoilValue(store.playbackRate); + + const [text, setText] = useState(null); + const [downloadFile, setDownloadFile] = useState(false); + const [isLocalSpeaking, setIsSpeaking] = useState(false); + const [blobUrl, setBlobUrl] = useState(null); + const [audio, setAudio] = useState(null); + + /* Global Audio Variables */ + const globalIsFetching = useRecoilValue(store.globalAudioFetchingFamily(index)); + const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index)); + + const playAudio = (blobUrl: string) => { + const newAudio = new Audio(blobUrl); + if (playbackRate && playbackRate !== 1) { + newAudio.playbackRate = playbackRate; + } + + const playPromise = () => newAudio.play().then(() => setIsSpeaking(true)); + + playPromise().catch((error: Error) => { + if ( + error?.message && + error.message.includes('The play() request was interrupted by a call to pause()') + ) { + return playPromise().catch(console.error); + } + console.error(error); + showToast({ message: `Error playing audio: ${error.message}`, status: 'error' }); + }); + + newAudio.onended = () => { + console.log('Target message audio ended'); + URL.revokeObjectURL(blobUrl); + setIsSpeaking(false); + }; + + setAudio(newAudio); + setBlobUrl(blobUrl); + }; + + const downloadAudio = (blobUrl: string) => { + const a = document.createElement('a'); + a.href = blobUrl; + a.download = 'audio.mp3'; + a.click(); + setDownloadFile(false); + }; + + const { mutate: processAudio, isLoading: isProcessing } = useTextToSpeechMutation({ + onSuccess: async (data: ArrayBuffer) => { + try { + const mediaSource = new MediaSource(); + const audio = new Audio(); + audio.src = URL.createObjectURL(mediaSource); + audio.autoplay = true; + + mediaSource.onsourceopen = () => { + const sourceBuffer = mediaSource.addSourceBuffer('audio/mpeg'); + sourceBuffer.appendBuffer(data); + }; + + audio.onended = () => { + URL.revokeObjectURL(audio.src); + setIsSpeaking(false); + }; + + setAudio(audio); + + if (cacheTTS) { + const cache = await caches.open('tts-responses'); + const request = new Request(text!); + const response = new Response(new Blob([data], { type: 'audio/mpeg' })); + cache.put(request, response); + } + + if (downloadFile) { + downloadAudio(audio.src); + } + } catch (error) { + showToast({ + message: `Error processing audio: ${(error as Error).message}`, + status: 'error', + }); + } + }, + onError: (error: unknown) => { + showToast({ message: `Error: ${(error as Error).message}`, status: 'error' }); + }, + }); + + const generateSpeechExternal = async (text: string, download: boolean) => { + setText(text); + const cachedResponse = await getCachedResponse(text); + + if (cachedResponse && cacheTTS) { + handleCachedResponse(cachedResponse, download); + } else { + const formData = createFormData(text, voice); + setDownloadFile(download); + processAudio(formData); + } + }; + + const getCachedResponse = async (text: string) => await caches.match(text); + + const handleCachedResponse = async (cachedResponse: Response, download: boolean) => { + const audioBlob = await cachedResponse.blob(); + const blobUrl = URL.createObjectURL(audioBlob); + if (download) { + downloadAudio(blobUrl); + } else { + playAudio(blobUrl); + } + }; + + const cancelSpeech = useCallback(() => { + if (audio) { + audio.pause(); + blobUrl && URL.revokeObjectURL(blobUrl); + setIsSpeaking(false); + } + }, [audio, blobUrl]); + + useEffect(() => cancelSpeech, [cancelSpeech]); + + const isLoading = useMemo(() => { + return isProcessing || (isLast && globalIsFetching && !globalIsPlaying); + }, [isProcessing, globalIsFetching, globalIsPlaying, isLast]); + + const isSpeaking = useMemo(() => { + return isLocalSpeaking || (isLast && globalIsPlaying); + }, [isLocalSpeaking, globalIsPlaying, isLast]); + + return { generateSpeechExternal, cancelSpeech, isLoading, isSpeaking }; +} + +export default useTextToSpeechExternal; diff --git a/client/src/hooks/Messages/useMessageHelpers.tsx b/client/src/hooks/Messages/useMessageHelpers.tsx index c8b86522afd..412ab5ab459 100644 --- a/client/src/hooks/Messages/useMessageHelpers.tsx +++ b/client/src/hooks/Messages/useMessageHelpers.tsx @@ -10,6 +10,7 @@ export default function useMessageHelpers(props: TMessageProps) { const { ask, + index, regenerate, isSubmitting, conversation, @@ -71,6 +72,7 @@ export default function useMessageHelpers(props: TMessageProps) { return { ask, edit, + index, isLast, assistant, enterEdit, diff --git a/client/src/hooks/SSE/useSSE.ts b/client/src/hooks/SSE/useSSE.ts index 9e8c34d09ee..50f88c891aa 100644 --- a/client/src/hooks/SSE/useSSE.ts +++ b/client/src/hooks/SSE/useSSE.ts @@ -61,6 +61,7 @@ type TSyncData = { export default function useSSE(submission: TSubmission | null, index = 0) { const queryClient = useQueryClient(); const genTitle = useGenTitleMutation(); + const setActiveRunId = useSetRecoilState(store.activeRunFamily(index)); const { conversationId: paramId } = useParams(); const { token, isAuthenticated } = useAuthContext(); @@ -86,7 +87,7 @@ export default function useSSE(submission: TSubmission | null, index = 0) { (data: string, submission: TSubmission) => { const { messages, - message, + userMessage, plugin, plugins, initialResponse, @@ -99,8 +100,6 @@ export default function useSSE(submission: TSubmission | null, index = 0) { { ...initialResponse, text: data, - parentMessageId: message?.overrideParentMessageId ?? null, - messageId: message?.overrideParentMessageId + '_', plugin: plugin ?? null, plugins: plugins ?? [], // unfinished: true @@ -109,12 +108,10 @@ export default function useSSE(submission: TSubmission | null, index = 0) { } else { setMessages([ ...messages, - message, + userMessage, { ...initialResponse, text: data, - parentMessageId: message?.messageId, - messageId: message?.messageId + '_', plugin: plugin ?? null, plugins: plugins ?? [], // unfinished: true @@ -175,9 +172,9 @@ export default function useSSE(submission: TSubmission | null, index = 0) { const syncHandler = useCallback( (data: TSyncData, submission: TSubmission) => { const { conversationId, thread_id, responseMessage, requestMessage } = data; - const { initialResponse, messages: _messages, message } = submission; + const { initialResponse, messages: _messages, userMessage } = submission; - const messages = _messages.filter((msg) => msg.messageId !== message.messageId); + const messages = _messages.filter((msg) => msg.messageId !== userMessage.messageId); setMessages([ ...messages, @@ -229,35 +226,24 @@ export default function useSSE(submission: TSubmission | null, index = 0) { const createdHandler = useCallback( (data: TResData, submission: TSubmission) => { - const { messages, message, initialResponse, isRegenerate = false } = submission; - + const { messages, userMessage, isRegenerate = false } = submission; + const initialResponse = { + ...submission.initialResponse, + parentMessageId: userMessage?.messageId, + messageId: userMessage?.messageId + '_', + }; if (isRegenerate) { - setMessages([ - ...messages, - { - ...initialResponse, - parentMessageId: message?.overrideParentMessageId ?? null, - messageId: message?.overrideParentMessageId + '_', - }, - ]); + setMessages([...messages, initialResponse]); } else { - setMessages([ - ...messages, - message, - { - ...initialResponse, - parentMessageId: message?.messageId, - messageId: message?.messageId + '_', - }, - ]); + setMessages([...messages, userMessage, initialResponse]); } - const { conversationId, parentMessageId } = message; + const { conversationId, parentMessageId } = userMessage; let update = {} as TConversation; setConversation((prevState) => { let title = prevState?.title; - const parentId = isRegenerate ? message?.overrideParentMessageId : parentMessageId; + const parentId = isRegenerate ? userMessage?.overrideParentMessageId : parentMessageId; if (parentId !== Constants.NO_PARENT && title?.toLowerCase()?.includes('new chat')) { const convos = queryClient.getQueryData([QueryKeys.allConversations]); const cachedConvo = getConversationById(convos, conversationId); @@ -342,11 +328,11 @@ export default function useSSE(submission: TSubmission | null, index = 0) { const errorHandler = useCallback( ({ data, submission }: { data?: TResData; submission: TSubmission }) => { - const { messages, message, initialResponse } = submission; + const { messages, userMessage, initialResponse } = submission; setCompleted((prev) => new Set(prev.add(initialResponse.messageId))); - const conversationId = message?.conversationId ?? submission?.conversationId; + const conversationId = userMessage?.conversationId ?? submission?.conversationId; const parseErrorResponse = (data: TResData | Partial) => { const metadata = data['responseMessage'] ?? data; @@ -354,7 +340,7 @@ export default function useSSE(submission: TSubmission | null, index = 0) { ...initialResponse, ...metadata, error: true, - parentMessageId: message?.messageId, + parentMessageId: userMessage?.messageId, }; if (!errorMessage.messageId) { @@ -371,7 +357,7 @@ export default function useSSE(submission: TSubmission | null, index = 0) { ...submission, conversationId: convoId, }); - setMessages([...messages, message, errorResponse]); + setMessages([...messages, userMessage, errorResponse]); newConversation({ template: { conversationId: convoId }, preset: tPresetSchema.parse(submission?.conversation), @@ -383,7 +369,7 @@ export default function useSSE(submission: TSubmission | null, index = 0) { if (!conversationId && !data.conversationId) { const convoId = v4(); const errorResponse = parseErrorResponse(data); - setMessages([...messages, message, errorResponse]); + setMessages([...messages, userMessage, errorResponse]); newConversation({ template: { conversationId: convoId }, preset: tPresetSchema.parse(submission?.conversation), @@ -392,7 +378,7 @@ export default function useSSE(submission: TSubmission | null, index = 0) { return; } else if (!data.conversationId) { const errorResponse = parseErrorResponse(data); - setMessages([...messages, message, errorResponse]); + setMessages([...messages, userMessage, errorResponse]); setIsSubmitting(false); return; } @@ -401,10 +387,10 @@ export default function useSSE(submission: TSubmission | null, index = 0) { const errorResponse = tMessageSchema.parse({ ...data, error: true, - parentMessageId: message?.messageId, + parentMessageId: userMessage?.messageId, }); - setMessages([...messages, message, errorResponse]); + setMessages([...messages, userMessage, errorResponse]); if (data.conversationId && paramId === 'new') { newConversation({ template: { conversationId: data.conversationId }, @@ -466,7 +452,7 @@ export default function useSSE(submission: TSubmission | null, index = 0) { }; const data = { - requestMessage: submission.message, + requestMessage: submission.userMessage, responseMessage: responseMessage, conversation: submission.conversation, }; @@ -493,7 +479,7 @@ export default function useSSE(submission: TSubmission | null, index = 0) { error: true, }; const errorResponse = tMessageSchema.parse(errorMessage); - setMessages([...submission.messages, submission.message, errorResponse]); + setMessages([...submission.messages, submission.userMessage, errorResponse]); newConversation({ template: { conversationId: convoId }, preset: tPresetSchema.parse(submission?.conversation), @@ -509,7 +495,7 @@ export default function useSSE(submission: TSubmission | null, index = 0) { return; } - let { message } = submission; + let { userMessage } = submission; const payloadData = createPayload(submission); let { payload } = payloadData; @@ -529,20 +515,25 @@ export default function useSSE(submission: TSubmission | null, index = 0) { if (data.final) { const { plugins } = data; - finalHandler(data, { ...submission, plugins, message }); + finalHandler(data, { ...submission, plugins }); startupConfig?.checkBalance && balanceQuery.refetch(); console.log('final', data); } if (data.created) { - message = { - ...message, + const runId = v4(); + setActiveRunId(runId); + userMessage = { + ...userMessage, ...data.message, - overrideParentMessageId: message?.overrideParentMessageId, + overrideParentMessageId: userMessage?.overrideParentMessageId, }; - createdHandler(data, { ...submission, message }); + + createdHandler(data, { ...submission, userMessage }); } else if (data.sync) { + const runId = v4(); + setActiveRunId(runId); /* synchronize messages to Assistants API as well as with real DB ID's */ - syncHandler(data, { ...submission, message }); + syncHandler(data, { ...submission, userMessage }); } else if (data.type) { const { text, index } = data; if (text && index !== textIndex) { @@ -554,12 +545,28 @@ export default function useSSE(submission: TSubmission | null, index = 0) { const text = data.text || data.response; const { plugin, plugins } = data; + const initialResponse = { + ...submission.initialResponse, + parentMessageId: data.parentMessageId, + messageId: data.messageId, + }; + if (data.message) { - messageHandler(text, { ...submission, plugin, plugins, message }); + messageHandler(text, { ...submission, plugin, plugins, userMessage, initialResponse }); } } }; + // events.onaudio = (e: MessageEvent) => { + // const data = JSON.parse(e.data); + // console.log('audio', data); + // if (data.audio) { + // audioSource.addBase64Data(data.audio); + // } + // }; + + // events.onend = () => audioSource.close(); + events.onopen = () => console.log('connection is opened'); events.oncancel = async () => { @@ -575,7 +582,7 @@ export default function useSSE(submission: TSubmission | null, index = 0) { setCompleted((prev) => new Set(prev.add(streamKey))); return await abortConversation( - message?.conversationId ?? submission?.conversationId, + userMessage?.conversationId ?? submission?.conversationId, submission, ); }; @@ -594,7 +601,7 @@ export default function useSSE(submission: TSubmission | null, index = 0) { setIsSubmitting(false); } - errorHandler({ data, submission: { ...submission, message } }); + errorHandler({ data, submission: { ...submission, userMessage } }); }; setIsSubmitting(true); diff --git a/client/src/hooks/index.ts b/client/src/hooks/index.ts index 8925a379128..818d2e895e7 100644 --- a/client/src/hooks/index.ts +++ b/client/src/hooks/index.ts @@ -23,3 +23,5 @@ export { default as useLocalStorage } from './useLocalStorage'; export { default as useDelayedRender } from './useDelayedRender'; export { default as useOnClickOutside } from './useOnClickOutside'; export { default as useGenerationsByLatest } from './useGenerationsByLatest'; +export { default as useSpeechToText } from './Input/useSpeechToText'; +export { default as useTextToSpeech } from './Input/useTextToSpeech'; diff --git a/client/src/hooks/useChatHelpers.ts b/client/src/hooks/useChatHelpers.ts index 1a9a7b72d14..cb4d72a9996 100644 --- a/client/src/hooks/useChatHelpers.ts +++ b/client/src/hooks/useChatHelpers.ts @@ -240,7 +240,7 @@ export default function useChatHelpers(index = 0, paramId: string | undefined) { conversationId, }, endpointOption, - message: { + userMessage: { ...currentMsg, generation, responseMessageId, diff --git a/client/src/hooks/useNewConvo.ts b/client/src/hooks/useNewConvo.ts index 6b47b545d5a..58cae55f581 100644 --- a/client/src/hooks/useNewConvo.ts +++ b/client/src/hooks/useNewConvo.ts @@ -31,7 +31,7 @@ import { } from '~/utils'; import useAssistantListMap from './Assistants/useAssistantListMap'; import { useDeleteFilesMutation } from '~/data-provider'; - +import { usePauseGlobalAudio } from './Audio'; import { mainTextareaId } from '~/common'; import store from '~/store'; @@ -47,6 +47,7 @@ const useNewConvo = (index = 0) => { const modelsQuery = useGetModelsQuery(); const timeoutIdRef = useRef(); const assistantsListMap = useAssistantListMap(); + const { pauseGlobalAudio } = usePauseGlobalAudio(index); const { mutateAsync } = useDeleteFilesMutation({ onSuccess: () => { @@ -176,6 +177,8 @@ const useNewConvo = (index = 0) => { buildDefault?: boolean; keepLatestMessage?: boolean; } = {}) => { + pauseGlobalAudio(); + const conversation = { conversationId: 'new', title: 'New Chat', @@ -215,7 +218,7 @@ const useNewConvo = (index = 0) => { switchToConversation(conversation, preset, modelsData, buildDefault, keepLatestMessage); }, - [switchToConversation, files, mutateAsync, setFiles, startupConfig], + [pauseGlobalAudio, switchToConversation, mutateAsync, setFiles, files, startupConfig], ); return { diff --git a/client/src/localization/languages/Eng.ts b/client/src/localization/languages/Eng.ts index 00d63238560..7da4c622e2b 100644 --- a/client/src/localization/languages/Eng.ts +++ b/client/src/localization/languages/Eng.ts @@ -146,6 +146,7 @@ export default { com_ui_save: 'Save', com_ui_save_submit: 'Save & Submit', com_user_message: 'You', + com_ui_read_aloud: 'Read aloud', com_ui_copied: 'Copied!', com_ui_copy_code: 'Copy code', com_ui_copy_to_clipboard: 'Copy to clipboard', @@ -241,6 +242,7 @@ export default { 'Uploading "{0}" is taking more time than anticipated. Please wait while the file finishes indexing for retrieval.', com_ui_privacy_policy: 'Privacy policy', com_ui_terms_of_service: 'Terms of service', + com_ui_use_micrphone: 'Use microphone', com_ui_min_tags: 'Cannot remove more values, a minimum of {0} are required.', com_ui_max_tags: 'Maximum number allowed is {0}, using latest values.', com_auth_error_login: @@ -478,6 +480,9 @@ export default { com_nav_hide_panel: 'Hide right-most side panel', com_nav_modular_chat: 'Enable switching Endpoints mid-conversation', com_nav_latex_parsing: 'Parsing LaTeX in messages (may affect performance)', + com_nav_text_to_speech: 'Text to Speech', + com_nav_automatic_playback: 'Autoplay Latest Message (external only)', + com_nav_speech_to_text: 'Speech to Text', com_nav_profile_picture: 'Profile Picture', com_nav_change_picture: 'Change picture', com_nav_plugin_store: 'Plugin store', @@ -540,10 +545,22 @@ export default { com_nav_help_faq: 'Help & FAQ', com_nav_settings: 'Settings', com_nav_search_placeholder: 'Search messages', + com_nav_conversation_mode: 'Conversation Mode', + com_nav_auto_send_text: 'Auto send text (after 3 sec)', + com_nav_auto_transcribe_audio: 'Auto transcribe audio', + com_nav_db_sensitivity: 'Decibel sensitivity', + com_nav_playback_rate: 'Audio Playback Rate', + com_nav_engine: 'Engine', + com_nav_browser: 'Browser', + com_nav_external: 'External', + com_nav_delete_cache_storage: 'Delete cache storage', + com_nav_enable_cache_tts: 'Enable cache TTS', + com_nav_voice_select: 'Voice', com_nav_setting_general: 'General', com_nav_setting_beta: 'Beta features', com_nav_setting_data: 'Data controls', com_nav_setting_account: 'Account', + com_nav_setting_speech: 'Speech', com_nav_language: 'Language', com_nav_lang_auto: 'Auto detect', com_nav_lang_english: 'English', diff --git a/client/src/localization/languages/It.ts b/client/src/localization/languages/It.ts index 73d79050996..5004477b16a 100644 --- a/client/src/localization/languages/It.ts +++ b/client/src/localization/languages/It.ts @@ -523,6 +523,7 @@ export default { com_nav_setting_general: 'Generali', com_nav_setting_beta: 'Funzionalità beta', com_nav_setting_data: 'Controlli dati', + com_nav_setting_speech: 'Voce', com_nav_setting_account: 'Account', /* The following are AI Translated */ com_assistants_file_search: 'Ricerca File', diff --git a/client/src/main.jsx b/client/src/main.jsx index 17c3985a467..196592577bb 100644 --- a/client/src/main.jsx +++ b/client/src/main.jsx @@ -1,3 +1,4 @@ +import 'regenerator-runtime/runtime'; import { createRoot } from 'react-dom/client'; import App from './App'; import './style.css'; diff --git a/client/src/store/families.ts b/client/src/store/families.ts index 7657d5b5635..035718c0cbb 100644 --- a/client/src/store/families.ts +++ b/client/src/store/families.ts @@ -120,6 +120,31 @@ const showMentionPopoverFamily = atomFamily({ default: false, }); +const globalAudioURLFamily = atomFamily({ + key: 'globalAudioURLByIndex', + default: null, +}); + +const globalAudioFetchingFamily = atomFamily({ + key: 'globalAudioisFetchingByIndex', + default: false, +}); + +const globalAudioPlayingFamily = atomFamily({ + key: 'globalAudioisPlayingByIndex', + default: false, +}); + +const activeRunFamily = atomFamily({ + key: 'activeRunByIndex', + default: null, +}); + +const audioRunFamily = atomFamily({ + key: 'audioRunByIndex', + default: null, +}); + const latestMessageFamily = atomFamily({ key: 'latestMessageByIndex', default: null, @@ -180,4 +205,9 @@ export default { useClearConvoState, useCreateConversationAtom, showMentionPopoverFamily, + globalAudioURLFamily, + activeRunFamily, + audioRunFamily, + globalAudioPlayingFamily, + globalAudioFetchingFamily, }; diff --git a/client/src/store/settings.ts b/client/src/store/settings.ts index f8e45b76048..bc589056f13 100644 --- a/client/src/store/settings.ts +++ b/client/src/store/settings.ts @@ -1,254 +1,69 @@ import { atom } from 'recoil'; -import { SettingsViews, LocalStorageKeys } from 'librechat-data-provider'; +import { SettingsViews } from 'librechat-data-provider'; import type { TOptionSettings } from '~/common'; -const abortScroll = atom({ - key: 'abortScroll', - default: false, -}); - -const showFiles = atom({ - key: 'showFiles', - default: false, -}); - -const optionSettings = atom({ - key: 'optionSettings', - default: {}, -}); - -const showPluginStoreDialog = atom({ - key: 'showPluginStoreDialog', - default: false, -}); - -const showAgentSettings = atom({ - key: 'showAgentSettings', - default: false, -}); - -const currentSettingsView = atom({ - key: 'currentSettingsView', - default: SettingsViews.default, -}); - -const showBingToneSetting = atom({ - key: 'showBingToneSetting', - default: false, -}); - -const showPopover = atom({ - key: 'showPopover', - default: false, -}); - -const autoScroll = atom({ - key: 'autoScroll', - default: localStorage.getItem('autoScroll') === 'true', - effects: [ - ({ setSelf, onSet }) => { - const savedValue = localStorage.getItem('autoScroll'); - if (savedValue != null) { - setSelf(savedValue === 'true'); - } - - onSet((newValue: unknown) => { - if (typeof newValue === 'boolean') { - localStorage.setItem('autoScroll', newValue.toString()); +// Improved helper function to create atoms with localStorage +function atomWithLocalStorage(key: string, defaultValue: T) { + return atom({ + key, + default: defaultValue, // Set the default value directly + effects_UNSTABLE: [ + ({ setSelf, onSet }) => { + // Load the initial value from localStorage if it exists + const savedValue = localStorage.getItem(key); + if (savedValue !== null) { + setSelf(JSON.parse(savedValue)); } - }); - }, - ] as const, -}); - -const showCode = atom({ - key: 'showCode', - default: localStorage.getItem('showCode') === 'true', - effects: [ - ({ setSelf, onSet }) => { - const savedValue = localStorage.getItem('showCode'); - if (savedValue != null) { - setSelf(savedValue === 'true'); - } - - onSet((newValue: unknown) => { - if (typeof newValue === 'boolean') { - localStorage.setItem('showCode', newValue.toString()); - } - }); - }, - ] as const, -}); - -const hideSidePanel = atom({ - key: 'hideSidePanel', - default: localStorage.getItem('hideSidePanel') === 'true', - effects: [ - ({ setSelf, onSet }) => { - const savedValue = localStorage.getItem('hideSidePanel'); - if (savedValue != null) { - setSelf(savedValue === 'true'); - } - - onSet((newValue: unknown) => { - if (typeof newValue === 'boolean') { - localStorage.setItem('hideSidePanel', newValue.toString()); - } - }); - }, - ] as const, -}); - -const modularChat = atom({ - key: 'modularChat', - default: true, - effects: [ - ({ setSelf, onSet }) => { - const savedValue = localStorage.getItem('modularChat'); - if (savedValue != null) { - setSelf(savedValue === 'true'); - } - - onSet((newValue: unknown) => { - if (typeof newValue === 'boolean') { - localStorage.setItem('modularChat', newValue.toString()); - } - }); - }, - ] as const, -}); - -const LaTeXParsing = atom({ - key: 'LaTeXParsing', - default: true, - effects: [ - ({ setSelf, onSet }) => { - const savedValue = localStorage.getItem('LaTeXParsing'); - if (savedValue != null) { - setSelf(savedValue === 'true'); - } - onSet((newValue: unknown) => { - if (typeof newValue === 'boolean') { - localStorage.setItem('LaTeXParsing', newValue.toString()); - } - }); - }, - ] as const, -}); - -const forkSetting = atom({ - key: LocalStorageKeys.FORK_SETTING, - default: '', - effects: [ - ({ setSelf, onSet }) => { - const savedValue = localStorage.getItem(LocalStorageKeys.FORK_SETTING); - if (savedValue != null) { - setSelf(savedValue); - } - - onSet((newValue: unknown) => { - if (typeof newValue === 'string') { - localStorage.setItem(LocalStorageKeys.FORK_SETTING, newValue.toString()); - } - }); - }, - ] as const, -}); - -const rememberForkOption = atom({ - key: LocalStorageKeys.REMEMBER_FORK_OPTION, - default: false, - effects: [ - ({ setSelf, onSet }) => { - const savedValue = localStorage.getItem(LocalStorageKeys.REMEMBER_FORK_OPTION); - if (savedValue != null) { - setSelf(savedValue === 'true'); - } - - onSet((newValue: unknown) => { - if (typeof newValue === 'boolean') { - localStorage.setItem(LocalStorageKeys.REMEMBER_FORK_OPTION, newValue.toString()); - } - }); - }, - ] as const, -}); - -const splitAtTarget = atom({ - key: LocalStorageKeys.FORK_SPLIT_AT_TARGET, - default: false, - effects: [ - ({ setSelf, onSet }) => { - const savedValue = localStorage.getItem(LocalStorageKeys.FORK_SPLIT_AT_TARGET); - if (savedValue != null) { - setSelf(savedValue === 'true'); - } - - onSet((newValue: unknown) => { - if (typeof newValue === 'boolean') { - localStorage.setItem(LocalStorageKeys.FORK_SPLIT_AT_TARGET, newValue.toString()); - } - }); - }, - ] as const, -}); - -const UsernameDisplay = atom({ - key: 'UsernameDisplay', - default: localStorage.getItem('UsernameDisplay') === 'true', - effects: [ - ({ setSelf, onSet }) => { - const savedValue = localStorage.getItem('UsernameDisplay'); - if (savedValue != null) { - setSelf(savedValue === 'true'); - } - - onSet((newValue: unknown) => { - if (typeof newValue === 'boolean') { - localStorage.setItem('UsernameDisplay', newValue.toString()); - } - }); - }, - ] as const, -}); - -const enterToSend = atom({ - key: 'enterToSend', - default: true, - effects: [ - ({ setSelf, onSet }) => { - const savedValue = localStorage.getItem('enterToSend'); - if (savedValue != null) { - setSelf(savedValue === 'true'); - } - - onSet((newValue: unknown) => { - if (typeof newValue === 'boolean') { - localStorage.setItem('enterToSend', newValue.toString()); - } - }); - }, - ] as const, -}); + // Update localStorage whenever the atom's value changes + onSet((newValue: T) => { + localStorage.setItem(key, JSON.stringify(newValue)); + }); + }, + ], + }); +} + +// Static atoms without localStorage +const staticAtoms = { + abortScroll: atom({ key: 'abortScroll', default: false }), + showFiles: atom({ key: 'showFiles', default: false }), + optionSettings: atom({ key: 'optionSettings', default: {} }), + showPluginStoreDialog: atom({ key: 'showPluginStoreDialog', default: false }), + showAgentSettings: atom({ key: 'showAgentSettings', default: false }), + currentSettingsView: atom({ + key: 'currentSettingsView', + default: SettingsViews.default, + }), + showBingToneSetting: atom({ key: 'showBingToneSetting', default: false }), + showPopover: atom({ key: 'showPopover', default: false }), +}; -export default { - abortScroll, - showFiles, - optionSettings, - showPluginStoreDialog, - showAgentSettings, - currentSettingsView, - showBingToneSetting, - showPopover, - autoScroll, - enterToSend, - showCode, - hideSidePanel, - modularChat, - LaTeXParsing, - UsernameDisplay, - forkSetting, - splitAtTarget, - rememberForkOption, +// Atoms with localStorage +const localStorageAtoms = { + autoScroll: atomWithLocalStorage('autoScroll', false), + showCode: atomWithLocalStorage('showCode', false), + hideSidePanel: atomWithLocalStorage('hideSidePanel', false), + modularChat: atomWithLocalStorage('modularChat', false), + LaTeXParsing: atomWithLocalStorage('LaTeXParsing', true), + UsernameDisplay: atomWithLocalStorage('UsernameDisplay', true), + TextToSpeech: atomWithLocalStorage('textToSpeech', true), + automaticPlayback: atomWithLocalStorage('automaticPlayback', false), + enterToSend: atomWithLocalStorage('enterToSend', true), + SpeechToText: atomWithLocalStorage('speechToText', true), + conversationMode: atomWithLocalStorage('conversationMode', false), + advancedMode: atomWithLocalStorage('advancedMode', false), + autoSendText: atomWithLocalStorage('autoSendText', false), + autoTranscribeAudio: atomWithLocalStorage('autoTranscribeAudio', false), + decibelValue: atomWithLocalStorage('decibelValue', -45), + endpointSTT: atomWithLocalStorage('endpointSTT', 'browser'), + endpointTTS: atomWithLocalStorage('endpointTTS', 'browser'), + cacheTTS: atomWithLocalStorage('cacheTTS', true), + voice: atomWithLocalStorage('voice', ''), + forkSetting: atomWithLocalStorage('forkSetting', ''), + splitAtTarget: atomWithLocalStorage('splitAtTarget', false), + rememberForkOption: atomWithLocalStorage('rememberForkOption', true), + playbackRate: atomWithLocalStorage('playbackRate', null), }; + +export default { ...staticAtoms, ...localStorageAtoms }; diff --git a/config/translations/streamAudioTest.ts b/config/translations/streamAudioTest.ts new file mode 100644 index 00000000000..7564b2637bf --- /dev/null +++ b/config/translations/streamAudioTest.ts @@ -0,0 +1,134 @@ +import { WebSocket } from 'ws'; +// const { ElevenLabsClient } = require('elevenlabs'); + +const ELEVENLABS_API_KEY = 'a495399653cc5824ba1e41d914473e07'; +const VOICE_ID = '1RVpBInY9YUYMLSUQReV'; + +interface AudioChunk { + audio: string; + isFinal: boolean; + alignment: { + char_start_times_ms: number[]; + chars_durations_ms: number[]; + chars: string[]; + }; + normalizedAlignment: { + char_start_times_ms: number[]; + chars_durations_ms: number[]; + chars: string[]; + }; +} + +export function inputStreamTextToSpeech( + textStream: AsyncIterable, +): AsyncGenerator { + const model = 'eleven_turbo_v2'; + const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${VOICE_ID}/stream-input?model_id=${model}`; + const socket = new WebSocket(wsUrl); + + socket.onopen = function () { + const streamStart = { + text: ' ', + voice_settings: { + stability: 0.5, + similarity_boost: 0.8, + }, + xi_api_key: ELEVENLABS_API_KEY, + }; + + socket.send(JSON.stringify(streamStart)); + + // send stream until done + const streamComplete = new Promise((resolve, reject) => { + (async () => { + for await (const message of textStream) { + const request = { + text: message, + try_trigger_generation: true, + }; + socket.send(JSON.stringify(request)); + } + })() + .then(resolve) + .catch(reject); + }); + + streamComplete + .then(() => { + const endStream = { + text: '', + }; + + socket.send(JSON.stringify(endStream)); + }) + .catch((e) => { + throw e; + }); + }; + + return (async function* audioStream() { + let isDone = false; + let chunks: AudioChunk[] = []; + let resolve: (value: unknown) => void; + let waitForMessage = new Promise((r) => (resolve = r)); + + socket.onmessage = function (event) { + console.log(event); + const audioChunk = JSON.parse(event.data as string) as AudioChunk; + if (audioChunk.audio && audioChunk.alignment) { + chunks.push(audioChunk); + resolve(null); + waitForMessage = new Promise((r) => (resolve = r)); + } + }; + + socket.onerror = function (error) { + throw error; + }; + + // Handle socket closing + socket.onclose = function () { + isDone = true; + }; + + while (!isDone) { + await waitForMessage; + yield* chunks; + chunks = []; + } + })(); +} + +import OpenAI from 'openai'; +import { ChatCompletionStream } from 'openai/lib/ChatCompletionStream'; + +export async function streamCompletion({ systemPrompt, messages }) { + const client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); + return client.beta.chat.completions.stream({ + model: 'gpt-4-0125-preview', + messages: [{ role: 'system', content: systemPrompt }, ...messages], + }); +} + +export async function* llmMessageSource(llmStream: ChatCompletionStream): AsyncIterable { + for await (const chunk of llmStream) { + const message = chunk.choices[0].delta.content; + if (message) { + yield message; + } + } +} + +async function main(systemPrompt: string, prompt: string) { + const llmStream = await streamCompletion({ + systemPrompt, + messages: [{ role: 'user', content: prompt }], + }); + const llmMessageStream = llmMessageSource(llmStream); + console.log('Streaming LLM messages...'); + for await (const audio of inputStreamTextToSpeech(llmMessageStream)) { + console.log(audio); + } +} + +main('Hello, how can I help you today?', 'What is the meaning of life?'); diff --git a/librechat.example.yaml b/librechat.example.yaml index 2699d3146aa..acb1d772b8f 100644 --- a/librechat.example.yaml +++ b/librechat.example.yaml @@ -25,6 +25,25 @@ registration: # allowedDomains: # - "gmail.com" +# tts: +# url: '' +# apiKey: '${TTS_API_KEY}' +# model: '' +# backend: '' +# voice: '' +# compatibility: '' +# voice_settings: +# similarity_boost: '' +# stability: '' +# style: '' +# use_speaker_boost: +# pronunciation_dictionary_locators: [''] +# +# stt: +# url: '' +# apiKey: '${STT_API_KEY}' +# model: '' + # rateLimits: # fileUploads: # ipMax: 100 diff --git a/package-lock.json b/package-lock.json index 83167f3b780..0362d771ce7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -102,6 +102,7 @@ "ua-parser-js": "^1.0.36", "winston": "^3.11.0", "winston-daily-rotate-file": "^4.7.1", + "ws": "^8.17.0", "zod": "^3.22.4" }, "devDependencies": { @@ -1163,9 +1164,11 @@ "react-markdown": "^8.0.6", "react-resizable-panels": "^1.0.9", "react-router-dom": "^6.11.2", + "react-speech-recognition": "^3.10.0", "react-textarea-autosize": "^8.4.0", "react-transition-group": "^4.4.5", "recoil": "^0.7.7", + "regenerator-runtime": "^0.14.1", "rehype-highlight": "^6.0.0", "rehype-katex": "^6.0.2", "rehype-raw": "^6.1.1", @@ -24436,6 +24439,14 @@ "react-dom": ">=16.8" } }, + "node_modules/react-speech-recognition": { + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/react-speech-recognition/-/react-speech-recognition-3.10.0.tgz", + "integrity": "sha512-EVSr4Ik8l9urwdPiK2r0+ADrLyDDrjB0qBRdUWO+w2MfwEBrj6NuRmy1GD3x7BU/V6/hab0pl8Lupen0zwlJyw==", + "peerDependencies": { + "react": ">=16.8.0" + } + }, "node_modules/react-style-singleton": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/react-style-singleton/-/react-style-singleton-2.2.1.tgz", @@ -29098,9 +29109,9 @@ } }, "node_modules/ws": { - "version": "8.16.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.16.0.tgz", - "integrity": "sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==", + "version": "8.17.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.0.tgz", + "integrity": "sha512-uJq6108EgZMAl20KagGkzCKfMEjxmKvZHG7Tlq0Z6nOky7YF7aq4mOx6xK8TJ/i1LeK4Qus7INktacctDgY8Ow==", "engines": { "node": ">=10.0.0" }, diff --git a/packages/data-provider/src/api-endpoints.ts b/packages/data-provider/src/api-endpoints.ts index d1f035d782e..aa290bf495c 100644 --- a/packages/data-provider/src/api-endpoints.ts +++ b/packages/data-provider/src/api-endpoints.ts @@ -122,3 +122,11 @@ export const files = () => '/api/files'; export const images = () => `${files()}/images`; export const avatar = () => `${images()}/avatar`; + +export const speechToText = () => `${files()}/stt`; + +export const textToSpeech = () => `${files()}/tts`; + +export const textToSpeechManual = () => `${textToSpeech()}/manual`; + +export const textToSpeechVoices = () => `${textToSpeech()}/voices`; diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index 7deccbed0b6..3be5b2ee4cf 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -223,6 +223,53 @@ export const azureEndpointSchema = z export type TAzureConfig = Omit, 'groups'> & TAzureConfigValidationResult; +const ttsSchema = z.object({ + openai: z + .object({ + url: z.string().optional(), + apiKey: z.string(), + model: z.string(), + voices: z.array(z.string()), + }) + .optional(), + elevenLabs: z + .object({ + url: z.string().optional(), + websocketUrl: z.string().optional(), + apiKey: z.string(), + model: z.string(), + voices: z.array(z.string()), + voice_settings: z + .object({ + similarity_boost: z.number().optional(), + stability: z.number().optional(), + style: z.number().optional(), + use_speaker_boost: z.boolean().optional(), + }) + .optional(), + pronunciation_dictionary_locators: z.array(z.string()).optional(), + }) + .optional(), + localai: z + .object({ + url: z.string(), + apiKey: z.string().optional(), + voices: z.array(z.string()), + backend: z.string(), + }) + .optional(), +}); + +const sttSchema = z.object({ + openai: z + .object({ + url: z.string().optional(), + apiKey: z.string().optional(), + model: z.string().optional(), + }) + .optional(), +}); + export const rateLimitSchema = z.object({ fileUploads: z .object({ @@ -289,6 +336,8 @@ export const configSchema = z.object({ allowedDomains: z.array(z.string()).optional(), }) .default({ socialLogins: defaultSocialLogins }), + tts: ttsSchema.optional(), + stt: sttSchema.optional(), rateLimits: rateLimitSchema.optional(), fileConfig: fileConfigSchema.optional(), modelSpecs: specsConfigSchema.optional(), @@ -562,6 +611,10 @@ export enum CacheKeys { * Used by Azure OpenAI Assistants. */ ENCODED_DOMAINS = 'encoded_domains', + /** + * Key for the cached audio run Ids. + */ + AUDIO_RUNS = 'audioRuns', } /** @@ -664,6 +717,10 @@ export enum SettingsTabValues { * Tab for Messages Settings */ MESSAGES = 'messages', + /** + * Tab for Speech Settings + */ + SPEECH = 'speech', /** * Tab for Beta Features */ @@ -683,7 +740,7 @@ export enum Constants { /** Key for the app's version. */ VERSION = 'v0.7.2', /** Key for the Custom Config's version (librechat.yaml). */ - CONFIG_VERSION = '1.1.1', + CONFIG_VERSION = '1.1.2', /** Standard value for the first message's `parentMessageId` value, to indicate no parent exists. */ NO_PARENT = '00000000-0000-0000-0000-000000000000', /** Fixed, encoded domain length for Azure OpenAI Assistants Function name parsing. */ diff --git a/packages/data-provider/src/createPayload.ts b/packages/data-provider/src/createPayload.ts index 32b222747d0..d08bcea1e54 100644 --- a/packages/data-provider/src/createPayload.ts +++ b/packages/data-provider/src/createPayload.ts @@ -3,7 +3,7 @@ import { tConvoUpdateSchema, EModelEndpoint, isAssistantsEndpoint } from './sche import { EndpointURLs } from './config'; export default function createPayload(submission: TSubmission) { - const { conversation, message, messages, endpointOption, isEdited, isContinued } = submission; + const { conversation, userMessage, messages, endpointOption, isEdited, isContinued } = submission; const { conversationId } = tConvoUpdateSchema.parse(conversation); const { endpoint, endpointType } = endpointOption as { endpoint: EModelEndpoint; @@ -26,7 +26,7 @@ export default function createPayload(submission: TSubmission) { }; const payload: Payload = { - ...message, + ...userMessage, ...endpointOption, isContinued: !!(isEdited && isContinued), conversationId, diff --git a/packages/data-provider/src/data-service.ts b/packages/data-provider/src/data-service.ts index 3554884184f..d12a8f07329 100644 --- a/packages/data-provider/src/data-service.ts +++ b/packages/data-provider/src/data-service.ts @@ -340,6 +340,18 @@ export const deleteFiles = async ( data: { files, assistant_id, tool_resource }, }); +export const speechToText = (data: FormData): Promise => { + return request.postMultiPart(endpoints.speechToText(), data); +}; + +export const textToSpeech = (data: FormData): Promise => { + return request.postTTS(endpoints.textToSpeechManual(), data); +}; + +export const getVoices = (): Promise => { + return request.get(endpoints.textToSpeechVoices()); +}; + /* actions */ export const updateAction = (data: m.UpdateActionVariables): Promise => { diff --git a/packages/data-provider/src/keys.ts b/packages/data-provider/src/keys.ts index 13f894ac8ad..45302947b95 100644 --- a/packages/data-provider/src/keys.ts +++ b/packages/data-provider/src/keys.ts @@ -26,6 +26,7 @@ export enum QueryKeys { actions = 'actions', assistantDocs = 'assistantDocs', fileDownload = 'fileDownload', + voices = 'voices', } export enum MutationKeys { @@ -35,6 +36,8 @@ export enum MutationKeys { deletePreset = 'deletePreset', logoutUser = 'logoutUser', avatarUpload = 'avatarUpload', + speechToText = 'speechToText', + textToSpeech = 'textToSpeech', assistantAvatarUpload = 'assistantAvatarUpload', updateAction = 'updateAction', deleteAction = 'deleteAction', diff --git a/packages/data-provider/src/request.ts b/packages/data-provider/src/request.ts index a3aa8e9c30b..c701e0b4636 100644 --- a/packages/data-provider/src/request.ts +++ b/packages/data-provider/src/request.ts @@ -27,6 +27,15 @@ async function _postMultiPart(url: string, formData: FormData, options?: AxiosRe return response.data; } +async function _postTTS(url: string, formData: FormData, options?: AxiosRequestConfig) { + const response = await axios.post(url, formData, { + ...options, + headers: { 'Content-Type': 'multipart/form-data' }, + responseType: 'arraybuffer', + }); + return response.data; +} + async function _put(url: string, data?: any) { const response = await axios.put(url, JSON.stringify(data), { headers: { 'Content-Type': 'application/json' }, @@ -121,6 +130,7 @@ export default { getResponse: _getResponse, post: _post, postMultiPart: _postMultiPart, + postTTS: _postTTS, put: _put, delete: _delete, deleteWithOptions: _deleteWithOptions, diff --git a/packages/data-provider/src/schemas.ts b/packages/data-provider/src/schemas.ts index 91b03ce7f1a..98b4c70d3f7 100644 --- a/packages/data-provider/src/schemas.ts +++ b/packages/data-provider/src/schemas.ts @@ -306,9 +306,9 @@ export const tConversationSchema = z.object({ tools: z.union([z.array(tPluginSchema), z.array(z.string())]).optional(), createdAt: z.string(), updatedAt: z.string(), - systemMessage: z.string().nullable().optional(), modelLabel: z.string().nullable().optional(), examples: z.array(tExampleSchema).optional(), + /* Prefer modelLabel over chatGptLabel */ chatGptLabel: z.string().nullable().optional(), userLabel: z.string().optional(), model: z.string().nullable().optional(), @@ -320,20 +320,12 @@ export const tConversationSchema = z.object({ top_p: z.number().optional(), frequency_penalty: z.number().optional(), presence_penalty: z.number().optional(), - jailbreak: z.boolean().optional(), - jailbreakConversationId: z.string().nullable().optional(), - conversationSignature: z.string().nullable().optional(), parentMessageId: z.string().optional(), - clientId: z.string().nullable().optional(), - invocationId: z.number().nullable().optional(), - toneStyle: z.string().nullable().optional(), maxOutputTokens: z.number().optional(), agentOptions: tAgentOptionsSchema.nullable().optional(), file_ids: z.array(z.string()).optional(), maxContextTokens: coerceNumber.optional(), max_tokens: coerceNumber.optional(), - /** @deprecated */ - resendImages: z.boolean().optional(), /* vision */ resendFiles: z.boolean().optional(), imageDetail: eImageDetailSchema.optional(), @@ -347,6 +339,25 @@ export const tConversationSchema = z.object({ iconURL: z.string().optional(), greeting: z.string().optional(), spec: z.string().optional(), + /* + Deprecated fields + */ + /** @deprecated */ + systemMessage: z.string().nullable().optional(), + /** @deprecated */ + jailbreak: z.boolean().optional(), + /** @deprecated */ + jailbreakConversationId: z.string().nullable().optional(), + /** @deprecated */ + conversationSignature: z.string().nullable().optional(), + /** @deprecated */ + clientId: z.string().nullable().optional(), + /** @deprecated */ + invocationId: z.number().nullable().optional(), + /** @deprecated */ + toneStyle: z.string().nullable().optional(), + /** @deprecated */ + resendImages: z.boolean().optional(), }); export const tPresetSchema = tConversationSchema diff --git a/packages/data-provider/src/types.ts b/packages/data-provider/src/types.ts index 2c3e8a236ca..b8c098359dc 100644 --- a/packages/data-provider/src/types.ts +++ b/packages/data-provider/src/types.ts @@ -40,7 +40,7 @@ export type TEndpointOption = { export type TSubmission = { plugin?: TResPlugin; plugins?: TResPlugin[]; - message: TMessage; + userMessage: TMessage; isEdited?: boolean; isContinued?: boolean; messages: TMessage[]; diff --git a/packages/data-provider/src/types/files.ts b/packages/data-provider/src/types/files.ts index 3459b0782c8..83585492827 100644 --- a/packages/data-provider/src/types/files.ts +++ b/packages/data-provider/src/types/files.ts @@ -77,6 +77,12 @@ export type AvatarUploadResponse = { url: string; }; +export type SpeechToTextResponse = { + text: string; +}; + +export type VoiceResponse = string[]; + export type UploadMutationOptions = { onSuccess?: (data: TFileUpload, variables: FormData, context?: unknown) => void; onMutate?: (variables: FormData) => void | Promise; @@ -89,6 +95,24 @@ export type UploadAvatarOptions = { onError?: (error: unknown, variables: FormData, context?: unknown) => void; }; +export type SpeechToTextOptions = { + onSuccess?: (data: SpeechToTextResponse, variables: FormData, context?: unknown) => void; + onMutate?: (variables: FormData) => void | Promise; + onError?: (error: unknown, variables: FormData, context?: unknown) => void; +}; + +export type TextToSpeechOptions = { + onSuccess?: (data: ArrayBuffer, variables: FormData, context?: unknown) => void; + onMutate?: (variables: FormData) => void | Promise; + onError?: (error: unknown, variables: FormData, context?: unknown) => void; +}; + +export type VoiceOptions = { + onSuccess?: (data: VoiceResponse, variables: unknown, context?: unknown) => void; + onMutate?: () => void | Promise; + onError?: (error: unknown, variables: unknown, context?: unknown) => void; +}; + export type DeleteFilesResponse = { message: string; result: Record;