-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtlon-tts.el
2845 lines (2419 loc) · 110 KB
/
tlon-tts.el
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
;;; tlon-tts.el --- Text-to-speech functionality -*- lexical-binding: t; fill-column: 80 -*-
;; Copyright (C) 2025
;; Author: Pablo Stafforini
;; This file is NOT part of GNU Emacs.
;; This program is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this program. If not, see <https://www.gnu.org/licenses/>.
;;; Commentary:
;; Text-to-speech functionality.
;;; Code:
(require 'tlon-core)
(require 'tlon-md)
(eval-and-compile
(require 'eieio)
(require 'transient))
;;;; User options
(defgroup tlon-tts ()
"Text-to-speech functionality."
:group 'tlon)
;;;;; Common
(defcustom tlon-tts-global-engine "Microsoft Azure"
"The TTS engine to use when creating the staging buffer."
:group 'tlon-tts
:type '(choice (const :tag "Microsoft Azure" :azure)
(const :tag "Google Cloud" :google)
(const :tag "Amazon Polly" :polly)
(const :tag "OpenAI" :openai)
(const :tag "ElevenLabs" :elevenlabs)))
(defcustom tlon-tts-use-alternate-voice nil
"Whether to use an alternate voice for reading notes, asides, etc."
:group 'tlon-tts
:type 'boolean)
(defcustom tlon-tts-delete-file-chunks nil
"Whether to delete file chunks after they have been merged into the main file."
:group 'tlon-tts
:type 'boolean)
;; TODO: it looks like this is not being used; decide what to do about it
(defcustom tlon-tts-prompt
nil
"Generic prompt to use in the TTS request.
Selection candidates for each language are listed in `tlon-tts-prompts'."
:group 'tlon-tts
:type 'string)
(defconst tlon-tts-prompts
'(("ar" . ("أنت متحدث باللغة العربية الأصلية. تتحدث بلهجة عربية محايدة."))
("de" . ("Sie sind ein Muttersprachler des Deutschen. Sie sprechen mit einem neutralen deutschen Akzent."))
("en" . ("You are a native English speaker. You speak with a neutral English accent."))
("es" . ("Eres un hablante nativo de español. Hablas con un acento español neutro."))
("fr" . ("Vous êtes un locuteur natif du français. Vous parlez avec un accent français neutre."))
("it" . ("Sei un madrelingua italiano. Parli con un accento italiano neutro."))
("ja" . ("あなたは日本語のネイティブスピーカーです。中立的な日本語アクセントで話します。"))
("ko" . ("당신은 한국어 원어민입니다. 중립적인 한국어 악센트로 말합니다.")))
"List of prompts to select from in each language.")
;;;;;; `break'
;; Note that, apparently, ElevenLabs *replaces* the pause that the narrator
;; would make without an explicit `break' tag with the duration specified in the
;; tag. This is different from the behavior of other engines, which *add* the
;; duration specified in the tag to the default pause duration.
(defcustom tlon-tts-paragraph-break-duration "0.8s"
"Duration of the break after a paragraph."
:group 'tlon-tts
:type 'string)
(defcustom tlon-tts-listener-cue-break-duration "0.5s"
"Duration of the break for a listener cue."
:group 'tlon-tts
:type 'string)
;;;;; Microsoft Azure
(defcustom tlon-microsoft-azure-audio-settings
'("audio-24khz-160kbitrate-mono-mp3" . "mp3")
"Output format and associated extension for the Microsoft Azure TTS service.
Here's a description of the main options:
- `\"audio-24khz-160kbitrate-mono-mp3\"': Offers higher quality due to a higher
bitrate and sample rate. This means the audio will sound clearer, especially
for more complex sounds or music. However, the file size will also be larger.
- `\"audio-16khz-64kbitrate-mono-mp3\"': Reduces the bitrate, which will result
in a smaller file at the cost of lower audio quality. Useful when network
bandwidth or storage is limited.
- `\"raw-16khz-16bit-mono-pcm\":' Provides raw audio data without compression.
This is useful if you plan to further process the audio yourself or need
lossless quality. Note that the files will be significantly larger.
- `\"riff-16khz-16bit-mono-pcm\"': Similar to the RAW format but wrapped in the
Waveform Audio File Format, which includes headers making it compatible with
more playback devices and software.
- `\"riff-24khz-16bit-mono-pcm\"': Offers a higher sample rate compared to the
16kHz versions, which can provide better audio quality at the expense of
larger file sizes.
For a full list of audio outputs, see
<https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech?tabs=streaming#audio-outputs>."
:group 'tlon-tts
:type '(cons (string :tag "Name") (string :tag "Extension")))
(defconst tlon-microsoft-azure-audio-choices
'(("amr-wb-16000hz")
("audio-16khz-16bit-32kbps-mono-opus" . "opus")
("audio-16khz-32kbitrate-mono-mp3" . "mp3")
("audio-16khz-64kbitrate-mono-mp3" . "mp3")
("audio-16khz-128kbitrate-mono-mp3" . "mp3")
("audio-24khz-16bit-24kbps-mono-opus" . "opus")
("audio-24khz-16bit-48kbps-mono-opus" . "opus")
("audio-24khz-48kbitrate-mono-mp3" . "mp3")
("audio-24khz-96kbitrate-mono-mp3" . "mp3")
("audio-24khz-160kbitrate-mono-mp3" . "mp3")
("audio-48khz-96kbitrate-mono-mp3" . "mp3")
("audio-48khz-192kbitrate-mono-mp3" . "mp3")
("ogg-16khz-16bit-mono-opus" . "opus")
("ogg-24khz-16bit-mono-opus" . "opus")
("ogg-48khz-16bit-mono-opus" . "opus")
("raw-8khz-8bit-mono-alaw" . "alaw")
("raw-8khz-8bit-mono-mulaw" . "mulaw")
("raw-8khz-16bit-mono-pcm" . "pcm")
("raw-16khz-16bit-mono-pcm" . "pcm")
("raw-16khz-16bit-mono-truesilk" . "sil")
("raw-22050hz-16bit-mono-pcm" . "pcm")
("raw-24khz-16bit-mono-pcm" . "pcm")
("raw-24khz-16bit-mono-truesilk" . "sil")
("raw-44100hz-16bit-mono-pcm" . "pcm")
("raw-48khz-16bit-mono-pcm" . "pcm")
("webm-16khz-16bit-mono-opus" . "opus")
("webm-24khz-16bit-24kbps-mono-opus" . "opus")
("webm-24khz-16bit-mono-opus" . "opus"))
"Output format and associated extension for the Microsoft Azure TTS service.")
;;;;; Google Cloud
(defcustom tlon-google-cloud-audio-settings
'("MP3" . "mp3")
"Output format and associated extension the Google Cloud TTS service.
The options are:
- `\"MP3\"': MPEG Audio Layer III (lossy). MP3 encoding is a Beta feature and
only available in v1p1beta1. See the RecognitionConfig reference documentation
for details.
- `\"FLAC\"': Free Lossless Audio Codec (lossless) 16-bit or 24-bit required
for streams LINEAR16 Linear PCM Yes 16-bit linear pulse-code modulation (PCM)
encoding. The header must contain the sample rate.
- `\"MULAW\"': μ-law (lossy). 8-bit PCM encoding.
- `\"AMR\"': Adaptive Multi-Rate Narrowband (lossy). Sample rate must be 8000 Hz.
- `\"AMR_WB\"': Adaptive Multi-Rate Wideband (lossy). Sample rate must be 16000
Hz.
- `\"OGG_OPUS\"': Opus encoded audio frames in an Ogg container (lossy). Sample
rate must be one of 8000 Hz, 12000 Hz, 16000 Hz, 24000 Hz, or 48000 Hz.
- `\"SPEEX_WITH_HEADER_BYTE\"': Speex wideband (lossy). Sample rate must be
16000 Hz.
- `\"WEBM_OPUS\"': WebM Opus (lossy). Sample rate must be one of 8000 Hz, 12000
Hz, 16000 Hz, 24000 Hz, or 48000 Hz.
For details, see <https://cloud.google.com/speech-to-text/docs/encoding>."
:group 'tlon-tts
:type '(cons (string :tag "Name") (string :tag "Extension")))
(defconst tlon-google-cloud-audio-choices
'(("MP3" . "mp3")
("FLAC" . "flac")
("MULAW" . "mulaw")
("AMR" . "amr")
("AMR_WB" . "amr_wb")
("OGG_OPUS" . "ogg")
("SPEEX_WITH_HEADER_BYTE" . "speex")
("WEBM_OPUS" . "webm"))
"Output format and associated extension for the Google Cloud TTS service.")
;;;;; Amazon Polly
(defcustom tlon-amazon-polly-audio-settings
'("mp3" . "mp3")
"Output format and associated extension for the Amazon Polly TTS service.
Admissible values are `\"ogg_vorbis\"', `\"pcm\"' and `\"mp3\"'."
:group 'tlon-tts
:type '(cons (string :tag "Name") (string :tag "Extension")))
(defconst tlon-amazon-polly-audio-choices
'(("mp3" . "mp3")
("ogg_vorbis" . "ogg")
("pcm" . "pcm"))
"Output format and associated extension for the Google Cloud TTS service.")
;;;;; OpenAI
;; TODO: check if the OpenAI API allows for different output formats
(defcustom tlon-openai-audio-settings
'("mp3" . "mp3")
"Output format and associated extension for the OpenAI TTS service."
:group 'tlon-tts
:type '(cons (string :tag "Name") (string :tag "Extension")))
(defcustom tlon-openai-model
"tts-1-hd"
"Model to use for the OpenAI TTS.
Options are
- `\"tts-1\"': Standard model. Provides the lowest latency.
- `\"tts-1-hd\"': Higher quality model.
<https://platform.openai.com/docs/guides/text-to-speech/audio-quality>"
:group 'tlon-tts
:type 'string)
;;;;; ElevenLabs
(defcustom tlon-elevenlabs-audio-settings
'("mp3_44100_128" . "mp3")
"Output format and associated extension for the ElevenLabs TTS service.
The options are:
- `\"mp3_44100_32\"': mp3 with 44.1kHz sample rate at 32kbps.
- `\"mp3_44100_64\"': mp3 with 44.1kHz sample rate at 64kbps.
- `\"mp3_44100_96\"': mp3 with 44.1kHz sample rate at 96kbps.
- `\"mp3_44100_128\"': mp3 with 44.1kHz sample rate at 128kbps.
- `\"mp3_44100_192\"': mp3 with 44.1kHz sample rate at 192kbps. Requires you to
be subscribed to Creator tier or above.
- `\"pcm_16000\"': PCM format (S16LE) with 16kHz sample rate.
- `\"pcm_22050\"': PCM format (S16LE) with 22.05kHz sample rate.
- `\"pcm_24000\"': PCM format (S16LE) with 24kHz sample rate.
- `\"pcm_44100\"': PCM format (S16LE) with 44.1kHz sample rate. Requires you to
be subscribed to Pro tier or above.
- `\"ulaw_8000\"': μ-law format (sometimes written mu-law, often approximated as
u-law) with 8kHz sample rate. Note that this format is commonly used for
Twilio audio inputs."
:group 'tlon-tts
:type '(cons (string :tag "Name") (string :tag "Extension")))
(defconst tlon-elevenlabs-audio-choices
'(("mp3_44100_32" . "mp3")
("mp3_44100_64" . "mp3")
("mp3_44100_96" . "mp3")
("mp3_44100_128" . "mp3")
("mp3_44100_192" . "mp3")
("pcm_16000" . "pcm")
("pcm_22050" . "pcm")
("pcm_24000" . "pcm")
("pcm_44100" . "pcm")
("ulaw_8000" . "ulaw")))
(defcustom tlon-elevenlabs-model
"eleven_multilingual_v2"
"Model to use for the ElevenLabs TTS.
Options are
- `\"eleven_monolingual_v1\"': \"Our very first model, English v1, set the
foundation for what's to come. This model was created specifically for English
and is the smallest and fastest model we offer. Trained on a focused,
English-only dataset, it quickly became the go-to choice for English-based
tasks. As our oldest model, it has undergone extensive optimization to ensure
reliable performance but it is also the most limited and generally the least
accurate.\"
- `\"eleven_multilingual_v1\"': \"Taking a step towards global access and usage,
we introduced Multilingual v1 as our second offering. Has been an experimental
model ever since release. To this day, it still remains in the experimental
phase. However, it paved the way for the future as we took what we learned to
improve the next iteration. Multilingual v1 currently supports a range of
languages.\"
- `\"eleven_multilingual_v2\"': \"Introducing our latest model, Multilingual v2,
which stands as a testament to our dedication to progress. This model is a
powerhouse, excelling in stability, language diversity, and accuracy in
replicating accents and voices. Its speed and agility are remarkable
considering its size.\"
- `\"eleven_turbo_v2\"': \"Using cutting-edge technology, this is a highly
optimized model for real-time applications that require very low latency, but
it still retains the fantastic quality offered in our other models. Even if
optimized for real-time and more conversational applications, we still
recommend testing it out for other applications as it is very versatile and
stable.\ As of 2024-07-15, it does not support multilingual voices.\"
- `\"eleven_turbo_v2_5\"': \"Turbo v2.5 generates human-like text to speech in
32 languages with low latency. We recommend Turbo v2.5 for users building real
time, conversational interfaces in non-English languages. It’s 300% faster
than Multilingual v2 and adds Vietnamese, Hungarian and Swedish to our
existing 29 languages. A highly optimized model, specifically tailored for
low-latency applications without sacrificing vocal performance and keeping
inline with the quality standard that people have come to expect from our
models.Because of its very optimized nature, it does have slightly lower
accuracy than multilingual V2 and is missing the style slider, which adds
latency when used. However, the accuracy is still very good when using a
properly created instant voice clone, and it is very stable.\"
<https://help.elevenlabs.io/hc/en-us/articles/17883183930129-What-models-do-you-offer-and-what-is-the-difference-between-them>"
:group 'tlon-tts
:type 'string)
;;;; Variables
;;;;; Paths
(defconst tlon-dir-tts
(file-name-concat (tlon-repo-lookup :dir :name "babel-core") "tts/")
"Directory for files related to text-to-speech functionality.")
(defconst tlon-file-global-abbreviations
(file-name-concat tlon-dir-tts "abbreviations.json")
"File with abbreviations.")
(defconst tlon-file-global-phonetic-replacements
(file-name-concat tlon-dir-tts "phonetic-replacements.json")
"File with replacements.")
(defconst tlon-file-global-phonetic-transcriptions
(file-name-concat tlon-dir-tts "phonetic-transcriptions.json")
"File with phonetic transcriptions.")
;;;;; Staging buffer
(defconst tlon-tts-staging-buffer-formatter
"%1$sTTS: %s%1$s"
"Formatter for the name of the staging buffer for TTS processes.
The first placeholder is for the asterisks enclosing the buffer name, which may
or may not need to be escaped. The second placeholder is for the base of the
file name.")
;;;;;; Local variables
(defvar tlon-tts-source)
(defvar tlon-tts-language)
(defvar tlon-tts-engine)
(defvar tlon-tts-audio)
(defvar tlon-tts-voice)
(defvar tlon-tts-voice-id)
(defvar tlon-tts-locale)
(defconst tlon-tts-local-variables-section-start
"^<!-- Local Variables: -->"
"Pattern to match the beginning of the local variables section.")
;;;;; SSML tag pairs & patterns
;;;;;; `voice'
(defconst tlon-tts-ssml-double-voice-replace-pattern
(concat (cdr (tlon-md-format-tag "voice" nil 'get-placeholders))
(tlon-md-get-tag-to-fill "voice")
(car (tlon-md-format-tag "voice" nil 'get-placeholders)))
"SSML pattern for voice tag, with 2 voice name placeholders and text placeholder.")
;;;;;; common
(defconst tlon-tts-supported-tags
`((:tag break
:tlon t
:polly t
:azure t
:google t
:openai nil
:elevenlabs t
:if-unsupported remove
:replacement (,(tlon-md-get-tag-pattern "break")))
(:tag emphasis
:tlon t
:polly nil
:azure nil ; https://bit.ly/azure-ssml-emphasis
:google t
:openai nil
:elevenlabs nil ; content is read, but tag is ignored
:if-unsupported remove
:replacement ,(tlon-md-get-tag-pattern "emphasis"))
(:tag lang
:tlon t
:polly t
:azure t
:google t
:openai nil
:elevenlabs nil ; content is read, but tag is ignored
:if-unsupported remove
:replacement ,(tlon-md-get-tag-pattern "lang"))
(:tag mark
:tlon nil
:polly t
:azure t
:google t
:openai nil
:if-unsupported remove)
(:tag p
:tlon nil
:polly t
:azure t
:google t
:openai nil
:if-unsupported remove)
(:tag phoneme
:tlon t
:polly t
:azure nil ; https://bit.ly/azure-ssml-phoneme
:google t
:openai nil
:elevenlabs nil
:if-unsupported remove
;; it works with v2 turbo but not with v2 multilingual, and turbo is
;; not currently multilingual
;; <https://elevenlabs.io/docs/speech-synthesis/prompting#pronunciation>
:replacement ,(tlon-md-get-tag-pattern "phoneme")
;; TODO: ideally it should be replaced by a mapping from IPA to
;; closest alphabetical equivalent
)
(:tag prosody
:tlon nil
:polly t ; partial support
:azure t
:google t
:openai nil
:if-unsupported remove)
(:tag s
:tlon nil
:polly t
:azure t
:google t
:openai nil
:if-unsupported remove)
(:tag say-as
:tlon t
:polly t ; partial support
:azure t
:google t
:openai nil
:elevenlabs nil ; content is sometimes read, sometimes not read
:if-unsupported remove
:replacement ,(tlon-md-get-tag-pattern "say-as"))
(:tag speak
:tlon t
:polly t
:azure t
:google t
:openai nil
:elevenlabs t ; I assume so?
:if-unsupported remove
;; :replacement ; Do we need a pattern for this tag, given it's only used in the wrapper?
)
(:tag sub
:tlon nil
:polly t
:azure t
:google t
:openai nil
:if-unsupported remove)
(:tag voice
:tlon t
:polly nil
:azure nil ; stopped working
:google t
:openai nil
:elevenlabs nil
:if-unsupported chunkify
:replacement ,(tlon-md-get-tag-pattern "voice"))
(:tag w
:tlon nil
:polly t
:azure t
:google t
:openai nil
:if-unsupported remove))
"SSML tags supported by this package and by various TTS engines.
- Amazon Polly:
<https://docs.aws.amazon.com/polly/latest/dg/supportedtags.html>.
- Microsoft Azure:
- Google Cloud: <https://cloud.google.com/text-to-speech/docs/ssml>.
- OpenAI:
<https://community.openai.com/t/what-about-to-implement-ssml-on-the-new-tts-api-service/485686/5>.
- ElevenLabs: <https://elevenlabs.io/docs/speech-synthesis/prompting>. Only two
tags are explicitly mentioned, so maybe none of the others are supported?
The value of `:replacement' is either a regexp pattern to replace with its
second capture group when removing unsupported tags (via
`tlon-tts-remove-unsupported-tags'), or a cons cell whose car is the replacement
pattern and whose cdr is the the number of the capture group to replace with; if
the cdr is nil, the entire tag is removed. We use the second capture group by
default because that is normally the group containing the text enclosed by the
tag.")
;;;;; Engine settings
;;;;;; Microsoft Azure
(defconst tlon-microsoft-azure-request
"curl -v --location --request POST 'https://eastus.tts.speech.microsoft.com/cognitiveservices/v1' \
--header 'Ocp-Apim-Subscription-Key: %s' \
--header 'Content-Type: application/ssml+xml' \
--header 'X-Microsoft-OutputFormat: %s' \
--header 'User-Agent: curl' \
--data-raw '%s' \
-o '%4$s'"
"Curl command to send a request to the Microsoft Azure text-to-speech engine.
The placeholders are: API key, output format, SSML, destination for the audio
file, and destination for the log file.")
(defconst tlon-microsoft-azure-voices
'((:id "es-US-AlonsoNeural" :language "es" :gender "male" :role "main")
(:id "es-US-PalomaNeural" :language "es" :gender "female" :role "main")
(:id "es-CO-GonzaloNeural" :language "es" :gender "male")
(:id "es-CO-SalomeNeural" :language "es" :gender "female")
(:id "es-MX-DaliaNeural" :language "es" :gender "female" :role "alternate")
(:id "es-MX-JorgeNeural" :language "es" :gender "male" :role "alternate")
(:id "es-AR-TomasNeural" :language "es" :gender "male")
(:id "es-AR-ElenaNeural" :language "es" :gender "female"))
"Preferred Microsoft Azure voices for different languages.
All the voices in this property list are neural and multilingual, and are the
best male and female voices we were able to identify in each language.
A list of available voices may be found here:
<https://github.com/MicrosoftDocs/azure-docs/blob/main/articles/ai-services/speech-service/includes/language-support/tts.md>.")
(defconst tlon-microsoft-azure-char-limit (* 9 60 14)
"Maximum number of characters that Microsoft Azure can process per request.
Microsoft Azure can process up to 10 minutes of audio at a time. This estimate
assumes 14 characters per second, and uses nine minutes to err on the safe side.")
(defvar tlon-microsoft-azure-key nil
"API key for the Microsoft Azure TTS service.")
;;;;;; Google Cloud
(defconst tlon-google-cloud-request
"curl -H 'Authorization: Bearer %s' \
-H 'x-goog-user-project: api-project-781899662791' \
-H 'Content-Type: application/json; charset=utf-8' \
--data '%s' 'https://texttospeech.googleapis.com/v1/text:synthesize' | jq -r .audioContent | base64 --decode > '%s'"
"Curl command to send a request to the Google Cloud text-to-speech engine.
The placeholders are: token, JSON payload and destination.")
(defconst tlon-google-cloud-voices
'((:id "en-US-Studio-Q" :language "en" :gender "male")
(:id "en-US-Studio-O" :language "en" :gender "female")
(:id "es-US-Studio-B" :language "es" :gender "male")
(:id "es-US-Neural2-A" :language "es" :gender "female"))
"Preferred Google Cloud voices for different languages.
The male voice is a \"studio\" voice, the highest quality voice type currently
offered by Google Cloud. Unfortunately, as of 2024-04-12, Google Cloud does not
offer a female studio voice for Spanish, so we use a \"neural\" voice.
A list of available voices may be found here:
<https://cloud.google.com/text-to-speech/docs/voices>.")
(defconst tlon-google-cloud-char-limit (* 5000 0.9)
"Maximum number of characters that Google Cloud can process per request.
Google Cloud TTS can process up to 5000 bytes per request. We use a slightly
lower number to err on the safe side.
See <https://cloud.google.com/text-to-speech/quotas>.")
(defvar tlon-google-cloud-key nil
"API key for the Google Cloud TTS service.")
;;;;;; Amazon Polly
(defconst tlon-amazon-polly-request
"aws polly synthesize-speech \
--output-format %s \
--voice-id %s \
--engine neural \
--text-type ssml \
--text '<speak>%s</speak>' \
--region %s \
'%s'"
"AWS command to synthesize speech using Amazon Polly.
The placeholders are: output format, voice ID, SSML, region, and destination
file.")
(defconst tlon-amazon-polly-voices
'((:id "Joanna" :language "en" :gender "female")
(:id "Matthew" :language "en" :gender "male")
(:id "Lupe" :language "es" :gender "female")
(:id "Pedro" :language "es" :gender "male"))
"Preferred Amazon Polly voices for different languages.
Joanna and Matthew are some of the available Polly voices for English.")
(defconst tlon-amazon-polly-char-limit (* 1500 0.9)
"Maximum number of characters that Amazon Polly can process per request.
The limit for Amazon Polly is 1500 characters but using a slightly lower number
to err on the safe side.")
(defvar tlon-amazon-polly-region "us-east-1"
"Default AWS region for Amazon Polly requests.")
;;;;;; OpenAI
(defconst tlon-openai-tts-request
"curl https://api.openai.com/v1/audio/speech \
-H \"Authorization: Bearer %s\" \
-H \"Content-Type: application/json\" \
-d '{
\"model\": \"%s\",
\"input\": \"%s\",
\"voice\": \"%s\"
}' \
--output '%s'"
"Curl command to send a request to the OpenAI text-to-speech engine.
The placeholders are the API key, the TTS model, the text to be synthesized, the
voice, and the file destination.")
(defconst tlon-openai-voices
'((:id "echo" :language "es" :gender "male")
(:id "nova" :language "es" :gender "female"))
"Preferred OpenAI voices for different languages.
All the voices in this property list are neural and multilingual, and are the
best male and female voices we were able to identify in each language.
A list of available voices may be found here:
<https://platform.openai.com/docs/guides/text-to-speech>.")
(defconst tlon-openai-char-limit (* 4096 0.9)
"Maximum number of characters that OpenAI can process per request.
OpenAI can process up to 4096 bytes per request. We use a slightly
lower number to err on the safe side.
See <https://help.openai.com/en/articles/8555505-tts-api#h_273e638099>.")
(defvar tlon-openai-key nil
"API key for OpenAI TTS service.")
;;;;;; ElevenLabs
(defconst tlon-elevenlabs-voices
'((:name "Brian" :id "rncjssM0aAEg1ApKehUP" :language "multilingual" :gender "male")
(:name "Bruce" :id "qUqZ27WoGID6BUp35xTV" :language "multilingual" :gender "male" :role "main")
(:name "Hades" :id "y3uxYtdWYpmzg8Wwx2k3" :language "multilingual" :gender "male")
(:name "Michael" :id "8mLUlN9GCPCERe4bI7Wx" :language "multilingual" :gender "male" :role "alternate")
(:name "Neal" :id "6JpiWMuXFTicEyWjwDLn" :language "multilingual" :gender "male")
(:name "Amelia" :id "Lpn2A60EAsgGCWjFue20" :language "multilingual" :gender "female" :role "alternate")
(:name "Victoria" :id "lm0dJr2LmYD4zn0kFH9E" :language "multilingual" :gender "female" :role "main")
(:name "Mariluz" :id "m1VE7dnwBN0zMer3LcKv" :language "multilingual" :gender "female" :role "main")
(:name "Ricardo" :id "CoAqFXxZEa3kpJmE7rDr" :language "multilingual" :gender "male" :role "main"))
"Preferred ElevenLabs voices for different languages.
A list of available voices may be found here:
<https://elevenlabs.io/app/voice-library>. To get information about the voices,
including the voice ID, run `tlon-tts-elevenlabs-get-voices'.")
(defconst tlon-elevenlabs-char-limit (* 5000 0.9)
"Maximum number of characters that Elevenlabs can process per request.
Elevenlabs can process up to 5000 characters per request. We use a slightly
lower number to err on the safe side.
See <https://elevenlabs.io/app/subscription> (scroll down to \"Frequently asked
questions\").")
(defconst tlon-elevenlabs-tts-url
"https://api.elevenlabs.io/v1/text-to-speech/%s/stream?output_format=%s"
"Base URL for the ElevenLabs TTS API.")
(defvar tlon-elevenlabs-key nil
"API key for the ElevenLabs TTS service.")
;;;;; Engines
(defconst tlon-tts-engines
`((:name "Microsoft Azure"
:voices-var tlon-microsoft-azure-voices
:audio-var tlon-microsoft-azure-audio-settings
:choices-var ,tlon-microsoft-azure-audio-choices
:request-fun tlon-tts-microsoft-azure-make-request
:char-limit ,tlon-microsoft-azure-char-limit
:property :azure)
(:name "Google Cloud"
:voices-var tlon-google-cloud-voices
:audio-var tlon-google-cloud-audio-settings
:choices-var ,tlon-google-cloud-audio-choices
:request-fun tlon-tts-google-cloud-make-request
:char-limit ,tlon-google-cloud-char-limit
:property :google)
(:name "Amazon Polly"
:voices-var tlon-amazon-polly-voices
:audio-var tlon-amazon-polly-audio-settings
:choices-var ,tlon-amazon-polly-audio-choices
:request-fun tlon-tts-amazon-polly-make-request
:char-limit ,tlon-amazon-polly-char-limit
:property :polly)
(:name "OpenAI"
:voices-var tlon-openai-voices
:audio-var tlon-openai-audio-settings
:request-fun tlon-tts-openai-make-request
:char-limit ,tlon-openai-char-limit
:property :openai)
(:name "ElevenLabs"
:voices-var tlon-elevenlabs-voices
:audio-var tlon-elevenlabs-audio-settings
:choices-var ,tlon-elevenlabs-audio-choices
:request-fun tlon-tts-elevenlabs-make-request
:char-limit ,tlon-elevenlabs-char-limit
:property :elevenlabs))
"Text-to-speech engines and associated properties.")
;; needs to use double quotes for Azure, but maybe single quotes for Google Cloud?
;; cannot be in single quotes because the entire string is itself enclosed in single quotes
(defconst tlon-ssml-wrapper
(mapcar (lambda (service)
(cons service
(format "<speak version=\"1.0\" xmlns=\"http://www.w3.org/2001/10/synthesis\" xml:lang=\"%%s\">%s</speak>"
(tlon-md-return-tag "voice" '("%s") "%s" 'get-values))))
'("Microsoft Azure" "Google Cloud"))
"SSML wrapper for the TTS request.")
;;;;; `ffmpeg'
(defconst tlon-tts-ffmpeg-convert
"ffmpeg -i \"%s\" -acodec libmp3lame -ar 44100 -b:a 128k -ac 1 \"%s\""
"Command to convert an audio file to MP3 format with settings optimized for tts.
The first placeholder is the input file, and the second is the output file.")
;;;;; Report
(defconst tlon-tts-report-buffer-name
"*TTS Report*"
"The name of the TTS report buffer.")
(defconst tlon-tts-maybe-chemical-symbol
"</SmallCaps><sub>"
"Pattern that may match a chemical symbol.")
;;;;; Links
(defconst tlon-tts-self-referential-link
'(("ar" . ("هنا" . "هنا (في النص هناك رابط يشير إلى صفحة وي"))
("de" . ("hier" . "hier (im Text gibt es einen Link, der auf eine Webseite zeigt)"))
("en" . ("here" . "here (in the text there is a link here pointing to a web page)"))
("es" . ("aquí" . "aquí (en el texto hay un enlace que apunta a una página web)"))
("fr" . ("ici" . "ici (dans le texte, il y a un lien qui pointe vers une page web)"))
("it" . ("qui" . "qui (nel testo c'è un link che punta a una pagina web)"))
("ja" . ("ここ" . "ここ (テキストには、ウェブページを指すリンクがあります)"))
("ko" . ("여기" . "여기 (텍스트에는 웹 페이지를 가리키는 링크가 있습니다)")))
"List of self-referential link texts and replacements.")
;;;;; Tables
(defconst tlon-tts-table-cell
"\\(?: .*?|\\)"
"Regular expression to match a table cell.")
(defconst tlon-tts-table-row
(format "\\(?:^|%s+\n\\)" tlon-tts-table-cell)
"Regular expression to match a table row.")
(defconst tlon-tts-table-pattern
(format "\\(?1:%s+\\)" tlon-tts-table-row)
"Regular expression to match a table.")
(defconst tlon-tts-table-separator
"|\\(?: ?-+ ?|\\)+\n"
"Regular expression to match the separator between table rows.")
(defconst tlon-tts-table-header
(format "\\(?1:%s\\)%s" tlon-tts-table-row tlon-tts-table-separator)
"Regular expression to match a table header.
The expression matches the header followed by the separator, capturing the
former in group 1.")
;;;;; Numbers
(defconst tlon-tts-regular-exponent-pattern
'(("es" . "a la %s")
("en" . "to the power of %s"))
"Pattern for regular exponents.")
(defconst tlon-tts-irregular-exponents
'((2 . (("es" . "al cuadrado")
("en" . "squared")))
(3 . (("es" . "al cubo")
("en" . "cubed")))
(4 . (("es" . "a la cuarta")))
(5 . (("es" . "a la quinta")))
(6 . (("es" . "a la sexta")))
(7 . (("es" . "a la séptima")))
(8 . (("es" . "a la octava")))
(9 . (("es" . "a la novena"))))
"List of exponents and their irregular verbal equivalents.")
(defconst tlon-tts-80000
'(("en" . "Eighty thousand")
("es" . "Ochenta mil")
("pt" . "Oitenta mil")
("fr" . "Quatre-vingt mille")
("de" . "Achtzigtausend")
("it" . "Ottantamila"))
"The number 80000 in different languages.")
;;;;; Currencies
(defconst tlon-tts-currencies
'(("₿" . (("en" . ("bitcoin" . "bitcoins"))
("es" . ("bitcoin" . "bitcoins"))))
("Ξ" . (("en" . ("ether" . "ether"))
("es" . ("éter" . "éter"))))
("£" . (("en" . ("pound" . "pounds"))
("es" . ("libra" . "libras"))))
("₪" . (("en" . ("shekel" . "shekels"))
("es" . ("séquel" . "séqueles")))) ; https://www.fundeu.es/recomendacion/shekel-shequel-sekel-sequel/
("₹" . (("en" . ("rupee" . "rupees"))
("es" . ("rupia" . "rupias"))))
("¥" . (("en" . ("yen" . "yens"))
("es" . ("yen" . "yenes"))))
("$" . (("en" . ("dollar" . "dollars"))
("es" . ("dólar" . "dólares")))))
"Currency symbols and their associated three-letter codes.")
;;;;; Tag sections
(defconst tlon-tts-tag-section-names
'(("es" . "\\(?:más información\\|entradas relacionadas\\|enlaces externos\\)")
("en" . "\\(?:further readong\\|related entries\\|external links\\)"))
"Names of the sections to be removed in each language.")
(defconst tlon-tts-tag-section-patterns
(let ((format-string "^\\(?1:## %s\\)"))
(mapcar (lambda (pair)
(let ((lang (car pair))
(names (cdr pair)))
(cons lang (format format-string names))))
tlon-tts-tag-section-names))
"Match the name of a section to be removed until the end of the buffer.")
;;;;; File-local variables
;;;;;; File variables
;; These are the file-local variables whose values we read from the source files
(defvar-local tlon-local-abbreviations '()
"Local abbreviations and their spoken equivalent.")
(defvar-local tlon-local-replacements '()
"Local replacements.")
(defvar-local tlon-tts-voice-chunks '()
"List of voice chunk data.
Each element is a cons cell where the car is the begin position of the voice
chunk and the cdr voice to be used to narrate this chunk.")
;;;;;; Chunk processing
;; TODO: decide if they should be file-local variables
(defvar tlon-tts-chunks nil
"Chunks of text to be narrated.
The value of this variable is used for debugging purposes. Hence it is not unset
at the end of the TTS process.")
(defvar tlon-tts-chunks-to-process 0
"Number of chunks left to process.")
;;;;;; Local abbrevs
;; These are the variables that store the file-local variable values for a particular tts session
;; TODO: decide if they should also be file-local
(defvar tlon-local-abbreviations-for-session '()
"Value of `tlon-local-abbreviations' for the file being processed.")
(defvar tlon-local-replacements-for-session '()
"Value of `tlon-local-replacements' for the file being processed.")
;;;;; Listener cues
(defconst tlon-tts-cue-delimiter
""
"Delimiter for listener cues.")
(defconst tlon-tts-listener-cues
'((aside
("ar" "في الجانب." . "\nنهاية الجانب.")
("de" "Beiseite." . "\nEnde des Beiseites.")
("en" "Aside." . "\nEnd of the aside.")
("es" "Inciso." . "\nFin del inciso.")
("fr" "Incise." . "\nFin de l'incise.")
("it" "Parentesi." . "\nFine della parentesi.")
("ja" "横に." . "\n横の終わり.")
("ko" "옆에." . "\n옆의 끝."))
(blockquote
("ar" "اقتباس." . "\nنهاية الاقتباس.")
("de" "Zitat." . "\nEnde des Zitats.")
("en" "Quote." . "\nEnd of quote.")
("es" "Cita." . "\nFin de la cita.")
("fr" "Citation." . "\nFin de la citation.")
("it" "Citazione." . "\nFine della citazione.")
("ja" "引用." . "\n引用の終わり.")
("ko" "인용문." . "\n인용문의 끝."))
(heading
("ar" "عنوان." . "")
("de" "Überschrift." . "")
("en" "Heading." . "")
("es" "Sección." ."")
("fr" "Titre." . "")
("it" "Titolo." . "")
("ko" "제목." . "")
("ja" "見出し." . ""))
(image
("ar" "هناك صورة هنا." . "\nنهاية الصورة.")
("de" "Hier ist ein Bild." . "\nEnde des Bildes.")
("en" "There’s an image here." . "\nEnd of image.")
("es" "Aquí hay una imagen." . "\nFin de la imagen.")
("fr" "Il y a une image ici." . "\nFin de l'image.")
("it" "C'è un'immagine qui." . "\nFine dell'immagine.")
("ja" "ここに画像があります。" . "\n画像の終わり。")
("ko" "여기에 이미지가 있습니다." . "\n이미지의 끝."))
(image-caption
("ar" . " تتبع الصورة تعليقًا يقول: ")
("de" . " Das Bild wird von einer Bildunterschrift gefolgt, die lautet: ")
("en" . " The image is followed by a caption that reads: ")
("es" . " A la imagen le sigue una leyenda que dice: ")
("fr" . " L'image est suivie d'une légende qui dit: ")
("it" . " L'immagine è seguita da una didascalia che recita: ")
("ja" . " 画像の後には次のようなキャプションが続きます: ")
("ko" . " 이미지 뒤에는 다음과 같은 캡션이 있습니다: "))
(note
("ar" "ملاحظة." . "\nنهاية الملاحظة.")
("de" "Anmerkung." . "\nEnde der Anmerkung.")
("en" "Note." . "\nEnd of the note.")
("es" "Nota." . "\nFin de la nota.")
("fr" "Note." . "\nFin de la note.")
("it" "Nota." . "\nFine della nota.")
("ja" "ノート." . "\nノートの終わり.")
("ko" "노트." . "\n노트의 끝."))
(owid
("ar" "رسم بياني." . "\nنهاية الرسم البياني.")
("de" "Diagramm." . "\nEnde des Diagramms.")
("en" "Chart." . "\nEnd of chart.")
("es" "Cuadro." . "\nFin del cuadro.")
("fr" "Tableau." . "\nFin du tableau.")
("it" "Grafico." . "\nFine del grafico.")
("ja" "チャート." . "\nチャートの終わり.")
("ko" "차트." . "\n차트의 끝."))
(quote
("ar" "(اقتباس)" . "(نهاية الاقتباس)")
("de" "(Zitat)" . "(Ende des Zitats)")
("en" "(quote)" . "(End of quote)")
("es" "(cita)" . "(Fin de la cita)")
("fr" "(citation)" . "(Fin de la citation)")
("it" "(citazione)" . "(Fine della citazione")
("ja" "(引用)" . "(引用の終わり")
("ko" "(인용문)" . "(인용문의 끝)"))
(subheading
("ar" "عنوان فرعي." . "")
("de" "Unterüberschrift." . "")
("en" "Subheading." . "")
("es" "Subsección." . "")
("fr" "Sous-titre." . "")
("it" "Sottotitolo." . "")
("ko" "소제목." . "")
("ja" "サブタイトル." . ""))
(table
("ar" "هناك جدول هنا.\n" . "\nنهاية الجدول.")
("de" "Hier ist eine Tabelle.\n" . "\nEnde der Tabelle.")
("en" "There’s a table here.\n" . "\nEnd of the table.")
("es" "Aquí hay una tabla.\n" . "\nFin de la tabla.")
("fr" "Il y a un tableau ici.\n" . "\nFin du tableau.")
("it" "C'è una tabella qui.\n" . "\nFine della tabella.")