-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspeech-in-speech-out.cc
282 lines (247 loc) · 7.16 KB
/
speech-in-speech-out.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#include <assert.h>
#include <cstdlib>
#include <curses.h>
#include "deepspeech.h"
#include <espeak-ng/speak_lib.h>
#include <errno.h>
#include <portaudio.h>
#include <stdio.h>
#include <string>
#include <unistd.h>
static void eff_initialize (void) __attribute__((constructor));
static int eff_fprintf(FILE * __restrict__ stream, const char * __restrict__ format, ...);
static int eff_vfprintf(FILE * __restrict__ stream, const char * __restrict__ format, va_list ap);
static constexpr unsigned sample_rate = 16000;
static constexpr unsigned max_seconds = 5;
static constexpr size_t buffer_len = sample_rate * max_seconds;
static const char* model_path = "deepspeech-0.8.1-models.pbmm";
static const char* scorer_path = "deepspeech-0.8.1-models.scorer";
static const char* language = "English";
static const espeak_AUDIO_OUTPUT output = AUDIO_OUTPUT_SYNCH_PLAYBACK;
struct data_t
{
size_t index;
size_t length;
size_t rec_length;
int16_t buffer[buffer_len];
};
thread_local static bool is_enabled = false;
thread_local static data_t data = { 0, buffer_len, 0, { 0 }};
thread_local static PaStream *stream = nullptr;
thread_local static WINDOW *w = nullptr;
thread_local static ModelState *ctx = nullptr;
static int
record(const void* in,
__attribute__((unused)) void *out,
unsigned long framesPerBuffer,
__attribute__((unused)) const PaStreamCallbackTimeInfo *timeInfo,
__attribute__((unused)) PaStreamCallbackFlags flags,
void *data)
{
assert(data && in);
struct data_t *_local = static_cast<struct data_t *>(data);
const int16_t *_in = static_cast<const int16_t *>(in);
const bool has_enough_space = _local->index + framesPerBuffer < _local->length;
// TODO: maybe a circular buffer in the future?
if (!has_enough_space) return paComplete;
for (unsigned long i = 0; i < framesPerBuffer; i++)
{
_local->buffer[_local->index++] = _in[i];
}
return paContinue;
}
static int
play(__attribute__((unused)) const void* in,
void *out,
unsigned long framesPerBuffer,
__attribute__((unused)) const PaStreamCallbackTimeInfo *timeInfo,
__attribute__((unused)) PaStreamCallbackFlags flags,
void *data)
{
assert(data);
struct data_t *_local = static_cast<struct data_t *>(data);
int16_t *_out = static_cast<int16_t *>(out);
// TODO: maybe a circular buffer in the future?
for (unsigned long i = 0; i < framesPerBuffer; i++)
{
_out[i] = _local->buffer[_local->index++];
const bool end_of_recording = _local->index >= _local->length;
if (end_of_recording) return paComplete;
}
return paContinue;
}
static void
record_audio()
{
assert(NULL != w);
int retval = 0;
// reset buffer
data.index = 0;
retval = keypad(w, false);
assert(ERR != retval);
wtimeout(w, -1);
retval = eff_fprintf(stdout, "Please press and hold the space bar to record your response\n");
assert(0 != retval);
wgetch(w);
wtimeout(w, 500);
retval = cbreak();
assert(ERR != retval);
retval = noecho();
assert(ERR != retval);
// Recording
retval = Pa_OpenDefaultStream(&stream, 1, 0, paInt16, 16000, paFramesPerBufferUnspecified, record, &data);
assert(0 == retval);
retval = Pa_StartStream(stream);
assert(0 == retval);
while (wgetch(w) == ' ') ;
wtimeout(w, -1);
retval = echo();
assert(ERR != retval);
retval = Pa_StopStream(stream);
assert(0 == retval);
retval = Pa_CloseStream(stream);
assert(0 == retval);
}
static void
play_audio()
{
// record buffer length
assert(NULL != w);
data.rec_length = data.index;
data.index = 0;
int retval = eff_fprintf(stdout, "You will now hear a recording of your response\n");
assert(0 != retval);
retval = Pa_OpenDefaultStream(&stream, 0, 1, paInt16, 16000, paFramesPerBufferUnspecified, play, &data);
assert(0 == retval);
retval = Pa_StartStream(stream);
assert(0 == retval);
Pa_Sleep(data.rec_length / 16);
retval = Pa_StopStream(stream);
assert(0 == retval);
retval = Pa_CloseStream(stream);
assert(0 == retval);
}
static bool
is_retry()
{
assert(NULL != w);
int retval = keypad(w, true);
assert(ERR != retval);
retval = cbreak();
assert(ERR != retval);
retval = eff_fprintf(stdout, "Press enter if you want to record again. Press any other key to continue.\n");
assert(0 != retval);
int key = wgetch(w);
assert(ERR != retval);
retval = wrefresh(w);
assert(ERR != retval);
retval = keypad(w, false);
assert(ERR != retval);
return key == '\n';
}
static char*
eff_fgets(char * __restrict__ str, int size, FILE * __restrict__ stream)
{
if (!is_enabled || stream != stdin) return fgets(str, size, stream);
assert(NULL != w);
char* input = NULL;
int retval = 0;
do
{
record_audio();
play_audio();
unsigned int buffer_size = static_cast<unsigned int>(data.rec_length);
input = DS_SpeechToText(ctx, data.buffer, buffer_size);
assert(NULL != input);
retval = eff_fprintf(stdout, "Deep-speech understood: %s\n", input);
assert(0 <= retval);
retval = wrefresh(w);
assert(ERR != retval);
strncpy(str, input, size);
DS_FreeString(input);
} while (is_retry());
return str;
}
static int
eff_vfprintf(FILE * __restrict__ stream, const char * __restrict__ format, va_list ap)
{
assert(stream && format);
if (!is_enabled || stream != stdout) return vfprintf(stream, format, ap);
char *temp;
int retval = vasprintf(&temp, format, ap);
assert(NULL != temp);
assert(0 <= retval);
int status = wprintw(w, temp);
assert(ERR != status);
status = wrefresh(w);
assert(ERR != status);
status = espeak_Synth(temp, strlen(temp) + 1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, NULL, NULL);
assert(EE_OK == status);
free(temp);
assert(NULL != w);
return retval;
}
static int
eff_fprintf(FILE * __restrict__ stream, const char * __restrict__ format, ...)
{
assert(stream && format);
va_list args;
va_start (args, format);
int retval = 0;
if (!is_enabled) goto stdexec;
if (stream != stdout) goto stdexec;
retval = eff_vfprintf(stream, format, args);
goto finish;
stdexec:
retval = vfprintf(stream, format, args);
finish:
va_end(args);
return retval;
}
static void eff_shutdown();
static void
eff_initialize()
{
int fd = fileno(stdout);
is_enabled = isatty(fd);
if (!is_enabled) return;
// We are only going to do this if isatty
int retval = Pa_Initialize();
assert(0 == retval);
w = initscr();
assert(NULL != w);
retval = DS_CreateModel(model_path, &ctx);
assert(0 == retval);
retval = DS_EnableExternalScorer(ctx, scorer_path);
assert(0 == retval);
retval = espeak_Initialize(output, 0, NULL, 0);
assert(0 < retval);
retval = espeak_SetVoiceByName(language);
assert(0 < retval);
atexit(eff_shutdown);
}
static void
eff_shutdown()
{
if (!is_enabled) return;
// Once per program...
int retval = Pa_Terminate();
assert(0 == retval);
retval = endwin();
assert(ERR != retval);
DS_FreeModel(ctx);
}
int
fprintf(FILE * __restrict__ stream, const char * __restrict__ format, ...)
{
va_list args;
va_start (args, format);
int retval = eff_vfprintf(stream, format, args);
va_end (args);
return retval;
}
char*
fgets(char * __restrict__ str, int size, FILE * __restrict__ stream)
{
return eff_fgets(str, size, stream);
}