From e349b962999302c5717890843e2211769e30257b Mon Sep 17 00:00:00 2001
From: Roy Shilkrot <roy.shil@gmail.com>
Date: Tue, 24 Sep 2024 16:16:02 -0400
Subject: [PATCH] Refactor build script for Linux and MacOS in CI workflow

---
 examples/CMakeLists.txt                       |  4 ++++
 examples/audio_capture.cpp                    |  2 ++
 examples/audio_capture.h                      |  1 -
 examples/realtime_transcription.cpp           |  4 +++-
 scripts/build-windows.ps1                     |  6 ++++-
 .../transcription/include/transcription.h     |  3 +++
 .../include/whisper-processing.h              |  4 +++-
 .../transcription/src/transcription.cpp       | 23 ++++++++++++++++---
 .../transcription/src/whisper-processing.cpp  |  8 +++----
 9 files changed, 43 insertions(+), 12 deletions(-)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0100938..daa9260 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,3 +3,7 @@ include(${CMAKE_SOURCE_DIR}/cmake/BuildSDL.cmake)
 
 add_executable(RealtimeTranscription realtime_transcription.cpp audio_capture.cpp)
 target_link_libraries(RealtimeTranscription PRIVATE SDL2 Core Transcription)
+
+# add target "examples" to the global target list
+add_custom_target(examples)
+add_dependencies(examples RealtimeTranscription)
diff --git a/examples/audio_capture.cpp b/examples/audio_capture.cpp
index 0d18470..dfe99ee 100644
--- a/examples/audio_capture.cpp
+++ b/examples/audio_capture.cpp
@@ -12,6 +12,8 @@ AudioCapture::~AudioCapture() {
 }
 
 bool AudioCapture::initialize(int device_index, int requested_sample_rate) {
+    SDL_InitSubSystem(SDL_INIT_AUDIO);
+
     SDL_AudioSpec desired_spec, obtained_spec;
 
     SDL_zero(desired_spec);
diff --git a/examples/audio_capture.h b/examples/audio_capture.h
index fd67cd3..621edde 100644
--- a/examples/audio_capture.h
+++ b/examples/audio_capture.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#define SDL_MAIN_HANDLED
 #include <SDL.h>
 
 #include <atomic>
diff --git a/examples/realtime_transcription.cpp b/examples/realtime_transcription.cpp
index 9cee776..7f6c96e 100644
--- a/examples/realtime_transcription.cpp
+++ b/examples/realtime_transcription.cpp
@@ -1,11 +1,13 @@
 #include <locaal.h>
+
+#define SDL_MAIN_HANDLED
 #include "audio_capture.h"
 
 #include <iostream>
 
 int main()
 {
-	SDL_SetMainReady();
+	// SDL_SetMainReady();
 
 	// Initialize the library
 	locaal::Transcription tt;
diff --git a/scripts/build-windows.ps1 b/scripts/build-windows.ps1
index 4984202..49fba7c 100644
--- a/scripts/build-windows.ps1
+++ b/scripts/build-windows.ps1
@@ -1,6 +1,7 @@
 param(
     [switch]$Verbose,
-    [switch]$Clean
+    [switch]$Clean,
+    [switch]$Examples
 )
 
 $verboseFlag = ""
@@ -33,5 +34,8 @@ Invoke-Expression $configureCommand
 
 # Build step
 $buildCommand = "cmake --build $buildDir --config Release $verboseBuildFlag"
+if ($Examples) {
+    $buildCommand += " --target examples"
+}
 Write-Host "Executing build command: $buildCommand"
 Invoke-Expression $buildCommand
diff --git a/src/modules/transcription/include/transcription.h b/src/modules/transcription/include/transcription.h
index 4daa89a..e7e5632 100644
--- a/src/modules/transcription/include/transcription.h
+++ b/src/modules/transcription/include/transcription.h
@@ -4,6 +4,8 @@
 #include <string>
 #include <functional>
 
+struct transcription_context;
+
 namespace locaal {
 
 struct TranscriptionResult {
@@ -41,6 +43,7 @@ class Transcription {
 	std::function<void(const TranscriptionResult &)> transcriptionCallback_;
 
 	// Add any other necessary private members
+	transcription_context *gf;
 };
 
 } // namespace locaal
diff --git a/src/modules/transcription/include/whisper-processing.h b/src/modules/transcription/include/whisper-processing.h
index ef645d7..0208f38 100644
--- a/src/modules/transcription/include/whisper-processing.h
+++ b/src/modules/transcription/include/whisper-processing.h
@@ -29,7 +29,9 @@ struct DetectionResultWithText {
 	std::string language;
 };
 
-void whisper_loop(void *data);
+struct transcription_context;
+
+void whisper_loop(struct transcription_context *gf);
 struct whisper_context *init_whisper_context(const std::string &model_path,
 					     struct transcription_context *gf);
 void run_inference_and_callbacks(transcription_context *gf, uint64_t start_offset_ms,
diff --git a/src/modules/transcription/src/transcription.cpp b/src/modules/transcription/src/transcription.cpp
index 92652ee..fc6022e 100644
--- a/src/modules/transcription/src/transcription.cpp
+++ b/src/modules/transcription/src/transcription.cpp
@@ -1,17 +1,34 @@
 #include "transcription.h"
 #include "logger.h"
+#include "transcription-context.h"
+#include "whisper-utils.h"
+
 #include <iostream>
 
+void set_text_callback(struct transcription_context *gf, const DetectionResultWithText &str)
+{
+	Logger::log(Logger::Level::INFO, "Transcription: %s", str.text.c_str());
+}
+
+void clear_current_caption(transcription_context *gf_){};
+
+// Callback sent when the VAD finds an audio chunk. Sample rate = WHISPER_SAMPLE_RATE, channels = 1
+// The audio chunk is in 32-bit float format
+void audio_chunk_callback(struct transcription_context *gf, const std::vector<float> pcm32f_data,
+			  int vad_state, const DetectionResultWithText &result){};
+
 namespace locaal {
 
 Transcription::Transcription()
 {
 	// Constructor implementation
+	gf = new transcription_context();
 }
 
 Transcription::~Transcription()
 {
 	// Destructor implementation
+	delete gf;
 }
 
 void Transcription::setTranscriptionParams(const std::string &language)
@@ -41,15 +58,15 @@ void Transcription::setTranscriptionCallback(
 void Transcription::startTranscription()
 {
 	Logger::log(Logger::Level::INFO, "Starting transcription...");
-	// Implement the logic to start the transcription process
-	// This might involve starting a new thread, initializing audio capture, etc.
+	// start the transcription thread
+	start_whisper_thread_with_path(this->gf, "en", "silero_vad_model_file");
 }
 
 void Transcription::stopTranscription()
 {
 	Logger::log(Logger::Level::INFO, "Stopping transcription...");
 	// Implement the logic to stop the transcription process
-	// This might involve stopping the transcription thread, cleaning up resources, etc.
+	shutdown_whisper_thread(this->gf);
 }
 
 void Transcription::processAudio(const std::vector<float> &audioData)
diff --git a/src/modules/transcription/src/whisper-processing.cpp b/src/modules/transcription/src/whisper-processing.cpp
index 9db0e6d..e53b490 100644
--- a/src/modules/transcription/src/whisper-processing.cpp
+++ b/src/modules/transcription/src/whisper-processing.cpp
@@ -330,15 +330,13 @@ void run_inference_and_callbacks(transcription_context *gf, uint64_t start_offse
 	}
 }
 
-void whisper_loop(void *data)
+void whisper_loop(transcription_context *gf)
 {
-	if (data == nullptr) {
-		Logger::log(Logger::Level::ERROR_LOG, "whisper_loop: data is null");
+	if (gf == nullptr) {
+		Logger::log(Logger::Level::ERROR_LOG, "whisper_loop: context is null");
 		return;
 	}
 
-	struct transcription_context *gf = static_cast<struct transcription_context *>(data);
-
 	Logger::log(gf->log_level, "Starting whisper thread");
 
 	vad_state current_vad_state = {false, now_ms(), 0, 0};