From 9bbafe867128ae2f833d7ce495fc565967987156 Mon Sep 17 00:00:00 2001 From: Malte Lantin Date: Tue, 17 Sep 2019 23:01:28 +0200 Subject: [PATCH] Add project source code --- .classpath | 27 +++ .gitignore | 61 +++++++ .project | 23 +++ README.md | 17 +- pom.xml | 30 ++++ .../speech/samples/ogg/App.java | 37 ++++ .../speech/samples/ogg/DataPipe.java | 56 ++++++ .../speech/samples/ogg/OggToTextService.java | 165 ++++++++++++++++++ .../speech/samples/ogg/WavStream.java | 150 ++++++++++++++++ 9 files changed, 564 insertions(+), 2 deletions(-) create mode 100644 .classpath create mode 100644 .project create mode 100644 pom.xml create mode 100644 src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/App.java create mode 100644 src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/DataPipe.java create mode 100644 src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/OggToTextService.java create mode 100644 src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/WavStream.java diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..ebde520 --- /dev/null +++ b/.classpath @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/.gitignore b/.gitignore index a1c2a23..d08cbd8 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,64 @@ # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* +/target/ + + + +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# From https://github.com/github/gitignore/blob/master/Global/Eclipse.gitignore + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# CDT- autotools +.autotools + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Annotation Processing +.apt_generated/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet diff --git a/.project b/.project new file mode 100644 index 0000000..bc4b92b --- /dev/null +++ b/.project @@ -0,0 +1,23 @@ + + + java-ogg-to-ms-speech + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + diff --git a/README.md b/README.md index 35c733f..56ebd52 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,15 @@ -# java-ogg-to-ms-speech -This project demonstrates how use ffmpeg to convert .ogg files (Vorbis and Opus) to the right format for Speech-to-Text transcription using the Microsoft Cognitive Services Speech Service. +# Transcribe .ogg speech files with the Microsoft Speech Java SDK +This project demonstrates how use ffmpeg to convert .ogg files (Vorbis and Opus) to the right format for Speech-to-Text transcription using the Microsoft Cognitive Services Speech Service. This could be used to transcribe voice messages encoded using the Opus (https://en.wikipedia.org/wiki/Opus_(audio_format)) codec or other codecs using the .ogg container format. + +To make this sample work, you need the [Cognitive Services Speech Service Java SDK](https://docs.microsoft.com/en-us/java/api/com.microsoft.cognitiveservices.speech?view=azure-java-stable) which has been already added to the pom file. +```java +public final static String MS_SPEECH_KEY = "your-microsoft-speech-key"; +public final static String MS_SPEECH_REGION = "westeurope"; +public final static String MS_SPEECH_RECOGNITION_LANG = "de-de"; +``` + +You also need to download [ffmpeg](https://ffmpeg.org/) which is used for transcoding and set the right path to it in the source. An audio file can be read from disk or passed as a byte array. It will then, in memory, be transcoded to wav / pcm format for transcription using the Cognitive Services Speech Service. + +Also check out the [Microsoft Speech SDK Sample Repository](https://github.com/Azure-Samples/cognitive-services-speech-sdk) to learn more and use more of it's functionality. + +Thank you [@chgeuer](https://github.com/chgeuer) for your contributions. diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..a064027 --- /dev/null +++ b/pom.xml @@ -0,0 +1,30 @@ + + 4.0.0 + + malantin + ogg + 0.0.1-SNAPSHOT + jar + + ogg + http://maven.apache.org + + + UTF-8 + + + + maven-cognitiveservices-speech + Microsoft Cognitive Services Speech Maven Repository + https://csspeechstorage.blob.core.windows.net/maven/ + + + + + com.microsoft.cognitiveservices.speech + client-sdk + 1.6.0 + + + diff --git a/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/App.java b/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/App.java new file mode 100644 index 0000000..e19c134 --- /dev/null +++ b/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/App.java @@ -0,0 +1,37 @@ +//MIT License +// +//Copyright (c) Microsoft Corporation. All rights reserved. +// +//Permission is hereby granted, free of charge, to any person obtaining a copy +//of this software and associated documentation files (the "Software"), to deal +//in the Software without restriction, including without limitation the rights +//to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +//copies of the Software, and to permit persons to whom the Software is +//furnished to do so, subject to the following conditions: +// +//The above copyright notice and this permission notice shall be included in all +//copies or substantial portions of the Software. +// +//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +//IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +//FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +//AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +//LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +//OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +//SOFTWARE + +package com.microsoft.cognitiveservices.speech.samples.ogg; + +import java.io.IOException; +import java.util.concurrent.ExecutionException; + +public class App { + public static void main(String[] args) throws IOException, InterruptedException, ExecutionException { + // Test with local file + String filePath = "test.ogg"; + + OggToTextService service = new OggToTextService(); + + System.out.println(service.transcribeOGGFile(filePath)); + } +} diff --git a/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/DataPipe.java b/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/DataPipe.java new file mode 100644 index 0000000..ceb8685 --- /dev/null +++ b/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/DataPipe.java @@ -0,0 +1,56 @@ +//MIT License +// +//Copyright (c) Microsoft Corporation. All rights reserved. +// +//Permission is hereby granted, free of charge, to any person obtaining a copy +//of this software and associated documentation files (the "Software"), to deal +//in the Software without restriction, including without limitation the rights +//to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +//copies of the Software, and to permit persons to whom the Software is +//furnished to do so, subject to the following conditions: +// +//The above copyright notice and this permission notice shall be included in all +//copies or substantial portions of the Software. +// +//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +//IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +//FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +//AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +//LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +//OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +//SOFTWARE + +package com.microsoft.cognitiveservices.speech.samples.ogg; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +public class DataPipe implements Runnable { + private final InputStream is; + private final OutputStream os; + + DataPipe(InputStream is, OutputStream os) { + this.is = is; + this.os = os; + } + + public void run() { + byte buffer[] = new byte[1024 * 1024]; + int numRead; + try { + while ((numRead = this.is.read(buffer)) != -1) { + os.write(buffer, 0, numRead); + } + this.os.close(); + } catch (IOException ioe) { + System.err.println(ioe.getMessage()); + } + } + + public static Thread start(InputStream is, OutputStream os) { + Thread t = new Thread(new DataPipe(is, os)); + t.start(); + return t; + } +} diff --git a/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/OggToTextService.java b/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/OggToTextService.java new file mode 100644 index 0000000..3152c79 --- /dev/null +++ b/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/OggToTextService.java @@ -0,0 +1,165 @@ +//MIT License +// +//Copyright (c) Microsoft Corporation. All rights reserved. +// +//Permission is hereby granted, free of charge, to any person obtaining a copy +//of this software and associated documentation files (the "Software"), to deal +//in the Software without restriction, including without limitation the rights +//to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +//copies of the Software, and to permit persons to whom the Software is +//furnished to do so, subject to the following conditions: +// +//The above copyright notice and this permission notice shall be included in all +//copies or substantial portions of the Software. +// +//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +//IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +//FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +//AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +//LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +//OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +//SOFTWARE + +package com.microsoft.cognitiveservices.speech.samples.ogg; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; + +import com.microsoft.cognitiveservices.speech.CancellationDetails; +import com.microsoft.cognitiveservices.speech.CancellationReason; +import com.microsoft.cognitiveservices.speech.ResultReason; +import com.microsoft.cognitiveservices.speech.SpeechConfig; +import com.microsoft.cognitiveservices.speech.SpeechRecognitionResult; +import com.microsoft.cognitiveservices.speech.SpeechRecognizer; +import com.microsoft.cognitiveservices.speech.audio.AudioConfig; +import com.microsoft.cognitiveservices.speech.audio.PullAudioInputStreamCallback; + + +/** + * @author malantin + * Transcode ogg (Vorbis and Opus) files and transcribe using Microsoft Speech Service + * + */ +public class OggToTextService { + + public final static String MS_SPEECH_KEY = "your-microsoft-speech-key"; + public final static String MS_SPEECH_REGION = "westeurope"; + public final static String MS_SPEECH_RECOGNITION_LANG = "de-de"; + + public String transcribeOGGFile(String path) throws IOException, InterruptedException, ExecutionException { + File inputFile = new File(path); + + byte[] fileContent = Files.readAllBytes(inputFile.toPath()); + + return transcribeOGGFileFromByteArray(fileContent); + } + + public String transcribeOGGFileFromByteArray(byte[] media) + throws InterruptedException, IOException, ExecutionException { + + // Create ffmpeg process to decode from opus to wave + // Learn more and get ffmpeg from https://ffmpeg.org/ + // Legal information on ffmpeg: https://ffmpeg.org/legal.html + // Download ffmpeg and set the path here + // ffmpeg we decode opus from stdin and output wave to stdout + Process process = Runtime.getRuntime().exec("ffmpeg.exe -i - -c:a pcm_s16le -ar 16000 -ac 1 -f wav - "); + + InputStream oggStream = new ByteArrayInputStream(media); + // Get stdin for ffmpeg process + OutputStream processStdinStream = process.getOutputStream(); + // Start moving bytes from our input to stdin + Thread threadIn = DataPipe.start(oggStream, processStdinStream); + + // Get stdin for ffmpeg process + InputStream processStdoutStream = process.getInputStream(); + ByteArrayOutputStream wavStream = new ByteArrayOutputStream(); + // Start moving bytes from from stdout to our output stream + Thread threadOut = DataPipe.start(processStdoutStream, wavStream); + + // Get error stream for ffmpeg process + InputStream processStderrStream = process.getErrorStream(); + ByteArrayOutputStream loggingStream = new ByteArrayOutputStream(); + // Start moving bytes from error stream + Thread threadErr = DataPipe.start(processStderrStream, loggingStream); + + // Wait for threads to finish + threadIn.join(); + int exitCode = process.waitFor(); + threadOut.join(); + threadErr.join(); + + // Fix header of the wave file as ffmpeg misses to write parts of the + // information when converting from stdin to stdout + byte[] wavByteArray = wavStream.toByteArray(); + + long l1 = wavByteArray.length - 8; + wavByteArray[4] = (byte) ((l1 >> 0) & 0xff); + wavByteArray[5] = (byte) ((l1 >> 8) & 0xff); + wavByteArray[6] = (byte) ((l1 >> 16) & 0xff); + wavByteArray[7] = (byte) ((l1 >> 24) & 0xff); + + long l2 = wavByteArray.length - 78; + wavByteArray[0x4a] = (byte) ((l2 >> 0) & 0xff); + wavByteArray[0x4b] = (byte) ((l2 >> 8) & 0xff); + wavByteArray[0x4c] = (byte) ((l2 >> 16) & 0xff); + wavByteArray[0x4d] = (byte) ((l2 >> 24) & 0xff); + + System.out.println("Exit Code: " + exitCode); + System.out.println(new String(loggingStream.toByteArray())); + + String recognizedText = ""; + + // Set Microsoft Speech subscription key and region, see Speech SDK documentation for more information + // https://docs.microsoft.com/en-us/java/api/com.microsoft.cognitiveservices.speech?view=azure-java-stable + SpeechConfig config = SpeechConfig.fromSubscription(MS_SPEECH_KEY, MS_SPEECH_REGION); + config.setSpeechRecognitionLanguage(MS_SPEECH_RECOGNITION_LANG); + + ByteArrayInputStream waveByteArrayInput = new ByteArrayInputStream(wavByteArray); + + PullAudioInputStreamCallback callback = new WavStream(waveByteArrayInput); + AudioConfig audioInput = AudioConfig.fromStreamInput(callback); + + SpeechRecognizer recognizer = new SpeechRecognizer(config, audioInput); + + // Do one time recognition, might fail for longer speech files, see SDK documentation + Future task = recognizer.recognizeOnceAsync(); + + assert (task != null); + + System.out.println(String.format("Sending file to Microsoft Speech for transcription to %s", + config.getSpeechRecognitionLanguage())); + + SpeechRecognitionResult result; + + result = task.get(); + assert (result != null); + + if (result.getReason() == ResultReason.RecognizedSpeech) { + recognizedText = result.getText(); + System.out.println("Recognized Speech: Transcription was successful."); + } else if (result.getReason() == ResultReason.NoMatch) { + System.out.println("No Match: Speech could not be recognized."); + } else if (result.getReason() == ResultReason.Canceled) { + CancellationDetails cancellation = CancellationDetails.fromResult(result); + System.out.println("Canceled: Reason=" + cancellation.getReason()); + + if (cancellation.getReason() == CancellationReason.Error) { + System.out.println("Canceled: ErrorCode=" + cancellation.getErrorCode()); + System.out.println("Canceled: ErrorDetails=" + cancellation.getErrorDetails()); + System.out.println("Canceled: Did you update the subscription info?"); + } + } + + recognizer.close(); + audioInput.close(); + + return recognizedText; + } +} diff --git a/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/WavStream.java b/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/WavStream.java new file mode 100644 index 0000000..f130036 --- /dev/null +++ b/src/main/java/com/microsoft/cognitiveservices/speech/samples/ogg/WavStream.java @@ -0,0 +1,150 @@ +package com.microsoft.cognitiveservices.speech.samples.ogg; + +// https://github.com/Azure-Samples/cognitive-services-speech-sdk/blob/master/samples/java/jre/console/src/com/microsoft/cognitiveservices/speech/samples/console/WavStream.java + +//MIT License +// +//Copyright (c) Microsoft Corporation. All rights reserved. +// +//Permission is hereby granted, free of charge, to any person obtaining a copy +//of this software and associated documentation files (the "Software"), to deal +//in the Software without restriction, including without limitation the rights +//to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +//copies of the Software, and to permit persons to whom the Software is +//furnished to do so, subject to the following conditions: +// +//The above copyright notice and this permission notice shall be included in all +//copies or substantial portions of the Software. +// +//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +//IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +//FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +//AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +//LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +//OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +//SOFTWARE + +import com.microsoft.cognitiveservices.speech.audio.PullAudioInputStreamCallback; + +import java.io.IOException; +import java.io.InputStream; + +public class WavStream extends PullAudioInputStreamCallback { + private final InputStream stream; + + public WavStream(InputStream wavStream) { + try { + this.stream = parseWavHeader(wavStream); + } catch (Exception ex) { + throw new IllegalArgumentException(ex.getMessage()); + } + } + + @Override + public int read(byte[] dataBuffer) { + long ret = 0; + + try { + ret = this.stream.read(dataBuffer, 0, dataBuffer.length); + } catch (Exception ex) { + System.out.println("Read " + ex); + } + + return (int)Math.max(0, ret); + } + + @Override + public void close() { + try { + this.stream.close(); + } catch (IOException ex) { + // ignored + } + } + // endregion + + // region Wav File helper functions + private int ReadInt32(InputStream inputStream) throws IOException { + int n = 0; + for (int i = 0; i < 4; i++) { + n |= inputStream.read() << (i * 8); + } + return n; + } + + private long ReadUInt32(InputStream inputStream) throws IOException { + long n = 0; + for (int i = 0; i < 4; i++) { + n |= inputStream.read() << (i * 8); + } + return n; + } + + private int ReadUInt16(InputStream inputStream) throws IOException { + int n = 0; + for (int i = 0; i < 2; i++) { + n |= inputStream.read() << (i * 8); + } + return n; + } + + public InputStream parseWavHeader(InputStream reader) throws IOException { + // Note: assumption about order of chunks + // Tag "RIFF" + byte data[] = new byte[4]; + int numRead = reader.read(data, 0, 4); + ThrowIfFalse((numRead == 4) && (data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "RIFF"); + + // Chunk size + /* int fileLength = */ReadInt32(reader); + + // Subchunk, Wave Header + // Subchunk, Format + // Tag: "WAVE" + numRead = reader.read(data, 0, 4); + ThrowIfFalse((numRead == 4) && (data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "WAVE"); + + // Tag: "fmt" + numRead = reader.read(data, 0, 4); + ThrowIfFalse((numRead == 4) && (data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "fmt "); + + // chunk format size + long formatSize = ReadInt32(reader); + ThrowIfFalse(formatSize >= 16, "formatSize"); + + int formatTag = ReadUInt16(reader); + int channels = ReadUInt16(reader); + int samplesPerSec = (int) ReadUInt32(reader); + int avgBytesPerSec = (int) ReadUInt32(reader); + int blockAlign = ReadUInt16(reader); + int bitsPerSample = ReadUInt16(reader); + ThrowIfFalse(formatTag == 1, "PCM"); // PCM + ThrowIfFalse(channels == 1, "single channel"); + ThrowIfFalse(samplesPerSec == 16000, "samples per second"); + ThrowIfFalse(bitsPerSample == 16, "bits per sample"); + + // Until now we have read 16 bytes in format, the rest is cbSize and is ignored + // for now. + if (formatSize > 16) { + numRead = reader.read(new byte[(int) (formatSize - 16)]); + ThrowIfFalse(numRead == (int)(formatSize - 16), "could not skip extended format"); + } + + // Second Chunk, data + // tag: data. + numRead = reader.read(data, 0, 4); + ThrowIfFalse((numRead == 4) && ((data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a')) || ((data[0] == 'L') && (data[1] == 'I') && (data[2] == 'S') && (data[3] == 'T')), "data/list"); + + // data chunk size + // Note: assumption is that only a single data chunk + /* int dataLength = */ReadInt32(reader); + return reader; + } + + private static void ThrowIfFalse(Boolean condition, String message) { + if (!condition) { + throw new IllegalArgumentException(message); + } + } + // endregion +}