diff --git a/non_semantic_speech_benchmark/README.md b/non_semantic_speech_benchmark/README.md index 8cc39ea84e84..866430aaf770 100644 --- a/non_semantic_speech_benchmark/README.md +++ b/non_semantic_speech_benchmark/README.md @@ -1,4 +1,4 @@ -# Towards Learning a Universal Non-Semantic Representation of Speech (WORK IN PROGRESS) +# Towards Learning a Universal Non-Semantic Representation of Speech Paper: [Towards Learning a Universal Non-Semantic Representation of Speech](https://arxiv.org/abs/2002.12764) @@ -28,6 +28,9 @@ To use this benchmark or embeddings, please cite as follows: ## Overview +![Data flowchart](images/data_flowchart.png "Data flowchart"){width="500"} + +![Embedding flowchart](images/embedding_flowchat.png "Embedding flowchart"){width="400"} ![Eval model flowchart](images/eval_moodel_flowchart.png "Eval model flowchart"){width="400"} ## Detailed Instructions diff --git a/non_semantic_speech_benchmark/images/data_flowchart.png b/non_semantic_speech_benchmark/images/data_flowchart.png new file mode 100644 index 000000000000..3dcba6397332 Binary files /dev/null and b/non_semantic_speech_benchmark/images/data_flowchart.png differ diff --git a/non_semantic_speech_benchmark/images/embedding_flowchart.png b/non_semantic_speech_benchmark/images/embedding_flowchart.png new file mode 100644 index 000000000000..9ca449755498 Binary files /dev/null and b/non_semantic_speech_benchmark/images/embedding_flowchart.png differ diff --git a/non_semantic_speech_benchmark/images/eval_model_flowchart.png b/non_semantic_speech_benchmark/images/eval_model_flowchart.png new file mode 100644 index 000000000000..6fd9d6bf2171 Binary files /dev/null and b/non_semantic_speech_benchmark/images/eval_model_flowchart.png differ diff --git a/non_semantic_speech_benchmark/train_and_eval_sklearn_small_tfds_dataset.ipynb b/non_semantic_speech_benchmark/train_and_eval_sklearn_small_tfds_dataset.ipynb new file mode 100644 index 000000000000..d510e76df9ee --- /dev/null +++ b/non_semantic_speech_benchmark/train_and_eval_sklearn_small_tfds_dataset.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Q59kntpEjeax" + }, + "source": [ + "##### Copyright 2019 Google LLC.\n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\");" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "9fhjU0f_ja9h" + }, + "outputs": [], + "source": [ + "#@title Default title text\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "4RPI9UwajiPI" + }, + "source": [ + "##### Full-flow evaluation\n", + "\n", + "A colab for testing the full flow of calculating embeddings and train/eval using sklearn models. Since this notebook doesn't parallelize (like the apache beam tools do) and computing embeddings is computationally expensive, **please use the mutli-step beam-based tools if** you'd like to eval a large dataset, eval a custom dataset, or train a Keras model.\n", + "\n", + "Please be sure to use a **Python 3** kernel. **Running on GPU** significantly speeds up the process as well.\n", + "\n", + "Conceptual overview of this colab:\n", + "\n", + "1. Read `TensorFlow Dataset` data as numpy\n", + "1. Convert audio to float and resample\n", + "1. Convert audio to embeddings\n", + "1. Train and eval sklearn model" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "dKUuSLehjke7" + }, + "outputs": [], + "source": [ + "tfds_dataset_name = 'savee' #@param\n", + "REQUIRED_SAMPLE_RATE_ = 16000" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gQvwTuBjjnRZ" + }, + "outputs": [], + "source": [ + "# Read the data into numpy arrays.\n", + "import collections\n", + "SingleSplit = collections.namedtuple(\n", + " 'SingleSplit', ['audio', 'labels', 'speaker_id'])\n", + "Data = collections.namedtuple(\n", + " 'Data', ['train', 'validation', 'test'])\n", + "\n", + "import tensorflow.compat.v2 as tf\n", + "tf.enable_v2_behavior()\n", + "assert tf.executing_eagerly()\n", + "import tensorflow_datasets as tfds\n", + "def _dat_from_split(split):\n", + " np_generator = tfds.as_numpy(tfds.load(tfds_dataset_name, split=split))\n", + " dat = [(x['audio'], x['label'], x['speaker_id']) for x in np_generator]\n", + " audio, labels, speaker_id = zip(*dat)\n", + "\n", + " import numpy as np\n", + " labels = np.array(labels, dtype=np.int16)\n", + " speaker_id = np.array(speaker_id)\n", + " assert len(audio) == labels.size == speaker_id.size\n", + " assert labels.ndim == speaker_id.ndim == 1\n", + " print(f'Finished {split}')\n", + " return audio, labels, speaker_id\n", + "\n", + "all_data = Data(\n", + " train=SingleSplit(*_dat_from_split('train')),\n", + " validation=SingleSplit(*_dat_from_split('validation')),\n", + " test=SingleSplit(*_dat_from_split('test')))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LhL0uSMjjpQN" + }, + "outputs": [], + "source": [ + "# Make the audio floats, and resample the audio if necessary.\n", + "import collections\n", + "import librosa\n", + "import numpy as np\n", + "FloatData = collections.namedtuple('FloatData', ['train', 'validation', 'test'])\n", + "\n", + "sample_rate = tfds.builder(tfds_dataset_name).info.features['audio'].sample_rate\n", + "def _int_to_float(audio_int16, split_name):\n", + " float_audio_16k = []\n", + " for i, samples in enumerate(audio_int16):\n", + " float_audio = samples.astype(np.float32) / np.iinfo(np.int16).max\n", + " if sample_rate != REQUIRED_SAMPLE_RATE_:\n", + " float_audio = librosa.core.resample(\n", + " float_audio, orig_sr=sample_rate, target_sr=16000, \n", + " res_type='kaiser_best')\n", + " float_audio_16k.append(float_audio)\n", + " if i % 50 == 0:\n", + " print(f'Finished resampling {i} / {len(audio_int16)} for {split_name}')\n", + " print(f'Finished {split_name}')\n", + " return float_audio_16k\n", + "\n", + "\n", + "float_audio_16k = FloatData(\n", + " train=_int_to_float(all_data.train.audio, 'train'),\n", + " validation=_int_to_float(all_data.validation.audio, 'validation'),\n", + " test=_int_to_float(all_data.test.audio, 'test'))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CIb4stZ3jqsF" + }, + "outputs": [], + "source": [ + "tfhub_model_name = 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/1' #@param\n", + "output_key = 'embedding' #@param" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "9SAsuvE5jrsN" + }, + "outputs": [], + "source": [ + "# Convert the audio to embeddings. Preaverage the embeddings across time.\n", + "import tensorflow_hub as hub\n", + "model = hub.load(tfhub_model_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "5UtmnSGPjt1k" + }, + "outputs": [], + "source": [ + "import collections\n", + "Embeddings = collections.namedtuple(\n", + " 'Embeddings', ['train', 'validation', 'test'])\n", + "\n", + "def _calc_embeddings(cur_float_audio, split_name):\n", + " cur_embeddings = []\n", + " for i, float_samples in enumerate(cur_float_audio):\n", + " tf_out = model(tf.constant(float_samples, tf.float32),\n", + " tf.constant(16000, tf.int32))\n", + " embedding_2d = tf_out[output_key]\n", + " assert embedding_2d.ndim == 2\n", + " embedding_1d = np.mean(embedding_2d, axis=0)\n", + " cur_embeddings.append(embedding_1d)\n", + " if i % 50 == 0:\n", + " print(f'Finished embedding {i} / {len(cur_float_audio)} for {split_name}')\n", + " print(f'Finished {split_name}')\n", + " cur_embeddings = np.array(cur_embeddings, dtype=np.float32)\n", + " return cur_embeddings\n", + "\n", + "embeddings = Embeddings(\n", + " train=_calc_embeddings(float_audio_16k.train, 'train'),\n", + " validation=_calc_embeddings(float_audio_16k.validation, 'validation'),\n", + " test=_calc_embeddings(float_audio_16k.test, 'test'))\n", + "assert embeddings.train.shape[1] == embeddings.validation.shape[1] == embeddings.test.shape[1]\n", + "assert embeddings.train.shape[0] == all_data.train.labels.shape[0] == all_data.train.speaker_id.shape[0]\n", + "assert embeddings.validation.shape[0] == all_data.validation.labels.shape[0] == all_data.validation.speaker_id.shape[0]\n", + "assert embeddings.test.shape[0] == all_data.test.labels.shape[0] == all_data.test.speaker_id.shape[0]\n", + "assert not np.isnan(embeddings.train).any()\n", + "assert not np.isnan(embeddings.validation).any()\n", + "assert not np.isnan(embeddings.test).any()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "r6BTvIGQjwt3" + }, + "outputs": [], + "source": [ + "model_name = 'LogisticRegression_balanced' #@param" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "BVVFxPrcjyak" + }, + "outputs": [], + "source": [ + "from sklearn import linear_model\n", + "\n", + "def get_sklearn_model(model_name):\n", + " return {\n", + " 'LogisticRegression': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),\n", + " 'LogisticRegression_balanced': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial', class_weight='balanced'),\n", + " }[model_name]()\n", + "\n", + "def _speaker_normalization(embedding_np, speaker_id_np):\n", + " \"\"\"Normalize embedding features by per-speaker statistics.\"\"\"\n", + " all_speaker_ids = np.unique(speaker_id_np)\n", + " for speaker in all_speaker_ids:\n", + " cur_i = speaker_id_np == speaker\n", + " embedding_np[cur_i] -= embedding_np[cur_i].mean(axis=0)\n", + " stds = embedding_np[cur_i].std(axis=0)\n", + " stds[stds == 0] = 1\n", + " embedding_np[cur_i] /= stds\n", + "\n", + " return embedding_np\n", + "\n", + "# Train models.\n", + "d = get_sklearn_model(model_name)\n", + "normalized_train = _speaker_normalization(\n", + " embeddings.train, all_data.train.speaker_id)\n", + "d.fit(normalized_train, all_data.train.labels)\n", + "\n", + "# Eval.\n", + "normalized_validation = _speaker_normalization(\n", + " embeddings.validation, all_data.validation.speaker_id)\n", + "eval_score = d.score(normalized_validation, all_data.validation.labels)\n", + "print(f'{model_name} eval score: {eval_score}')\n", + "\n", + "# Test.\n", + "normalized_test = _speaker_normalization(\n", + " embeddings.test, all_data.test.speaker_id)\n", + "test_score = d.score(normalized_test, all_data.test.labels)\n", + "print(f'{model_name} test score: {test_score}')" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Train and eval sklearn small TFDS dataset", + "provenance": [ + { + "file_id": "1UE-jDSsEQ0qRvxtw_aRK2k4dJwytUdSk", + "timestamp": 1587402478951 + } + ] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}