Open source colabs that run all-in-one training section.

PiperOrigin-RevId: 307532498
duoduoxi · Apr 21, 2020 · c71c969 · c71c969
1 parent 2f8d5e7
commit c71c969
Show file tree

Hide file tree

Showing 5 changed files with 308 additions and 1 deletion.
diff --git a/non_semantic_speech_benchmark/README.md b/non_semantic_speech_benchmark/README.md
@@ -1,4 +1,4 @@
-# Towards Learning a Universal Non-Semantic Representation of Speech  (WORK IN PROGRESS)
+# Towards Learning a Universal Non-Semantic Representation of Speech
 
 Paper: [Towards Learning a Universal Non-Semantic Representation of Speech](https://arxiv.org/abs/2002.12764)
 
@@ -28,6 +28,9 @@ To use this benchmark or embeddings, please cite as follows:
 
 ## Overview
 
+![Data flowchart](images/data_flowchart.png "Data flowchart"){width="500"}
+
+![Embedding flowchart](images/embedding_flowchat.png "Embedding flowchart"){width="400"} ![Eval model flowchart](images/eval_moodel_flowchart.png "Eval model flowchart"){width="400"}
 
 
 ## Detailed Instructions

diff --git a/non_semantic_speech_benchmark/images/data_flowchart.png b/non_semantic_speech_benchmark/images/data_flowchart.png
diff --git a/non_semantic_speech_benchmark/images/embedding_flowchart.png b/non_semantic_speech_benchmark/images/embedding_flowchart.png
diff --git a/non_semantic_speech_benchmark/images/eval_model_flowchart.png b/non_semantic_speech_benchmark/images/eval_model_flowchart.png
diff --git a/non_semantic_speech_benchmark/train_and_eval_sklearn_small_tfds_dataset.ipynb b/non_semantic_speech_benchmark/train_and_eval_sklearn_small_tfds_dataset.ipynb
@@ -0,0 +1,304 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Q59kntpEjeax"
+      },
+      "source": [
+        "##### Copyright 2019 Google LLC.\n",
+        "\n",
+        "Licensed under the Apache License, Version 2.0 (the \"License\");"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "9fhjU0f_ja9h"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Default title text\n",
+        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "4RPI9UwajiPI"
+      },
+      "source": [
+        "##### Full-flow evaluation\n",
+        "\n",
+        "A colab for testing the full flow of calculating embeddings and train/eval using sklearn models. Since this notebook doesn't parallelize (like the apache beam tools do) and computing embeddings is computationally expensive, **please use the mutli-step beam-based tools if** you'd like to eval a large dataset, eval a custom dataset, or train a Keras model.\n",
+        "\n",
+        "Please be sure to use a **Python 3** kernel. **Running on GPU** significantly speeds up the process as well.\n",
+        "\n",
+        "Conceptual overview of this colab:\n",
+        "\n",
+        "1. Read `TensorFlow Dataset` data as numpy\n",
+        "1. Convert audio to float and resample\n",
+        "1. Convert audio to embeddings\n",
+        "1. Train and eval sklearn model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "dKUuSLehjke7"
+      },
+      "outputs": [],
+      "source": [
+        "tfds_dataset_name = 'savee'  #@param\n",
+        "REQUIRED_SAMPLE_RATE_ = 16000"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "gQvwTuBjjnRZ"
+      },
+      "outputs": [],
+      "source": [
+        "# Read the data into numpy arrays.\n",
+        "import collections\n",
+        "SingleSplit = collections.namedtuple(\n",
+        "    'SingleSplit', ['audio', 'labels', 'speaker_id'])\n",
+        "Data = collections.namedtuple(\n",
+        "    'Data', ['train', 'validation', 'test'])\n",
+        "\n",
+        "import tensorflow.compat.v2 as tf\n",
+        "tf.enable_v2_behavior()\n",
+        "assert tf.executing_eagerly()\n",
+        "import tensorflow_datasets as tfds\n",
+        "def _dat_from_split(split):\n",
+        "  np_generator = tfds.as_numpy(tfds.load(tfds_dataset_name, split=split))\n",
+        "  dat = [(x['audio'], x['label'], x['speaker_id']) for x in np_generator]\n",
+        "  audio, labels, speaker_id = zip(*dat)\n",
+        "\n",
+        "  import numpy as np\n",
+        "  labels = np.array(labels, dtype=np.int16)\n",
+        "  speaker_id = np.array(speaker_id)\n",
+        "  assert len(audio) == labels.size == speaker_id.size\n",
+        "  assert labels.ndim == speaker_id.ndim == 1\n",
+        "  print(f'Finished {split}')\n",
+        "  return audio, labels, speaker_id\n",
+        "\n",
+        "all_data = Data(\n",
+        "    train=SingleSplit(*_dat_from_split('train')),\n",
+        "    validation=SingleSplit(*_dat_from_split('validation')),\n",
+        "    test=SingleSplit(*_dat_from_split('test')))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "LhL0uSMjjpQN"
+      },
+      "outputs": [],
+      "source": [
+        "# Make the audio floats, and resample the audio if necessary.\n",
+        "import collections\n",
+        "import librosa\n",
+        "import numpy as np\n",
+        "FloatData = collections.namedtuple('FloatData', ['train', 'validation', 'test'])\n",
+        "\n",
+        "sample_rate = tfds.builder(tfds_dataset_name).info.features['audio'].sample_rate\n",
+        "def _int_to_float(audio_int16, split_name):\n",
+        "  float_audio_16k = []\n",
+        "  for i, samples in enumerate(audio_int16):\n",
+        "    float_audio = samples.astype(np.float32) / np.iinfo(np.int16).max\n",
+        "    if sample_rate != REQUIRED_SAMPLE_RATE_:\n",
+        "      float_audio = librosa.core.resample(\n",
+        "          float_audio, orig_sr=sample_rate, target_sr=16000, \n",
+        "          res_type='kaiser_best')\n",
+        "    float_audio_16k.append(float_audio)\n",
+        "    if i % 50 == 0:\n",
+        "      print(f'Finished resampling {i} / {len(audio_int16)} for {split_name}')\n",
+        "  print(f'Finished {split_name}')\n",
+        "  return float_audio_16k\n",
+        "\n",
+        "\n",
+        "float_audio_16k = FloatData(\n",
+        "    train=_int_to_float(all_data.train.audio, 'train'),\n",
+        "    validation=_int_to_float(all_data.validation.audio, 'validation'),\n",
+        "    test=_int_to_float(all_data.test.audio, 'test'))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "CIb4stZ3jqsF"
+      },
+      "outputs": [],
+      "source": [
+        "tfhub_model_name = 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/1'  #@param\n",
+        "output_key = 'embedding'  #@param"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "9SAsuvE5jrsN"
+      },
+      "outputs": [],
+      "source": [
+        "# Convert the audio to embeddings. Preaverage the embeddings across time.\n",
+        "import tensorflow_hub as hub\n",
+        "model = hub.load(tfhub_model_name)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "5UtmnSGPjt1k"
+      },
+      "outputs": [],
+      "source": [
+        "import collections\n",
+        "Embeddings = collections.namedtuple(\n",
+        "    'Embeddings', ['train', 'validation', 'test'])\n",
+        "\n",
+        "def _calc_embeddings(cur_float_audio, split_name):\n",
+        "  cur_embeddings = []\n",
+        "  for i, float_samples in enumerate(cur_float_audio):\n",
+        "    tf_out = model(tf.constant(float_samples, tf.float32),\n",
+        "                  tf.constant(16000, tf.int32))\n",
+        "    embedding_2d = tf_out[output_key]\n",
+        "    assert embedding_2d.ndim == 2\n",
+        "    embedding_1d = np.mean(embedding_2d, axis=0)\n",
+        "    cur_embeddings.append(embedding_1d)\n",
+        "    if i % 50 == 0:\n",
+        "      print(f'Finished embedding {i} / {len(cur_float_audio)} for {split_name}')\n",
+        "  print(f'Finished {split_name}')\n",
+        "  cur_embeddings = np.array(cur_embeddings, dtype=np.float32)\n",
+        "  return cur_embeddings\n",
+        "\n",
+        "embeddings = Embeddings(\n",
+        "    train=_calc_embeddings(float_audio_16k.train, 'train'),\n",
+        "    validation=_calc_embeddings(float_audio_16k.validation, 'validation'),\n",
+        "    test=_calc_embeddings(float_audio_16k.test, 'test'))\n",
+        "assert embeddings.train.shape[1] == embeddings.validation.shape[1] == embeddings.test.shape[1]\n",
+        "assert embeddings.train.shape[0] == all_data.train.labels.shape[0] == all_data.train.speaker_id.shape[0]\n",
+        "assert embeddings.validation.shape[0] == all_data.validation.labels.shape[0] == all_data.validation.speaker_id.shape[0]\n",
+        "assert embeddings.test.shape[0] == all_data.test.labels.shape[0] == all_data.test.speaker_id.shape[0]\n",
+        "assert not np.isnan(embeddings.train).any()\n",
+        "assert not np.isnan(embeddings.validation).any()\n",
+        "assert not np.isnan(embeddings.test).any()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "r6BTvIGQjwt3"
+      },
+      "outputs": [],
+      "source": [
+        "model_name = 'LogisticRegression_balanced'  #@param"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {},
+        "colab_type": "code",
+        "id": "BVVFxPrcjyak"
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn import linear_model\n",
+        "\n",
+        "def get_sklearn_model(model_name):\n",
+        "  return {\n",
+        "      'LogisticRegression': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),\n",
+        "      'LogisticRegression_balanced': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial', class_weight='balanced'),\n",
+        "  }[model_name]()\n",
+        "\n",
+        "def _speaker_normalization(embedding_np, speaker_id_np):\n",
+        "  \"\"\"Normalize embedding features by per-speaker statistics.\"\"\"\n",
+        "  all_speaker_ids = np.unique(speaker_id_np)\n",
+        "  for speaker in all_speaker_ids:\n",
+        "    cur_i = speaker_id_np == speaker\n",
+        "    embedding_np[cur_i] -= embedding_np[cur_i].mean(axis=0)\n",
+        "    stds = embedding_np[cur_i].std(axis=0)\n",
+        "    stds[stds == 0] = 1\n",
+        "    embedding_np[cur_i] /= stds\n",
+        "\n",
+        "  return embedding_np\n",
+        "\n",
+        "# Train models.\n",
+        "d = get_sklearn_model(model_name)\n",
+        "normalized_train = _speaker_normalization(\n",
+        "    embeddings.train, all_data.train.speaker_id)\n",
+        "d.fit(normalized_train, all_data.train.labels)\n",
+        "\n",
+        "# Eval.\n",
+        "normalized_validation = _speaker_normalization(\n",
+        "    embeddings.validation, all_data.validation.speaker_id)\n",
+        "eval_score = d.score(normalized_validation, all_data.validation.labels)\n",
+        "print(f'{model_name} eval score: {eval_score}')\n",
+        "\n",
+        "# Test.\n",
+        "normalized_test = _speaker_normalization(\n",
+        "    embeddings.test, all_data.test.speaker_id)\n",
+        "test_score = d.score(normalized_test, all_data.test.labels)\n",
+        "print(f'{model_name} test score: {test_score}')"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "Train and eval sklearn small TFDS dataset",
+      "provenance": [
+        {
+          "file_id": "1UE-jDSsEQ0qRvxtw_aRK2k4dJwytUdSk",
+          "timestamp": 1587402478951
+        }
+      ]
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}