Skip to content

Commit

Permalink
Open source colabs that run all-in-one training section.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 307532498
  • Loading branch information
joel-shor authored and copybara-github committed Apr 21, 2020
1 parent 2f8d5e7 commit c71c969
Show file tree
Hide file tree
Showing 5 changed files with 308 additions and 1 deletion.
5 changes: 4 additions & 1 deletion non_semantic_speech_benchmark/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Towards Learning a Universal Non-Semantic Representation of Speech (WORK IN PROGRESS)
# Towards Learning a Universal Non-Semantic Representation of Speech

Paper: [Towards Learning a Universal Non-Semantic Representation of Speech](https://arxiv.org/abs/2002.12764)

Expand Down Expand Up @@ -28,6 +28,9 @@ To use this benchmark or embeddings, please cite as follows:

## Overview

![Data flowchart](images/data_flowchart.png "Data flowchart"){width="500"}

![Embedding flowchart](images/embedding_flowchat.png "Embedding flowchart"){width="400"} ![Eval model flowchart](images/eval_moodel_flowchart.png "Eval model flowchart"){width="400"}


## Detailed Instructions
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Q59kntpEjeax"
},
"source": [
"##### Copyright 2019 Google LLC.\n",
"\n",
"Licensed under the Apache License, Version 2.0 (the \"License\");"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "9fhjU0f_ja9h"
},
"outputs": [],
"source": [
"#@title Default title text\n",
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"# you may not use this file except in compliance with the License.\n",
"# You may obtain a copy of the License at\n",
"#\n",
"# https://www.apache.org/licenses/LICENSE-2.0\n",
"#\n",
"# Unless required by applicable law or agreed to in writing, software\n",
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "4RPI9UwajiPI"
},
"source": [
"##### Full-flow evaluation\n",
"\n",
"A colab for testing the full flow of calculating embeddings and train/eval using sklearn models. Since this notebook doesn't parallelize (like the apache beam tools do) and computing embeddings is computationally expensive, **please use the mutli-step beam-based tools if** you'd like to eval a large dataset, eval a custom dataset, or train a Keras model.\n",
"\n",
"Please be sure to use a **Python 3** kernel. **Running on GPU** significantly speeds up the process as well.\n",
"\n",
"Conceptual overview of this colab:\n",
"\n",
"1. Read `TensorFlow Dataset` data as numpy\n",
"1. Convert audio to float and resample\n",
"1. Convert audio to embeddings\n",
"1. Train and eval sklearn model"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "dKUuSLehjke7"
},
"outputs": [],
"source": [
"tfds_dataset_name = 'savee' #@param\n",
"REQUIRED_SAMPLE_RATE_ = 16000"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "gQvwTuBjjnRZ"
},
"outputs": [],
"source": [
"# Read the data into numpy arrays.\n",
"import collections\n",
"SingleSplit = collections.namedtuple(\n",
" 'SingleSplit', ['audio', 'labels', 'speaker_id'])\n",
"Data = collections.namedtuple(\n",
" 'Data', ['train', 'validation', 'test'])\n",
"\n",
"import tensorflow.compat.v2 as tf\n",
"tf.enable_v2_behavior()\n",
"assert tf.executing_eagerly()\n",
"import tensorflow_datasets as tfds\n",
"def _dat_from_split(split):\n",
" np_generator = tfds.as_numpy(tfds.load(tfds_dataset_name, split=split))\n",
" dat = [(x['audio'], x['label'], x['speaker_id']) for x in np_generator]\n",
" audio, labels, speaker_id = zip(*dat)\n",
"\n",
" import numpy as np\n",
" labels = np.array(labels, dtype=np.int16)\n",
" speaker_id = np.array(speaker_id)\n",
" assert len(audio) == labels.size == speaker_id.size\n",
" assert labels.ndim == speaker_id.ndim == 1\n",
" print(f'Finished {split}')\n",
" return audio, labels, speaker_id\n",
"\n",
"all_data = Data(\n",
" train=SingleSplit(*_dat_from_split('train')),\n",
" validation=SingleSplit(*_dat_from_split('validation')),\n",
" test=SingleSplit(*_dat_from_split('test')))"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "LhL0uSMjjpQN"
},
"outputs": [],
"source": [
"# Make the audio floats, and resample the audio if necessary.\n",
"import collections\n",
"import librosa\n",
"import numpy as np\n",
"FloatData = collections.namedtuple('FloatData', ['train', 'validation', 'test'])\n",
"\n",
"sample_rate = tfds.builder(tfds_dataset_name).info.features['audio'].sample_rate\n",
"def _int_to_float(audio_int16, split_name):\n",
" float_audio_16k = []\n",
" for i, samples in enumerate(audio_int16):\n",
" float_audio = samples.astype(np.float32) / np.iinfo(np.int16).max\n",
" if sample_rate != REQUIRED_SAMPLE_RATE_:\n",
" float_audio = librosa.core.resample(\n",
" float_audio, orig_sr=sample_rate, target_sr=16000, \n",
" res_type='kaiser_best')\n",
" float_audio_16k.append(float_audio)\n",
" if i % 50 == 0:\n",
" print(f'Finished resampling {i} / {len(audio_int16)} for {split_name}')\n",
" print(f'Finished {split_name}')\n",
" return float_audio_16k\n",
"\n",
"\n",
"float_audio_16k = FloatData(\n",
" train=_int_to_float(all_data.train.audio, 'train'),\n",
" validation=_int_to_float(all_data.validation.audio, 'validation'),\n",
" test=_int_to_float(all_data.test.audio, 'test'))"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "CIb4stZ3jqsF"
},
"outputs": [],
"source": [
"tfhub_model_name = 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/1' #@param\n",
"output_key = 'embedding' #@param"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "9SAsuvE5jrsN"
},
"outputs": [],
"source": [
"# Convert the audio to embeddings. Preaverage the embeddings across time.\n",
"import tensorflow_hub as hub\n",
"model = hub.load(tfhub_model_name)"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "5UtmnSGPjt1k"
},
"outputs": [],
"source": [
"import collections\n",
"Embeddings = collections.namedtuple(\n",
" 'Embeddings', ['train', 'validation', 'test'])\n",
"\n",
"def _calc_embeddings(cur_float_audio, split_name):\n",
" cur_embeddings = []\n",
" for i, float_samples in enumerate(cur_float_audio):\n",
" tf_out = model(tf.constant(float_samples, tf.float32),\n",
" tf.constant(16000, tf.int32))\n",
" embedding_2d = tf_out[output_key]\n",
" assert embedding_2d.ndim == 2\n",
" embedding_1d = np.mean(embedding_2d, axis=0)\n",
" cur_embeddings.append(embedding_1d)\n",
" if i % 50 == 0:\n",
" print(f'Finished embedding {i} / {len(cur_float_audio)} for {split_name}')\n",
" print(f'Finished {split_name}')\n",
" cur_embeddings = np.array(cur_embeddings, dtype=np.float32)\n",
" return cur_embeddings\n",
"\n",
"embeddings = Embeddings(\n",
" train=_calc_embeddings(float_audio_16k.train, 'train'),\n",
" validation=_calc_embeddings(float_audio_16k.validation, 'validation'),\n",
" test=_calc_embeddings(float_audio_16k.test, 'test'))\n",
"assert embeddings.train.shape[1] == embeddings.validation.shape[1] == embeddings.test.shape[1]\n",
"assert embeddings.train.shape[0] == all_data.train.labels.shape[0] == all_data.train.speaker_id.shape[0]\n",
"assert embeddings.validation.shape[0] == all_data.validation.labels.shape[0] == all_data.validation.speaker_id.shape[0]\n",
"assert embeddings.test.shape[0] == all_data.test.labels.shape[0] == all_data.test.speaker_id.shape[0]\n",
"assert not np.isnan(embeddings.train).any()\n",
"assert not np.isnan(embeddings.validation).any()\n",
"assert not np.isnan(embeddings.test).any()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "r6BTvIGQjwt3"
},
"outputs": [],
"source": [
"model_name = 'LogisticRegression_balanced' #@param"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "BVVFxPrcjyak"
},
"outputs": [],
"source": [
"from sklearn import linear_model\n",
"\n",
"def get_sklearn_model(model_name):\n",
" return {\n",
" 'LogisticRegression': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),\n",
" 'LogisticRegression_balanced': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial', class_weight='balanced'),\n",
" }[model_name]()\n",
"\n",
"def _speaker_normalization(embedding_np, speaker_id_np):\n",
" \"\"\"Normalize embedding features by per-speaker statistics.\"\"\"\n",
" all_speaker_ids = np.unique(speaker_id_np)\n",
" for speaker in all_speaker_ids:\n",
" cur_i = speaker_id_np == speaker\n",
" embedding_np[cur_i] -= embedding_np[cur_i].mean(axis=0)\n",
" stds = embedding_np[cur_i].std(axis=0)\n",
" stds[stds == 0] = 1\n",
" embedding_np[cur_i] /= stds\n",
"\n",
" return embedding_np\n",
"\n",
"# Train models.\n",
"d = get_sklearn_model(model_name)\n",
"normalized_train = _speaker_normalization(\n",
" embeddings.train, all_data.train.speaker_id)\n",
"d.fit(normalized_train, all_data.train.labels)\n",
"\n",
"# Eval.\n",
"normalized_validation = _speaker_normalization(\n",
" embeddings.validation, all_data.validation.speaker_id)\n",
"eval_score = d.score(normalized_validation, all_data.validation.labels)\n",
"print(f'{model_name} eval score: {eval_score}')\n",
"\n",
"# Test.\n",
"normalized_test = _speaker_normalization(\n",
" embeddings.test, all_data.test.speaker_id)\n",
"test_score = d.score(normalized_test, all_data.test.labels)\n",
"print(f'{model_name} test score: {test_score}')"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "Train and eval sklearn small TFDS dataset",
"provenance": [
{
"file_id": "1UE-jDSsEQ0qRvxtw_aRK2k4dJwytUdSk",
"timestamp": 1587402478951
}
]
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

0 comments on commit c71c969

Please sign in to comment.