forked from google-research/google-research
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Open source colabs that run all-in-one training section.
PiperOrigin-RevId: 307532498
- Loading branch information
1 parent
2f8d5e7
commit c71c969
Showing
5 changed files
with
308 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
304 changes: 304 additions & 0 deletions
304
non_semantic_speech_benchmark/train_and_eval_sklearn_small_tfds_dataset.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,304 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"colab_type": "text", | ||
"id": "Q59kntpEjeax" | ||
}, | ||
"source": [ | ||
"##### Copyright 2019 Google LLC.\n", | ||
"\n", | ||
"Licensed under the Apache License, Version 2.0 (the \"License\");" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 0, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "9fhjU0f_ja9h" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"#@title Default title text\n", | ||
"# Licensed under the Apache License, Version 2.0 (the \"License\");\n", | ||
"# you may not use this file except in compliance with the License.\n", | ||
"# You may obtain a copy of the License at\n", | ||
"#\n", | ||
"# https://www.apache.org/licenses/LICENSE-2.0\n", | ||
"#\n", | ||
"# Unless required by applicable law or agreed to in writing, software\n", | ||
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n", | ||
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", | ||
"# See the License for the specific language governing permissions and\n", | ||
"# limitations under the License." | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"colab_type": "text", | ||
"id": "4RPI9UwajiPI" | ||
}, | ||
"source": [ | ||
"##### Full-flow evaluation\n", | ||
"\n", | ||
"A colab for testing the full flow of calculating embeddings and train/eval using sklearn models. Since this notebook doesn't parallelize (like the apache beam tools do) and computing embeddings is computationally expensive, **please use the mutli-step beam-based tools if** you'd like to eval a large dataset, eval a custom dataset, or train a Keras model.\n", | ||
"\n", | ||
"Please be sure to use a **Python 3** kernel. **Running on GPU** significantly speeds up the process as well.\n", | ||
"\n", | ||
"Conceptual overview of this colab:\n", | ||
"\n", | ||
"1. Read `TensorFlow Dataset` data as numpy\n", | ||
"1. Convert audio to float and resample\n", | ||
"1. Convert audio to embeddings\n", | ||
"1. Train and eval sklearn model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 0, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "dKUuSLehjke7" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"tfds_dataset_name = 'savee' #@param\n", | ||
"REQUIRED_SAMPLE_RATE_ = 16000" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 0, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "gQvwTuBjjnRZ" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Read the data into numpy arrays.\n", | ||
"import collections\n", | ||
"SingleSplit = collections.namedtuple(\n", | ||
" 'SingleSplit', ['audio', 'labels', 'speaker_id'])\n", | ||
"Data = collections.namedtuple(\n", | ||
" 'Data', ['train', 'validation', 'test'])\n", | ||
"\n", | ||
"import tensorflow.compat.v2 as tf\n", | ||
"tf.enable_v2_behavior()\n", | ||
"assert tf.executing_eagerly()\n", | ||
"import tensorflow_datasets as tfds\n", | ||
"def _dat_from_split(split):\n", | ||
" np_generator = tfds.as_numpy(tfds.load(tfds_dataset_name, split=split))\n", | ||
" dat = [(x['audio'], x['label'], x['speaker_id']) for x in np_generator]\n", | ||
" audio, labels, speaker_id = zip(*dat)\n", | ||
"\n", | ||
" import numpy as np\n", | ||
" labels = np.array(labels, dtype=np.int16)\n", | ||
" speaker_id = np.array(speaker_id)\n", | ||
" assert len(audio) == labels.size == speaker_id.size\n", | ||
" assert labels.ndim == speaker_id.ndim == 1\n", | ||
" print(f'Finished {split}')\n", | ||
" return audio, labels, speaker_id\n", | ||
"\n", | ||
"all_data = Data(\n", | ||
" train=SingleSplit(*_dat_from_split('train')),\n", | ||
" validation=SingleSplit(*_dat_from_split('validation')),\n", | ||
" test=SingleSplit(*_dat_from_split('test')))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 0, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "LhL0uSMjjpQN" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Make the audio floats, and resample the audio if necessary.\n", | ||
"import collections\n", | ||
"import librosa\n", | ||
"import numpy as np\n", | ||
"FloatData = collections.namedtuple('FloatData', ['train', 'validation', 'test'])\n", | ||
"\n", | ||
"sample_rate = tfds.builder(tfds_dataset_name).info.features['audio'].sample_rate\n", | ||
"def _int_to_float(audio_int16, split_name):\n", | ||
" float_audio_16k = []\n", | ||
" for i, samples in enumerate(audio_int16):\n", | ||
" float_audio = samples.astype(np.float32) / np.iinfo(np.int16).max\n", | ||
" if sample_rate != REQUIRED_SAMPLE_RATE_:\n", | ||
" float_audio = librosa.core.resample(\n", | ||
" float_audio, orig_sr=sample_rate, target_sr=16000, \n", | ||
" res_type='kaiser_best')\n", | ||
" float_audio_16k.append(float_audio)\n", | ||
" if i % 50 == 0:\n", | ||
" print(f'Finished resampling {i} / {len(audio_int16)} for {split_name}')\n", | ||
" print(f'Finished {split_name}')\n", | ||
" return float_audio_16k\n", | ||
"\n", | ||
"\n", | ||
"float_audio_16k = FloatData(\n", | ||
" train=_int_to_float(all_data.train.audio, 'train'),\n", | ||
" validation=_int_to_float(all_data.validation.audio, 'validation'),\n", | ||
" test=_int_to_float(all_data.test.audio, 'test'))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 0, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "CIb4stZ3jqsF" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"tfhub_model_name = 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/1' #@param\n", | ||
"output_key = 'embedding' #@param" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 0, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "9SAsuvE5jrsN" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Convert the audio to embeddings. Preaverage the embeddings across time.\n", | ||
"import tensorflow_hub as hub\n", | ||
"model = hub.load(tfhub_model_name)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 0, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "5UtmnSGPjt1k" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import collections\n", | ||
"Embeddings = collections.namedtuple(\n", | ||
" 'Embeddings', ['train', 'validation', 'test'])\n", | ||
"\n", | ||
"def _calc_embeddings(cur_float_audio, split_name):\n", | ||
" cur_embeddings = []\n", | ||
" for i, float_samples in enumerate(cur_float_audio):\n", | ||
" tf_out = model(tf.constant(float_samples, tf.float32),\n", | ||
" tf.constant(16000, tf.int32))\n", | ||
" embedding_2d = tf_out[output_key]\n", | ||
" assert embedding_2d.ndim == 2\n", | ||
" embedding_1d = np.mean(embedding_2d, axis=0)\n", | ||
" cur_embeddings.append(embedding_1d)\n", | ||
" if i % 50 == 0:\n", | ||
" print(f'Finished embedding {i} / {len(cur_float_audio)} for {split_name}')\n", | ||
" print(f'Finished {split_name}')\n", | ||
" cur_embeddings = np.array(cur_embeddings, dtype=np.float32)\n", | ||
" return cur_embeddings\n", | ||
"\n", | ||
"embeddings = Embeddings(\n", | ||
" train=_calc_embeddings(float_audio_16k.train, 'train'),\n", | ||
" validation=_calc_embeddings(float_audio_16k.validation, 'validation'),\n", | ||
" test=_calc_embeddings(float_audio_16k.test, 'test'))\n", | ||
"assert embeddings.train.shape[1] == embeddings.validation.shape[1] == embeddings.test.shape[1]\n", | ||
"assert embeddings.train.shape[0] == all_data.train.labels.shape[0] == all_data.train.speaker_id.shape[0]\n", | ||
"assert embeddings.validation.shape[0] == all_data.validation.labels.shape[0] == all_data.validation.speaker_id.shape[0]\n", | ||
"assert embeddings.test.shape[0] == all_data.test.labels.shape[0] == all_data.test.speaker_id.shape[0]\n", | ||
"assert not np.isnan(embeddings.train).any()\n", | ||
"assert not np.isnan(embeddings.validation).any()\n", | ||
"assert not np.isnan(embeddings.test).any()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 0, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "r6BTvIGQjwt3" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"model_name = 'LogisticRegression_balanced' #@param" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 0, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "BVVFxPrcjyak" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from sklearn import linear_model\n", | ||
"\n", | ||
"def get_sklearn_model(model_name):\n", | ||
" return {\n", | ||
" 'LogisticRegression': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),\n", | ||
" 'LogisticRegression_balanced': lambda: linear_model.LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial', class_weight='balanced'),\n", | ||
" }[model_name]()\n", | ||
"\n", | ||
"def _speaker_normalization(embedding_np, speaker_id_np):\n", | ||
" \"\"\"Normalize embedding features by per-speaker statistics.\"\"\"\n", | ||
" all_speaker_ids = np.unique(speaker_id_np)\n", | ||
" for speaker in all_speaker_ids:\n", | ||
" cur_i = speaker_id_np == speaker\n", | ||
" embedding_np[cur_i] -= embedding_np[cur_i].mean(axis=0)\n", | ||
" stds = embedding_np[cur_i].std(axis=0)\n", | ||
" stds[stds == 0] = 1\n", | ||
" embedding_np[cur_i] /= stds\n", | ||
"\n", | ||
" return embedding_np\n", | ||
"\n", | ||
"# Train models.\n", | ||
"d = get_sklearn_model(model_name)\n", | ||
"normalized_train = _speaker_normalization(\n", | ||
" embeddings.train, all_data.train.speaker_id)\n", | ||
"d.fit(normalized_train, all_data.train.labels)\n", | ||
"\n", | ||
"# Eval.\n", | ||
"normalized_validation = _speaker_normalization(\n", | ||
" embeddings.validation, all_data.validation.speaker_id)\n", | ||
"eval_score = d.score(normalized_validation, all_data.validation.labels)\n", | ||
"print(f'{model_name} eval score: {eval_score}')\n", | ||
"\n", | ||
"# Test.\n", | ||
"normalized_test = _speaker_normalization(\n", | ||
" embeddings.test, all_data.test.speaker_id)\n", | ||
"test_score = d.score(normalized_test, all_data.test.labels)\n", | ||
"print(f'{model_name} test score: {test_score}')" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"colab": { | ||
"collapsed_sections": [], | ||
"name": "Train and eval sklearn small TFDS dataset", | ||
"provenance": [ | ||
{ | ||
"file_id": "1UE-jDSsEQ0qRvxtw_aRK2k4dJwytUdSk", | ||
"timestamp": 1587402478951 | ||
} | ||
] | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"name": "python3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |