diff --git a/.github/workflows/basic_tests_for_aud_csv.yaml b/.github/workflows/basic_tests_for_aud_csv.yaml index ad55e24d..dd0fd84e 100644 --- a/.github/workflows/basic_tests_for_aud_csv.yaml +++ b/.github/workflows/basic_tests_for_aud_csv.yaml @@ -36,8 +36,9 @@ jobs: - name: Run csv-ravdess-praat-xgb run: | cd data/ravdess - wget https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip - unzip Audio_Speech_Actors_01-24.zip + # wget https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip + wget https://zenodo.org/records/11063852/files/Audio_Speech_Actors_01-24_16k.zip + unzip Audio_Speech_Actors_01-24_16k.zip cd ../.. python3 -m nkululeko.nkululeko --config data/ravdess/exp_praat_xgb.ini > output1.txt if grep -q "DONE" output1.txt; then diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a3c518d..5001c13f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,43 @@ Changelog ========= +Version 0.84.0 +-------------- +* added SHAP analysis +* started with finetuning + +Version 0.83.3 +-------------- +* fixed a naming error in trill features that prevented storage of experiment + +Version 0.83.2 +-------------- +* added default cuda if present and not stated + +Version 0.83.1 +-------------- +* add test module to nkuluflag + +Version 0.83.0 +-------------- +* test module now prints out reports + +Version 0.82.4 +-------------- +* fixed bug in wavlm + +Version 0.82.3 +-------------- +* fixed another audformat peculiarity to interprete time values as nanoseconds + +Version 0.82.2 +-------------- +* fixed audformat peculiarity that dataframes can have only one column + +Version 0.82.1 +-------------- +* Add more test for GC action + Version 0.82.0 -------------- * added nkuluflag module diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..8fd6e21c --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,72 @@ +# Code of Conduct + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, gender identity and expression, level of +experience, education, socioeconomic status, nationality, personal appearance, +race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* using welcoming and inclusive language, +* being respectful of differing viewpoints and experiences, +* gracefully accepting constructive criticism, +* focusing on what is best for the community, and +* showing empathy towards other community members. + +Examples of unacceptable behavior by participants include: + +* the use of sexualized language or imagery and unwelcome sexual + attention or advances, +* trolling, insulting/derogatory comments, and personal or political + attacks, +* public or private harassment, +* publishing others' private information, such as a physical or + electronic address, without explicit permission, and +* other conduct which could reasonably be considered inappropriate in + a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, or to ban temporarily or permanently any +contributor for other behaviors that they deem inappropriate, threatening, +offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project email +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by emailing the project team. All complaints will be reviewed +and investigated and will result in a response that is deemed necessary and +appropriate to the circumstances. The project team is obligated to maintain +confidentiality with regard to the reporter of an incident. Further details of +specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][contributor_covenant] +version 1.4. + +[contributor_covenant]: https://www.contributor-covenant.org/ diff --git a/data/crema-d/load_db.py b/data/crema-d/load_db.py index 86d97581..ba60852e 100644 --- a/data/crema-d/load_db.py +++ b/data/crema-d/load_db.py @@ -8,4 +8,4 @@ audb.config.CACHE_ROOT = cwd # load the latest version of the data -db = audb.load("crema-d", format="wav", sampling_rate=16000, mixdown=True) +db = audb.load("crema-d", version="1.3.0", verbose=True) diff --git a/data/ravdess/README.md b/data/ravdess/README.md index d4010dbe..2c78de41 100644 --- a/data/ravdess/README.md +++ b/data/ravdess/README.md @@ -5,10 +5,14 @@ I used the version downloadable from [Zenodo](https://zenodo.org/record/1188976) Download and unzip the file Audio_Speech_Actors_01-24.zip ```bash +# download original dataset in 48k $ wget https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip $ unzip Audio_Speech_Actors_01-24.zip ``` +Or, if you prefer the dataset in 16k, you can download from this link: +https://zenodo.org/records/11063852/files/Audio_Speech_Actors_01-24_16k.zip + run the file ```bash python3 process_database.py diff --git a/data/ravdess/exp_praat_xgb.ini b/data/ravdess/exp_praat_xgb.ini index ded9f0bb..bd2cc770 100644 --- a/data/ravdess/exp_praat_xgb.ini +++ b/data/ravdess/exp_praat_xgb.ini @@ -10,10 +10,12 @@ dev = ./data/ravdess/ravdess_dev.csv dev.type = csv dev.absolute_path = False dev.split_strategy = train +dev.audio_path = Audio_Speech_Actors_01-24_16k/ test = ./data/ravdess/ravdess_test.csv test.type = csv test.absolute_path = False test.split_strategy = test +test.audio_path = Audio_Speech_Actors_01-24_16k/ target = emotion labels = ['angry', 'happy', 'neutral', 'sad'] [FEATS] diff --git a/ini_file.md b/ini_file.md index 5047912b..37226779 100644 --- a/ini_file.md +++ b/ini_file.md @@ -330,7 +330,9 @@ * **dist_type**: type of plot for value counts, either histogram or density estimation (kde) * dist_type = hist * **spotlight**: open a web-browser window to inspect the data with the [spotlight software](https://github.com/Renumics/spotlight). Needs package *renumics-spotlight* to be installed! - * spotlight = False + * spotlight = False +* **shap**: comopute [SHAP](https://shap.readthedocs.io/en/latest/) values + * shap = False ### [PREDICT](#predict) * **targets**: Speaker/speech characteristics to be predicted by some models * targets = ['gender', 'age', 'snr', 'arousal', 'valence', 'dominance', 'pesq', 'mos'] diff --git a/meta/demos/multiple_exeriments/do_experiments.py b/meta/demos/multiple_experiments/do_experiments.py similarity index 85% rename from meta/demos/multiple_exeriments/do_experiments.py rename to meta/demos/multiple_experiments/do_experiments.py index 70fcd3c0..d75b7834 100644 --- a/meta/demos/multiple_exeriments/do_experiments.py +++ b/meta/demos/multiple_experiments/do_experiments.py @@ -19,13 +19,13 @@ # {'--feat': 'os', # '--set': 'ComParE_2016', # }, - {"--feat": "audmodel"}, + {"--feat": "praat"}, ] for c in classifiers: for f in features: - cmd = "python -m nkululeko.nkuluflag --config exp.ini " + cmd = "python -m nkululeko.nkuluflag --config meta/demos/multiple_exeriments/exp.ini " for item in c: cmd += f"{item} {c[item]} " for item in f: diff --git a/meta/demos/multiple_exeriments/exp.ini b/meta/demos/multiple_experiments/exp.ini similarity index 96% rename from meta/demos/multiple_exeriments/exp.ini rename to meta/demos/multiple_experiments/exp.ini index a50cf430..fcf76dc1 100644 --- a/meta/demos/multiple_exeriments/exp.ini +++ b/meta/demos/multiple_experiments/exp.ini @@ -11,6 +11,7 @@ emodb.train_tables = ['emotion.categories.train.gold_standard'] emodb.test_tables = ['emotion.categories.test.gold_standard'] target = emotion labels = ['anger', 'happiness'] +tests = ['emodb'] [FEATS] [MODEL] C_val = .001 diff --git a/meta/demos/multiple_experiments/tmp.ini b/meta/demos/multiple_experiments/tmp.ini new file mode 100644 index 00000000..de887a4d --- /dev/null +++ b/meta/demos/multiple_experiments/tmp.ini @@ -0,0 +1,28 @@ +[EXP] +root = ./ +name = results +runs = 1 +epochs = 1 + +[DATA] +databases = ['emodb'] +emodb = ../../../data/emodb/emodb +emodb.split_strategy = specified +emodb.train_tables = ['emotion.categories.train.gold_standard'] +emodb.test_tables = ['emotion.categories.test.gold_standard'] +target = emotion +labels = ['anger', 'happiness'] + +[FEATS] +type = ['praat'] + +[MODEL] +c_val = .001 +learning_rate = 0.0001 +store = True +patience = 5 +type = svm + +[PLOT] +best_model = True + diff --git a/meta/demos/result_visualization/plot_multiple_experiment_results.ipynb b/meta/demos/result_visualization/plot_multiple_experiment_results.ipynb new file mode 100644 index 00000000..4fc86f27 --- /dev/null +++ b/meta/demos/result_visualization/plot_multiple_experiment_results.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def barplot(df, title=\"Results\", \\\n", + " xlab='Data', metric=\"UAR\", \\\n", + " ylim_low=.4, ylim_up=1, bar_width = 0.12, \\\n", + " fontsize=10):\n", + " \"\"\"A bar plot for 2 dimensional results: e.g. models vs. features\n", + " \n", + " \"\"\"\n", + " colors = ['tab:red',\n", + " 'tab:blue',\n", + " 'tab:orange',\n", + " 'tab:green',\n", + " 'tab:purple',\n", + " 'tab:cyan',\n", + " 'tab:brown',\n", + " 'tab:olive',\n", + " 'tab:pink',\n", + " ]\n", + " bar_width = bar_width\n", + "\n", + " fig, ax = plt.subplots(figsize=(12, 8))\n", + " br = []\n", + " br.append(np.arange(df.shape[1]))\n", + " for m in range(df.shape[0]):\n", + " br.append([x + bar_width for x in br[-1]])\n", + "\n", + " for im in range(df.shape[0]):\n", + " bars = plt.bar(br[im],\n", + " df.iloc[im],\n", + " color=colors[im],\n", + " width=bar_width,\n", + " edgecolor='k',\n", + " label=df.index[im])\n", + " # Add labels to the bars\n", + " for bar, value in zip(bars, df.iloc[im]):\n", + " value = f'.{(int)(100*value)}'\n", + " plt.text(bar.get_x() + bar.get_width() / 2,\n", + " bar.get_height() + 0.01,\n", + " f'{value}', # Format the value to 2 decimal places\n", + " ha='center',\n", + " va='bottom',\n", + " fontsize=fontsize)\n", + "\n", + " plt.xlabel(xlab, fontweight='bold', fontsize=15)\n", + " plt.ylabel(metric, fontweight='bold', fontsize=15)\n", + " plt.xticks([r + (df.shape[0]-1)//2*bar_width for r in range(df.shape[1])],\n", + " df.columns,\n", + " fontsize=15)\n", + " plt.title(title.upper(), fontweight='bold', fontsize=15)\n", + " plt.grid()\n", + " plt.legend(fontsize=14)\n", + " plt.ylim([ylim_low, ylim_up])\n", + " plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# hacky code to collect the results from storage\n", + "\n", + "feats = [\n", + " \"os\",\n", + " \"praat\",\n", + " \"audmodel\",\n", + " \"hubert-large-ll60k\",\n", + " \"trill\",\n", + " \"whisper-medium\",\n", + " \"wavlm-large\",\n", + " \"wav2vec2\",\n", + "]\n", + "#result_key = \"dev\"\n", + "result_key = \"test\"\n", + "\n", + "models = [\"mlp\",\"xgb\",\"svm\"]\n", + "result_arrays = {}\n", + "for m in models:\n", + " result_arrays[m] = []\n", + "for f in feats:\n", + " for m in models:\n", + " if result_key == \"dev\":\n", + " if m == \"mlp\":\n", + " fn = f\"results/results/run_0/train_dev_{m}_{f}_16-64_C_val-10_drop-3_scale-standard_conf.txt\"\n", + " else:\n", + " fn = f\"results/results/run_0/train_dev_{m}_{f}_C_val-10_drop-3_scale-standard_conf.txt\"\n", + " elif result_key == \"test\":\n", + " if m == \"mlp\":\n", + " fn = f\"results/results/run_0/train_dev_{m}_{f}_16-64_C_val-10_drop-3_scale-standard_test-test_conf.txt\"\n", + " else:\n", + " fn = f\"results/results/run_0/train_dev_{m}_{f}_C_val-10_drop-3_scale-standard_test-test_conf.txt\"\n", + " file_in = open(fn, \"r\")\n", + " line = file_in.read()\n", + " y = line.split(\"\\n\")[0].split(\" \")[3].replace(\",\", \"\")\n", + " e = line.split(\"\\n\")[0].split(\" \")[4]\n", + " y = int(float(y) * 1000) / 1000.0\n", + " print(f, y, e)\n", + " result_arrays[m].append(y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# make a dataframe from the results\n", + "db_df = pd.DataFrame(result_arrays, index = feats)\n", + "db_df['mean'] = db_df.mean(numeric_only=True, axis=1)\n", + "db_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ... and plot it\n", + "barplot(db_df,title = f'results {result_key}',ylim_low =.25, ylim_up=0.8, bar_width=.1, fontsize=15)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# should look like this:\n", + "from IPython.display import Image\n", + "Image(filename='../../images/results_dev.png') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/meta/images/results_dev.png b/meta/images/results_dev.png new file mode 100644 index 00000000..002a8109 Binary files /dev/null and b/meta/images/results_dev.png differ diff --git a/nkululeko/constants.py b/nkululeko/constants.py index 94325f65..40c84b1e 100644 --- a/nkululeko/constants.py +++ b/nkululeko/constants.py @@ -1,2 +1,2 @@ -VERSION="0.82.0" +VERSION="0.84.0" SAMPLING_RATE = 16000 diff --git a/nkululeko/data/dataset_csv.py b/nkululeko/data/dataset_csv.py index 71b73b91..60c434eb 100644 --- a/nkululeko/data/dataset_csv.py +++ b/nkululeko/data/dataset_csv.py @@ -22,7 +22,18 @@ def load(self): # data_file = os.path.join(exp_root, data_file) root = os.path.dirname(data_file) audio_path = self.util.config_val_data(self.name, "audio_path", "") - df = audformat.utils.read_csv(data_file) + df = pd.read_csv(data_file) + # special treatment for segmented dataframes with only one column: + if "start" in df.columns and len(df.columns) == 4: + index = audformat.segmented_index( + df.file.values, df.start.values, df.end.values + ) + df = df.set_index(index) + df = df.drop(columns=["file", "start", "end"]) + else: + df = audformat.utils.read_csv(data_file) + if isinstance(df, pd.Series): + df = df.to_frame() rename_cols = self.util.config_val_data(self.name, "colnames", False) if rename_cols: col_dict = ast.literal_eval(rename_cols) diff --git a/nkululeko/demo.py b/nkululeko/demo.py index eda595e3..d59c73f9 100644 --- a/nkululeko/demo.py +++ b/nkululeko/demo.py @@ -2,8 +2,9 @@ # Demonstration code to use the ML-experiment framework # Test the loading of a previously trained model and demo mode # needs the project config file to run before -""" -This script is used to test the loading of a previously trained model and run it in demo mode. +"""This script is used to test the loading of a previously trained model. + +And run it in demo mode. It requires the project config file to be run before. Usage: @@ -20,17 +21,15 @@ import configparser import os -import nkululeko.glob_conf as glob_conf from nkululeko.constants import VERSION from nkululeko.experiment import Experiment +import nkululeko.glob_conf as glob_conf from nkululeko.utils.util import Util def main(src_dir): - parser = argparse.ArgumentParser( - description="Call the nkululeko DEMO framework.") - parser.add_argument("--config", default="exp.ini", - help="The base configuration") + parser = argparse.ArgumentParser(description="Call the nkululeko DEMO framework.") + parser.add_argument("--config", default="exp.ini", help="The base configuration") parser.add_argument( "--file", help="A file that should be processed (16kHz mono wav)" ) diff --git a/nkululeko/demo_predictor.py b/nkululeko/demo_predictor.py index a2239ea3..d6e38cfe 100644 --- a/nkululeko/demo_predictor.py +++ b/nkululeko/demo_predictor.py @@ -1,18 +1,19 @@ # demo_predictor.py import os -import audformat -import audiofile import numpy as np import pandas as pd +import audformat +import audiofile + import nkululeko.glob_conf as glob_conf from nkululeko.utils.util import Util class Demo_predictor: def __init__(self, model, file, is_list, feature_extractor, label_encoder, outfile): - """Constructor setting up name and configuration""" + """Constructor setting up name and configuration.""" self.model = model self.feature_extractor = feature_extractor self.label_encoder = label_encoder diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py index 5ecf041d..f8d51dd2 100644 --- a/nkululeko/experiment.py +++ b/nkululeko/experiment.py @@ -5,25 +5,27 @@ import random import time -import audeer -import audformat import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder -import nkululeko.glob_conf as glob_conf +import audeer +import audformat + from nkululeko.data.dataset import Dataset from nkululeko.data.dataset_csv import Dataset_CSV from nkululeko.demo_predictor import Demo_predictor from nkululeko.feat_extract.feats_analyser import FeatureAnalyser from nkululeko.feature_extractor import FeatureExtractor from nkululeko.file_checker import FileChecker -from nkululeko.filter_data import DataFilter, filter_min_dur +from nkululeko.filter_data import DataFilter +from nkululeko.filter_data import filter_min_dur +import nkululeko.glob_conf as glob_conf from nkululeko.plots import Plots from nkululeko.reporting.report import Report from nkululeko.runmanager import Runmanager from nkululeko.scaler import Scaler -from nkululeko.test_predictor import Test_predictor +from nkululeko.test_predictor import TestPredictor from nkululeko.utils.util import Util @@ -101,6 +103,7 @@ def load_datasets(self): self.got_speaker = True self.datasets.update({d: data}) self.target = self.util.config_val("DATA", "target", "emotion") + glob_conf.set_target(self.target) # print target via debug self.util.debug(f"target: {self.target}") # print keys/column @@ -487,11 +490,7 @@ def random_splice(self): return df_ret def analyse_features(self, needs_feats): - """ - Do a feature exploration - - """ - + """Do a feature exploration.""" plot_feats = eval( self.util.config_val("EXPL", "feature_distributions", "False") ) @@ -511,7 +510,7 @@ def analyse_features(self, needs_feats): f"unknown sample selection specifier {sample_selection}, should" " be [all | train | test]" ) - + self.util.debug(f"sampling selection: {sample_selection}") if self.util.config_val("EXPL", "value_counts", False): self.plot_distribution(df_labels) @@ -537,9 +536,13 @@ def analyse_features(self, needs_feats): f"unknown sample selection specifier {sample_selection}, should" " be [all | train | test]" ) + feat_analyser = FeatureAnalyser(sample_selection, df_labels, df_feats) + # check if SHAP features should be analysed + shap = eval(self.util.config_val("EXPL", "shap", "False")) + if shap: + feat_analyser.analyse_shap(self.runmgr.get_best_model()) if plot_feats: - feat_analyser = FeatureAnalyser(sample_selection, df_labels, df_feats) feat_analyser.analyse() # check if a scatterplot should be done @@ -672,15 +675,19 @@ def demo(self, file, is_list, outfile): def predict_test_and_save(self, result_name): model = self.runmgr.get_best_model() model.set_testdata(self.df_test, self.feats_test) - test_predictor = Test_predictor( + test_predictor = TestPredictor( model, self.df_test, self.label_encoder, result_name ) - test_predictor.predict_and_store() + result = test_predictor.predict_and_store() + return result def load(self, filename): - f = open(filename, "rb") - tmp_dict = pickle.load(f) - f.close() + try: + f = open(filename, "rb") + tmp_dict = pickle.load(f) + f.close() + except EOFError as eof: + self.util.error(f"can't open file {filename}: {eof}") self.__dict__.update(tmp_dict) glob_conf.set_labels(self.labels) @@ -688,22 +695,26 @@ def save(self, filename): if self.runmgr.modelrunner.model.is_ann(): self.runmgr.modelrunner.model = None self.util.warn( - f"Save experiment: Can't pickle the learning model so saving without it." + "Save experiment: Can't pickle the trained model so saving without it. (it should be stored anyway)" ) try: f = open(filename, "wb") pickle.dump(self.__dict__, f) f.close() - except TypeError: + except (TypeError, AttributeError) as error: self.feature_extractor.feat_extractor.model = None f = open(filename, "wb") pickle.dump(self.__dict__, f) f.close() self.util.warn( - f"Save experiment: Can't pickle the feature extraction model so saving without it." + "Save experiment: Can't pickle the feature extraction model so saving without it." + + f"{type(error).__name__} {error}" + ) + except RuntimeError as error: + self.util.warn( + "Save experiment: Can't pickle local object, NOT saving: " + + f"{type(error).__name__} {error}" ) - except (AttributeError, RuntimeError) as error: - self.util.warn(f"Save experiment: Can't pickle local object: {error}") def save_onnx(self, filename): # export the model to onnx diff --git a/nkululeko/explore.py b/nkululeko/explore.py index e03ca9d6..42178efe 100644 --- a/nkululeko/explore.py +++ b/nkululeko/explore.py @@ -12,9 +12,9 @@ def main(src_dir): parser = argparse.ArgumentParser( - description="Call the nkululeko EXPLORE framework.") - parser.add_argument("--config", default="exp.ini", - help="The base configuration") + description="Call the nkululeko EXPLORE framework." + ) + parser.add_argument("--config", default="exp.ini", help="The base configuration") args = parser.parse_args() if args.config is not None: config_file = args.config @@ -43,28 +43,34 @@ def main(src_dir): import warnings warnings.filterwarnings("ignore") - - # load the data - expr.load_datasets() - - # split into train and test - expr.fill_train_and_tests() - util.debug( - f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}") - - plot_feats = eval(util.config_val( - "EXPL", "feature_distributions", "False")) - tsne = eval(util.config_val("EXPL", "tsne", "False")) - scatter = eval(util.config_val("EXPL", "scatter", "False")) - spotlight = eval(util.config_val("EXPL", "spotlight", "False")) - model_type = util.config_val("EXPL", "model", False) - plot_tree = eval(util.config_val("EXPL", "plot_tree", "False")) needs_feats = False - if plot_feats or tsne or scatter or model_type or plot_tree: - # these investigations need features to explore - expr.extract_feats() + try: + # load the experiment + expr.load(f"{util.get_save_name()}") needs_feats = True - # explore + except FileNotFoundError: + # first time: load the data + expr.load_datasets() + + # split into train and test + expr.fill_train_and_tests() + util.debug( + f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}" + ) + + plot_feats = eval(util.config_val("EXPL", "feature_distributions", "False")) + tsne = eval(util.config_val("EXPL", "tsne", "False")) + scatter = eval(util.config_val("EXPL", "scatter", "False")) + spotlight = eval(util.config_val("EXPL", "spotlight", "False")) + shap = eval(util.config_val("EXPL", "shap", "False")) + model_type = util.config_val("EXPL", "model", False) + plot_tree = eval(util.config_val("EXPL", "plot_tree", "False")) + needs_feats = False + if plot_feats or tsne or scatter or model_type or plot_tree or shap: + # these investigations need features to explore + expr.extract_feats() + needs_feats = True + # explore expr.analyse_features(needs_feats) expr.store_report() print("DONE") diff --git a/nkululeko/feat_extract/feats_agender_agender.py b/nkululeko/feat_extract/feats_agender_agender.py index 3ab2fa50..c070ad52 100644 --- a/nkululeko/feat_extract/feats_agender_agender.py +++ b/nkululeko/feat_extract/feats_agender_agender.py @@ -28,9 +28,11 @@ def _load_model(self): if not os.path.isdir(model_root): cache_root = audeer.mkdir("cache") model_root = audeer.mkdir(model_root) - archive_path = audeer.download_url(model_url, cache_root, verbose=True) + archive_path = audeer.download_url( + model_url, cache_root, verbose=True) audeer.extract_archive(archive_path, model_root) - device = self.util.config_val("MODEL", "device", "cpu") + cuda = "cuda" if torch.cuda.is_available() else "cpu" + device = self.util.config_val("MODEL", "device", cuda) self.model = audonnx.load(model_root, device=device) # pytorch_total_params = sum(p.numel() for p in self.model.parameters()) # self.util.debug( diff --git a/nkululeko/feat_extract/feats_analyser.py b/nkululeko/feat_extract/feats_analyser.py index 4b3a291d..e7d93459 100644 --- a/nkululeko/feat_extract/feats_analyser.py +++ b/nkululeko/feat_extract/feats_analyser.py @@ -40,6 +40,39 @@ def _get_importance(self, model, permutation): importance = model.feature_importances_ return importance + def analyse_shap(self, model): + """Shap analysis. + + Use the best model from a previous run and analyse feature importance with SHAP. + https://m.mage.ai/how-to-interpret-and-explain-your-machine-learning-models-using-shap-values-471c2635b78e. + """ + import shap + + name = "my_shap_values" + if not self.util.exist_pickle(name): + + explainer = shap.Explainer( + model.predict_shap, + self.features, + output_names=glob_conf.labels, + algorithm="permutation", + npermutations=5, + ) + self.util.debug("computing SHAP values...") + shap_values = explainer(self.features) + self.util.to_pickle(shap_values, name) + else: + shap_values = self.util.from_pickle(name) + plt.tight_layout() + shap.plots.bar(shap_values) + fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs + exp_name = self.util.get_exp_name(only_data=True) + format = self.util.config_val("PLOT", "format", "png") + filename = f"_SHAP_{model.name}" + filename = f"{fig_dir}{exp_name}{filename}.{format}" + plt.savefig(filename) + self.util.debug(f"plotted SHAP feature importance tp {filename}") + def analyse(self): models = ast.literal_eval(self.util.config_val("EXPL", "model", "['log_reg']")) model_name = "_".join(models) diff --git a/nkululeko/feat_extract/feats_hubert.py b/nkululeko/feat_extract/feats_hubert.py index 4a63dbe9..99c07b94 100644 --- a/nkululeko/feat_extract/feats_hubert.py +++ b/nkululeko/feat_extract/feats_hubert.py @@ -6,23 +6,26 @@ import os -import audeer -import nkululeko.glob_conf as glob_conf import pandas as pd import torch import torchaudio -from audformat.utils import map_file_path -from nkululeko.feat_extract.featureset import Featureset from tqdm import tqdm -from transformers import HubertModel, Wav2Vec2FeatureExtractor +from transformers import HubertModel +from transformers import Wav2Vec2FeatureExtractor + +from nkululeko.feat_extract.featureset import Featureset +import nkululeko.glob_conf as glob_conf class Hubert(Featureset): - """Class to extract HuBERT embedding)""" + """Class to extract HuBERT embedding).""" def __init__(self, name, data_df, feat_type): - """Constructor. is_train is needed to distinguish from test/dev sets, - because they use the codebook from the training""" + """Constructor. + + Is_train is needed to distinguish from test/dev sets, + because they use the codebook from the training. + """ super().__init__(name, data_df, feat_type) # check if device is not set, use cuda if available cuda = "cuda" if torch.cuda.is_available() else "cpu" @@ -61,16 +64,12 @@ def extract(self): """Extract the features or load them from disk if present.""" store = self.util.get_path("store") storage = f"{store}{self.name}.pkl" - extract = self.util.config_val( - "FEATS", "needs_feature_extraction", False - ) + extract = self.util.config_val("FEATS", "needs_feature_extraction", False) no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False")) if extract or no_reuse or not os.path.isfile(storage): if not self.model_initialized: self.init_model() - self.util.debug( - "extracting Hubert embeddings, this might take a while..." - ) + self.util.debug("extracting Hubert embeddings, this might take a while...") emb_series = pd.Series(index=self.data_df.index, dtype=object) length = len(self.data_df.index) for idx, (file, start, end) in enumerate( @@ -84,9 +83,7 @@ def extract(self): assert sampling_rate == 16000 emb = self.get_embeddings(signal, sampling_rate, file) emb_series.iloc[idx] = emb - self.df = pd.DataFrame( - emb_series.values.tolist(), index=self.data_df.index - ) + self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index) self.df.to_pickle(storage) try: glob_conf.config["DATA"]["needs_feature_extraction"] = "false" diff --git a/nkululeko/feat_extract/feats_squim.py b/nkululeko/feat_extract/feats_squim.py index 93baaf30..b10cb1e4 100644 --- a/nkululeko/feat_extract/feats_squim.py +++ b/nkululeko/feat_extract/feats_squim.py @@ -28,12 +28,17 @@ class SquimSet(Featureset): - """Class to predict SQUIM features""" + """Class to predict SQUIM features.""" def __init__(self, name, data_df, feats_type): - """Constructor. is_train is needed to distinguish from test/dev sets, because they use the codebook from the training""" + """Constructor. + + Is_train is needed to distinguish from test/dev sets, + because they use the codebook from the training. + """ super().__init__(name, data_df, feats_type) - self.device = self.util.config_val("MODEL", "device", "cpu") + cuda = "cuda" if torch.cuda.is_available() else "cpu" + self.device = self.util.config_val("MODEL", "device", cuda) self.model_initialized = False def init_model(self): diff --git a/nkululeko/feat_extract/feats_trill.py b/nkululeko/feat_extract/feats_trill.py index b3c42b4d..9f3cc1e4 100644 --- a/nkululeko/feat_extract/feats_trill.py +++ b/nkululeko/feat_extract/feats_trill.py @@ -1,35 +1,39 @@ # feats_trill.py -import tensorflow_hub as hub import os + +import pandas as pd import tensorflow as tf -from numpy.core.numeric import tensordot +import tensorflow_hub as hub from tqdm import tqdm -import pandas as pd + import audiofile as af -from nkululeko.utils.util import Util -import nkululeko.glob_conf as glob_conf + from nkululeko.feat_extract.featureset import Featureset +import nkululeko.glob_conf as glob_conf +from nkululeko.utils.util import Util + # Import TF 2.X and make sure we're running eager. assert tf.executing_eagerly() class TRILLset(Featureset): - """A feature extractor for the Google TRILL embeddings""" + """A feature extractor for the Google TRILL embeddings. - """https://ai.googleblog.com/2020/06/improving-speech-representations-and.html""" + See https://ai.googleblog.com/2020/06/improving-speech-representations-and.html. + """ # Initialization of the class def __init__(self, name, data_df, feats_type): - """ - Initialize the class with name, data and Util instance - Also loads the model from hub + """Initialize the class with name, data and Util instance. - :param name: Name of the class - :type name: str - :param data_df: Data of the class - :type data_df: DataFrame - :return: None + Also loads the model from hub + Args: + :param name: Name of the class + :type name: str + :param data_df: Data of the class + :type data_df: DataFrame + :return: None """ super().__init__(name, data_df, feats_type) # Load the model from the configured path @@ -38,25 +42,21 @@ def __init__(self, name, data_df, feats_type): "trill.model", "https://tfhub.dev/google/nonsemantic-speech-benchmark/trill/3", ) - self.module = hub.load(model_path) + self.model = hub.load(model_path) self.feats_type = feats_type def extract(self): store = self.util.get_path("store") storage = f"{store}{self.name}.pkl" - extract = self.util.config_val( - "FEATS", "needs_feature_extraction", False) + extract = self.util.config_val("FEATS", "needs_feature_extraction", False) no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False")) if extract or no_reuse or not os.path.isfile(storage): - self.util.debug( - "extracting TRILL embeddings, this might take a while...") + self.util.debug("extracting TRILL embeddings, this might take a while...") emb_series = pd.Series(index=self.data_df.index, dtype=object) - length = len(self.data_df.index) for idx, file in enumerate(tqdm(self.data_df.index.get_level_values(0))): - emb = self.getEmbeddings(file) - emb_series[idx] = emb - self.df = pd.DataFrame( - emb_series.values.tolist(), index=self.data_df.index) + emb = self.get_embeddings(file) + emb_series.iloc[idx] = emb + self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index) self.df.to_pickle(storage) try: glob_conf.config["DATA"]["needs_feature_extraction"] = "false" @@ -70,15 +70,15 @@ def embed_wav(self, wav): if len(wav.shape) > 1: wav = tf.reduce_mean(wav, axis=0) - emb_dict = self.module(samples=wav, sample_rate=tf.constant(16000)) + emb_dict = self.model(samples=wav, sample_rate=tf.constant(16000)) return emb_dict["embedding"] - def getEmbeddings(self, file): + def get_embeddings(self, file): wav = af.read(file)[0] - emb_short = self.getEmbeddings_signal(wav, 16000) + emb_short = self.get_embeddings_signal(wav, 16000) return emb_short - def getEmbeddings_signal(self, signal, sr): + def get_embeddings_signal(self, signal, sr): wav = tf.convert_to_tensor(signal) emb_short = self.embed_wav(wav) # you get one embedding per frame, we use the mean for all the frames @@ -86,7 +86,7 @@ def getEmbeddings_signal(self, signal, sr): return emb_short def extract_sample(self, signal, sr): - if self.module == None: + if self.model == None: self.__init__("na", None) - feats = self.getEmbeddings_signal(signal, sr) + feats = self.get_embeddings_signal(signal, sr) return feats diff --git a/nkululeko/feat_extract/feats_wav2vec2.py b/nkululeko/feat_extract/feats_wav2vec2.py index 6ace2f73..46888e20 100644 --- a/nkululeko/feat_extract/feats_wav2vec2.py +++ b/nkululeko/feat_extract/feats_wav2vec2.py @@ -21,7 +21,11 @@ class Wav2vec2(Featureset): """Class to extract wav2vec2 embeddings""" def __init__(self, name, data_df, feat_type): - """Constructor. is_train is needed to distinguish from test/dev sets, because they use the codebook from the training""" + """Constructor. + + If_train is needed to distinguish from test/dev sets, + because they use the codebook from the training + """ super().__init__(name, data_df, feat_type) cuda = "cuda" if torch.cuda.is_available() else "cpu" self.device = self.util.config_val("MODEL", "device", cuda) @@ -39,8 +43,7 @@ def init_model(self): ) config = transformers.AutoConfig.from_pretrained(model_path) layer_num = config.num_hidden_layers - hidden_layer = int(self.util.config_val( - "FEATS", "wav2vec2.layer", "0")) + hidden_layer = int(self.util.config_val("FEATS", "wav2vec2.layer", "0")) config.num_hidden_layers = layer_num - hidden_layer self.util.debug(f"using hidden layer #{config.num_hidden_layers}") self.processor = Wav2Vec2FeatureExtractor.from_pretrained(model_path) @@ -55,8 +58,7 @@ def extract(self): """Extract the features or load them from disk if present.""" store = self.util.get_path("store") storage = f"{store}{self.name}.pkl" - extract = self.util.config_val( - "FEATS", "needs_feature_extraction", False) + extract = self.util.config_val("FEATS", "needs_feature_extraction", False) no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False")) if extract or no_reuse or not os.path.isfile(storage): if not self.model_initialized: @@ -77,8 +79,7 @@ def extract(self): emb = self.get_embeddings(signal, sampling_rate, file) emb_series[idx] = emb # print(f"emb_series shape: {emb_series.shape}") - self.df = pd.DataFrame( - emb_series.values.tolist(), index=self.data_df.index) + self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index) # print(f"df shape: {self.df.shape}") self.df.to_pickle(storage) try: diff --git a/nkululeko/feat_extract/feats_wavlm.py b/nkululeko/feat_extract/feats_wavlm.py index 748791e1..a21b0860 100644 --- a/nkululeko/feat_extract/feats_wavlm.py +++ b/nkululeko/feat_extract/feats_wavlm.py @@ -4,27 +4,32 @@ import os -import nkululeko.glob_conf as glob_conf import pandas as pd import torch import torchaudio -from nkululeko.feat_extract.featureset import Featureset from tqdm import tqdm -from transformers import Wav2Vec2FeatureExtractor, WavLMModel +from transformers import Wav2Vec2FeatureExtractor +from transformers import WavLMModel + +from nkululeko.feat_extract.featureset import Featureset +import nkululeko.glob_conf as glob_conf class Wavlm(Featureset): - """Class to extract WavLM embedding)""" + """Class to extract WavLM embedding).""" + + def __init__(self, name, data_df, feats_type): + """Constructor. - def __init__(self, name, data_df, feat_type): - """Constructor. is_train is needed to distinguish from test/dev sets, - because they use the codebook from the training""" - super().__init__(name, data_df) + Is_train is needed to distinguish from test/dev sets, + because they use the codebook from the training. + """ + super().__init__(name, data_df, feats_type) # check if device is not set, use cuda if available cuda = "cuda" if torch.cuda.is_available() else "cpu" self.device = self.util.config_val("MODEL", "device", cuda) self.model_initialized = False - self.feat_type = feat_type + self.feat_type = feats_type def init_model(self): # load model @@ -59,7 +64,9 @@ def extract(self): frame_offset=int(start.total_seconds() * 16000), num_frames=int((end - start).total_seconds() * 16000), ) - assert sampling_rate == 16000, f"sampling rate should be 16000 but is {sampling_rate}" + assert ( + sampling_rate == 16000 + ), f"sampling rate should be 16000 but is {sampling_rate}" emb = self.get_embeddings(signal, sampling_rate, file) emb_series.iloc[idx] = emb self.df = pd.DataFrame(emb_series.values.tolist(), index=self.data_df.index) diff --git a/nkululeko/feat_extract/feats_whisper.py b/nkululeko/feat_extract/feats_whisper.py index b0333bec..f6b6e94b 100644 --- a/nkululeko/feat_extract/feats_whisper.py +++ b/nkululeko/feat_extract/feats_whisper.py @@ -32,19 +32,22 @@ def init_model(self): model_name = f"openai/{self.feat_type}" self.model = WhisperModel.from_pretrained(model_name).to(self.device) print(f"intialized Whisper model on {self.device}") - self.feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) + self.feature_extractor = AutoFeatureExtractor.from_pretrained( + model_name) self.model_initialized = True def extract(self): """Extract the features or load them from disk if present.""" store = self.util.get_path("store") storage = f"{store}{self.name}.pkl" - extract = self.util.config_val("FEATS", "needs_feature_extraction", False) + extract = self.util.config_val( + "FEATS", "needs_feature_extraction", False) no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False")) if extract or no_reuse or not os.path.isfile(storage): if not self.model_initialized: self.init_model() - self.util.debug("extracting whisper embeddings, this might take a while...") + self.util.debug( + "extracting whisper embeddings, this might take a while...") emb_series = [] for (file, start, end), _ in audeer.progress_bar( self.data_df.iterrows(), diff --git a/nkululeko/glob_conf.py b/nkululeko/glob_conf.py index c74d4ceb..c630367d 100644 --- a/nkululeko/glob_conf.py +++ b/nkululeko/glob_conf.py @@ -29,3 +29,8 @@ def set_report(report_obj): def set_labels(labels_obj): global labels labels = labels_obj + + +def set_target(target_obj): + global target + target = target_obj diff --git a/nkululeko/modelrunner.py b/nkululeko/modelrunner.py index b36d990f..af63adc3 100644 --- a/nkululeko/modelrunner.py +++ b/nkululeko/modelrunner.py @@ -2,18 +2,16 @@ import pandas as pd -from nkululeko.utils.util import Util from nkululeko import glob_conf -import nkululeko.glob_conf as glob_conf +from nkululeko.utils.util import Util class Modelrunner: - """ - Class to model one run - """ + """Class to model one run.""" def __init__(self, df_train, df_test, feats_train, feats_test, run): - """Constructor setting up the dataframes + """Constructor setting up the dataframes. + Args: df_train: train dataframe df_test: test dataframe diff --git a/nkululeko/models/model.py b/nkululeko/models/model.py index fce9c0e6..903c663c 100644 --- a/nkululeko/models/model.py +++ b/nkululeko/models/model.py @@ -20,6 +20,7 @@ class Model: def __init__(self, df_train, df_test, feats_train, feats_test): """Constructor taking the configuration and all dataframes.""" + self.name = "undefined" self.df_train, self.df_test, self.feats_train, self.feats_test = ( df_train, df_test, diff --git a/nkululeko/models/model_bayes.py b/nkululeko/models/model_bayes.py index d54dd76f..dbddbb8b 100644 --- a/nkululeko/models/model_bayes.py +++ b/nkululeko/models/model_bayes.py @@ -12,3 +12,4 @@ class Bayes_model(Model): def __init__(self, df_train, df_test, feats_train, feats_test): super().__init__(df_train, df_test, feats_train, feats_test) self.clf = GaussianNB() # set up the classifier + self.name = "bayes" diff --git a/nkululeko/models/model_cnn.py b/nkululeko/models/model_cnn.py index d5f321a6..4bc983eb 100644 --- a/nkululeko/models/model_cnn.py +++ b/nkululeko/models/model_cnn.py @@ -16,6 +16,7 @@ from sklearn.metrics import recall_score from collections import OrderedDict from PIL import Image +from traitlets import default from nkululeko.utils.util import Util import nkululeko.glob_conf as glob_conf @@ -33,7 +34,8 @@ def __init__(self, df_train, df_test, feats_train, feats_test): """Constructor taking the configuration and all dataframes""" super().__init__(df_train, df_test, feats_train, feats_test) super().set_model_type("ann") - self.target = glob_conf.config["DATA"]["target"] + self.name = "cnn" + self.target = glob_conf.target labels = glob_conf.labels self.class_num = len(labels) # set up loss criterion @@ -48,6 +50,7 @@ def __init__(self, df_train, df_test, feats_train, feats_test): self.util.error(f"unknown loss function: {criterion}") self.util.debug(f"using model with cross entropy loss function") # set up the model + # cuda = "cuda" if torch.cuda.is_available() else "cpu" self.device = self.util.config_val("MODEL", "device", "cpu") try: layers_string = glob_conf.config["MODEL"]["layers"] @@ -209,7 +212,8 @@ def load(self, run, epoch): dir = self.util.get_path("model_dir") # name = f'{self.util.get_exp_name()}_{run}_{epoch:03d}.model' name = f"{self.util.get_exp_name(only_train=True)}_{self.run}_{self.epoch:03d}.model" - self.device = self.util.config_val("MODEL", "device", "cpu") + cuda = "cuda" if torch.cuda.is_available() else "cpu" + self.device = self.util.config_val("MODEL", "device", cuda) layers = ast.literal_eval(glob_conf.config["MODEL"]["layers"]) self.store_path = dir + name drop = self.util.config_val("MODEL", "drop", False) @@ -222,7 +226,8 @@ def load(self, run, epoch): def load_path(self, path, run, epoch): self.set_id(run, epoch) with open(path, "rb") as handle: - self.device = self.util.config_val("MODEL", "device", "cpu") + cuda = "cuda" if torch.cuda.is_available() else "cpu" + self.device = self.util.config_val("MODEL", "device", cuda) layers = ast.literal_eval(glob_conf.config["MODEL"]["layers"]) self.store_path = path drop = self.util.config_val("MODEL", "drop", False) diff --git a/nkululeko/models/model_gmm.py b/nkululeko/models/model_gmm.py index ed635e83..f8d2bb7c 100644 --- a/nkululeko/models/model_gmm.py +++ b/nkululeko/models/model_gmm.py @@ -11,10 +11,9 @@ class GMM_model(Model): def __init__(self, df_train, df_test, feats_train, feats_test): super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "gmm" n_components = int(self.util.config_val("MODEL", "GMM_components", "4")) - covariance_type = self.util.config_val( - "MODEL", "GMM_covariance_type", "full" - ) + covariance_type = self.util.config_val("MODEL", "GMM_covariance_type", "full") self.clf = mixture.GaussianMixture( n_components=n_components, covariance_type=covariance_type ) diff --git a/nkululeko/models/model_knn.py b/nkululeko/models/model_knn.py index 05e170b0..4c77fbbe 100644 --- a/nkululeko/models/model_knn.py +++ b/nkululeko/models/model_knn.py @@ -11,6 +11,7 @@ class KNN_model(Model): def __init__(self, df_train, df_test, feats_train, feats_test): super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "knn" method = self.util.config_val("MODEL", "KNN_weights", "uniform") k = int(self.util.config_val("MODEL", "K_val", "5")) self.clf = KNeighborsClassifier( diff --git a/nkululeko/models/model_knn_reg.py b/nkululeko/models/model_knn_reg.py index 875f981a..b728679f 100644 --- a/nkululeko/models/model_knn_reg.py +++ b/nkululeko/models/model_knn_reg.py @@ -11,6 +11,7 @@ class KNN_reg_model(Model): def __init__(self, df_train, df_test, feats_train, feats_test): super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "knn_reg" method = self.util.config_val("MODEL", "KNN_weights", "uniform") k = int(self.util.config_val("MODEL", "K_val", "5")) self.clf = KNeighborsRegressor( diff --git a/nkululeko/models/model_lin_reg.py b/nkululeko/models/model_lin_reg.py index 5b4eb422..dc5b7491 100644 --- a/nkululeko/models/model_lin_reg.py +++ b/nkululeko/models/model_lin_reg.py @@ -11,4 +11,5 @@ class Lin_reg_model(Model): def __init__(self, df_train, df_test, feats_train, feats_test): super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "lin_reg" self.clf = LinearRegression() # set up the classifier diff --git a/nkululeko/models/model_mlp.py b/nkululeko/models/model_mlp.py index 9af7c595..2ba71285 100644 --- a/nkululeko/models/model_mlp.py +++ b/nkululeko/models/model_mlp.py @@ -1,4 +1,6 @@ # model_mlp.py +import pandas as pd + from nkululeko.utils.util import Util import nkululeko.glob_conf as glob_conf from nkululeko.models.model import Model @@ -20,6 +22,7 @@ def __init__(self, df_train, df_test, feats_train, feats_test): """Constructor taking the configuration and all dataframes""" super().__init__(df_train, df_test, feats_train, feats_test) super().set_model_type("ann") + self.name = "mlp" self.target = glob_conf.config["DATA"]["target"] labels = glob_conf.labels self.class_num = len(labels) @@ -34,8 +37,9 @@ def __init__(self, df_train, df_test, feats_train, feats_test): else: self.util.error(f"unknown loss function: {criterion}") self.util.debug(f"using model with cross entropy loss function") - # set up the model - self.device = self.util.config_val("MODEL", "device", "cpu") + # set up the model, use GPU if availabe + cuda = "cuda" if torch.cuda.is_available() else "cpu" + self.device = self.util.config_val("MODEL", "device", cuda) try: layers_string = glob_conf.config["MODEL"]["layers"] except KeyError as ke: @@ -172,13 +176,26 @@ def forward(self, x): x = x.squeeze(dim=1).float() return self.linear(x) + def predict_shap(self, features): + # predict outputs for all samples in SHAP format (pd. dataframe) + results = [] + for index, row in features.iterrows(): + feats = row.values + res_dict = self.predict_sample(feats) + class_key = max(res_dict, key=res_dict.get) + results.append(class_key) + return results + def predict_sample(self, features): - """Predict one sample""" + """Predict one sample.""" with torch.no_grad(): features = torch.from_numpy(features) features = np.reshape(features, (-1, 1)).T logits = self.model(features.to(self.device)) # logits = self.model(features) + # if tensor conver to cpu + if isinstance(logits, torch.Tensor): + logits = logits.cpu() a = logits.numpy() res = {} for i in range(len(a[0])): @@ -196,7 +213,8 @@ def load(self, run, epoch): dir = self.util.get_path("model_dir") # name = f'{self.util.get_exp_name()}_{run}_{epoch:03d}.model' name = f"{self.util.get_exp_name(only_train=True)}_{self.run}_{self.epoch:03d}.model" - self.device = self.util.config_val("MODEL", "device", "cpu") + cuda = "cuda" if torch.cuda.is_available() else "cpu" + self.device = self.util.config_val("MODEL", "device", cuda) layers = ast.literal_eval(glob_conf.config["MODEL"]["layers"]) self.store_path = dir + name drop = self.util.config_val("MODEL", "drop", False) @@ -211,7 +229,8 @@ def load(self, run, epoch): def load_path(self, path, run, epoch): self.set_id(run, epoch) with open(path, "rb") as handle: - self.device = self.util.config_val("MODEL", "device", "cpu") + cuda = "cuda" if torch.cuda.is_available() else "cpu" + self.device = self.util.config_val("MODEL", "device", cuda) layers = ast.literal_eval(glob_conf.config["MODEL"]["layers"]) self.store_path = path drop = self.util.config_val("MODEL", "drop", False) diff --git a/nkululeko/models/model_mlp_regression.py b/nkululeko/models/model_mlp_regression.py index 42949670..d2f798b9 100644 --- a/nkululeko/models/model_mlp_regression.py +++ b/nkululeko/models/model_mlp_regression.py @@ -9,6 +9,7 @@ from audmetric import concordance_cc from audmetric import mean_absolute_error from audmetric import mean_squared_error +from traitlets import default import nkululeko.glob_conf as glob_conf from nkululeko.losses.loss_ccc import ConcordanceCorCoeff @@ -24,6 +25,7 @@ class MLP_Reg_model(Model): def __init__(self, df_train, df_test, feats_train, feats_test): """Constructor taking the configuration and all dataframes""" super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "mlp_reg" super().set_model_type("ann") self.target = glob_conf.config["DATA"]["target"] labels = glob_conf.labels @@ -40,7 +42,8 @@ def __init__(self, df_train, df_test, feats_train, feats_test): self.util.error(f"unknown loss function: {criterion}") self.util.debug(f"training model with {criterion} loss function") # set up the model - self.device = self.util.config_val("MODEL", "device", "cpu") + cuda = "cuda" if torch.cuda.is_available() else "cpu" + self.device = self.util.config_val("MODEL", "device", cuda) layers_string = glob_conf.config["MODEL"]["layers"] self.util.debug(f"using layers {layers_string}") try: diff --git a/nkululeko/models/model_svm.py b/nkululeko/models/model_svm.py index 6ad4ac74..1d53a95b 100644 --- a/nkululeko/models/model_svm.py +++ b/nkululeko/models/model_svm.py @@ -11,6 +11,7 @@ class SVM_model(Model): def __init__(self, df_train, df_test, feats_train, feats_test): super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "svm" c = float(self.util.config_val("MODEL", "C_val", "0.001")) if eval(self.util.config_val("MODEL", "class_weight", "False")): class_weight = "balanced" diff --git a/nkululeko/models/model_svr.py b/nkululeko/models/model_svr.py index 71dd950a..ee6d4240 100644 --- a/nkululeko/models/model_svr.py +++ b/nkululeko/models/model_svr.py @@ -11,6 +11,7 @@ class SVR_model(Model): def __init__(self, df_train, df_test, feats_train, feats_test): super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "svr" c = float(self.util.config_val("MODEL", "C_val", "0.001")) # kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable, default=’rbf’ kernel = self.util.config_val("MODEL", "kernel", "rbf") diff --git a/nkululeko/models/model_tree.py b/nkululeko/models/model_tree.py index a536b1d9..afa30d46 100644 --- a/nkululeko/models/model_tree.py +++ b/nkululeko/models/model_tree.py @@ -11,4 +11,5 @@ class Tree_model(Model): def __init__(self, df_train, df_test, feats_train, feats_test): super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "tree" self.clf = DecisionTreeClassifier() # set up the classifier diff --git a/nkululeko/models/model_tree_reg.py b/nkululeko/models/model_tree_reg.py index 0d5648c7..f5ad2309 100644 --- a/nkululeko/models/model_tree_reg.py +++ b/nkululeko/models/model_tree_reg.py @@ -11,4 +11,5 @@ class Tree_reg_model(Model): def __init__(self, df_train, df_test, feats_train, feats_test): super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "tree_reg" self.clf = DecisionTreeRegressor() # set up the classifier diff --git a/nkululeko/models/model_xgb.py b/nkululeko/models/model_xgb.py index b5a78469..681ec37a 100644 --- a/nkululeko/models/model_xgb.py +++ b/nkululeko/models/model_xgb.py @@ -7,9 +7,11 @@ class XGB_model(Model): """An XGBoost model""" - is_classifier = True - - clf = XGBClassifier() # set up the classifier + def __init__(self, df_train, df_test, feats_train, feats_test): + super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "xgb" + self.is_classifier = True + self.clf = XGBClassifier() # set up the classifier def get_type(self): return "xgb" diff --git a/nkululeko/models/model_xgr.py b/nkululeko/models/model_xgr.py index 3cdcae1b..f78bfebb 100644 --- a/nkululeko/models/model_xgr.py +++ b/nkululeko/models/model_xgr.py @@ -5,8 +5,10 @@ class XGR_model(Model): - """An XGBoost model""" + """An XGBoost regression model""" - is_classifier = False - - clf = XGBRegressor() # set up the regressor + def __init__(self, df_train, df_test, feats_train, feats_test): + super().__init__(df_train, df_test, feats_train, feats_test) + self.name = "xgr" + self.is_classifier = False + self.clf = XGBRegressor() # set up the regressor diff --git a/nkululeko/nkuluflag.py b/nkululeko/nkuluflag.py index 5603bcff..a827b3e6 100644 --- a/nkululeko/nkuluflag.py +++ b/nkululeko/nkuluflag.py @@ -2,13 +2,16 @@ import configparser import os import os.path +import sys from nkululeko.nkululeko import doit as nkulu +from nkululeko.test import do_it as test_mod -def do_it(src_dir): +def doit(cla): parser = argparse.ArgumentParser(description="Call the nkululeko framework.") parser.add_argument("--config", help="The base configuration") + parser.add_argument("--mod", default="nkulu", help="Which nkululeko module to call") parser.add_argument("--data", help="The databases", nargs="*", action="append") parser.add_argument( "--label", nargs="*", help="The labels for the target", action="append" @@ -25,20 +28,23 @@ def do_it(src_dir): parser.add_argument("--model", default="xgb", help="The model type") parser.add_argument("--feat", default="['os']", help="The feature type") parser.add_argument("--set", help="The opensmile set") - parser.add_argument("--with_os", help="To add os features") parser.add_argument("--target", help="The target designation") parser.add_argument("--epochs", help="The number of epochs") parser.add_argument("--runs", help="The number of runs") parser.add_argument("--learning_rate", help="The learning rate") parser.add_argument("--drop", help="The dropout rate [0:1]") - args = parser.parse_args() + args = parser.parse_args(cla) if args.config is not None: config_file = args.config else: print("ERROR: need config file") quit(-1) + + if args.mod is not None: + nkulu_mod = args.mod + # test if config is there if not os.path.isfile(config_file): print(f"ERROR: no such file {config_file}") @@ -86,10 +92,17 @@ def do_it(src_dir): with open(tmp_config, "w") as tmp_file: config.write(tmp_file) - result, last_epoch = nkulu(tmp_config) + result, last_epoch = 0, 0 + if nkulu_mod == "nkulu": + result, last_epoch = nkulu(tmp_config) + elif nkulu_mod == "test": + result, last_epoch = test_mod(tmp_config, "test_results.csv") + else: + print(f"ERROR: unknown module: {nkulu_mod}, should be [nkulu | test]") return result, last_epoch if __name__ == "__main__": - cwd = os.path.dirname(os.path.abspath(__file__)) - do_it(cwd) # sys.argv[1]) + cla = sys.argv + cla.pop(0) + doit(cla) # sys.argv[1]) diff --git a/nkululeko/plots.py b/nkululeko/plots.py index f04a494f..0f2baa98 100644 --- a/nkululeko/plots.py +++ b/nkululeko/plots.py @@ -28,7 +28,8 @@ def plot_distributions_speaker(self, df): df_speaker["samplenum"] = df_speaker.shape[0] df_speakers = pd.concat([df_speakers, df_speaker.head(1)]) # plot the distribution of samples per speaker - fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs + # one up because of the runs + fig_dir = self.util.get_path("fig_dir") + "../" self.util.debug(f"plotting samples per speaker") if "gender" in df_speakers: filename = f"samples_value_counts" @@ -137,7 +138,8 @@ def plot_distributions(self, df, type_s="samples"): df, att1, class_label, att1, type_s ) else: - ax, caption = self._plot2cont(df, class_label, att1, type_s) + ax, caption = self._plot2cont( + df, class_label, att1, type_s) self._save_plot( ax, caption, @@ -150,7 +152,8 @@ def plot_distributions(self, df, type_s="samples"): att1 = att[0] att2 = att[1] if att1 == self.target or att2 == self.target: - self.util.debug(f"no need to correlate {self.target} with itself") + self.util.debug( + f"no need to correlate {self.target} with itself") return if att1 not in df: self.util.error(f"unknown feature: {att1}") @@ -165,7 +168,8 @@ def plot_distributions(self, df, type_s="samples"): if self.util.is_categorical(df[att1]): if self.util.is_categorical(df[att2]): # class_label = cat, att1 = cat, att2 = cat - ax, caption = self._plot2cat(df, att1, att2, att1, type_s) + ax, caption = self._plot2cat( + df, att1, att2, att1, type_s) else: # class_label = cat, att1 = cat, att2 = cont ax, caption = self._plotcatcont( @@ -186,7 +190,8 @@ def plot_distributions(self, df, type_s="samples"): if self.util.is_categorical(df[att1]): if self.util.is_categorical(df[att2]): # class_label = cont, att1 = cat, att2 = cat - ax, caption = self._plot2cat(df, att1, att2, att1, type_s) + ax, caption = self._plot2cat( + df, att1, att2, att1, type_s) else: # class_label = cont, att1 = cat, att2 = cont ax, caption = self._plot2cont_cat( @@ -200,7 +205,8 @@ def plot_distributions(self, df, type_s="samples"): ) else: # class_label = cont, att1 = cont, att2 = cont - ax, caption = self._plot2cont(df, att1, att2, type_s) + ax, caption = self._plot2cont( + df, att1, att2, type_s) self._save_plot( ax, caption, f"Correlation of {att1} and {att2}", filename, type_s @@ -213,7 +219,8 @@ def plot_distributions(self, df, type_s="samples"): ) def _save_plot(self, ax, caption, header, filename, type_s): - fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs + # one up because of the runs + fig_dir = self.util.get_path("fig_dir") + "../" fig = ax.figure # avoid warning # plt.tight_layout() @@ -231,7 +238,8 @@ def _save_plot(self, ax, caption, header, filename, type_s): ) def _check_binning(self, att, df): - bin_reals_att = eval(self.util.config_val("EXPL", f"{att}.bin_reals", "False")) + bin_reals_att = eval(self.util.config_val( + "EXPL", f"{att}.bin_reals", "False")) if bin_reals_att: self.util.debug(f"binning continuous variable {att} to categories") att_new = f"{att}_binned" @@ -305,7 +313,8 @@ def _plot2cat(self, df, col1, col2, xlab, ylab): return ax, caption def plot_durations(self, df, filename, sample_selection, caption=""): - fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs + # one up because of the runs + fig_dir = self.util.get_path("fig_dir") + "../" try: ax = sns.histplot(df, x="duration", hue="class_label", kde=True) except AttributeError as ae: @@ -333,7 +342,8 @@ def plot_durations(self, df, filename, sample_selection, caption=""): def describe_df(self, name, df, target, filename): """Make a stacked barplot of samples and speakers per sex and target values. speaker, gender and target columns must be present""" - fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs + fig_dir = self.util.get_path( + "fig_dir") + "../" # one up because of the runs sampl_num = df.shape[0] sex_col = "gender" if target == "gender": @@ -380,8 +390,10 @@ def describe_df(self, name, df, target, filename): def scatter_plot(self, feats, label_df, label, dimred_type): dim_num = int(self.util.config_val("EXPL", "scatter.dim", 2)) - fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs - sample_selection = self.util.config_val("EXPL", "sample_selection", "all") + # one up because of the runs + fig_dir = self.util.get_path("fig_dir") + "../" + sample_selection = self.util.config_val( + "EXPL", "sample_selection", "all") filename = f"{label}_{self.util.get_feattype_name()}_{sample_selection}_{dimred_type}_{str(dim_num)}d" filename = f"{fig_dir}{filename}.{self.format}" self.util.debug(f"computing {dimred_type}, this might take a while...") @@ -423,7 +435,8 @@ def scatter_plot(self, feats, label_df, label, dimred_type): if dim_num == 2: plot_data = np.vstack((data.T, labels)).T - plot_df = pd.DataFrame(data=plot_data, columns=("Dim_1", "Dim_2", "label")) + plot_df = pd.DataFrame( + data=plot_data, columns=("Dim_1", "Dim_2", "label")) # plt.tight_layout() ax = ( sns.FacetGrid(plot_df, hue="label", height=6) @@ -515,7 +528,8 @@ def getTsne(self, feats, dim_num, perplexity=30, learning_rate=200): def plot_feature(self, title, feature, label, df_labels, df_features): # remove fullstops in the name feature_name = feature.replace(".", "-") - fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs + # one up because of the runs + fig_dir = self.util.get_path("fig_dir") + "../" filename = f"{fig_dir}feat_dist_{title}_{feature_name}.{self.format}" if self.util.is_categorical(df_labels[label]): df_plot = pd.DataFrame( @@ -554,7 +568,8 @@ def plot_tree(self, model, features): tree.plot_tree(model, feature_names=list(features.columns), ax=ax) # plt.tight_layout() # print(ax) - fig_dir = self.util.get_path("fig_dir") + "../" # one up because of the runs + # one up because of the runs + fig_dir = self.util.get_path("fig_dir") + "../" exp_name = self.util.get_exp_name(only_data=True) format = self.util.config_val("PLOT", "format", "png") filename = f"{fig_dir}{exp_name}EXPL_tree-plot.{format}" diff --git a/nkululeko/reporter.py b/nkululeko/reporter.py deleted file mode 100644 index 41228d98..00000000 --- a/nkululeko/reporter.py +++ /dev/null @@ -1,332 +0,0 @@ -"""Reporter module. - -This module contains the Reporter class which is responsible for generating reports. -""" - - -import ast -import glob -import json -import math - -import matplotlib.pyplot as plt -import numpy as np -from scipy.stats import pearsonr -from sklearn.metrics import ( - ConfusionMatrixDisplay, - accuracy_score, - classification_report, - confusion_matrix, - mean_absolute_error, - mean_squared_error, - r2_score, - recall_score, -) -from sklearn.utils import resample - -import nkululeko.glob_conf as glob_conf -from nkululeko.reporting.defines import Header -from nkululeko.reporting.report_item import ReportItem -from nkululeko.result import Result -from nkululeko.utils.util import Util - - -class Reporter: - def __set_measure(self): - if self.util.exp_is_classification(): - self.MEASURE = "UAR" - self.result.measure = self.MEASURE - self.is_classification = True - else: - self.is_classification = False - self.measure = self.util.config_val("MODEL", "measure", "mse") - if self.measure == "mse": - self.MEASURE = "MSE" - self.result.measure = self.MEASURE - elif self.measure == "mae": - self.MEASURE = "MAE" - self.result.measure = self.MEASURE - elif self.measure == "ccc": - self.MEASURE = "CCC" - self.result.measure = self.MEASURE - - def __init__(self, truths, preds, run, epoch): - """Initialization with ground truth und predictions vector""" - self.util = Util("reporter") - self.format = self.util.config_val("PLOT", "format", "png") - self.truths = truths - self.preds = preds - self.result = Result(0, 0, 0, 0, "unknown") - self.run = run - self.epoch = epoch - self.__set_measure() - self.cont_to_cat = False - if len(self.truths) > 0 and len(self.preds) > 0: - if self.util.exp_is_classification(): - self.result.test = recall_score( - self.truths, self.preds, average="macro" - ) - self.result.loss = 1 - accuracy_score(self.truths, self.preds) - else: - # regression experiment - if self.measure == "mse": - self.result.test = mean_squared_error( - self.truths, self.preds) - elif self.measure == "mae": - self.result.test = mean_absolute_error( - self.truths, self.preds) - elif self.measure == "ccc": - self.result.test = self.ccc(self.truths, self.preds) - if math.isnan(self.result.test): - self.util.debug(f"Truth: {self.truths}") - self.util.debug(f"Predict.: {self.preds}") - self.util.debug(f"Result is NAN: setting to -1") - self.result.test = -1 - else: - self.util.error(f"unknown measure: {self.measure}") - - # train and loss are being set by the model - - def set_id(self, run, epoch): - """Make the report identifiable with run and epoch index""" - self.run = run - self.epoch = epoch - - def continuous_to_categorical(self): - if self.cont_to_cat: - return - self.cont_to_cat = True - bins = ast.literal_eval(glob_conf.config["DATA"]["bins"]) - self.truths = np.digitize(self.truths, bins) - 1 - self.preds = np.digitize(self.preds, bins) - 1 - - def plot_confmatrix(self, plot_name, epoch): - if not self.util.exp_is_classification(): - self.continuous_to_categorical() - self._plot_confmat(self.truths, self.preds, plot_name, epoch) - - -def plot_per_speaker(self, result_df, plot_name, function): - """Plot a confusion matrix with the mode category per speakers. - - This function creates a confusion matrix for each speaker in the result_df. - The result_df should contain the columns: preds, truths and speaker. - - Args: - * result_df: a pandas dataframe with columns: preds, truths and speaker - * plot_name: a string with the name of the plot - * function: a string with the function to use for each speaker, - can be 'mode' or 'mean' - - Returns: - * None - """ - # Initialize empty arrays for predictions and truths - pred = np.zeros(0) - truth = np.zeros(0) - - # Iterate over each speaker - for s in result_df.speaker.unique(): - # Filter the dataframe for the current speaker - s_df = result_df[result_df.speaker == s] - - # Get the mode or mean prediction for the current speaker - mode = s_df.pred.mode().iloc[-1] - mean = s_df.pred.mean() - if function == "mode": - s_df.pred = mode - elif function == "mean": - s_df.pred = mean - else: - self.util.error(f"unknown function {function}") - - # Append the current speaker's predictions and truths to the arrays - pred = np.append(pred, s_df.pred.values) - truth = np.append(truth, s_df["truth"].values) - - # If the experiment is not a classification or continuous to categorical conversion was performed, - # convert the truths and predictions to categorical - if not (self.is_classification or self.cont_to_cat): - bins = ast.literal_eval(glob_conf.config["DATA"]["bins"]) - truth = np.digitize(truth, bins) - 1 - pred = np.digitize(pred, bins) - 1 - - # Plot the confusion matrix for the speakers - self._plot_confmat(truth, pred.astype("int"), plot_name, 0) - - def _plot_confmat(self, truths, preds, plot_name, epoch): - # print(truths) - # print(preds) - fig_dir = self.util.get_path("fig_dir") - labels = glob_conf.labels - fig = plt.figure() # figsize=[5, 5] - uar = recall_score(truths, preds, average="macro") - acc = accuracy_score(truths, preds) - cm = confusion_matrix( - truths, preds, normalize=None - ) # normalize must be one of {'true', 'pred', 'all', None} - if cm.shape[0] != len(labels): - self.util.error( - f"mismatch between confmatrix dim ({cm.shape[0]}) and labels" - f" length ({len(labels)}: {labels})" - ) - try: - disp = ConfusionMatrixDisplay( - confusion_matrix=cm, display_labels=labels - ).plot(cmap="Blues") - except ValueError: - disp = ConfusionMatrixDisplay( - confusion_matrix=cm, - display_labels=list(labels).remove("neutral"), - ).plot(cmap="Blues") - - reg_res = "" - if not self.is_classification: - reg_res = f", {self.MEASURE}: {self.result.test:.3f}" - - if epoch != 0: - plt.title( - f"Confusion Matrix, UAR: {uar:.3f}{reg_res}, Epoch: {epoch}") - else: - plt.title(f"Confusion Matrix, UAR: {uar:.3f}{reg_res}") - img_path = f"{fig_dir}{plot_name}.{self.format}" - plt.savefig(img_path) - fig.clear() - plt.close(fig) - plt.savefig(img_path) - plt.close(fig) - glob_conf.report.add_item( - ReportItem( - Header.HEADER_RESULTS, - self.util.get_model_description(), - "Confusion matrix", - img_path, - ) - ) - - res_dir = self.util.get_path("res_dir") - uar = int(uar * 1000) / 1000.0 - acc = int(acc * 1000) / 1000.0 - rpt = f"epoch: {epoch}, UAR: {uar}, ACC: {acc}" - # print(rpt) - self.util.debug(rpt) - file_name = f"{res_dir}{self.util.get_exp_name()}_conf.txt" - with open(file_name, "w") as text_file: - text_file.write(rpt) - - def print_results(self, epoch): - """Print all evaluation values to text file""" - res_dir = self.util.get_path("res_dir") - file_name = f"{res_dir}{self.util.get_exp_name()}_{epoch}.txt" - if self.util.exp_is_classification(): - labels = glob_conf.labels - try: - rpt = classification_report( - self.truths, - self.preds, - target_names=labels, - output_dict=True, - ) - except ValueError as e: - self.util.debug( - "Reporter: caught a ValueError when trying to get" - " classification_report: " + e - ) - rpt = self.result.to_string() - with open(file_name, "w") as text_file: - c_ress = list(range(len(labels))) - for i, l in enumerate(labels): - c_res = rpt[l]["f1-score"] - c_ress[i] = float(f"{c_res:.3f}") - self.util.debug(f"labels: {labels}") - f1_per_class = f"result per class (F1 score): {c_ress}" - self.util.debug(f1_per_class) - rpt_str = f"{json.dumps(rpt)}\n{f1_per_class}" - text_file.write(rpt_str) - glob_conf.report.add_item( - ReportItem( - Header.HEADER_RESULTS, - f"Classification result {self.util.get_model_description()}", - rpt_str, - ) - ) - - else: # regression - result = self.result.test - r2 = r2_score(self.truths, self.preds) - pcc = pearsonr(self.truths, self.preds)[0] - measure = self.util.config_val("MODEL", "measure", "mse") - with open(file_name, "w") as text_file: - text_file.write( - f"{measure}: {result:.3f}, r_2: {r2:.3f}, pcc {pcc:.3f}" - ) - - def make_conf_animation(self, out_name): - import imageio - - fig_dir = self.util.get_path("fig_dir") - filenames = glob.glob( - fig_dir + f"{self.util.get_plot_name()}*_?_???_cnf.png") - images = [] - for filename in filenames: - images.append(imageio.imread(filename)) - fps = self.util.config_val("PLOT", "fps", "1") - try: - imageio.mimsave(fig_dir + out_name, images, fps=int(fps)) - except RuntimeError as e: - self.util.error("error writing anim gif: " + e) - - def get_result(self): - return self.result - - def plot_epoch_progression(self, reports, out_name): - fig_dir = self.util.get_path("fig_dir") - results, losses, train_results, losses_eval = [], [], [], [] - for r in reports: - results.append(r.get_result().test) - losses.append(r.get_result().loss) - train_results.append(r.get_result().train) - losses_eval.append(r.get_result().loss_eval) - - # do a plot per run - # scale the losses so they fit on the picture - losses, results, train_results, losses_eval = ( - np.asarray(losses), - np.asarray(results), - np.asarray(train_results), - np.asarray(losses_eval), - ) - - if np.all((results > 1)): - # scale down values - results = results / 100.0 - train_results = train_results / 100.0 - # if np.all((losses < 1)): - # scale up values - plt.figure(dpi=200) - plt.plot(train_results, "green", label="train set") - plt.plot(results, "red", label="dev set") - plt.plot(losses, "black", label="losses") - plt.plot(losses_eval, "grey", label="losses_eval") - plt.xlabel("epochs") - plt.ylabel(f"{self.MEASURE}") - plt.legend() - plt.savefig(f"{fig_dir}{out_name}.{self.format}") - plt.close() - - @staticmethod - def ccc(ground_truth, prediction): - mean_gt = np.mean(ground_truth, 0) - mean_pred = np.mean(prediction, 0) - var_gt = np.var(ground_truth, 0) - var_pred = np.var(prediction, 0) - v_pred = prediction - mean_pred - v_gt = ground_truth - mean_gt - cor = sum(v_pred * v_gt) / \ - (np.sqrt(sum(v_pred**2)) * np.sqrt(sum(v_gt**2))) - sd_gt = np.std(ground_truth) - sd_pred = np.std(prediction) - numerator = 2 * cor * sd_gt * sd_pred - denominator = var_gt + var_pred + (mean_gt - mean_pred) ** 2 - ccc = numerator / denominator - return ccc diff --git a/nkululeko/reporting/reporter.py b/nkululeko/reporting/reporter.py index f279fb3b..411b44b4 100644 --- a/nkululeko/reporting/reporter.py +++ b/nkululeko/reporting/reporter.py @@ -55,6 +55,7 @@ def __init__(self, truths, preds, run, epoch): self.run = run self.epoch = epoch self.__set_measure() + self.filenameadd = "" self.cont_to_cat = False if len(self.truths) > 0 and len(self.preds) > 0: if self.util.exp_is_classification(): @@ -206,7 +207,7 @@ def _plot_confmat(self, truths, preds, plot_name, epoch): f"Confusion Matrix, UAR: {uar_str} " + f"(+-{up_str}/{low_str}) {reg_res}" ) - img_path = f"{fig_dir}{plot_name}.{self.format}" + img_path = f"{fig_dir}{plot_name}{self.filenameadd}.{self.format}" plt.savefig(img_path) fig.clear() plt.close(fig) @@ -228,14 +229,17 @@ def _plot_confmat(self, truths, preds, plot_name, epoch): ) # print(rpt) self.util.debug(rpt) - file_name = f"{res_dir}{self.util.get_exp_name()}_conf.txt" + file_name = f"{res_dir}{self.util.get_exp_name()}{self.filenameadd}_conf.txt" with open(file_name, "w") as text_file: text_file.write(rpt) + def set_filename_add(self, my_string): + self.filenameadd = f"_{my_string}" + def print_results(self, epoch): """Print all evaluation values to text file.""" res_dir = self.util.get_path("res_dir") - file_name = f"{res_dir}{self.util.get_exp_name()}_{epoch}.txt" + file_name = f"{res_dir}{self.util.get_exp_name()}_{epoch}{self.filenameadd}.txt" if self.util.exp_is_classification(): labels = glob_conf.labels try: diff --git a/nkululeko/test.py b/nkululeko/test.py index ac1a781c..06462d77 100644 --- a/nkululeko/test.py +++ b/nkululeko/test.py @@ -10,20 +10,7 @@ from nkululeko.utils.util import Util -def main(src_dir): - parser = argparse.ArgumentParser( - description="Call the nkululeko TEST framework.") - parser.add_argument("--config", default="exp.ini", - help="The base configuration") - parser.add_argument( - "--outfile", - default="my_results.csv", - help="File name to store the predictions", - ) - - args = parser.parse_args() - - config_file = args.config +def do_it(config_file, outfile): # test if the configuration file exists if not os.path.isfile(config_file): @@ -48,10 +35,28 @@ def main(src_dir): expr.load(f"{util.get_save_name()}") expr.fill_tests() expr.extract_test_feats() - expr.predict_test_and_save(args.outfile) + result = expr.predict_test_and_save(outfile) print("DONE") + return result, 0 + + +def main(src_dir): + parser = argparse.ArgumentParser(description="Call the nkululeko TEST framework.") + parser.add_argument("--config", default="exp.ini", help="The base configuration") + parser.add_argument( + "--outfile", + default="my_results.csv", + help="File name to store the predictions", + ) + args = parser.parse_args() + if args.config is not None: + config_file = args.config + else: + config_file = f"{src_dir}/exp.ini" + do_it(config_file, args.outfile) + if __name__ == "__main__": cwd = os.path.dirname(os.path.abspath(__file__)) diff --git a/nkululeko/test_predictor.py b/nkululeko/test_predictor.py index dc9a88f2..0cfb68a5 100644 --- a/nkululeko/test_predictor.py +++ b/nkululeko/test_predictor.py @@ -1,21 +1,25 @@ -""" test_predictor.py +"""test_predictor.py. + Predict targets from a model and save as csv file. """ -import nkululeko.glob_conf as glob_conf -from nkululeko.utils.util import Util +import ast + +import numpy as np import pandas as pd +from sklearn.preprocessing import LabelEncoder + from nkululeko.data.dataset import Dataset from nkululeko.feature_extractor import FeatureExtractor +import nkululeko.glob_conf as glob_conf from nkululeko.scaler import Scaler -import numpy as np -from sklearn.preprocessing import LabelEncoder +from nkululeko.utils.util import Util -class Test_predictor: +class TestPredictor: def __init__(self, model, orig_df, labenc, name): - """Constructor setting up name and configuration""" + """Constructor setting up name and configuration.""" self.model = model self.orig_df = orig_df self.label_encoder = labenc @@ -25,6 +29,7 @@ def __init__(self, model, orig_df, labenc, name): def predict_and_store(self): label_data = self.util.config_val("DATA", "label_data", False) + result = 0 if label_data: data = Dataset(label_data) data.load() @@ -49,7 +54,15 @@ def predict_and_store(self): df[self.target] = labelenc.inverse_transform(predictions.tolist()) df.to_csv(self.name) else: + test_dbs = ast.literal_eval(glob_conf.config["DATA"]["tests"]) + test_dbs_string = "_".join(test_dbs) predictions = self.model.get_predictions() + report = self.model.predict() + result = report.result.get_result() + report.set_filename_add(f"test-{test_dbs_string}") + self.util.print_best_results([report]) + report.plot_confmatrix(self.util.get_plot_name(), 0) + report.print_results(0) # print(predictions) # df = pd.DataFrame(index=self.orig_df.index) # df["speaker"] = self.orig_df["speaker"] @@ -63,3 +76,4 @@ def predict_and_store(self): df = df.rename(columns={"class_label": target}) df.to_csv(self.name) self.util.debug(f"results stored in {self.name}") + return result diff --git a/nkululeko/test_pretrain.py b/nkululeko/test_pretrain.py new file mode 100644 index 00000000..0b94840e --- /dev/null +++ b/nkululeko/test_pretrain.py @@ -0,0 +1,117 @@ +# test_pretrain.py +import argparse +import configparser +import os.path + +import datasets +import numpy as np +import pandas as pd +import torch +import transformers + +import audeer +import audiofile + +from nkululeko.constants import VERSION +import nkululeko.experiment as exp +import nkululeko.glob_conf as glob_conf +from nkululeko.utils.util import Util + + +def doit(config_file): + # test if the configuration file exists + if not os.path.isfile(config_file): + print(f"ERROR: no such file: {config_file}") + exit() + + # load one configuration per experiment + config = configparser.ConfigParser() + config.read(config_file) + + # create a new experiment + expr = exp.Experiment(config) + module = "test_pretrain" + expr.set_module(module) + util = Util(module) + util.debug( + f"running {expr.name} from config {config_file}, nkululeko version" + f" {VERSION}" + ) + + if util.config_val("EXP", "no_warnings", False): + import warnings + + warnings.filterwarnings("ignore") + + # load the data + expr.load_datasets() + + # split into train and test + expr.fill_train_and_tests() + util.debug(f"train shape : {expr.df_train.shape}, test shape:{expr.df_test.shape}") + + sampling_rate = 16000 + max_duration_sec = 8.0 + + model_path = "facebook/wav2vec2-large-robust-ft-swbd-300h" + num_layers = None + + batch_size = 16 + accumulation_steps = 4 + + # create dataset + + dataset = {} + data_sources = { + "train": pd.DataFrame(expr.df_train[glob_conf.target]), + "dev": pd.DataFrame(expr.df_test[glob_conf.target]), + } + + for split in ["train", "dev"]: + + y = pd.Series( + data=data_sources[split].itertuples(index=False, name=None), + index=data_sources[split].index, + dtype=object, + name="labels", + ) + + y.name = "targets" + df = y.reset_index() + df.start = df.start.dt.total_seconds() + df.end = df.end.dt.total_seconds() + print(f"{split}: {len(df)}") + ds = datasets.Dataset.from_pandas(df) + dataset[split] = ds + + dataset = datasets.DatasetDict(dataset) + + config = transformers.AutoConfig.from_pretrained( + model_path, + num_labels=len(util.la), + label2id=data.gender_mapping, + id2label=data.gender_mapping_reverse, + finetuning_task="age-gender", + ) + if num_layers is not None: + config.num_hidden_layers = num_layers + setattr(config, "sampling_rate", sampling_rate) + setattr(config, "data", ",".join(sources)) + + print("DONE") + + +def main(src_dir): + parser = argparse.ArgumentParser(description="Call the nkululeko framework.") + parser.add_argument("--config", default="exp.ini", help="The base configuration") + args = parser.parse_args() + if args.config is not None: + config_file = args.config + else: + config_file = f"{src_dir}/exp.ini" + doit(config_file) + + +if __name__ == "__main__": + cwd = os.path.dirname(os.path.abspath(__file__)) + main(cwd) # use this if you want to state the config file path on command line diff --git a/nkululeko/utils/util.py b/nkululeko/utils/util.py index 5400f594..8815f55d 100644 --- a/nkululeko/utils/util.py +++ b/nkululeko/utils/util.py @@ -1,10 +1,13 @@ # util.py -import pandas as pd import ast +import configparser +import os.path +import pickle import sys + import numpy as np -import os.path -import configparser +import pandas as pd + import audeer import audformat @@ -295,6 +298,28 @@ def print_best_results(self, best_reports): f" {vals.argmax()}" ) + def exist_pickle(self, name): + store = self.get_path("store") + name = "/".join([store, name]) + ".pkl" + if os.path.isfile(name): + return True + return False + + def to_pickle(self, anyobject, name): + store = self.get_path("store") + name = "/".join([store, name]) + ".pkl" + self.debug(f"saving {name}") + with open(name, "wb") as handle: + pickle.dump(anyobject, handle) + + def from_pickle(self, name): + store = self.get_path("store") + name = "/".join([store, name]) + ".pkl" + self.debug(f"loading {name}") + with open(name, "rb") as handle: + any_opject = pickle.load(handle) + return any_opject + def write_store(self, df, storage, format): if format == "pkl": df.to_pickle(storage) diff --git a/requirements.txt b/requirements.txt index 2f92cd3d..d5a52a66 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,9 +19,12 @@ scikit_learn scipy seaborn sounddevice +splitutils # tensorflow # tensorflow_hub torch +torchaudio +torchvision # transformers xgboost umap-learn diff --git a/test_runs.sh b/test_runs.sh index cf56051c..f5ac9b16 100755 --- a/test_runs.sh +++ b/test_runs.sh @@ -39,6 +39,8 @@ function Nkulu { # test features function Feat { python -m nkululeko.nkululeko --config tests/exp_emodb_hubert_xgb.ini + python -m nkululeko.nkululeko --config tests/exp_emodb_audmodel_xgb.ini + python -m nkululeko.nkululeko --config tests/exp_emodb_wavlm_xgb.ini python -m nkululeko.nkululeko --config tests/exp_emodb_whisper_xgb.ini } # test augmentation @@ -73,6 +75,10 @@ function Demo { function Test { python -m nkululeko.nkululeko --config tests/exp_emodb_os_xgb_test.ini python -m nkululeko.test --config tests/exp_emodb_os_xgb_test.ini + python -m nkululeko.nkululeko --config tests/exp_emodb_trill_test.ini + python -m nkululeko.test --config tests/exp_emodb_trill_test.ini + python -m nkululeko.nkululeko --config tests/exp_emodb_wav2vec2_test.ini + python -m nkululeko.test --config tests/exp_emodb_wav2vec2_test.ini } # test multidb function Multi { @@ -82,8 +88,9 @@ function Multi { function Spot { python -m nkululeko.explore --config tests/exp_explore.ini } - -Help +if [ $# -eq 0 ] || [ "$1" == "--help" ]; then + Help +fi for arg in "$@"; do if [[ "$arg" = --Explore ]] || [[ "$arg" = --all ]]; then Explore diff --git a/tests/data_roots.ini b/tests/data_roots.ini index b1cf39d5..c20341c0 100644 --- a/tests/data_roots.ini +++ b/tests/data_roots.ini @@ -4,7 +4,8 @@ emodb.split_strategy = specified emodb.test_tables = ['emotion.categories.test.gold_standard'] emodb.train_tables = ['emotion.categories.train.gold_standard'] emodb.mapping = {'anger':'angry', 'happiness':'happy', 'sadness':'sad', 'neutral':'neutral'} -crema-d = ./data/crema-d/crema-d/1.3.0/fe182b91/ +; crema-d = ./data/crema-d/crema-d/1.3.0/fe182b91/ +crema-d = ./data/crema-d/crema-d/1.3.0/d3b62a9b/ crema-d.split_strategy = specified crema-d.colnames = {'sex':'gender'} crema-d.files_table = ['files'] diff --git a/tests/emodb_demo.ini b/tests/emodb_demo.ini index 451b41c0..a60c2a37 100644 --- a/tests/emodb_demo.ini +++ b/tests/emodb_demo.ini @@ -6,7 +6,7 @@ epochs = 100 save = True [DATA] databases = ['emodb'] -root_folders = data_roots.ini +root_folders = ./tests/data_roots.ini tests = ['testdb'] testdb = data/test/samples.csv testdb.type = csv diff --git a/tests/exp_emodb_audmodel_xgb.ini b/tests/exp_emodb_audmodel_xgb.ini index a4cd5009..e4ae2a9c 100644 --- a/tests/exp_emodb_audmodel_xgb.ini +++ b/tests/exp_emodb_audmodel_xgb.ini @@ -1,6 +1,6 @@ [EXP] root = ./tests/results/ -name = exp_emodb_audmodel +name = exp_emodb_audmodel_xgb runs = 1 epochs = 1 save = True @@ -8,7 +8,7 @@ save = True databases = ['emodb'] emodb = ./data/emodb/emodb emodb.split_strategy = random -emodb.limit_samples = 50 +emodb.limit_samples = 200 emodb.mapping = {'anger':'angry', 'happiness':'happy', 'sadness':'sad', 'neutral':'neutral'} labels = ['angry', 'happy', 'neutral', 'sad'] target = emotion diff --git a/tests/exp_emodb_os_knn.ini b/tests/exp_emodb_os_knn.ini index b36a7c57..c57cd0f0 100644 --- a/tests/exp_emodb_os_knn.ini +++ b/tests/exp_emodb_os_knn.ini @@ -11,9 +11,11 @@ emodb.split_strategy = specified emodb.test_tables = ['emotion.categories.test.gold_standard'] emodb.train_tables = ['emotion.categories.train.gold_standard'] target = emotion +labels = ['anger', 'happiness'] [FEATS] type = ['os'] store_format = csv scale = standard [MODEL] type = knn +save = True diff --git a/tests/exp_emodb_os_mlp.ini b/tests/exp_emodb_os_mlp.ini index ffc349d5..8eede178 100644 --- a/tests/exp_emodb_os_mlp.ini +++ b/tests/exp_emodb_os_mlp.ini @@ -11,6 +11,7 @@ emodb.split_strategy = specified emodb.test_tables = ['emotion.categories.test.gold_standard'] emodb.train_tables = ['emotion.categories.train.gold_standard'] target = emotion +labels = ['anger', 'happiness'] [FEATS] type = ['os'] scale = standard @@ -19,6 +20,7 @@ type = mlp layers = {'l1':128, 'l2':16} drop = .4 patience = 5 +save = True [PLOT] best_model = True epoch_progression = True diff --git a/tests/exp_emodb_os_svm.ini b/tests/exp_emodb_os_svm.ini index c24a1720..d9877ab3 100644 --- a/tests/exp_emodb_os_svm.ini +++ b/tests/exp_emodb_os_svm.ini @@ -1,8 +1,6 @@ [EXP] root = ./tests/results/ name = exp_emodb_classifiers -runs = 1 -epochs = 10 save = True [DATA] databases = ['emodb'] @@ -11,9 +9,12 @@ emodb.split_strategy = specified emodb.test_tables = ['emotion.categories.test.gold_standard'] emodb.train_tables = ['emotion.categories.train.gold_standard'] target = emotion +labels = ['anger', 'happiness'] [FEATS] type = ['os'] -store_format = csv scale = standard [MODEL] type = svm +tuning_params = ['C'] +scoring = recall_macro +C = [10, 1, 0.1, 0.01, 0.001, 0.0001] diff --git a/tests/exp_emodb_os_xgb.ini b/tests/exp_emodb_os_xgb.ini index c2ac7053..7031d940 100644 --- a/tests/exp_emodb_os_xgb.ini +++ b/tests/exp_emodb_os_xgb.ini @@ -1,6 +1,6 @@ [EXP] root = ./tests/results/ -name = exp_emodb +name = exp_emodb_classifiers runs = 1 epochs = 10 save = True @@ -10,8 +10,7 @@ emodb = ./data/emodb/emodb emodb.split_strategy = specified emodb.test_tables = ['emotion.categories.test.gold_standard'] emodb.train_tables = ['emotion.categories.train.gold_standard'] -emodb.mapping = {'anger':'angry', 'happiness':'happy', 'sadness':'sad', 'neutral':'neutral'} -labels = ['angry', 'happy', 'neutral', 'sad'] +labels = ['anger', 'happiness'] target = emotion [FEATS] type = ['os'] diff --git a/tests/exp_emodb_os_xgb_logo.ini b/tests/exp_emodb_os_xgb_logo.ini index 464e4602..aa84fec0 100644 --- a/tests/exp_emodb_os_xgb_logo.ini +++ b/tests/exp_emodb_os_xgb_logo.ini @@ -1,6 +1,6 @@ [EXP] root = ./tests/results/ -name = exp_emodb +name = exp_emodb_logo runs = 1 epochs = 1 save = True @@ -10,9 +10,8 @@ emodb = ./data/emodb/emodb emodb.split_strategy = specified emodb.test_tables = ['emotion.categories.test.gold_standard'] emodb.train_tables = ['emotion.categories.train.gold_standard'] -emodb.mapping = {'anger':'angry', 'happiness':'happy', 'sadness':'sad', 'neutral':'neutral'} -labels = ['angry', 'happy', 'neutral', 'sad'] target = emotion +labels = ['anger', 'happiness'] [FEATS] type = ['os'] store_format = csv diff --git a/tests/exp_emodb_os_xgb_test.ini b/tests/exp_emodb_os_xgb_test.ini index e0c86f7e..b597d160 100644 --- a/tests/exp_emodb_os_xgb_test.ini +++ b/tests/exp_emodb_os_xgb_test.ini @@ -7,7 +7,7 @@ databases = ['emodb'] root_folders = tests/data_roots.ini target = emotion tests = ['crema-d'] -labels = ['angry', 'happy', 'neutral', 'sad'] +labels = ['angry', 'happy'] no_reuse = True [FEATS] type = ['os'] diff --git a/tests/exp_emodb_shap.ini b/tests/exp_emodb_shap.ini new file mode 100644 index 00000000..50f6bb12 --- /dev/null +++ b/tests/exp_emodb_shap.ini @@ -0,0 +1,28 @@ +[EXP] +root = ./tests/results/ +name = exp_emodb_shap +runs = 1 +epochs = 500 +save = True +[DATA] +databases = ['emodb'] +emodb = ./data/emodb/emodb +emodb.split_strategy = specified +emodb.test_tables = ['emotion.categories.test.gold_standard'] +emodb.train_tables = ['emotion.categories.train.gold_standard'] +target = emotion +labels = ['anger', 'happiness'] +[FEATS] +type = ['os'] +scale = standard +[MODEL] +type = mlp +layers = {'l1':128, 'l2':16} +drop = .4 +patience = 5 +[EXPL] +shap = True +sample_selection = test +[PLOT] +best_model = True +epoch_progression = True diff --git a/tests/exp_emodb_trill_test.ini b/tests/exp_emodb_trill_test.ini new file mode 100644 index 00000000..38a2ce51 --- /dev/null +++ b/tests/exp_emodb_trill_test.ini @@ -0,0 +1,16 @@ +[EXP] +root = ./tests/results/ +name = exp_testmodule +save = True +[DATA] +databases = ['emodb'] +root_folders = tests/data_roots.ini +target = emotion +tests = ['crema-d'] +labels = ['angry', 'happy'] +no_reuse = True +[FEATS] +type = ['trill'] +[MODEL] +type = xgb +save = True diff --git a/tests/exp_emodb_wav2vec2_test.ini b/tests/exp_emodb_wav2vec2_test.ini new file mode 100644 index 00000000..f2051b51 --- /dev/null +++ b/tests/exp_emodb_wav2vec2_test.ini @@ -0,0 +1,16 @@ +[EXP] +root = ./tests/results/ +name = exp_testmodule +save = True +[DATA] +databases = ['emodb'] +root_folders = tests/data_roots.ini +target = emotion +tests = ['crema-d'] +labels = ['angry', 'happy'] +no_reuse = True +[FEATS] +type = ['wav2vec2'] +[MODEL] +type = xgb +save = True diff --git a/tests/exp_emodb_wav2vec_xgb.ini b/tests/exp_emodb_wav2vec_xgb.ini new file mode 100644 index 00000000..82906e32 --- /dev/null +++ b/tests/exp_emodb_wav2vec_xgb.ini @@ -0,0 +1,18 @@ +[EXP] +root = ./tests/results/ +name = exp_emodb_feats +runs = 1 +epochs = 1 +save = True +[DATA] +databases = ['emodb'] +emodb = ./data/emodb/emodb +emodb.test_tables = ['emotion.categories.test.gold_standard'] +emodb.train_tables = ['emotion.categories.train.gold_standard'] +emodb.mapping = {'anger':'angry', 'happiness':'happy', 'sadness':'sad', 'neutral':'neutral'} +labels = ['angry', 'happy'] +target = emotion +[FEATS] +type = ['wav2vec2'] +[MODEL] +type = xgb diff --git a/tests/exp_emodb_wavlm_xgb.ini b/tests/exp_emodb_wavlm_xgb.ini new file mode 100644 index 00000000..4db1a5c3 --- /dev/null +++ b/tests/exp_emodb_wavlm_xgb.ini @@ -0,0 +1,18 @@ +[EXP] +root = ./tests/results/ +name = exp_emodb_feats +runs = 1 +epochs = 1 +save = True +[DATA] +databases = ['emodb'] +emodb = ./data/emodb/emodb +emodb.test_tables = ['emotion.categories.test.gold_standard'] +emodb.train_tables = ['emotion.categories.train.gold_standard'] +emodb.mapping = {'anger':'angry', 'happiness':'happy', 'sadness':'sad', 'neutral':'neutral'} +labels = ['angry', 'happy'] +target = emotion +[FEATS] +type = ['wavlm-base-plus'] +[MODEL] +type = xgb