add EAED dataset

bagustris · May 15, 2024 · 8cdb6dd · 8cdb6dd
1 parent 330093e
commit 8cdb6dd
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 1 deletion.
diff --git a/data/eaed/README.md b/data/eaed/README.md
@@ -0,0 +1,24 @@
+# Nkululeko pre-processing for EAED dataset
+
+## Dataset description
+
+AED is an Egyptian-Arabic emotional speech dataset containing 3,614 audio files. The dataset is a semi-natural one as it was collected from five well-known Egyptian TV series. Each audio file ranged in length from 1 to 8 seconds depending on the completion time of the given sentence. The dataset contains six different emotions: happy, sad, angry, neutral, surprised, and fearful. All audio files were recorded using the open source Audacity Software at sampling rate 44.1KHz. Four different human labelers were assigned to hear the recorded audio files in order to annotate/label them. Then, a fifth labeler was assigned for the task of tie-breaking. The number of speakers in the dataset is 79 including 37 males and 42 females.
+
+Instructions:
+The audio files for each series are grouped in a separate folder. Each folder consists of multiple folders, one for each actor/actress in the series. In each actor/actress folder, the audio files are named in the following convention: AA_BB_ CC.wav
+
+AA : Actor unique ID
+
+BB : the emotion label
+
+CC : unique number inside this folder
+
+Example: NellyKarim_happy_ 01.wav is a file in a folder that belongs to an actress whose name is Nelly Karim and the emotion being conveyed is happy.
+
+## Pre-processing command
+
+```bash
+```
+
+Reference:  
+[1]  
diff --git a/data/eaed/exp.ini b/data/eaed/exp.ini
@@ -0,0 +1,30 @@
+[EXP]
+root = /tmp/results/
+name = exp_eaed_hubert_knn_big4
+[DATA]
+databases = ['train', 'dev', 'test']
+train = ./data/eaed/eaed_train.csv
+train.type = csv
+train.absolute_path = False
+train.split_strategy = train
+dev = ./data/eaed/eaed_dev.csv
+dev.type = csv
+dev.absolute_path = False
+dev.split_strategy = train
+test = ./data/eaed/eaed_test.csv
+test.type = csv
+test.absolute_path = False
+test.split_strategy = test
+target = emotion
+labels = ['angry', 'neutral', 'sad', 'happy']
+; get the number of classes from the target column automatically
+[FEATS]
+; type = ['os']
+type = ['hubert-xlarge-ll60k']
+; no_reuse = False
+scale = standard
+[MODEL]
+type = knn
+; save = True
+[RESAMPLE]
+replace = True
diff --git a/data/eaed/process_database.py b/data/eaed/process_database.py
@@ -0,0 +1,64 @@
+""" process_database.py for EAED dataset
+
+file name format: <speaker>_<emotion>_<(number)>.wav
+"""
+
+import pandas as pd
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+import argparse
+from nkululeko.utils.files import find_files
+
+
+def process_database(data_dir, output_dir):
+    # check if data_dir exists
+    if not Path(data_dir).is_dir():
+        print(f"ERROR: no such directory {data_dir}")
+        return
+    # create output dir if not exist
+    if not Path(output_dir).is_dir():
+        Path(output_dir).mkdir()
+
+    # read all wav files
+    wavs = find_files(data_dir, relative=True, ext=["wav"])
+    print(f"Found {len(wavs)} wav files.")
+
+    # building dataframe from wavs list
+    df = pd.DataFrame(wavs, columns=["file"])
+    df["file"] = df["file"].apply(lambda x: str(x))
+
+    # get emotion from file basename, make all smallcase
+    df["emotion"] = df["file"].apply(lambda x: x.split("_")[1].lower())
+
+    # get speaker from file basename, firs string before _
+    df["speaker"] = df["file"].apply(lambda x: Path(x).name.split("_")[0])
+
+    # add language = arabic
+    df["language"] = "arabic"
+
+    # make speaker independent partition
+    speakers = df["speaker"].unique()
+    train_speakers, dev_speakers = train_test_split(speakers, test_size=0.2)
+    dev_speakers, test_speakers = train_test_split(dev_speakers, test_size=0.5)
+
+    # loop over train, dev, and test and save as csv
+    for set_name in ["train", "dev", "test"]:
+        df_set = df[df["speaker"].isin(eval(f"{set_name}_speakers"))]
+        df_set.to_csv(Path(output_dir, f"eaed_{set_name}.csv"), index=False)
+        print(
+            f"Saved {len(df_set)} samples to {Path(output_dir, f'eaed_{set_name}.csv')}"
+        )
+
+    print("DONE")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process EAED dataset")
+    parser.add_argument(
+        "--data_dir", type=str, default="./EAED/", help="Path to the EAED dataset"
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default="./", help="Path to the output directory"
+    )
+    args = parser.parse_args()
+    process_database(args.data_dir, args.output_dir)
diff --git a/data/nemo/exp.ini b/data/nemo/exp.ini
@@ -25,7 +25,7 @@ labels = ['anger', 'neutral', 'sadness', 'happiness']
 type = ['audmodel']
 ; type = ['hubert-xlarge-ll60k']
 ; no_reuse = False
-; scale = standard
+scale = standard
 [MODEL]
 type = knn
 ; save = True