From 8cdb6dd3d56f53a6135c94d56f8f128211f716b5 Mon Sep 17 00:00:00 2001
From: Bagus Tris Atmaja <bagustris@outlook.com>
Date: Wed, 15 May 2024 16:59:31 +0900
Subject: [PATCH] add EAED dataset

---
 data/eaed/README.md           | 24 +++++++++++++
 data/eaed/exp.ini             | 30 ++++++++++++++++
 data/eaed/process_database.py | 64 +++++++++++++++++++++++++++++++++++
 data/nemo/exp.ini             |  2 +-
 4 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 data/eaed/README.md
 create mode 100644 data/eaed/exp.ini
 create mode 100644 data/eaed/process_database.py
diff --git a/data/eaed/README.md b/data/eaed/README.md
new file mode 100644
index 00000000..c4c95581
--- /dev/null
+++ b/data/eaed/README.md
@@ -0,0 +1,24 @@
+# Nkululeko pre-processing for EAED dataset
+
+## Dataset description
+
+AED is an Egyptian-Arabic emotional speech dataset containing 3,614 audio files. The dataset is a semi-natural one as it was collected from five well-known Egyptian TV series. Each audio file ranged in length from 1 to 8 seconds depending on the completion time of the given sentence. The dataset contains six different emotions: happy, sad, angry, neutral, surprised, and fearful. All audio files were recorded using the open source Audacity Software at sampling rate 44.1KHz. Four different human labelers were assigned to hear the recorded audio files in order to annotate/label them. Then, a fifth labeler was assigned for the task of tie-breaking. The number of speakers in the dataset is 79 including 37 males and 42 females.
+
+Instructions:
+The audio files for each series are grouped in a separate folder. Each folder consists of multiple folders, one for each actor/actress in the series. In each actor/actress folder, the audio files are named in the following convention: AA_BB_ CC.wav
+
+AA : Actor unique ID
+
+BB : the emotion label
+
+CC : unique number inside this folder
+
+Example: NellyKarim_happy_ 01.wav is a file in a folder that belongs to an actress whose name is Nelly Karim and the emotion being conveyed is happy.
+
+## Pre-processing command
+
+```bash
+```
+
+Reference:  
+[1]  
diff --git a/data/eaed/exp.ini b/data/eaed/exp.ini
new file mode 100644
index 00000000..83558752
--- /dev/null
+++ b/data/eaed/exp.ini
@@ -0,0 +1,30 @@
+[EXP]
+root = /tmp/results/
+name = exp_eaed_hubert_knn_big4
+[DATA]
+databases = ['train', 'dev', 'test']
+train = ./data/eaed/eaed_train.csv
+train.type = csv
+train.absolute_path = False
+train.split_strategy = train
+dev = ./data/eaed/eaed_dev.csv
+dev.type = csv
+dev.absolute_path = False
+dev.split_strategy = train
+test = ./data/eaed/eaed_test.csv
+test.type = csv
+test.absolute_path = False
+test.split_strategy = test
+target = emotion
+labels = ['angry', 'neutral', 'sad', 'happy']
+; get the number of classes from the target column automatically
+[FEATS]
+; type = ['os']
+type = ['hubert-xlarge-ll60k']
+; no_reuse = False
+scale = standard
+[MODEL]
+type = knn
+; save = True
+[RESAMPLE]
+replace = True
\ No newline at end of file
diff --git a/data/eaed/process_database.py b/data/eaed/process_database.py
new file mode 100644
index 00000000..4238a1e2
--- /dev/null
+++ b/data/eaed/process_database.py
@@ -0,0 +1,64 @@
+""" process_database.py for EAED dataset
+
+file name format: <speaker>_<emotion>_<(number)>.wav
+"""
+
+import pandas as pd
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+import argparse
+from nkululeko.utils.files import find_files
+
+
+def process_database(data_dir, output_dir):
+    # check if data_dir exists
+    if not Path(data_dir).is_dir():
+        print(f"ERROR: no such directory {data_dir}")
+        return
+    # create output dir if not exist
+    if not Path(output_dir).is_dir():
+        Path(output_dir).mkdir()
+
+    # read all wav files
+    wavs = find_files(data_dir, relative=True, ext=["wav"])
+    print(f"Found {len(wavs)} wav files.")
+
+    # building dataframe from wavs list
+    df = pd.DataFrame(wavs, columns=["file"])
+    df["file"] = df["file"].apply(lambda x: str(x))
+
+    # get emotion from file basename, make all smallcase
+    df["emotion"] = df["file"].apply(lambda x: x.split("_")[1].lower())
+
+    # get speaker from file basename, firs string before _
+    df["speaker"] = df["file"].apply(lambda x: Path(x).name.split("_")[0])
+
+    # add language = arabic
+    df["language"] = "arabic"
+
+    # make speaker independent partition
+    speakers = df["speaker"].unique()
+    train_speakers, dev_speakers = train_test_split(speakers, test_size=0.2)
+    dev_speakers, test_speakers = train_test_split(dev_speakers, test_size=0.5)
+
+    # loop over train, dev, and test and save as csv
+    for set_name in ["train", "dev", "test"]:
+        df_set = df[df["speaker"].isin(eval(f"{set_name}_speakers"))]
+        df_set.to_csv(Path(output_dir, f"eaed_{set_name}.csv"), index=False)
+        print(
+            f"Saved {len(df_set)} samples to {Path(output_dir, f'eaed_{set_name}.csv')}"
+        )
+
+    print("DONE")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process EAED dataset")
+    parser.add_argument(
+        "--data_dir", type=str, default="./EAED/", help="Path to the EAED dataset"
+    )
+    parser.add_argument(
+        "--output_dir", type=str, default="./", help="Path to the output directory"
+    )
+    args = parser.parse_args()
+    process_database(args.data_dir, args.output_dir)
diff --git a/data/nemo/exp.ini b/data/nemo/exp.ini
index 4e01a3ce..3daef684 100644
--- a/data/nemo/exp.ini
+++ b/data/nemo/exp.ini
@@ -25,7 +25,7 @@ labels = ['anger', 'neutral', 'sadness', 'happiness']
 type = ['audmodel']
 ; type = ['hubert-xlarge-ll60k']
 ; no_reuse = False
-; scale = standard
+scale = standard
 [MODEL]
 type = knn
 ; save = True