add test and update finetuning

WaveGenAI · May 24, 2024 · cab12ac · cab12ac
1 parent a951ed7
commit cab12ac
Show file tree

Hide file tree

Showing 12 changed files with 367 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -167,6 +167,6 @@ test.py
 
 # remove training dir
 training/
-model_fintuned/
+model_finetuned/
 
 .idea/
diff --git a/dataset/dataset.py b/dataset/dataset.py
@@ -73,7 +73,7 @@ def load(self) -> datasets.dataset_dict.DatasetDict:
             }
         )
 
-        test_size = min(int(len(dataset) * 0.2), 2000)
+        test_size = min(int(len(dataset) * 0.1), 2000)
         dataset = dataset.train_test_split(test_size=test_size)
 
         return dataset
diff --git a/dataset/llama3_dataset.py b/dataset/llama3_dataset.py
@@ -23,7 +23,6 @@
 A module that contains the Flant5 dataset class.
 """
 
-
 import os
 import random
 import typing
@@ -36,8 +35,23 @@
 
 
 def _setup_model() -> typing.Tuple[AutoModelForCausalLM, AutoTokenizer]:
-    model = AutoModelForCausalLM.from_pretrained(
-        "unsloth/llama-3-8b-Instruct-bnb-4bit"
+    # model = AutoModelForCausalLM.from_pretrained(
+    #     "unsloth/llama-3-8b-Instruct-bnb-4bit"
+    # )
+    #
+    # tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct-bnb-4bit")
+    # model.generation_config.pad_token_ids = tokenizer.pad_token_id
+    llm = Llama.from_pretrained(
+        repo_id="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
+        filename="*Q4_K_M.gguf",
+        flash_attn=True,
+        n_gpu_layers=-1,
+        n_ctx=2000,
+        use_mlock=False,
+        verbose=False,
+        # draft_model=LlamaPromptLookupDecoding(
+        #     max_ngram_size=3, num_pred_tokens=5
+        # ),  # boost?
     )
 
     tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct-bnb-4bit")
@@ -62,7 +76,11 @@ class Llama3Dataset(LyricsDataset):
     The lyrics should be generated based on the word "{WORD}" and have to contain the word at the 5th word of the lyrics.
     The number of chorus to generate is {NB_CHORUS}.
     The number of verse to generate is {NB_VERSE}.
-    The number of bridge to generate is {NB_BRIDGE}.
+    The number of bridge to generate is {NB_BRIDGE} where ever you want.
+    You start with a {START}.
+    The number of line for the VERSES is {VERSE_LINES}.
+    The number of line for the CHORUS is {CHORUS_LINES}.
+    {ONOMATOPOEIA}
     """
 
     def __init__(self, path: str) -> None:
@@ -111,6 +129,9 @@ def generate_dataset(self, nb_gen: int = 100) -> None:
         for i in range(nb_gen):
             # add word to obtain different lyrics each time
             word = " ".join(self._word_generator.get_random_word() for _ in range(1))
+            start = "CHORUS" if random.random() >= 0.5 else "VERSE"
+            onomatopoeia = ("Add some (not too much) onomatopoeia to the lyrics (in verse, chorus, and out verse, "
+                            "chorus and bridge).") if random.random() >= 0.75 else ""
             messages = [
                 {
                     "role": "system",
@@ -121,9 +142,13 @@ def generate_dataset(self, nb_gen: int = 100) -> None:
                     "content": self.PROMPT.format(
                         TAGS=tags,
                         WORD=word,
-                        NB_CHORUS=random.randint(2, 4),
-                        NB_VERSE=random.randint(2, 4),
-                        NB_BRIDGE=random.randint(0, 1)
+                        NB_CHORUS=random.choices(population=[1, 2, 3, 4], weights=[0.1, 0.2, 0.3, 0.4], ),
+                        NB_VERSE=random.choices(population=[1, 2, 3, 4], weights=[0.1, 0.2, 0.3, 0.4], ),
+                        NB_BRIDGE=random.randint(0, 1),
+                        START=start,
+                        VERSE_LINES=random.randint(2, 12),
+                        CHORUS_LINES=random.randint(2, 12),
+                        ONOMATOPOEIA=onomatopoeia,
                     ),
                 },
             ]

diff --git a/evaluate_script.py b/evaluate_script.py
@@ -0,0 +1,7 @@
+"""
+This script is used to evaluate the model.
+"""
+
+from model.eval import evaluator
+
+print(evaluator.evaluate())
diff --git a/finetune.py b/finetune.py
@@ -2,7 +2,8 @@
 This script is used to fine-tune the model on the dataset.
 """
 
-from model.finetune import trainer
+from model.finetune import trainer, tokenizer
 
 trainer.train()
-trainer.save_model("model_fintuned")  # Save the model
+trainer.save_model("model_finetuned")  # Save the model
+tokenizer.save_pretrained("model_finetuned")  # Save the tokenizer
diff --git a/generate_lyric.py b/generate_lyric.py
@@ -5,7 +5,7 @@
 from dataset.llama3_dataset import Llama3Dataset
 
 dataset = Llama3Dataset("data/")
-# dataset.generate_dataset()
+dataset.generate_dataset()
 data = dataset.load()
 
 for d in data["test"]:

diff --git a/model/eval.py b/model/eval.py
@@ -0,0 +1,48 @@
+"""
+This script is used to evaluate the model on the test set.
+"""
+import torch
+from transformers import TrainingArguments, AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Trainer
+
+import dataset.llama3_dataset
+
+from model import utils
+
+
+data = dataset.llama3_dataset.Llama3Dataset("data/").load()
+
+t5model = AutoModelForSeq2SeqLM.from_pretrained("./model_finetuned")
+tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+def _tokenize_function(examples):
+    return tokenizer(
+        examples["lyrics_no_tags"],
+        text_target=examples["lyrics"],
+        padding="max_length",
+        truncation=True,
+        max_length=512,
+    )
+
+ds = data.map(_tokenize_function)
+data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=t5model)
+utils.TOKENIZER = tokenizer
+
+BATCH_SIZE = 8
+
+training_args = TrainingArguments(
+    output_dir="eval",
+    evaluation_strategy="epoch",
+    per_device_train_batch_size=BATCH_SIZE,
+    per_device_eval_batch_size=BATCH_SIZE,
+    fp16=not torch.cuda.is_bf16_supported(),
+    bf16=torch.cuda.is_bf16_supported(),
+    bf16_full_eval=torch.cuda.is_bf16_supported(),
+    fp16_full_eval=not torch.cuda.is_bf16_supported(),
+)
+
+evaluator = Trainer(
+    model=t5model,
+    args=training_args,
+    eval_dataset=ds["test"],
+    data_collator=data_collator,
+    compute_metrics=utils.compute_metrics,
+)
diff --git a/model/finetune.py b/model/finetune.py
@@ -29,16 +29,38 @@
     Trainer,
     TrainingArguments, DataCollatorForSeq2Seq,
 )
+from peft import (
+    get_peft_model,
+    LoraConfig,
+    TaskType,
+)
 
 import dataset.llama3_dataset
 from model import utils
+from model. patch import patch
 
-data = dataset.llama3_dataset.Llama3Dataset("data/").load()
-
+patch()
 
+data = dataset.llama3_dataset.Llama3Dataset("data/").load()
 
+peft_config = LoraConfig(
+    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=32, lora_alpha=64, lora_dropout=0.1
+)
 t5model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
+
 tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+special_tokens_to_add = []
+for i in range(1, 5):
+    special_tokens_to_add.append(f"[CHORUS {i}]")
+    special_tokens_to_add.append(f"[VERSE {i}]")
+special_tokens_to_add.append("[BRIDGE 1]")
+special_tokens_to_add.append("\n")
+tokenizer.add_special_tokens({"additional_special_tokens": special_tokens_to_add})
+t5model.resize_token_embeddings(len(tokenizer))
+# t5model = get_peft_model(t5model, peft_config)
+# t5model.print_trainable_parameters()
+
+
 def _tokenize_function(examples):
     return tokenizer(
         examples["lyrics_no_tags"],
@@ -48,6 +70,7 @@ def _tokenize_function(examples):
         max_length=512,
     )
 
+
 ds = data.map(_tokenize_function)
 data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=t5model)
 utils.TOKENIZER = tokenizer