Skip to content

Commit

Permalink
add test and update finetuning
Browse files Browse the repository at this point in the history
  • Loading branch information
ostix360 committed May 24, 2024
1 parent a951ed7 commit cab12ac
Show file tree
Hide file tree
Showing 12 changed files with 367 additions and 14 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,6 @@ test.py

# remove training dir
training/
model_fintuned/
model_finetuned/

.idea/
2 changes: 1 addition & 1 deletion dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def load(self) -> datasets.dataset_dict.DatasetDict:
}
)

test_size = min(int(len(dataset) * 0.2), 2000)
test_size = min(int(len(dataset) * 0.1), 2000)
dataset = dataset.train_test_split(test_size=test_size)

return dataset
39 changes: 32 additions & 7 deletions dataset/llama3_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
A module that contains the Flant5 dataset class.
"""


import os
import random
import typing
Expand All @@ -36,8 +35,23 @@


def _setup_model() -> typing.Tuple[AutoModelForCausalLM, AutoTokenizer]:
model = AutoModelForCausalLM.from_pretrained(
"unsloth/llama-3-8b-Instruct-bnb-4bit"
# model = AutoModelForCausalLM.from_pretrained(
# "unsloth/llama-3-8b-Instruct-bnb-4bit"
# )
#
# tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct-bnb-4bit")
# model.generation_config.pad_token_ids = tokenizer.pad_token_id
llm = Llama.from_pretrained(
repo_id="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
filename="*Q4_K_M.gguf",
flash_attn=True,
n_gpu_layers=-1,
n_ctx=2000,
use_mlock=False,
verbose=False,
# draft_model=LlamaPromptLookupDecoding(
# max_ngram_size=3, num_pred_tokens=5
# ), # boost?
)

tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b-Instruct-bnb-4bit")
Expand All @@ -62,7 +76,11 @@ class Llama3Dataset(LyricsDataset):
The lyrics should be generated based on the word "{WORD}" and have to contain the word at the 5th word of the lyrics.
The number of chorus to generate is {NB_CHORUS}.
The number of verse to generate is {NB_VERSE}.
The number of bridge to generate is {NB_BRIDGE}.
The number of bridge to generate is {NB_BRIDGE} where ever you want.
You start with a {START}.
The number of line for the VERSES is {VERSE_LINES}.
The number of line for the CHORUS is {CHORUS_LINES}.
{ONOMATOPOEIA}
"""

def __init__(self, path: str) -> None:
Expand Down Expand Up @@ -111,6 +129,9 @@ def generate_dataset(self, nb_gen: int = 100) -> None:
for i in range(nb_gen):
# add word to obtain different lyrics each time
word = " ".join(self._word_generator.get_random_word() for _ in range(1))
start = "CHORUS" if random.random() >= 0.5 else "VERSE"
onomatopoeia = ("Add some (not too much) onomatopoeia to the lyrics (in verse, chorus, and out verse, "
"chorus and bridge).") if random.random() >= 0.75 else ""
messages = [
{
"role": "system",
Expand All @@ -121,9 +142,13 @@ def generate_dataset(self, nb_gen: int = 100) -> None:
"content": self.PROMPT.format(
TAGS=tags,
WORD=word,
NB_CHORUS=random.randint(2, 4),
NB_VERSE=random.randint(2, 4),
NB_BRIDGE=random.randint(0, 1)
NB_CHORUS=random.choices(population=[1, 2, 3, 4], weights=[0.1, 0.2, 0.3, 0.4], ),
NB_VERSE=random.choices(population=[1, 2, 3, 4], weights=[0.1, 0.2, 0.3, 0.4], ),
NB_BRIDGE=random.randint(0, 1),
START=start,
VERSE_LINES=random.randint(2, 12),
CHORUS_LINES=random.randint(2, 12),
ONOMATOPOEIA=onomatopoeia,
),
},
]
Expand Down
7 changes: 7 additions & 0 deletions evaluate_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""
This script is used to evaluate the model.
"""

from model.eval import evaluator

print(evaluator.evaluate())
5 changes: 3 additions & 2 deletions finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
This script is used to fine-tune the model on the dataset.
"""

from model.finetune import trainer
from model.finetune import trainer, tokenizer

trainer.train()
trainer.save_model("model_fintuned") # Save the model
trainer.save_model("model_finetuned") # Save the model
tokenizer.save_pretrained("model_finetuned") # Save the tokenizer
2 changes: 1 addition & 1 deletion generate_lyric.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dataset.llama3_dataset import Llama3Dataset

dataset = Llama3Dataset("data/")
# dataset.generate_dataset()
dataset.generate_dataset()
data = dataset.load()

for d in data["test"]:
Expand Down
48 changes: 48 additions & 0 deletions model/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
This script is used to evaluate the model on the test set.
"""
import torch
from transformers import TrainingArguments, AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Trainer

import dataset.llama3_dataset

from model import utils


data = dataset.llama3_dataset.Llama3Dataset("data/").load()

t5model = AutoModelForSeq2SeqLM.from_pretrained("./model_finetuned")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
def _tokenize_function(examples):
return tokenizer(
examples["lyrics_no_tags"],
text_target=examples["lyrics"],
padding="max_length",
truncation=True,
max_length=512,
)

ds = data.map(_tokenize_function)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=t5model)
utils.TOKENIZER = tokenizer

BATCH_SIZE = 8

training_args = TrainingArguments(
output_dir="eval",
evaluation_strategy="epoch",
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
bf16_full_eval=torch.cuda.is_bf16_supported(),
fp16_full_eval=not torch.cuda.is_bf16_supported(),
)

evaluator = Trainer(
model=t5model,
args=training_args,
eval_dataset=ds["test"],
data_collator=data_collator,
compute_metrics=utils.compute_metrics,
)
27 changes: 25 additions & 2 deletions model/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,38 @@
Trainer,
TrainingArguments, DataCollatorForSeq2Seq,
)
from peft import (
get_peft_model,
LoraConfig,
TaskType,
)

import dataset.llama3_dataset
from model import utils
from model. patch import patch

data = dataset.llama3_dataset.Llama3Dataset("data/").load()

patch()

data = dataset.llama3_dataset.Llama3Dataset("data/").load()

peft_config = LoraConfig(
task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=32, lora_alpha=64, lora_dropout=0.1
)
t5model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
special_tokens_to_add = []
for i in range(1, 5):
special_tokens_to_add.append(f"[CHORUS {i}]")
special_tokens_to_add.append(f"[VERSE {i}]")
special_tokens_to_add.append("[BRIDGE 1]")
special_tokens_to_add.append("\n")
tokenizer.add_special_tokens({"additional_special_tokens": special_tokens_to_add})
t5model.resize_token_embeddings(len(tokenizer))
# t5model = get_peft_model(t5model, peft_config)
# t5model.print_trainable_parameters()


def _tokenize_function(examples):
return tokenizer(
examples["lyrics_no_tags"],
Expand All @@ -48,6 +70,7 @@ def _tokenize_function(examples):
max_length=512,
)


ds = data.map(_tokenize_function)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=t5model)
utils.TOKENIZER = tokenizer
Expand Down
Loading

0 comments on commit cab12ac

Please sign in to comment.