From 80ee172c46f2a25543688152995ccb0c2ffd9c2f Mon Sep 17 00:00:00 2001 From: jzonthemtn Date: Wed, 24 Aug 2022 10:18:17 -0400 Subject: [PATCH] Initial commit. --- .gitignore | 7 ++++++ README.md | 17 +++++++++++++ train.py | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 train.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..14aae33 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +venv/ +exported-to-onnx/ +results/ +.ipynb_checkpoints/ +finetune-imdb/ +checkpoints/ +distilbert-imdb/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..5de8c10 --- /dev/null +++ b/README.md @@ -0,0 +1,17 @@ +# Fine Tune + +This repository contains code to fine tune a classifier using the aclImdb_v1 dataset. The resulting model can then be converted to ONNX (and used by Apache OpenNLP). + +## Requirements + +``` +python3 -m pip install transformers onnxruntime torch sklearn +``` + +## Train + +`python3 train.py` + +## Convert to ONNX + +`python3 -m transformers.onnx --model=local-pt-checkpoint/ --feature sequence-classification exported-to-onnx` diff --git a/train.py b/train.py new file mode 100644 index 0000000..4cb55b1 --- /dev/null +++ b/train.py @@ -0,0 +1,72 @@ +from datasets import load_dataset +imdb = load_dataset("imdb") + +small_train_dataset = imdb["train"].shuffle(seed=42) #.select([i for i in list(range(3000))]) +small_test_dataset = imdb["test"].shuffle(seed=42) #.select([i for i in list(range(300))]) + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") + +def preprocess_function(examples): + return tokenizer(examples["text"], truncation=True) + +tokenized_train = small_train_dataset.map(preprocess_function, batched=True) +tokenized_test = small_test_dataset.map(preprocess_function, batched=True) + +from transformers import DataCollatorWithPadding +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2) + + +import numpy as np +from datasets import load_metric + +def compute_metrics(eval_pred): + load_accuracy = load_metric("accuracy") + load_f1 = load_metric("f1") + + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"] + f1 = load_f1.compute(predictions=predictions, references=labels)["f1"] + return {"accuracy": accuracy, "f1": f1} + + +from transformers import TrainingArguments, Trainer + +import torch +torch.cuda.set_device(0) + +training_args = TrainingArguments( + output_dir="checkpoints", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + save_strategy="epoch", + push_to_hub=False, +) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_train, + eval_dataset=tokenized_test, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +trainer.train() + +results = trainer.evaluate() +print(results) + +trainer.save_model("distilbert-imdb") + +#from transformers import pipeline +#sentiment_model = pipeline(model="distilbert-imdb") +#sentiment_model(["I love this movie", "This movie sucks!"])