Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
jzonthemtn committed Aug 24, 2022
0 parents commit 80ee172
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
venv/
exported-to-onnx/
results/
.ipynb_checkpoints/
finetune-imdb/
checkpoints/
distilbert-imdb/
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Fine Tune

This repository contains code to fine tune a classifier using the aclImdb_v1 dataset. The resulting model can then be converted to ONNX (and used by Apache OpenNLP).

## Requirements

```
python3 -m pip install transformers onnxruntime torch sklearn
```

## Train

`python3 train.py`

## Convert to ONNX

`python3 -m transformers.onnx --model=local-pt-checkpoint/ --feature sequence-classification exported-to-onnx`
72 changes: 72 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from datasets import load_dataset
imdb = load_dataset("imdb")

small_train_dataset = imdb["train"].shuffle(seed=42) #.select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42) #.select([i for i in list(range(300))])

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
load_accuracy = load_metric("accuracy")
load_f1 = load_metric("f1")

logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
return {"accuracy": accuracy, "f1": f1}


from transformers import TrainingArguments, Trainer

import torch
torch.cuda.set_device(0)

training_args = TrainingArguments(
output_dir="checkpoints",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
save_strategy="epoch",
push_to_hub=False,
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_test,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()

results = trainer.evaluate()
print(results)

trainer.save_model("distilbert-imdb")

#from transformers import pipeline
#sentiment_model = pipeline(model="distilbert-imdb")
#sentiment_model(["I love this movie", "This movie sucks!"])

0 comments on commit 80ee172

Please sign in to comment.