You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am trying to create a PEFT model from DISTILBERT model, and run a training loop. However, the trainer.train() is giving me this error: ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']
Here is my code:
Steps to reproduce the bug
#Creating a PEFT Config
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model
#Converting a Transformers Model into a PEFT Model
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=2, #Binary classification, 1 = positive, 0 = negative
)
lora_model = get_peft_model(model, lora_config)
print(lora_model)
Tokenize data set
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
#create a smaller subset for train and test
subset_size = 5000
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(subset_size))
small_test_dataset = dataset["test"].shuffle(seed=42).select(range(subset_size))
#Tokenize data
def tokenize_function(example):
return tokenizer(example["content"], padding="max_length", truncation=True)
import numpy as np
from transformers import Trainer, TrainingArguments, default_data_collator, DataCollatorWithPadding
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
Describe the bug
I am trying to create a PEFT model from DISTILBERT model, and run a training loop. However, the trainer.train() is giving me this error: ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']
Here is my code:
Steps to reproduce the bug
#Creating a PEFT Config
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model
lora_config = LoraConfig(
task_type="SEQ_CLASS",
r=8,
lora_alpha=32,
target_modules=["q_lin", "k_lin", "v_lin"],
lora_dropout=0.01,
)
#Converting a Transformers Model into a PEFT Model
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=2, #Binary classification, 1 = positive, 0 = negative
)
lora_model = get_peft_model(model, lora_config)
print(lora_model)
Tokenize data set
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
Load the train and test splits dataset
dataset = load_dataset("fancyzhx/amazon_polarity")
#create a smaller subset for train and test
subset_size = 5000
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(subset_size))
small_test_dataset = dataset["test"].shuffle(seed=42).select(range(subset_size))
#Tokenize data
def tokenize_function(example):
return tokenizer(example["content"], padding="max_length", truncation=True)
tokenized_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = small_test_dataset.map(tokenize_function, batched=True)
train_lora = tokenized_train_dataset.rename_column('label', 'labels')
test_lora = tokenized_test_dataset.rename_column('label', 'labels')
print(tokenized_train_dataset.column_names)
print(tokenized_test_dataset.column_names)
#Train the PEFT model
import numpy as np
from transformers import Trainer, TrainingArguments, default_data_collator, DataCollatorWithPadding
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return {"accuracy": (predictions == labels).mean()}
trainer = Trainer(
model=lora_model,
args=TrainingArguments(
output_dir=".",
learning_rate=2e-3,
# Reduce the batch size if you don't have enough memory
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
num_train_epochs=3,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
),
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_test_dataset,
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt"),
compute_metrics=compute_metrics,
)
trainer.train()
Expected behavior
Example of output:
[558/558 01:04, Epoch XX]
Environment info
Using python and jupyter notbook
The text was updated successfully, but these errors were encountered: