-
Notifications
You must be signed in to change notification settings - Fork 0
/
flan_t5.py
356 lines (317 loc) · 12.2 KB
/
flan_t5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#### LLAMA-65B ####
import torch
import nltk
import os
import json
import logging
import transformers
from transformers import (
AutoTokenizer,
MT5ForConditionalGeneration,
set_seed,
Seq2SeqTrainer,
BitsAndBytesConfig
)
from datasets.arrow_dataset import Dataset
import pandas as pd
import numpy as np
from peft import (
prepare_model_for_kbit_training,
LoraConfig,
get_peft_model,
get_peft_config,
PeftModelForSeq2SeqLM
)
from peft.tuners.lora import LoraLayer
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from typing import Optional, Dict, Sequence
import sys
sys.path.append('./Evaluation_metric/spider/')
from Evaluation_self import evaluate,evaluate_test
import re
##### START #####
##### Load model #####
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
model_id = "google/flan-t5-base"
model = MT5ForConditionalGeneration.from_pretrained(model_id, quantization_config=nf4_config, device_map="auto")
##### Load tokenizer #####
def smart_tokenizer_and_embedding_resize(
special_tokens_dict: Dict,
tokenizer: transformers.PreTrainedTokenizer,
model: transformers.PreTrainedModel,
):
"""Resize tokenizer and embedding.
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
if num_new_tokens > 0:
input_embeddings_data = model.get_input_embeddings().weight.data
output_embeddings_data = model.get_output_embeddings().weight.data
input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
output_embeddings_data[-num_new_tokens:] = output_embeddings_avg
tokenizer = AutoTokenizer.from_pretrained(
model_id,
padding_side="right",
use_fast=False,
)
##### Load model as Qlora setup #####
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
r=8, # 理论上调的越高越好,8是一个分界线
lora_alpha=32, # 这个参数类似lr
target_modules=["q", "v"], # 需要影响的层
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model.print_trainable_parameters()
#### load data ####
path_to_Spider = "./Data/spider"
Output_path = "./Outputs/spider"
DATASET_SCHEMA = path_to_Spider + "/tables.json"
DATASET_TRAIN = path_to_Spider + "/train_spider.json"
DATASET_DEV = path_to_Spider + "/dev.json"
OUTPUT_FILE_1 = Output_path + "/predicted_sql.txt"
OUTPUT_FILE_2 = Output_path + "/gold_sql.txt"
DATABASE_PATH = path_to_Spider + "/database"
gold_file = path_to_Spider + "/gold_eval.txt"
def load_data(DATASET):
return pd.read_json(DATASET)
#### Preprocess ####
def find_foreign_keys_MYSQL_like(db_name):
df = spider_foreign[spider_foreign['Database name'] == db_name]
output = "["
for index, row in df.iterrows():
output += row['First Table Name'] + '.' + row['First Table Foreign Key'] + " = " + row['Second Table Name'] + '.' + row['Second Table Foreign Key'] + ','
output= output[:-1] + "]"
return output
def find_fields_MYSQL_like(db_name):
df = spider_schema[spider_schema['Database name'] == db_name]
df = df.groupby(' Table Name')
output = ""
for name, group in df:
output += "Table " +name+ ', columns = ['
for index, row in group.iterrows():
output += row[" Field Name"]+','
output = output[:-1]
output += "]\n"
return output
def find_primary_keys_MYSQL_like(db_name):
df = spider_primary[spider_primary['Database name'] == db_name]
output = "["
for index, row in df.iterrows():
output += row['Table Name'] + '.' + row['Primary Key'] +','
output = output[:-1]
output += "]\n"
return output
def creatiing_schema(DATASET_JSON):
schema_df = pd.read_json(DATASET_JSON)
schema_df = schema_df.drop(['column_names','table_names'], axis=1)
schema = []
f_keys = []
p_keys = []
for index, row in schema_df.iterrows():
tables = row['table_names_original']
col_names = row['column_names_original']
col_types = row['column_types']
foreign_keys = row['foreign_keys']
primary_keys = row['primary_keys']
for col, col_type in zip(col_names, col_types):
index, col_name = col
if index == -1:
for table in tables:
schema.append([row['db_id'], table, '*', 'text'])
else:
schema.append([row['db_id'], tables[index], col_name, col_type])
for primary_key in primary_keys:
index, column = col_names[primary_key]
p_keys.append([row['db_id'], tables[index], column])
for foreign_key in foreign_keys:
first, second = foreign_key
first_index, first_column = col_names[first]
second_index, second_column = col_names[second]
f_keys.append([row['db_id'], tables[first_index], tables[second_index], first_column, second_column])
spider_schema = pd.DataFrame(schema, columns=['Database name', ' Table Name', ' Field Name', ' Type'])
spider_primary = pd.DataFrame(p_keys, columns=['Database name', 'Table Name', 'Primary Key'])
spider_foreign = pd.DataFrame(f_keys,
columns=['Database name', 'First Table Name', 'Second Table Name', 'First Table Foreign Key',
'Second Table Foreign Key'])
return spider_schema,spider_primary,spider_foreign
print('Creating Schema linking...\n')
spider_schema,spider_primary,spider_foreign = creatiing_schema(DATASET_SCHEMA)
train_data = load_data(DATASET_TRAIN)
eval_data = load_data(DATASET_DEV)
test_questions = []
def preprocess_function(example, tokenizer):
questions = []
prompt = 'You are an expert in SQL. You are given a question and a database schema. You need to write a SQL query to answer the question.\n'
for question,db_id in zip(example['question'],example['db_id']):
schema = "db_id:" + db_id +'\n' + find_fields_MYSQL_like(db_id) + '\n' + "foreign key:" + find_foreign_keys_MYSQL_like(
db_id) + '\n' + "primary key:" + find_primary_keys_MYSQL_like(db_id)
question_after = question + '\n' + schema + '\n' + 'SQL:'
questions.append(question_after)
test_questions.append(question_after)
queries = example['query']
input_tokenized = tokenizer(questions, return_tensors="pt", max_length=512, truncation=True, padding="max_length",add_special_tokens=False)
output_tokenized = tokenizer(queries, return_tensors="pt", max_length=512, truncation=True, padding="max_length",add_special_tokens=False)
return {
"input_ids": input_tokenized["input_ids"],
"attention_mask": input_tokenized["attention_mask"],
"labels": output_tokenized["input_ids"],
"db_id": example["db_id"],
"gold_query": example["query"]
}
db_id_train = []
query_train = []
question_train = []
for index, sample in train_data.iterrows():
# if index == 8:
# break
db_id_train.append(sample['db_id'])
query_train.append(sample['query'])
question_train.append(sample['question'])
dataset_train = Dataset.from_dict({
"db_id": db_id_train,
"query": query_train,
"question": question_train,
})
db_id_eval = []
query_eval = []
question_eval = []
for index,sample in eval_data.iterrows():
# if index == 8:
# break
db_id_eval.append(sample['db_id'])
query_eval.append(sample['query'])
question_eval.append(sample['question'])
dataset_eval = Dataset.from_dict({
"db_id": db_id_eval,
"query": query_eval,
"question": question_eval,
})
# Shuffle and select a subset of the data, if needed
dataset_train = dataset_train.shuffle(seed=42)
dataset_eval = dataset_eval
# Preprocess the data
dataset = dataset_train.map(lambda e: preprocess_function(e, tokenizer), batched=True)
eval_dataset = dataset_eval.map(lambda e: preprocess_function(e, tokenizer), batched=True)
#### Custom metric ####
def compute_metric(eval_pred):
print("Starting evaluation...\n")
# predictions = eval_pred.predictions
# preds = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
preds = eval_pred.predictions
labels = eval_pred.label_ids
# inputs = eval_pred.inputs
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True,clean_up_tokenization_spaces=False)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True,clean_up_tokenization_spaces=False)
# decoded_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True,clean_up_tokenization_spaces=False)
# db_id = []
# for question in decoded_inputs:
# result = re.search(r'db_id:(.+?)\n', question)
# db_id.append(result.group(1).strip())
genetrated_queries = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds] ###########
gold_queries_and_db_ids = []
with open(gold_file, 'r') as file:
for line in file:
# Split the line by the tab character '\t'
query, db_id = line.strip().split('\t')
# Append the query and db_id as a tuple to the list
gold_queries_and_db_ids.append((query, db_id))
with open(OUTPUT_FILE_1,'w') as file:
for query in genetrated_queries:
file.write(query+'\n')
db_dir = DATABASE_PATH
etype = 'all'
table = DATASET_SCHEMA
# print("now you see")
score = evaluate(gold_queries_and_db_ids, genetrated_queries, db_dir, etype, table)
print(f"Execution Accuracy: {score}")
return {"exec": score} # 必须返回字典
ds_config = {
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu"
},
"contiguous_gradients": True,
"overlap_comm": True
},
"zero_allow_untested_optimizer": True,
"fp16": {
"enabled": True,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": 0,
"warmup_max_lr": 2e-5,
"warmup_num_steps": "auto",
"warmup_type": "linear"
}
},
"activation_checkpointing": {
"partition_activations": True,
"contiguous_memory_optimization": True
},
"wall_clock_breakdown": True
}
#### train ####
trainer = transformers.Seq2SeqTrainer(
model=model,
train_dataset=dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metric,
args=transformers.Seq2SeqTrainingArguments(
logging_dir="./logs_for_t5_flan_base" , # Path to directory to save logs
logging_strategy='steps', # Log after every X steps
logging_steps=10, # Set X to be 100
output_dir="./Checkpoints/T5_flan_base/Spider",
num_train_epochs=10,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
gradient_accumulation_steps=4,
gradient_checkpointing=True,
warmup_ratio=0.03,
group_by_length=False,
lr_scheduler_type="cosine",
evaluation_strategy="steps", # Change evaluation_strategy to "steps"
save_strategy="steps",
eval_steps=50,
save_steps=100,# Add eval_steps parameter need to lower the log/eval/save steps to see the report results
learning_rate=5e-4,
fp16=False,
optim="paged_adamw_8bit",
predict_with_generate=True,
generation_num_beams=4,
generation_max_length=513,
include_inputs_for_metrics=True,
# deepspeed=ds_config,
),
)
model.config.use_cache = True # silence the warnings. Please re-enable for inference!
trainer.train()
# CUDA_VISIBLE_DEVICES=0 python3 flan_t5.py
# deepspeed --num_gpus 8 LLAMA_65B.py
# CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.launch LLAMA_65B.py
# torchrun --nproc_per_node 8 LLAMA_65B.py
# tensorboard dev upload --logdir ./logs_for_LLAMA_65B
# --name yjh
# --description yjh