forked from jain-harshil/Adapter-BERT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper.py
107 lines (86 loc) · 4.95 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import codecs
from transformers.data.processors import InputFeatures
import torch
def write_list(path, list):
f = codecs.open(path,'w',encoding='utf8')
for l in list:
f.write(l + "\n")
f.close()
def load_lines(filepath):
return [l.strip() for l in list(codecs.open(filepath, "r", encoding = 'utf8', errors = 'replace').readlines())]
def load(filepath):
df = pd.read_csv(filepath)
texts = []
labels = []
for i in range (len(df)):
a = []
a.append(df.iloc[i]['FirstWord'])
a.append(df.iloc[i]['SecondWord'])
texts.append(a)
for i in range (len(df)):
labels.append(df.iloc[i]['Relation'])
def featurize_texts(texts, tokenizer, labels, max_length = 128, add_special_tokens = True, is_text_pair = False, has_toktype_ids = True):
instances = []
i = 0
for text in texts:
feats = featurize_text(text, tokenizer, labels[i],max_length, add_special_tokens, is_text_pair=is_text_pair, has_toktype_ids = has_toktype_ids)
i = i+1
instances.append(feats)
token_ids = torch.tensor([x.input_ids for x in instances], dtype = torch.long)
attention_mask = torch.tensor([x.attention_mask for x in instances], dtype = torch.long)
token_type_ids = torch.tensor([x.token_type_ids for x in instances], dtype = torch.long)
label = torch.tensor([x.label for x in instances], dtype = torch.long)
input_dict = {"input_ids" : token_ids, "attention_mask" : attention_mask, "token_type_ids" : token_type_ids, "label" : label}
return input_dict
def featurize_texts_siqa(texts, tokenizer, labels, max_length = 128, add_special_tokens = True, is_text_pair = False, has_toktype_ids = True):
instances = []
i = 0
for text in texts:
feats = featurize_text_siqa(text, tokenizer, labels[i],max_length, add_special_tokens, is_text_pair=is_text_pair, has_toktype_ids = has_toktype_ids)
i = i+1
instances.append(feats)
print("Printing")
print((instances[0]).input_ids)
print((instances[1]).input_ids)
token_ids = torch.tensor([x.input_ids for x in instances], dtype = torch.long)
attention_mask = torch.tensor([x.attention_mask for x in instances], dtype = torch.long)
token_type_ids = torch.tensor([x.token_type_ids for x in instances], dtype = torch.long)
label = torch.tensor([x.label for x in instances], dtype = torch.long)
input_dict = {"input_ids" : token_ids, "attention_mask" : attention_mask, "token_type_ids" : token_type_ids, "label" : label}
return input_dict
def featurize_text_siqa(text,tokenizer, label,max_length = 128, add_special_tokens = True, is_text_pair = False, has_toktype_ids = True):
choices_inputs = []
for i in range (0,5,2):
if is_text_pair:
text1, text2 = text[i],text[i+1]
inputs = tokenizer.encode_plus(text1, text2, add_special_tokens=True, max_length=max_length,pad_to_max_length=True)
else:
inputs = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_length,pad_to_max_length=True)
choices_inputs.append(inputs)
input_ids = [x["input_ids"] for x in choices_inputs]
attention_mask = ([x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None)
if has_toktype_ids:
token_type_ids = ([x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None)
return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids if has_toktype_ids else None, label = label)
def featurize_text(text, tokenizer, label,max_length = 128, add_special_tokens = True, is_text_pair = False, has_toktype_ids = True):
if is_text_pair:
text1, text2 = text
inputs = tokenizer.encode_plus(text1, text2, add_special_tokens=True, max_length=max_length)
else:
inputs = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_length)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
if has_toktype_ids:
token_type_ids = inputs["token_type_ids"]
# Zero-pad up to the sequence length.
pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
padding_length = max_length - len(input_ids)
input_ids = input_ids + ([pad_token] * padding_length)
attention_mask = attention_mask + ([0] * padding_length)
if has_toktype_ids:
token_type_ids = token_type_ids + ([0] * padding_length)
assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
if has_toktype_ids:
assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids if has_toktype_ids else None, label = label)