-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbert_speech_acts.py
137 lines (109 loc) · 5.17 KB
/
bert_speech_acts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Fine-tune BERT with DailyDialog annotations.
import os
import sys
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from sklearn.model_selection import train_test_split
import pickle
sys.path.append('models')
df = pd.read_csv('equalized.csv')
train_df, remaining = train_test_split(df, random_state=42, train_size=0.75, stratify=df.label.values)
valid_df, _ = train_test_split(remaining, random_state=42, train_size=0.075, stratify=remaining.label.values)
# efficient ingest pipeline
with tf.device('/cpu:0'):
train_data = tf.data.Dataset.from_tensor_slices((train_df['text'].values, train_df['label'].values))
valid_data = tf.data.Dataset.from_tensor_slices((valid_df.text.values, valid_df.label.values))
for text, label in train_data.take(1):
print(text)
print(label)
# multiclass setup
label_list = [1, 2, 3, 4]
max_seq_length = 128
train_batch_size = 32
print("Creating tokenizer...")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)
pickle.dump(tokenizer, open("tokenizer.pickle", "wb"))
def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer):
example = classifier_data_lib.InputExample(guid=None,
text_a=text.numpy(),
text_b=None,
label=label.numpy())
feature = classifier_data_lib.convert_single_example(0, example, label_list, max_seq_length, tokenizer)
return feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id
# map featurized input to tf Dataset with Dataset.map(), which runs in graph mode
# turn graph tensors (that do not have a value) to regular tensors to enable eager execution
def to_feature_map(text, label):
input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp=[text, label],
Tout=[tf.int32, tf.int32, tf.int32, tf.int32])
input_ids.set_shape([max_seq_length])
input_mask.set_shape([max_seq_length])
segment_ids.set_shape([max_seq_length])
label_id.set_shape([])
x = {
'input_word_ids': input_ids,
'input_mask': input_mask,
'input_type_ids': segment_ids
}
return x, label_id
# complete tf pipeline with per element mappings
print("Do per element mappings...")
with tf.device('/cpu:0'):
train_data = (train_data.map(to_feature_map, num_parallel_calls=tf.data.experimental.AUTOTUNE)
.shuffle(1000)
.batch(32, drop_remainder=True)
.prefetch(tf.data.experimental.AUTOTUNE))
valid_data = (valid_data.map(to_feature_map, num_parallel_calls=tf.data.experimental.AUTOTUNE)
.batch(32, drop_remainder=True)
.prefetch(tf.data.experimental.AUTOTUNE))
def create_model():
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_type_ids")
# the pooled output is the contextualized input
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])
drop = tf.keras.layers.Dropout(0.4)(pooled_output)
output = tf.keras.layers.Dense(20, activation='softmax', name='output')(drop)
model = tf.keras.Model(
inputs={
'input_word_ids': input_word_ids,
'input_mask': input_mask,
'input_type_ids': input_type_ids
},
outputs=output
)
return model
# compile model
model = create_model()
# multiclass setup
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
loss=tf.keras.losses.SparseCategoricalCrossentropy(),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
print(model.summary())
# train model
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
save_weights_only=True,
verbose=1)
epochs = 2
history = model.fit(train_data,
validation_data=valid_data,
epochs=epochs,
verbose=1,
callbacks=[cp_callback])
# predict against target
def get_prediction_target(target_path):
lines = []
with open(target_path) as f:
lines = f.readlines()
lines = [line.strip() for line in lines]
target = tf.data.Dataset.from_tensor_slices((lines, [0]*len(lines)))
target = (target.map(to_feature_map).batch(1))
return target
target = get_prediction_target(target_path)