forked from michael92ht/daguan_ner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
156 lines (135 loc) · 6 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# encoding: utf-8
from keras.models import Model
from keras_contrib.layers import CRF
from keras.utils import to_categorical
from model.keras_model import *
from utils.data_utils import *
import warnings
np.random.seed(0)
warnings.filterwarnings('ignore')
SPLIT_NUM = 6
MAX_SEQUENCE_LENGTH = 200
TAGS_NUM = 7
EMBEDDING_DIM = 300
DROP_OUT = 0.5
LSTM_NUM = 200
BATCH_SIZE = 128
# 128
EPOCHS = 300
MASK_ZERO = True
encoders_dict = {
'attention': attention_encoder,
'cnn': cnn_encoder,
'lstm': lstm_encoder,
'bilstm': bilstm_encoder,
'2lstm': two_lstm_encoder,
'2bilstm': two_bilstm_encoder,
'lstmcnn': lstm_cnn_encoder,
'bilstmcnn': bilstm_cnn_encoder,
'dgcnn': dgcnn_encoder
}
def run(input_data, split_index, encoder_type='cnn'):
train_data, tags, test_data, test_data_map, test_mask, raw_test_data, embed_matrix, word_2_id = input_data
best_vali_score = {}
model_save_folder = 'data/hub/' + encoder_type
test_save_folder = 'data/result/' + encoder_type
for folder in [model_save_folder, test_save_folder]:
if not os.path.exists(folder):
os.mkdir(folder)
for model_count in range(SPLIT_NUM):
print("MODEL:", model_count)
# split data into train/vali set
idx_val = split_index[model_count]
idx_train = []
for i in range(SPLIT_NUM):
if i != model_count:
idx_train.extend(list(split_index[i]))
train_sentences = train_data[idx_train]
train_tags = tags[idx_train]
train_tags = np.array([to_categorical(i, num_classes=TAGS_NUM) for i in train_tags])
val_sentences = train_data[idx_val]
val_tags = tags[idx_val]
val_tags = np.array([to_categorical(i, num_classes=TAGS_NUM) for i in val_tags])
input = Input(shape=(MAX_SEN_LEN,))
# model = Masking(mask_value=0, input_shape=(MAX_SEN_LEN,))(input)
if 'cnn' in encoder_type or 'attention' in encoder_type:
mask_zero = False
else:
mask_zero = True
embed = Embedding(embed_matrix.shape[0],
weights=[embed_matrix],
trainable=True,
output_dim=EMBEDDING_DIM,
input_length=MAX_SEN_LEN,
mask_zero=mask_zero)(input)
# q = cnn_encoder(embed)
# q = two_lstm_encoder(embed)
# q0 = embed
q0 = Dropout(DROP_OUT)(embed)
# q = encoders_dict[encoder_type](q0)
q1 = Bidirectional(LSTM(units=200, return_sequences=True, recurrent_dropout=0.2))(q0) # variational biLSTM
q2 = Bidirectional(LSTM(units=200, return_sequences=True, recurrent_dropout=0.2))(q1) # variational biLSTM
q = concatenate([q1, q2])
q = TimeDistributed(Dense(100, activation="relu"))(q) # a dense layer as suggested by neuralNer
crf = CRF(TAGS_NUM) # CRF layer
out = crf(q)
model = Model(input, out)
model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()
best_weights_filepath = os.path.join(model_save_folder, str(model_count).zfill(3) + '.hdf5')
custom_metrics = CustomMetrics(best_weights_filepath)
# define save model
earlyStopping = kcallbacks.EarlyStopping(monitor='crf_viterbi_accuracy',
patience=15,
verbose=1,
mode='auto')
# saveBestModel = kcallbacks.ModelCheckpoint(
# best_weights_filepath,
# monitor='crf_viterbi_accuracy',
# verbose=1,
# save_best_only=True,
# mode='auto')
hist = model.fit(
train_sentences, train_tags,
validation_data=(val_sentences, val_tags),
epochs=EPOCHS,
batch_size=BATCH_SIZE,
shuffle=True,
callbacks=[earlyStopping, custom_metrics],
verbose=1)
model.load_weights(best_weights_filepath)
print(model_count, "validation loss:", min(hist.history["val_loss"]))
best_vali_score[model_count] = min(hist.history["val_loss"])
# predict on the test set
test_preds = model.predict(test_data, batch_size=128, verbose=1)
test_preds = decode(test_preds)
save_path = os.path.join(test_save_folder, str(model_count).zfill(3) + '.npy')
np.save(save_path, test_preds)
print("Test preds saved: ", model_count)
for index, loss in best_vali_score.items():
print(encoder_type, index, loss)
result_path = os.path.join('data/result', encoder_type + '_result.txt')
test_dataset = (test_data, test_data_map, test_mask)
test_results = merge_results(test_save_folder, test_dataset)
write_results(test_results, raw_test_data, result_path)
if __name__ == '__main__':
# load dataset
with open('data/anns/dataset.pkl', 'rb') as f:
data, tags, test_data, test_data_map, test_mask, raw_test_data, embeddings, word_2_id = pickle.load(f)
print('sentences: ', data.shape)
print('tags: ', tags.shape)
print('embed_matrix: ', embeddings.shape)
# split dataset
length = data.shape[0]
split_index = get_split_indexs(length, SPLIT_NUM)
encoder_type = '2bi'
# import time
# start = time.time()
# input_data = (data, tags, test_data, test_data_map, test_mask, raw_test_data, embeddings, word_2_id)
# run(input_data, split_index, encoder_type)
# end = time.time()
# print('Training time {0:.3f} 分钟'.format((end - start) / 60))
result_path = os.path.join('data/result', encoder_type + '_result.txt')
test_dataset = (test_data, test_data_map, test_mask)
test_results = merge_results('data/result/' + encoder_type, test_dataset)
write_results(test_results, raw_test_data, result_path)