-
Notifications
You must be signed in to change notification settings - Fork 5
/
tf_test.py
149 lines (127 loc) · 7.28 KB
/
tf_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from utility import hatebase_features
from utility.twitter_preprocess import clean
import os
from os.path import join as pjoin
from tqdm import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import logging
import json
from hs_model import HateSpeechSystem
from tf_data import *
logging.basicConfig(level=logging.INFO)
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string("model_type", "classic", "classic / hb_embed / hb_append")
tf.app.flags.DEFINE_boolean("bidirectional", False, "Bidirectionality of LSTM")
tf.app.flags.DEFINE_string("scoring", "f1_macro", "auc / f1_macro")
tf.app.flags.DEFINE_float("learning_rate", 0.001, "Learning rate.")
tf.app.flags.DEFINE_float("max_gradient_norm", 10.0, "Clip gradients to this norm.")
tf.app.flags.DEFINE_float("input_dropout", 0.0, "Fraction of input units randomly dropped.")
tf.app.flags.DEFINE_float("output_dropout", 0.0, "Fraction of output units randomly dropped.")
tf.app.flags.DEFINE_float("state_dropout", 0.0, "Fraction of units randomly dropped on recurrent connections.")
tf.app.flags.DEFINE_float("embedding_dropout", 0.0, "Fraction of embeddings randomly dropped on lookup.")
tf.app.flags.DEFINE_string("optimizer", "adam", "adam / sgd")
tf.app.flags.DEFINE_integer("batch_size", 16, "Batch size to use during training.")
tf.app.flags.DEFINE_integer("epochs", 1000, "Number of epochs to train.")
tf.app.flags.DEFINE_integer("state_size", 100, "Size of each model layer.")
tf.app.flags.DEFINE_integer("num_layers", 2, "Number of encoder layers.")
tf.app.flags.DEFINE_integer("tweet_size", 32, "The length of a tweet (example).")
tf.app.flags.DEFINE_integer("output_size", 3, "The output size, aka number of classes.")
tf.app.flags.DEFINE_integer("embedding_size", 100, "Size of the pretrained vocabulary. (default 100)")
tf.app.flags.DEFINE_integer("hatebase_size", 7, "Number of hatebase features. (default 8)")
tf.app.flags.DEFINE_string("data_dir", "data/twitter_davidson", "SQuAD directory (default ./data/twitter_davidson)")
tf.app.flags.DEFINE_string("train_dir", "train", "CHECKPOINT directory (default: ./train).")
tf.app.flags.DEFINE_string("log_dir", "log", "Path to store log and flag files (default: ./log)")
tf.app.flags.DEFINE_integer("print_every", 100, "How many iterations to do per print.")
tf.app.flags.DEFINE_integer("save_every", 1200, "How many iterations to do per save checkpoint and evaluate.")
tf.app.flags.DEFINE_string("vocab_path", "data/twitter_davidson/vocab.dat", "Path to vocab file (default: ./data/twitter_davidson/vocab.dat)")
tf.app.flags.DEFINE_string("vocab_stemmed", False, "Whether to use stemmed vocabulary (False for GloVe, True for others)")
tf.app.flags.DEFINE_string("embed_path", "", "Path to the GLoVe embedding (default: ./data/twitter_davidson/embeddings.{embedding_size}d.dat)")
tf.app.flags.DEFINE_string("embed_trainable", False, "Whether to train embeddings (False for GloVe, True for others)")
tf.app.flags.DEFINE_string("test_path", "", "Path to file with sentences to test (Default empty is REPL)")
def initialize_model(session, model, train_dir, seed=42):
tf.set_random_seed(seed)
np.random.seed(seed)
ckpt = tf.train.get_checkpoint_state(train_dir)
print ckpt
v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
logging.info(str(vars(FLAGS)))
if ckpt and (tf.gfile.Exists(ckpt.model_checkpoint_path) or tf.gfile.Exists(v2_path)):
logging.info("Reading model parameters from %s" % ckpt.model_checkpoint_path)
model.saver.restore(session, ckpt.model_checkpoint_path)
else:
logging.info("Created model with fresh parameters.")
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
session.run(init_op)
logging.info('Num params: %d' % sum(v.get_shape().num_elements() for v in tf.trainable_variables()))
return model
def initialize_vocab(vocab_path):
if tf.gfile.Exists(vocab_path):
rev_vocab = []
with tf.gfile.GFile(vocab_path, mode="rb") as f:
rev_vocab.extend(f.readlines())
# index to word
rev_vocab = [line.strip('\n') for line in rev_vocab]
# word to index
vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
return vocab, rev_vocab
else:
raise ValueError("Vocabulary file %s not found.", vocab_path)
def load_dataset(x_file, y_file):
with open(x_file, 'rb') as x_file, open(y_file, 'rb') as y_file:
data_x = np.loadtxt( x_file, delimiter = ' ', dtype = int)
if FLAGS.output_size == 2:
data_y = pd.read_csv( y_file, header = 0, quoting = 0, usecols = ['hate_speech'], dtype = int)
elif FLAGS.output_size == 3:
data_y = pd.read_csv( y_file, header = 0, quoting = 0, usecols = ['class'], dtype = int)
data_y = data_y.values.ravel()
return data_x, data_y
def raw_to_ids(lines, vocab, tok):
lines = [clean(l) for l in lines]
ids = [sentence_to_token_ids(l, vocab, tokenizer=tok, pad=True) for l in lines]
# if FLAGS.model_type == 'hb_append':
# hb = hatebase_features( lines, tokenizer=tok )
# ids.extend(hb)
return ids
def main(_):
if FLAGS.embed_path:
FLAGS.embed_path = FLAGS.embed_path.format(FLAGS.embedding_size)
else:
FLAGS.embed_path = pjoin("data", "twitter_davidson", "embeddings.{}d.dat".format(FLAGS.embedding_size))
if not os.path.exists(FLAGS.log_dir):
os.makedirs(FLAGS.log_dir)
file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
logging.getLogger().addHandler(file_handler)
# print(vars(FLAGS))
with open(pjoin(FLAGS.log_dir, "flags.json"), 'w') as fout:
json.dump(FLAGS.__flags, fout)
hb = 'hb.' if FLAGS.model_type == 'hb_append' else ''
stemmed = 'stemmed.' if FLAGS.vocab_stemmed else ''
train_x_path = pjoin(FLAGS.data_dir, "train.ids.{}d.{}{}vec".format(FLAGS.tweet_size, hb, stemmed))
test_x_path = pjoin(FLAGS.data_dir, "test.ids.{}d.{}{}vec".format(FLAGS.tweet_size, hb, stemmed))
print train_x_path, test_x_path
train_y_path = pjoin(FLAGS.data_dir, "train.y")
test_y_path = pjoin(FLAGS.data_dir, "test.y")
train_x, train_y = load_dataset(train_x_path, train_y_path)
test_x, test_y = load_dataset(test_x_path, test_y_path) #FLAGS.batch_size
vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) # one is list and one is dict
FLAGS.vocab_len = len(vocab)
tok = stem_tokenizer if FLAGS.vocab_stemmed else basic_tokenizer
hs = HateSpeechSystem( FLAGS, train_x, train_y )
with tf.Session() as sess:
initialize_model(sess, hs, FLAGS.train_dir)
if not FLAGS.test_path:
while True:
token_ids = raw_to_ids([raw_input("Type sentence: ")], vocab, tok)
print "Tokens:", token_ids
proba = hs.eval.predict_proba(sess, token_ids)[0][0]
print "Probability dist:", proba
print "Prediction:", np.argmax(proba)
raw_lines = pd.read_csv( FLAGS.test_path, header = 0, quoting = 0 )['tweet']
token_ids = raw_to_ids(raw_lines, vocab, tok)
pred, encod = hs.eval.predict(sess, token_ids, get_encoding=True)
pd.DataFrame(pred).to_csv( FLAGS.test_path.split('.')[0] + '.pred.csv')
pd.DataFrame(encod).to_csv( FLAGS.test_path.split('.')[0] + '.encod.csv')
if __name__ == "__main__":
tf.app.run()