forked from ImDahmash/Seq2Seq_NMT
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinput_helper.py
134 lines (104 loc) · 4.21 KB
/
input_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import numpy as np
import tensorflow as tf
GO_TOKEN = 0
END_TOKEN = 1
UNK_TOKEN = 2
def tokenize_and_map(line, vocab):
return [vocab.get(token, UNK_TOKEN) for token in line.split(' ')]
def make_input_fn(
batch_size, input_filename, output_filename, vocab,
input_max_length, output_max_length,
input_process=tokenize_and_map, output_process=tokenize_and_map):
def input_fn():
source = tf.placeholder(tf.int64, shape=[None, None], name='input')
target = tf.placeholder(tf.int64, shape=[None, None], name='output')
tf.identity(source[0], 'input_0')
tf.identity(target[0], 'output_0')
return {
'input': source,
'output': target,
}, None
def sampler():
while True:
with open(input_filename) as finput:
with open(output_filename) as foutput:
for in_line in finput:
out_line = foutput.readline()
yield {
'input': input_process(in_line, vocab)[:input_max_length - 1] + [END_TOKEN],
'output': output_process(out_line, vocab)[:output_max_length - 1] + [END_TOKEN]
}
sample_me = sampler()
def feed_fn():
inputs, outputs = [], []
input_length, output_length = 0, 0
for i in range(batch_size):
rec = next(sample_me)
inputs.append(rec['input'])
outputs.append(rec['output'])
input_length = max(input_length, len(inputs[-1]))
output_length = max(output_length, len(outputs[-1]))
# Pad me right with </S> token.
for i in range(batch_size):
inputs[i] += [END_TOKEN] * (input_length - len(inputs[i]))
outputs[i] += [END_TOKEN] * (output_length - len(outputs[i]))
return {
'input:0': inputs,
'output:0': outputs
}
return input_fn, feed_fn
def predict_input_fn(input_filename, vocab, input_process=tokenize_and_map):
max_len = 0.
with open(input_filename) as finput:
for in_line in finput:
max_len = max(len(in_line.split(" ")), max_len)
predict_lines = np.empty(max_len + 1, int)
with open(input_filename) as finput:
for in_line in finput:
in_line = in_line.lower()
new_line_tmp = np.array(input_process(in_line, vocab), dtype=int)
new_line = np.append(new_line_tmp, np.array([UNK_TOKEN for _ in range(max_len - len(new_line_tmp))] +
[int(END_TOKEN)], dtype=int))
predict_lines = np.vstack((predict_lines, new_line))
pred_line_tmp = np.delete(predict_lines, 0, 0)
pred_lines = np.array(pred_line_tmp)
return {'input': pred_lines}
def load_vocab(filename):
vocab = {}
with open(filename) as f:
for idx, line in enumerate(f):
tmp_val = (line.strip(" \n")).split("=")
vocab[tmp_val[0]] = int(tmp_val[1])
return vocab
def get_rev_vocab(vocab):
return {idx: key for key, idx in vocab.items()}
def get_formatter(keys, vocab):
rev_vocab = get_rev_vocab(vocab)
def to_str(sequence):
tokens = [rev_vocab.get(x, "<UNK>") for x in sequence]
return ' '.join(tokens)
def format(values):
res = []
for key in keys:
res.append("%s = %s" % (key, to_str(values[key])))
return '\n'.join(res)
return format
def get_out_put_from_tokens(all_sentences, vocab):
rev_vocab = get_rev_vocab(vocab)
all_string_sent = []
for each_sent in all_sentences:
string_sent = []
for each_word in each_sent:
string_sent.append(rev_vocab.get(each_word))
all_string_sent.append(' '.join(string_sent))
return all_string_sent
def get_out_put_from_tokens_beam_search(all_sentences, vocab):
rev_vocab = get_rev_vocab(vocab)
all_string_sent = []
for each_sent in all_sentences:
each_sent = each_sent[:, 0]
string_sent = []
for each_word in each_sent:
string_sent.append(rev_vocab.get(each_word))
all_string_sent.append(' '.join(string_sent))
return all_string_sent