-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
263 lines (239 loc) · 9.7 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
import string
import numpy as np
from PIL import Image
import os
import pydot
import pydotplus
from tqdm import tqdm
from pickle import dump, load
from keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.layers import concatenate
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout
import keras.utils.vis_utils
from importlib import reload
reload(keras.utils.vis_utils)
from keras.utils.vis_utils import pydot
from keras.utils.vis_utils import plot_model
from keras.utils.vis_utils import model_to_dot
from pydotplus import graphviz
# small library for seeing the progress of loops.
#from tqdm import tqdm_notebook as tqdm
#tqdm().pandas()
# Loading a text file into memory
def load_doc(filename):
# Opening the file as read only
file = open(filename, 'r')
text = file.read()
file.close()
return text
# get all imgs with their captions
def all_img_captions(filename):
file = load_doc(filename)
captions = file.split('\n')
descriptions ={}
for caption in captions[:-1]:
img, caption = caption.split('\t')
if img[:-2] not in descriptions:
descriptions[img[:-2]] = [ caption ]
else:
descriptions[img[:-2]].append(caption)
return descriptions
#Data cleaning- lower casing, removing puntuations and words containing numbers
def cleaning_text(captions):
table = str.maketrans('','',string.punctuation)
for img,caps in captions.items():
for i,img_caption in enumerate(caps):
img_caption.replace("-"," ")
desc = img_caption.split()
#converts to lowercase
desc = [word.lower() for word in desc]
#remove punctuation from each token
desc = [word.translate(table) for word in desc]
#remove hanging 's and a
desc = [word for word in desc if(len(word)>1)]
#remove tokens with numbers in them
desc = [word for word in desc if(word.isalpha())]
#convert back to string
img_caption = ' '.join(desc)
captions[img][i]= img_caption
return captions
def text_vocabulary(descriptions):
# build vocabulary of all unique words
vocab = set()
for key in descriptions.keys():
[vocab.update(d.split()) for d in descriptions[key]]
return vocab
#All descriptions in one file
def save_descriptions(descriptions, filename):
lines = list()
for key, desc_list in descriptions.items():
for desc in desc_list:
lines.append(key + '\t' + desc )
data = "\n".join(lines)
file = open(filename,"w")
file.write(data)
file.close()
# Set these path according to project folder in you system
dataset_images = r"C:\Users\Sravan\Documents\openlab\Flicker8k_Dataset"
dataset_text = r"C:\Users\Sravan\Documents\openlab\Flickr8k_text"
#we prepare our text data
filename = dataset_text + "/" + "Flickr8k.token.txt"
#loading the file that contains all data
#mapping them into descriptions dictionary img to 5 captions
descriptions = all_img_captions(filename)
print("Length of descriptions =" ,len(descriptions))
#cleaning the descriptions
clean_descriptions = cleaning_text(descriptions)
#building vocabulary
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))
#saving each description to file
save_descriptions(clean_descriptions, "descriptions.txt")
def extract_features(directory):
model = Xception( include_top=False, pooling='avg' )
features = {}
for img in tqdm(os.listdir(directory)):
filename = directory + "/" + img
image = Image.open(filename)
image = image.resize((299,299))
image = np.expand_dims(image, axis=0)
#image = preprocess_input(image)
image = image/127.5
image = image - 1.0
feature = model.predict(image)
features[img] = feature
return features
#2048 feature vector
features = extract_features(dataset_images)
dump(features, open("features1.p","wb"))
#to directly load the features from the pickle file.
features = load(open("features1.p","rb"))
#load the data
def load_photos(filename):
file = load_doc(filename)
photos = file.split("\n")[:-1]
return photos
def load_clean_descriptions(filename, photos):
#loading clean_descriptions
file = load_doc(filename)
descriptions = {}
for line in file.split("\n"):
words = line.split()
if len(words)<1 :
continue
image, image_caption = words[0], words[1:]
if image in photos:
if image not in descriptions:
descriptions[image] = []
desc = '<start> ' + " ".join(image_caption) + ' <end>'
descriptions[image].append(desc)
return descriptions
def load_features(photos):
#loading all features
all_features = load(open("features1.p","rb"))
#selecting only needed features
features = {k:all_features[k] for k in photos}
return features
filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"
#train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)
#converting dictionary to clean list of descriptions
def dict_to_list(descriptions):
all_desc = []
for key in descriptions.keys():
[all_desc.append(d) for d in descriptions[key]]
return all_desc
#creating tokenizer class
#this will vectorise text corpus
#each integer will represent token in dictionary
from keras.preprocessing.text import Tokenizer
def create_tokenizer(descriptions):
desc_list = dict_to_list(descriptions)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(desc_list)
return tokenizer
# give each word an index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
#calculate maximum length of descriptions
def max_length(descriptions):
desc_list = dict_to_list(descriptions)
return max(len(d.split()) for d in desc_list)
max_length = max_length(descriptions)
print(max_length)
#data generator, used by model.fit_generator()
def data_generator(descriptions, features, tokenizer, max_length):
while 1:
for key, description_list in descriptions.items():
#retrieve photo features
feature = features[key][0]
input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
yield [[input_image, input_sequence], output_word]
def create_sequences(tokenizer, max_length, desc_list, feature):
X1, X2, y = list(), list(), list()
# walk through each description for the image
for desc in desc_list:
# encode the sequence
seq = tokenizer.texts_to_sequences([desc])[0]
# split one sequence into multiple X,y pairs
for i in range(1, len(seq)):
# split into input and output pair
in_seq, out_seq = seq[:i], seq[i]
# pad input sequence
in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
# encode output sequence
out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
# store
X1.append(feature)
X2.append(in_seq)
y.append(out_seq)
return np.array(X1), np.array(X2), np.array(y)
#You can check the shape of the input and output for your model
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length))
print(a.shape, b.shape, c.shape)
# define the captioning model
def define_model(vocab_size, max_length):
# features from the CNN model squeezed from 2048 to 256 nodes
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(512, activation='relu')(fe1)
# LSTM sequence model
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 512, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(512)(se2)
# Merging both models
decoder1 = concatenate([fe2, se3])
decoder2 = Dense(512, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
# tie it together [image, seq] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
# summarize model
print(model.summary())
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
return model
# train our model
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)
model = define_model(vocab_size, max_length)
epochs = 40
steps = len(train_descriptions)
# making a directory models to save our models
os.mkdir("models")
for i in range(epochs):
generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
model.fit(generator, epochs=1, steps_per_epoch= steps, verbose=1)
model.save("models/model_" + str(i) + ".h5")