-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathclassify_video.py
158 lines (133 loc) · 6.94 KB
/
classify_video.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#classify_video.py will classify a video using (1) singleFrame RGB model (2) singleFrame flow model (3) 0.5/0.5 singleFrame RGB/singleFrame flow fusion (4) 0.33/0.67 singleFrame RGB/singleFrame flow fusion (5) LRCN RGB model (6) LRCN flow model (7) 0.5/0.5 LRCN RGB/LRCN flow model (8) 0.33/0.67 LRCN RGB/LRCN flow model
#Before using, change RGB_video_path and flow_video_path.
#Use: classify_video.py video, where video is the video you wish to classify. If no video is specified, the video "v_Archery_g01_c01" will be classified.
import numpy as np
import glob
caffe_root = '../../'
import sys
sys.path.insert(0,caffe_root + 'python')
import caffe
caffe.set_mode_gpu()
caffe.set_device(0)
import pickle
RGB_video_path = 'frames/'
flow_video_path = 'flow_images/'
if len(sys.argv) > 1:
video = sys.argv[1]
else:
video = 'v_Archery_g01_c01'
#Initialize transformers
def initialize_transformer(image_mean, is_flow):
shape = (10*16, 3, 227, 227)
transformer = caffe.io.Transformer({'data': shape})
channel_mean = np.zeros((3,227,227))
for channel_index, mean_val in enumerate(image_mean):
channel_mean[channel_index, ...] = mean_val
transformer.set_mean('data', channel_mean)
transformer.set_raw_scale('data', 255)
transformer.set_channel_swap('data', (2, 1, 0))
transformer.set_transpose('data', (2, 0, 1))
transformer.set_is_flow('data', is_flow)
return transformer
ucf_mean_RGB = np.zeros((3,1,1))
ucf_mean_flow = np.zeros((3,1,1))
ucf_mean_flow[:,:,:] = 128
ucf_mean_RGB[0,:,:] = 103.939
ucf_mean_RGB[1,:,:] = 116.779
ucf_mean_RGB[2,:,:] = 128.68
transformer_RGB = initialize_transformer(ucf_mean_RGB, False)
transformer_flow = initialize_transformer(ucf_mean_flow,True)
# Extract list of frames in video
RGB_frames = glob.glob('%s%s/*.jpg' %(RGB_video_path, video))
flow_frames = glob.glob('%s%s/*.jpg' %(flow_video_path, video))
#classify video with LRCN model
def LRCN_classify_video(frames, net, transformer, is_flow):
clip_length = 16
offset = 8
input_images = []
for im in frames:
input_im = caffe.io.load_image(im)
if (input_im.shape[0] < 240):
input_im = caffe.io.resize_image(input_im, (240,320))
input_images.append(input_im)
vid_length = len(input_images)
input_data = []
for i in range(0,vid_length,offset):
if (i + clip_length) < vid_length:
input_data.extend(input_images[i:i+clip_length])
else: #video may not be divisible by clip_length
input_data.extend(input_images[-clip_length:])
output_predictions = np.zeros((len(input_data),101))
for i in range(0,len(input_data),clip_length):
clip_input = input_data[i:i+clip_length]
clip_input = caffe.io.oversample(clip_input,[227,227])
clip_clip_markers = np.ones((clip_input.shape[0],1,1,1))
clip_clip_markers[0:10,:,:,:] = 0
# if is_flow: #need to negate the values when mirroring
# clip_input[5:,:,:,0] = 1 - clip_input[5:,:,:,0]
caffe_in = np.zeros(np.array(clip_input.shape)[[0,3,1,2]], dtype=np.float32)
for ix, inputs in enumerate(clip_input):
caffe_in[ix] = transformer.preprocess('data',inputs)
out = net.forward_all(data=caffe_in, clip_markers=np.array(clip_clip_markers))
output_predictions[i:i+clip_length] = np.mean(out['probs'],1)
return np.mean(output_predictions,0).argmax(), output_predictions
#classify video with singleFrame model
def singleFrame_classify_video(frames, net, transformer, is_flow):
batch_size = 16
input_images = []
for im in frames:
input_im = caffe.io.load_image(im)
if (input_im.shape[0] < 240):
input_im = caffe.io.resize_image(input_im, (240,320))
input_images.append(input_im)
vid_length = len(input_images)
output_predictions = np.zeros((len(input_images),101))
for i in range(0,len(input_images), batch_size):
clip_input = input_images[i:min(i+batch_size, len(input_images))]
clip_input = caffe.io.oversample(clip_input,[227,227])
clip_clip_markers = np.ones((clip_input.shape[0],1,1,1))
clip_clip_markers[0:10,:,:,:] = 0
if is_flow: #need to negate the values when mirroring
clip_input[5:,:,:,0] = 1 - clip_input[5:,:,:,0]
caffe_in = np.zeros(np.array(clip_input.shape)[[0,3,1,2]], dtype=np.float32)
for ix, inputs in enumerate(clip_input):
caffe_in[ix] = transformer.preprocess('data',inputs)
net.blobs['data'].reshape(caffe_in.shape[0], caffe_in.shape[1], caffe_in.shape[2], caffe_in.shape[3])
out = net.forward_all(data=caffe_in)
output_predictions[i:i+batch_size] = np.mean(out['probs'].reshape(10,caffe_in.shape[0]/10,101),0)
return np.mean(output_predictions,0).argmax(), output_predictions
#Models and weights
singleFrame_model = 'deploy_singleFrame.prototxt'
lstm_model = 'deploy_lstm.prototxt'
RGB_singleFrame = 'single_frame_all_layers_hyb_RGB_iter_5000.caffemodel'
flow_singleFrame = 'single_frame_all_layers_hyb_flow_iter_50000.caffemodel'
RGB_lstm = 'RGB_lstm_model_iter_30000.caffemodel'
flow_lstm = 'flow_lstm_model_iter_50000.caffemodel'
RGB_singleFrame_net = caffe.Net(singleFrame_model, RGB_singleFrame, caffe.TEST)
class_RGB_singleFrame, predictions_RGB_singleFrame = \
singleFrame_classify_video(RGB_frames, RGB_singleFrame_net, transformer_RGB, False)
del RGB_singleFrame_net
flow_singleFrame_net = caffe.Net(singleFrame_model, flow_singleFrame, caffe.TEST)
class_flow_singleFrame, predictions_flow_singleFrame = \
singleFrame_classify_video(flow_frames, flow_singleFrame_net, transformer_flow, True)
del flow_singleFrame_net
RGB_lstm_net = caffe.Net(lstm_model, RGB_lstm, caffe.TEST)
class_RGB_LRCN, predictions_RGB_LRCN = \
LRCN_classify_video(RGB_frames, RGB_lstm_net, transformer_RGB, False)
del RGB_lstm_net
flow_lstm_net = caffe.Net(lstm_model, flow_lstm, caffe.TEST)
class_flow_LRCN, predictions_flow_LRCN = \
LRCN_classify_video(flow_frames, flow_lstm_net, transformer_flow, True)
del flow_lstm_net
def compute_fusion(RGB_pred, flow_pred, p):
return np.argmax(p*np.mean(RGB_pred,0) + (1-p)*np.mean(flow_pred,0))
#Load activity label hash
action_hash = pickle.load(open('action_hash_rev.p','rb'))
print "RGB single frame model classified video as: %s.\n" %(action_hash[class_RGB_singleFrame])
print "Flow single frame model classified video as: %s.\n" %(action_hash[class_flow_singleFrame])
print "RGB LRCN model classified video as: %s.\n" %(action_hash[class_RGB_LRCN])
print "Flow LRCN frame model classified video as: %s.\n" %(action_hash[class_flow_LRCN])
print "0.5/0.5 single frame fusion model classified video as: %s. \n" %(action_hash[compute_fusion(predictions_RGB_singleFrame, predictions_flow_singleFrame, 0.5)])
print "0.33/0.67 single frame fusion model classified video as: %s. \n" %(action_hash[compute_fusion(predictions_RGB_singleFrame, predictions_flow_singleFrame, 0.33)])
print "0.5/0.5 LRCN fusion model classified video as: %s. \n" %(action_hash[compute_fusion(predictions_RGB_LRCN, predictions_flow_LRCN, 0.5)])
print "0.33/0.67 LRCN fusion model classified video as: %s. \n" %(action_hash[compute_fusion(predictions_RGB_LRCN, predictions_flow_LRCN, 0.33)])