-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmain.py
executable file
·234 lines (210 loc) · 7.41 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env python3
import argparse
import time
from loguru import logger
from pprint import pprint
from src.input_feeder import InputFeeder
from src.model import (
Facial_Landmarks,
Face_Detection,
Head_Pose_Estimation,
Gaze_Estimation,
)
from src.mouse_controller import MouseController
def arg_parser():
"""Parse command line arguments.
:return: command line arguments
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-fm",
"--face-model",
required=True,
type=str,
help="Path to an xml file with a trained model.",
)
parser.add_argument(
"-hp",
"--head-pose-model",
required=True,
type=str,
help="Path to an IR model representative for head-pose-model",
)
parser.add_argument(
"-fl",
"--facial-landmarks-model",
required=True,
type=str,
help="Path to an IR model representative for facial-landmarks-model",
)
parser.add_argument(
"-gm",
"--gaze-model",
required=True,
type=str,
help="Path to an IR model representative for gaze-model",
)
parser.add_argument(
"-d",
"--device",
type=str,
default="CPU",
help="Specify the target device to infer on: "
"CPU, GPU, FPGA or MYRIAD is acceptable. Sample "
"will look for a suitable plugin for device "
"specified (Default: CPU)",
)
parser.add_argument(
"-pt",
"--prob_threshold",
type=float,
default=0.8,
help="Probability threshold for detections filtering" "(Default: 0.8)",
)
parser.add_argument(
"-i",
"--input",
required=True,
type=str,
help="Path to image, video file or 'cam' for Webcam.",
)
parser.add_argument(
"--out", action="store_true", help="Write video to file.",
)
parser.add_argument(
"-mp",
"--mouse-precision",
type=str,
default="low",
const="low",
nargs="?",
choices=["high", "low", "medium"],
help="The precision for mouse movement (how much the mouse moves). [Default: low]",
)
parser.add_argument(
"-ms",
"--mouse-speed",
type=str,
default="fast",
const="fast",
nargs="?",
choices=["fast", "slow", "medium"],
help="The speed (how fast it moves) by changing [Default: fast]",
)
parser.add_argument(
"--enable-mouse", action="store_true", help="Enable Mouse Movement",
)
parser.add_argument(
"--show-bbox",
action="store_true",
help="Show bounding box and stats on screen [debugging].",
)
parser.add_argument(
"--debug", action="store_true", help="Show output on screen [debugging].",
)
parser.add_argument(
"--stats",
action="store_true",
help="Verbose OpenVINO layer performance stats [debugging].",
)
return parser.parse_args()
def main(args):
mouse_controller = MouseController(
precision=args.mouse_precision, speed=args.mouse_speed
)
video_feed = InputFeeder(input_file=args.input)
face_detection = Face_Detection(
model_name=args.face_model,
source_width=video_feed.source_width,
source_height=video_feed.source_height,
device=args.device,
threshold=args.prob_threshold,
)
facial_landmarks = Facial_Landmarks(args.facial_landmarks_model, device=args.device)
head_pose_estimation = Head_Pose_Estimation(
args.head_pose_model, device=args.device
)
gaze_estimation = Gaze_Estimation(args.gaze_model, device=args.device)
model_load_time = (
face_detection._model_load_time
+ head_pose_estimation._model_load_time
+ facial_landmarks._model_load_time
+ gaze_estimation._model_load_time
) / 1000
logger.info(f"Total time taken to load all the models: {model_load_time:.2f} secs.")
count = 0
for frame in video_feed.next_frame():
count += 1
predict_end_time, face_bboxes = face_detection.predict(
frame, show_bbox=args.show_bbox
)
if face_bboxes:
for face_bbox in face_bboxes:
# Useful resource: https://www.pyimagesearch.com/2018/09/24/opencv-face-recognition/
# Face bounding box coordinates cropped from the face detection inference
# are face_bboxes i.e `xmin, ymin, xmax, ymax`
# Therefore the face can be cropped by:
# frame[face_bbox[1]:face_bbox[3], face_bbox[0]:face_bbox[2]]
# extract the face ROI
(x, y, w, h) = face_bbox
face = frame[y:h, x:w]
(face_height, face_width) = face.shape[:2]
# video_feed.show(frame[y:h, x:w], "face")
# ensure the face width and height are sufficiently large
if face_height < 20 or face_width < 20:
continue
facial_landmarks_pred_time, eyes_coords = facial_landmarks.predict(
face, show_bbox=args.show_bbox
)
hp_est_pred_time, head_pose_angles = head_pose_estimation.predict(
face, show_bbox=args.show_bbox
)
gaze_pred_time, gaze_vector = gaze_estimation.predict(
frame,
show_bbox=args.show_bbox,
face=face,
eyes_coords=eyes_coords,
head_pose_angles=head_pose_angles,
)
if args.debug:
head_pose_estimation.show_text(frame, head_pose_angles)
gaze_estimation.show_text(frame, gaze_vector)
if args.enable_mouse:
mouse_controller.move(gaze_vector["x"], gaze_vector["y"])
else:
if count % 10 ==0:
logger.warning("Could not detect face in the frame.")
if args.debug:
if face_bboxes:
text = f"Face Detection Inference time: {predict_end_time:.3f} s"
face_detection.add_text(
text, frame, (15, video_feed.source_height - 80)
)
text = (
f"Facial Landmarks Est. Inference time: "
f"{facial_landmarks_pred_time:.3f} s"
)
facial_landmarks.add_text(
text, frame, (15, video_feed.source_height - 60)
)
text = f"Head Pose Est. Inference time: {hp_est_pred_time:.3f} s"
head_pose_estimation.add_text(
text, frame, (15, video_feed.source_height - 40)
)
text = f"Gaze Est. Inference time: {gaze_pred_time:.3f} s"
gaze_estimation.add_text(
text, frame, (15, video_feed.source_height - 20)
)
video_feed.show(video_feed.resize(frame))
if args.stats:
stats = {
"face_detection": face_detection.perf_stats,
"facial_landmarks": facial_landmarks.perf_stats,
"head_pose_estimation": head_pose_estimation.perf_stats,
"gaze_estimation": gaze_estimation.perf_stats,
}
pprint(stats)
video_feed.close()
if __name__ == "__main__":
args = arg_parser()
main(args)