-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
64 lines (47 loc) · 1.84 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import librosa
import numpy as np
from scipy.io import wavfile
import av
import python_speech_features as psf
import torchvision
from PIL import Image
import torch
def load_wav(path, fr=0, to=10000, sample_rate=16000):
"""Loads Audio wav from path at time indices given by fr, to (seconds)"""
_, wav = wavfile.read(path)
fr_aud = int(np.round(fr * sample_rate))
to_aud = int(np.round((to) * sample_rate))
wav = wav[fr_aud:to_aud]
return wav
def load_video(vid_path, resize=None):
container = av.open(vid_path)
ims = [frame.to_image() for frame in container.decode(video=0)]
frames = np.array([np.array(im) for im in ims])
if resize:
ims = [Image.fromarray(frm) for frm in frames]
ims = [
torchvision.transforms.functional.resize(im,
resize)
for im in ims
]
frames = np.array([np.array(im) for im in ims])
return frames.astype('float32')
def trunkate_audio_and_video(video, aud_feats, aud_fact):
aud_in_frames = aud_feats.shape[0] // aud_fact
# make audio exactly devisible by video frames
aud_cutoff = min(video.shape[0], int(aud_feats.shape[0] / aud_fact))
aud_feats = aud_feats[:aud_cutoff * aud_fact]
aud_in_frames = aud_feats.shape[0] // aud_fact
min_len = min(aud_in_frames, video.shape[0])
# --- trunkate all to min
video = video[:min_len]
aud_feats = aud_feats[:min_len * aud_fact]
if not aud_feats.shape[0] // aud_fact == video.shape[0]:
ipdb.set_trace(context=20)
return aud_feats, video
def logsoftmax_2d(logits):
# Log softmax on last 2 dims because torch won't allow multiple dims
orig_shape = logits.shape
logprobs = torch.nn.LogSoftmax(dim=-1)(
logits.reshape(list(logits.shape[:-2]) + [-1])).reshape(orig_shape)
return logprobs