This repository has been archived by the owner on Jan 20, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocess.py
139 lines (109 loc) · 3.95 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import sys
if sys.version_info[0] < 3 and sys.version_info[1] < 2:
raise Exception("Must be using >= Python 3.2")
import multiprocessing as mp
from concurrent.futures import ThreadPoolExecutor, as_completed
from os import listdir, path
import numpy as np
import argparse, os, cv2, traceback, subprocess
from tqdm import tqdm
import dlib, audio
detector = dlib.get_frontal_face_detector()
def rect_to_bb(rect):
x = rect.left()
y = rect.top()
w = rect.right() - x
h = rect.bottom() - y
return (x, y, w, h)
def calcMaxArea(rects):
max_cords = (-1,-1,-1,-1)
max_area = 0
max_rect = None
for i in range(len(rects)):
cur_rect = rects[i]
(x,y,w,h) = rect_to_bb(cur_rect)
if w*h > max_area:
max_area = w*h
max_cords = (x,y,w,h)
max_rect = cur_rect
return max_cords, max_rect
def face_detect(image):
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
rects = detector(gray, 1)
(x, y, w, h), max_rect = calcMaxArea(rects)
if x == -1:
return None, False
faceAligned = image[y:y+h, x:x+w]
if 0 in faceAligned.shape: return None, False
return faceAligned, True
step_size_in_ms = 40
window_size = 350
mfcc_chunk_size = window_size // 10
mfcc_step_size = 4
fps = 25
video_step_size_in_ms = mfcc_step_size * 10 # for 25 fps video
sr = 16000
template = 'ffmpeg -loglevel panic -y -i {} -ar {} {}'
def process_video_file(vfile, args, split):
video_stream = cv2.VideoCapture(vfile)
frames = []
while 1:
still_reading, frame = video_stream.read()
if not still_reading:
video_stream.release()
break
frames.append(frame)
mid_frames = []
ss = 0.
es = (ss + (window_size / 1000.))
while int(es * fps) <= len(frames):
mid_second = (ss + es) / 2.
mid_frames.append(frames[int(mid_second * fps)])
ss += (video_step_size_in_ms / 1000.)
es = (ss + (window_size / 1000.))
dst_subdir = path.join(vfile.split('/')[-2], vfile.split('/')[-1].split('.')[0])
fulldir = path.join(args.final_data_root, split, dst_subdir)
os.makedirs(fulldir, exist_ok=True)
wavpath = path.join(fulldir, 'audio.wav')
command = template.format(vfile, sr, wavpath)
subprocess.call(command, shell=True)
specpath = path.join(fulldir, 'mels.npz')
if path.isfile(wavpath):
wav = audio.load_wav(wavpath, sr)
spec = audio.melspectrogram(wav)
np.savez_compressed(specpath, spec=spec)
else:
return
for i, f in enumerate(mid_frames):
face, valid_frame = face_detect(f)
if not valid_frame:
continue
resized_face = cv2.resize(face, (args.img_size, args.img_size))
cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), resized_face)
def mp_handler(job):
vfile, args, split = job
try:
process_video_file(vfile, args, split)
except:
traceback.print_exc()
def dump_split(args):
print('Started processing for {} with {} CPU cores'.format(args.split, args.num_workers))
filelist = [path.join(args.videos_data_root, ('pretrain' if args.split == 'pretrain' else 'main'),
'{}.mp4'.format(line.strip())) \
for line in open(path.join(args.filelists, '{}.txt'.format(args.split))).readlines()]
jobs = [(vfile, args, ('pretrain' if args.split == 'pretrain' else 'main')) for vfile in filelist]
p = ThreadPoolExecutor(args.num_workers)
futures = [p.submit(mp_handler, j) for j in jobs]
_ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]
parser = argparse.ArgumentParser()
parser.add_argument('--split', help='LRS2 dataset split to preprocess', default='train')
parser.add_argument('--num_workers', help='Number of workers to run in parallel', default=10, type=int)
parser.add_argument('--filelists', help='List of train, val, test, pretrain files', default='./filelists/')
parser.add_argument("--videos_data_root", help="Root folder of LRS", required=True)
parser.add_argument("--final_data_root", help="Folder where preprocessed files will reside",
required=True)
### hyperparams ####
parser.add_argument("--img_size", help="Square face image to resize to", default=96, type=int)
args = parser.parse_args()
if __name__ == '__main__':
dump_split(args)