forked from awni/speech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
loader.py
170 lines (140 loc) · 5.26 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import numpy as np
import random
import scipy.signal
import torch
import torch.autograd as autograd
import torch.utils.data as tud
from speech.utils import wave
class Preprocessor():
END = "</s>"
START = "<s>"
def __init__(self, data_json, max_samples=100, start_and_end=True):
"""
Builds a preprocessor from a dataset.
Arguments:
data_json (string): A file containing a json representation
of each example per line.
max_samples (int): The maximum number of examples to be used
in computing summary statistics.
start_and_end (bool): Include start and end tokens in labels.
"""
data = read_data_json(data_json)
# Compute data mean, std from sample
audio_files = [d['audio'] for d in data]
random.shuffle(audio_files)
self.mean, self.std = compute_mean_std(audio_files[:max_samples])
self._input_dim = self.mean.shape[0]
# Make char map
chars = list(set(t for d in data for t in d['text']))
if start_and_end:
# START must be last so it can easily be
# excluded in the output classes of a model.
chars.extend([self.END, self.START])
self.start_and_end = start_and_end
self.int_to_char = dict(enumerate(chars))
self.char_to_int = {v : k for k, v in self.int_to_char.items()}
def encode(self, text):
text = list(text)
if self.start_and_end:
text = [self.START] + text + [self.END]
return [self.char_to_int[t] for t in text]
def decode(self, seq):
text = [self.int_to_char[s] for s in seq]
if not self.start_and_end:
return text
s = text[0] == self.START
e = len(text)
if text[-1] == self.END:
e = text.index(self.END)
return text[s:e]
def preprocess(self, wave_file, text):
inputs = log_specgram_from_file(wave_file)
inputs = (inputs - self.mean) / self.std
targets = self.encode(text)
return inputs, targets
@property
def input_dim(self):
return self._input_dim
@property
def vocab_size(self):
return len(self.int_to_char)
def compute_mean_std(audio_files):
samples = [log_specgram_from_file(af)
for af in audio_files]
samples = np.vstack(samples)
mean = np.mean(samples, axis=0)
std = np.std(samples, axis=0)
return mean, std
class AudioDataset(tud.Dataset):
def __init__(self, data_json, preproc, batch_size):
data = read_data_json(data_json)
self.preproc = preproc
bucket_diff = 4
max_len = max(len(x['text']) for x in data)
num_buckets = max_len // bucket_diff
buckets = [[] for _ in range(num_buckets)]
for d in data:
bid = min(len(d['text']) // bucket_diff, num_buckets - 1)
buckets[bid].append(d)
# Sort by input length followed by output length
sort_fn = lambda x : (round(x['duration'], 1),
len(x['text']))
for b in buckets:
b.sort(key=sort_fn)
data = [d for b in buckets for d in b]
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
datum = self.data[idx]
datum = self.preproc.preprocess(datum["audio"],
datum["text"])
return datum
class BatchRandomSampler(tud.sampler.Sampler):
"""
Batches the data consecutively and randomly samples
by batch without replacement.
"""
def __init__(self, data_source, batch_size):
it_end = len(data_source) - batch_size + 1
self.batches = [range(i, i + batch_size)
for i in range(0, it_end, batch_size)]
self.data_source = data_source
def __iter__(self):
random.shuffle(self.batches)
return (i for b in self.batches for i in b)
def __len__(self):
return len(self.data_source)
def make_loader(dataset_json, preproc,
batch_size, num_workers=4):
dataset = AudioDataset(dataset_json, preproc,
batch_size)
sampler = BatchRandomSampler(dataset, batch_size)
loader = tud.DataLoader(dataset,
batch_size=batch_size,
sampler=sampler,
num_workers=num_workers,
collate_fn=lambda batch : zip(*batch),
drop_last=True)
return loader
def log_specgram_from_file(audio_file):
audio, sr = wave.array_from_wave(audio_file)
return log_specgram(audio, sr)
def log_specgram(audio, sample_rate, window_size=20,
step_size=10, eps=1e-10):
nperseg = window_size * sample_rate / 1e3
noverlap = step_size * sample_rate / 1e3
_, _, spec = scipy.signal.spectrogram(audio,
fs=sample_rate,
window='hann',
nperseg=nperseg,
noverlap=noverlap,
detrend=False)
return np.log(spec.T.astype(np.float32) + eps)
def read_data_json(data_json):
with open(data_json) as fid:
return [json.loads(l) for l in fid]