-
Notifications
You must be signed in to change notification settings - Fork 2
/
metadata.py
142 lines (106 loc) · 5.24 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Converts raw TIMIT data into a pickle dump which can be used during training
"""
import numpy as np
import pickle
import os
import utils
import json
from utils import listdir
class timit_metadata:
def __init__(self, type_, config_file):
self.config = config_file
self.mode = type_
self.db_path = config_file['dir']['dataset']
# fold phones in list to the phone which is the key e.g. 'ao' is 'collapsed' into 'aa'
self.replacement = utils.replacement_dict()
feature_dim = self.config['n_fbank'] + self.config['n_mfcc']
self.pkl_name = self.db_path + self.mode + '_rnn_ctc_' + str(feature_dim) + '.pkl'
self.win_len, self.win_step = config_file['window_size'], config_file['window_step']
# Generate and store pickle dump
def gen_pickle(self):
# Return if already exists
if os.path.exists(self.pkl_name):
print("Found pickle dump for", self.mode)
with open(self.pkl_name, 'rb') as f:
return pickle.load(f)
print("Generating pickle dump for", self.mode)
list_features, list_phones = [], []
base_pth = self.db_path + self.mode
all_phones = set()
# Phone distribution is used to calculate weights
num_distribution = {}
# Iterate over entire dataset
for dialect in sorted(listdir(base_pth)):
print("Dialect:", dialect)
for speaker_id in sorted(listdir(os.path.join(base_pth, dialect))):
data = sorted(os.listdir(os.path.join(base_pth, dialect, speaker_id)))
wav_files = [x for x in data if x.split('.')[-1] == 'wav'] # all the .wav files
for wav_file in wav_files:
if wav_file in ['SA1.wav', 'SA2.wav']:
continue
wav_path = os.path.join(base_pth, dialect, speaker_id, wav_file)
final_vec = utils.read_wav(wav_path, winlen=self.config['window_size'],
winstep=self.config['window_step'],
fbank_filt=self.config['n_fbank'], mfcc_filt=self.config['n_mfcc'])
phone_path = wav_path[:-3] + 'PHN' # file which contains the phenome location data
# phones in current wav file
cur_phones = []
with open(phone_path, 'r') as f:
a = f.readlines()
for phone in a:
s_e_i = phone[:-1].split(' ') # start, end, phenome_name e.g. 0 5432 'aa'
start, end, ph = int(s_e_i[0]), int(s_e_i[1]), s_e_i[2]
# collapse into father phone
for father, list_of_sons in self.replacement.items():
if ph in list_of_sons:
ph = father
break
# update distribution
all_phones.add(ph)
if ph not in num_distribution.keys():
num_distribution[ph] = 0
num_distribution[ph] += 1
cur_phones.append(ph)
# Append current recording to the main list
list_features.append(final_vec)
list_phones.append(cur_phones)
if self.mode == 'TRAIN':
# Normalise feature vectors
# np_arr = np.concatenate(list_features, axis=0)
# print(np_arr.shape)
# np_mean = np.mean(np_arr, axis=0)
# np_std = np.std(np_arr, axis=0)
# np_mean = np.zeros(np_mean.shape)
# np_std = np.ones(np_std.shape)
# print("Mean:", np_mean, "\nStd. Dev:", np_std)
phones_to_id = {}
if self.config['dump_phone_weights']:
# Weights are inversely proportional to number of phones encountered
num_distribution = {k: 1 / v for k, v in num_distribution.items()}
total_ph = sum(num_distribution.values())
num_distribution = {k: v / total_ph for k, v in num_distribution.items()}
# Dump mapping from id to phone. Used to convert NN output back to the phone it predicted
for ph in sorted(all_phones):
phones_to_id[ph] = (len(phones_to_id), num_distribution[ph])
else:
for ph in sorted(all_phones):
phones_to_id[ph] = len(phones_to_id)
phones_to_id['PAD'] = len(phones_to_id)
phones_to_id['BLANK'] = len(phones_to_id)
# phones_to_id['WB'] = len(phones_to_id)
# Dump this mapping
fname = self.config['dir']['dataset'] + 'phone_mapping.json'
with open(fname, 'w') as f:
json.dump(phones_to_id, f)
to_return = list(zip(list_features, list_phones))
# Dump database
with open(self.pkl_name, 'wb') as f:
pickle.dump(to_return, f)
print("Dumped pickle")
return to_return
if __name__ == '__main__':
from read_yaml import read_yaml
config = read_yaml('config.yaml')
a = timit_metadata('TRAIN', config)
a.gen_pickle()