-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEmo2Mix_dataio.py
148 lines (126 loc) · 5.06 KB
/
Emo2Mix_dataio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import speechbrain as sb
import numpy as np
import torch
import torchaudio
import glob
import os
from speechbrain.dataio.batch import PaddedBatch
from tqdm import tqdm
import warnings
import pyloudnorm
import random
import itertools
from scipy.signal import resample_poly
from RAVDESS2Mix_BSS_prep import getEmotion, getIntensity
def dataio_prep(hparams):
emotion_list = [*range(1,9)]
emotion_combs = [*itertools.product(emotion_list,emotion_list)]
normal_emotions_list = [getEmotion(x[0])+"_"+getEmotion(x[1]) for x in emotion_combs]
emotion_list = [*range(2,9)]
emotion_combs = [*itertools.product(emotion_list,emotion_list)]
strong_emotions_list = [getEmotion(x[0])+"_"+getEmotion(x[1]) for x in emotion_combs]
# 1. Define datasets
normal_data = []
for emotion in normal_emotions_list:
normal_data.append(
sb.dataio.dataset.DynamicItemDataset.from_csv(
csv_path=f'{hparams["save_folder"]}normal_normal/{emotion}.csv',
)
)
strong_data = []
for emotion in normal_emotions_list:
normal_data.append(
sb.dataio.dataset.DynamicItemDataset.from_csv(
csv_path=f'{hparams["save_folder"]}normal_normal/{emotion}.csv',
)
)
@sb.utils.data_pipeline.takes("s1_wav","s2_wav")
@sb.utils.data_pipeline.provides("mix_sig","s1_sig","s2_sig")
def audio_pipeline_mix(s1_wav, s2_wav):
"""
This audio pipeline defines the compute graph for dynamic mixing of the RAVDESS2Mix dataset
Based on the original RAVDES2Mix mixing script, we downsample the audio before normalization
Function used is scipy.signal.resample_poly
"""
sources = []
spk_files = [s1_wav,s2_wav]
minlen = min(
*[torchaudio.info(x).num_frames for x in spk_files],
)
meter = pyloudnorm.Meter(hparams["sample_rate"])
MAX_AMP = 0.9
MIN_LOUDNESS = -33
MAX_LOUDNESS = -25
def normalize(signal, is_noise=False):
"""
This function normalizes the audio signals for loudness
"""
with warnings.catch_warnings():
warnings.simplefilter("ignore")
c_loudness = meter.integrated_loudness(signal)
if is_noise:
target_loudness = random.uniform(
MIN_LOUDNESS - 5, MAX_LOUDNESS - 5
)
else:
target_loudness = random.uniform(MIN_LOUDNESS, MAX_LOUDNESS)
signal = pyloudnorm.normalize.loudness(
signal, c_loudness, target_loudness
)
# check for clipping
if np.max(np.abs(signal)) >= 1:
signal = signal * MAX_AMP / np.max(np.abs(signal))
return torch.from_numpy(signal)
for i, spk_file in enumerate(spk_files):
# select random offset
length = torchaudio.info(spk_file).num_frames
start = 0
stop = length
if length > minlen: # take a random window
start = np.random.randint(0, length - minlen)
stop = start + minlen
tmp, fs_read = torchaudio.load(
spk_file, frame_offset=start, num_frames=stop - start,
)
tmp = tmp[0].numpy()
tmp = resample_poly(tmp,hparams["sample_rate"],fs_read)
tmp = normalize(tmp)
sources.append(tmp)
sources = torch.stack(sources)
mixture = torch.sum(sources, 0)
# check for clipping
max_amp_insig = mixture.abs().max().item()
if max_amp_insig > MAX_AMP:
weight = MAX_AMP / max_amp_insig
else:
weight = 1
sources = weight * sources
mix_sig = weight * mixture
yield mix_sig
for i in range(hparams["num_spks"]):
yield sources[i]
sb.dataio.dataset.add_dynamic_item(normal_data, audio_pipeline_mix)
# sb.dataio.dataset.add_dynamic_item(normal_data, audio_pipeline_s1)
# sb.dataio.dataset.add_dynamic_item(normal_data, audio_pipeline_s2)
sb.dataio.dataset.set_output_keys(
normal_data, ["id", "mix_sig", "s1_sig", "s2_sig"]
)
# if "strong_emotions_list" in hparams:
# sb.dataio.dataset.add_dynamic_item(strong_data, audio_pipeline_mix)
# sb.dataio.dataset.add_dynamic_item(strong_data, audio_pipeline_s1)
# sb.dataio.dataset.add_dynamic_item(strong_data, audio_pipeline_s2)
# sb.dataio.dataset.set_output_keys(
# strong_data, ["id", "mix_sig", "s1_sig", "s2_sig"]
# )
return normal_data
if __name__ == "__main__":
hparams={
"normal_emotions_list": [1,2,3],
"num_spks":2,
"sample_rate":8000,
"limit_training_signal_len": False,
"training_signal_len": None,
"save_folder": "RAVDESS2Mix_csv/sep_third_spks/"
}
test = dataio_prep(hparams)
print(test[0][0].keys())