From b66a3509c4b550161f0196e545dfe7e8260244b0 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Wed, 4 Dec 2024 12:17:46 -0500 Subject: [PATCH 01/46] moved PR to a branch --- assets/i18n/languages/en_US.json | 4 +- core.py | 18 +- logs/mute/sliced_audios/mute44100.wav | Bin 0 -> 529258 bytes rvc/configs/config.py | 2 + rvc/infer/infer.py | 2 + rvc/lib/algorithm/hifigan.py | 286 ++++++++++++++++++++++++++ rvc/lib/algorithm/synthesizers.py | 92 ++++++--- rvc/lib/tools/pretrained_selector.py | 77 ++----- rvc/lib/zluda.py | 27 +-- rvc/train/losses.py | 16 ++ rvc/train/process/extract_model.py | 2 + rvc/train/train.py | 27 ++- tabs/train/train.py | 12 +- 13 files changed, 430 insertions(+), 135 deletions(-) create mode 100644 logs/mute/sliced_audios/mute44100.wav create mode 100644 rvc/lib/algorithm/hifigan.py diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index 083677160..50b145a80 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -321,5 +321,7 @@ "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid.": "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid.", "Model Author Name": "Model Author Name", "The name that will appear in the model information.": "The name that will appear in the model information.", - "Set name": "Set name" + "Set name": "Set name", + "Vocoder": "Vocoder", + "Vocoder type": "Choose the vocoder type. 'Default' is a standard vocoder compatible with all other RVC clients, the experimental 'MRF HiFi-GAN' vocoder provides higher fidelity but is only compatible with Applio." } \ No newline at end of file diff --git a/core.py b/core.py index dc3889c19..02175d2b9 100644 --- a/core.py +++ b/core.py @@ -512,15 +512,14 @@ def run_train_script( custom_pretrained: bool = False, g_pretrained_path: str = None, d_pretrained_path: str = None, + vocoder: str = "default", ): if pretrained == True: from rvc.lib.tools.pretrained_selector import pretrained_selector if custom_pretrained == False: - pg, pd = pretrained_selector(bool(pitch_guidance))[str(rvc_version)][ - int(sample_rate) - ] + pg, pd = pretrained_selector(str(rvc_version), str(vocoder), bool(pitch_guidance), int(sample_rate)) else: if g_pretrained_path is None or d_pretrained_path is None: raise ValueError( @@ -553,6 +552,7 @@ def run_train_script( overtraining_detector, overtraining_threshold, cleanup, + vocoder ], ), ] @@ -1840,7 +1840,7 @@ def parse_arguments(): "--sample_rate", type=int, help="Target sampling rate for the audio data.", - choices=[32000, 40000, 48000], + choices=[32000, 40000, 44100, 48000], required=True, ) preprocess_parser.add_argument( @@ -1931,7 +1931,7 @@ def parse_arguments(): "--sample_rate", type=int, help="Target sampling rate for the audio data.", - choices=[32000, 40000, 48000], + choices=[32000, 40000, 44100, 48000], required=True, ) extract_parser.add_argument( @@ -1966,6 +1966,13 @@ def parse_arguments(): choices=["v1", "v2"], default="v2", ) + train_parser.add_argument( + "--vocoder", + type=str, + help="Vocoder name", + choices=["default", "MRF HiFi-GAN"], + default="default", + ) train_parser.add_argument( "--save_every_epoch", type=int, @@ -2465,6 +2472,7 @@ def main(): cache_data_in_gpu=args.cache_data_in_gpu, g_pretrained_path=args.g_pretrained_path, d_pretrained_path=args.d_pretrained_path, + vocoder=args.vocoder, ) elif args.mode == "index": run_index_script( diff --git a/logs/mute/sliced_audios/mute44100.wav b/logs/mute/sliced_audios/mute44100.wav new file mode 100644 index 0000000000000000000000000000000000000000..de029a9c1d4bccbc785203442fa0dad0d438fa32 GIT binary patch literal 529258 zcmeIuu?@mN00Tf1zMw>~Kt^B%B#4xj0_q2pj_${zVGoAlM__<+W66@6yUf#c-*0j| zUeC*`jk_f4ta5rM+s}}au`f$~w9@-qhCFMlhyD-(1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1pXKJ E0T#dtg#Z8m literal 0 HcmV?d00001 diff --git a/rvc/configs/config.py b/rvc/configs/config.py index ed5447e1d..0b83459aa 100644 --- a/rvc/configs/config.py +++ b/rvc/configs/config.py @@ -5,9 +5,11 @@ version_config_paths = [ os.path.join("v1", "32000.json"), os.path.join("v1", "40000.json"), + os.path.join("v1", "44100.json"), os.path.join("v1", "48000.json"), os.path.join("v2", "48000.json"), os.path.join("v2", "40000.json"), + os.path.join("v2", "44100.json"), os.path.join("v2", "32000.json"), ] diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py index 63f369cf4..93312965a 100644 --- a/rvc/infer/infer.py +++ b/rvc/infer/infer.py @@ -471,11 +471,13 @@ def setup_network(self): self.version = self.cpt.get("version", "v1") self.text_enc_hidden_dim = 768 if self.version == "v2" else 256 + self.vocoder = self.cpt.get("vocoder", "default") self.net_g = Synthesizer( *self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=self.text_enc_hidden_dim, is_half=self.config.is_half, + vocoder=self.vocoder ) del self.net_g.enc_q self.net_g.load_state_dict(self.cpt["weight"], strict=False) diff --git a/rvc/lib/algorithm/hifigan.py b/rvc/lib/algorithm/hifigan.py new file mode 100644 index 000000000..24d006f7a --- /dev/null +++ b/rvc/lib/algorithm/hifigan.py @@ -0,0 +1,286 @@ +import math +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from typing import Optional + +LRELU_SLOPE = 0.1 + +class MRFLayer(nn.Module): + def __init__(self, channels, kernel_size, dilation): + super().__init__() + self.conv1 = weight_norm( + nn.Conv1d( + channels, + channels, + kernel_size, + padding=(kernel_size * dilation - dilation) // 2, + dilation=dilation, + ) + ) + self.conv2 = weight_norm( + nn.Conv1d( + channels, channels, kernel_size, padding=kernel_size // 2, dilation=1 + ) + ) + + def forward(self, x): + y = F.leaky_relu(x, LRELU_SLOPE) + y = self.conv1(y) + y = F.leaky_relu(y, LRELU_SLOPE) + y = self.conv2(y) + return x + y + + def remove_weight_norm(self): + remove_weight_norm(self.conv1) + remove_weight_norm(self.conv2) + + +class MRFBlock(nn.Module): + def __init__(self, channels, kernel_size, dilations): + super().__init__() + self.layers = nn.ModuleList() + for dilation in dilations: + self.layers.append(MRFLayer(channels, kernel_size, dilation)) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + def remove_weight_norm(self): + for layer in self.layers: + layer.remove_weight_norm() + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values): + """f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand( + f0_values.shape[0], f0_values.shape[2], device=f0_values.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) + + return sines + + def forward(self, f0): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + uv = self._f02uv(f0) + + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshold=0, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x): + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + return sine_merge, None, None + +class HiFiGAN(nn.Module): + def __init__( + self, + in_channel, + upsample_initial_channel, + upsample_rates, + upsample_kernel_sizes, + resblock_kernel_sizes, + resblock_dilations, + gin_channels, + sample_rate, + harmonic_num, + ): + super().__init__() + print('hifigan') + self.num_kernels = len(resblock_kernel_sizes) + + self.f0_upsample = nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num) + + self.conv_pre = weight_norm( + nn.Conv1d( + in_channel, upsample_initial_channel, kernel_size=7, stride=1, padding=3 + ) + ) + self.upsamples = nn.ModuleList() + self.noise_convs = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.upsamples.append( + weight_norm( + nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + kernel_size=k, + stride=u, + padding=u // 2 + u % 2, + output_padding=u % 2, + ) + ) + ) + if i < len(upsample_rates) - 1: + stride_f0 = np.prod(upsample_rates[i + 1 :]) # noqa + self.noise_convs.append( + nn.Conv1d( + 1, + upsample_initial_channel // (2 ** (i + 1)), + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append( + nn.Conv1d( + 1, upsample_initial_channel // (2 ** (i + 1)), kernel_size=1 + ) + ) + + self.mrfs = nn.ModuleList() + for i in range(len(self.upsamples)): + channel = upsample_initial_channel // (2 ** (i + 1)) + self.mrfs.append( + nn.ModuleList( + [ + MRFBlock(channel, kernel_size=k, dilations=d) + for k, d in zip(resblock_kernel_sizes, resblock_dilations) + ] + ) + ) + self.conv_post = weight_norm( + nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3) + ) + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, f0, g: Optional[torch.Tensor] = None): + f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2) + har_source, _, _ = self.m_source(f0) + har_source = har_source.transpose(-1, -2) + + x = self.conv_pre(x) + + if g is not None: + x = x + self.cond(g) + + for up, mrf, noise_conv in zip(self.upsamples, self.mrfs, self.noise_convs): + x = F.leaky_relu(x, LRELU_SLOPE) + x = up(x) + x_source = noise_conv(har_source) + # hacky fix the for upscale/downscale mismatch between signal and f0 helper + if x_source.size(-1) < x.size(-1): + x_source = F.pad(x_source, (0, x.size(-1)-x_source.size(-1))) + x = x + x_source + xs = 0 + for layer in mrf: + xs += layer(x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.conv_pre) + for up in self.upsamples: + remove_weight_norm(up) + for mrf in self.mrfs: + mrf.remove_weight_norm() + remove_weight_norm(self.conv_post) \ No newline at end of file diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py index e69f08eb8..a6ea7a9e3 100644 --- a/rvc/lib/algorithm/synthesizers.py +++ b/rvc/lib/algorithm/synthesizers.py @@ -1,5 +1,6 @@ import torch from typing import Optional +from rvc.lib.algorithm.hifigan import HiFiGAN from rvc.lib.algorithm.nsf import GeneratorNSF from rvc.lib.algorithm.generators import Generator from rvc.lib.algorithm.commons import slice_segments, rand_slice_segments @@ -56,12 +57,14 @@ def __init__( sr: int, use_f0: bool, text_enc_hidden_dim: int = 768, + vocoder: str = "default", + randomized: bool = True, **kwargs, ): super().__init__() self.segment_size = segment_size - self.gin_channels = gin_channels self.use_f0 = use_f0 + self.randomized = randomized self.enc_p = TextEncoder( inter_channels, @@ -76,28 +79,46 @@ def __init__( ) if use_f0: - self.dec = GeneratorNSF( - inter_channels, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - is_half=kwargs["is_half"], - ) + if vocoder == "MRF HiFi-GAN": + self.dec = HiFiGAN( + in_channel=inter_channels, + upsample_initial_channel=upsample_initial_channel, + upsample_rates=upsample_rates, + upsample_kernel_sizes=upsample_kernel_sizes, + resblock_kernel_sizes=resblock_kernel_sizes, + resblock_dilations=resblock_dilation_sizes, + gin_channels=gin_channels, + sample_rate=sr, + harmonic_num=8, + ) + else: + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) else: - self.dec = Generator( - inter_channels, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - ) - + if vocoder == "MRF HiFi-GAN": + print("MRF HiFi-GAN does not support training without pitch guidance.") + self.dec = None + else: + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) self.enc_q = PosteriorEncoder( spec_channels, inter_channels, @@ -160,17 +181,24 @@ def forward( if y is not None: z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) z_p = self.flow(z, y_mask, g=g) - z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size) - - if self.use_f0 and pitchf is not None: - pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2) - o = self.dec(z_slice, pitchf, g=g) + # regular old training method using random slices + if self.randomized: + z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size) + if self.use_f0: + pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2) + o = self.dec(z_slice, pitchf, g=g) + else: + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + # future use for finetuning using the entire dataset each pass else: - o = self.dec(z_slice, g=g) - - return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) + if self.use_f0: + o = self.dec(z, pitchf, g=g) + else: + o = self.dec(z, g=g) + return o, None, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + else: + return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) @torch.jit.export def infer( diff --git a/rvc/lib/tools/pretrained_selector.py b/rvc/lib/tools/pretrained_selector.py index e982fac50..e899065d9 100644 --- a/rvc/lib/tools/pretrained_selector.py +++ b/rvc/lib/tools/pretrained_selector.py @@ -1,63 +1,14 @@ -def pretrained_selector(pitch_guidance): - if pitch_guidance == True: - return { - "v1": { - 32000: ( - "rvc/models/pretraineds/pretrained_v1/f0G32k.pth", - "rvc/models/pretraineds/pretrained_v1/f0D32k.pth", - ), - 40000: ( - "rvc/models/pretraineds/pretrained_v1/f0G40k.pth", - "rvc/models/pretraineds/pretrained_v1/f0D40k.pth", - ), - 48000: ( - "rvc/models/pretraineds/pretrained_v1/f0G48k.pth", - "rvc/models/pretraineds/pretrained_v1/f0D48k.pth", - ), - }, - "v2": { - 32000: ( - "rvc/models/pretraineds/pretrained_v2/f0G32k.pth", - "rvc/models/pretraineds/pretrained_v2/f0D32k.pth", - ), - 40000: ( - "rvc/models/pretraineds/pretrained_v2/f0G40k.pth", - "rvc/models/pretraineds/pretrained_v2/f0D40k.pth", - ), - 48000: ( - "rvc/models/pretraineds/pretrained_v2/f0G48k.pth", - "rvc/models/pretraineds/pretrained_v2/f0D48k.pth", - ), - }, - } - elif pitch_guidance == False: - return { - "v1": { - 32000: ( - "rvc/models/pretraineds/pretrained_v1/G32k.pth", - "rvc/models/pretraineds/pretrained_v1/D32k.pth", - ), - 40000: ( - "rvc/models/pretraineds/pretrained_v1/G40k.pth", - "rvc/models/pretraineds/pretrained_v1/D40k.pth", - ), - 48000: ( - "rvc/models/pretraineds/pretrained_v1/G48k.pth", - "rvc/models/pretraineds/pretrained_v1/D48k.pth", - ), - }, - "v2": { - 32000: ( - "rvc/models/pretraineds/pretrained_v2/G32k.pth", - "rvc/models/pretraineds/pretrained_v2/D32k.pth", - ), - 40000: ( - "rvc/models/pretraineds/pretrained_v2/G40k.pth", - "rvc/models/pretraineds/pretrained_v2/D40k.pth", - ), - 48000: ( - "rvc/models/pretraineds/pretrained_v2/G48k.pth", - "rvc/models/pretraineds/pretrained_v2/D48k.pth", - ), - }, - } +def pretrained_selector(version, vocoder, pitch_guidance, sample_rate): + + path = f"rvc/models/pretraineds/pretrained_{version}/" + f0 = "f0" if pitch_guidance == True else "" + + if vocoder == "default": + vocoder_path = "" + elif vocoder == "MRF HiFi-GAN": + vocoder_path = "HiFiGAN_" + + path_g = f"{path}{vocoder_path}{f0}G{str(sample_rate)[:2]}k.pth" + path_d = f"{path}{vocoder_path}{f0}D{str(sample_rate)[:2]}k.pth" + + return path_g, path_d \ No newline at end of file diff --git a/rvc/lib/zluda.py b/rvc/lib/zluda.py index 482009cc4..4af2e86c4 100644 --- a/rvc/lib/zluda.py +++ b/rvc/lib/zluda.py @@ -3,31 +3,8 @@ if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): _torch_stft = torch.stft - def z_stft( - audio: torch.Tensor, - n_fft: int, - hop_length: int = None, - win_length: int = None, - window: torch.Tensor = None, - center: bool = True, - pad_mode: str = "reflect", - normalized: bool = False, - onesided: bool = None, - return_complex: bool = None, - ): - sd = audio.device - return _torch_stft( - audio.to("cpu"), - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=window.to("cpu"), - center=center, - pad_mode=pad_mode, - normalized=normalized, - onesided=onesided, - return_complex=return_complex, - ).to(sd) + def z_stft(input: torch.Tensor, window: torch.Tensor, *args, **kwargs): + return _torch_stft(input=input.cpu(), window=window.cpu(), *args, **kwargs).to(input.device) def z_jit(f, *_, **__): f.graph = torch._C.Graph() diff --git a/rvc/train/losses.py b/rvc/train/losses.py index 2e0d4dc9c..b0d7cff51 100644 --- a/rvc/train/losses.py +++ b/rvc/train/losses.py @@ -55,6 +55,22 @@ def generator_loss(disc_outputs): return loss, gen_losses +def discriminator_loss_scaled(disc_real, disc_fake, scale=1.0): + loss = 0 + for i, (d_real, d_fake) in enumerate(zip(disc_real, disc_fake)): + real_loss = torch.mean((1 - d_real) ** 2) + fake_loss = torch.mean(d_fake**2) + _loss = real_loss + fake_loss + loss += _loss if i < len(disc_real) / 2 else scale * _loss + return loss, None, None + +def generator_loss_scaled(disc_outputs, scale=1.0): + loss = 0 + for i, d_fake in enumerate(disc_outputs): + d_fake = d_fake.float() + _loss = torch.mean((1 - d_fake) ** 2) + loss += _loss if i < len(disc_outputs) / 2 else scale * _loss + return loss, None, None def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): """ diff --git a/rvc/train/process/extract_model.py b/rvc/train/process/extract_model.py index 864765cb0..33eba8c97 100644 --- a/rvc/train/process/extract_model.py +++ b/rvc/train/process/extract_model.py @@ -33,6 +33,7 @@ def extract_model( version, hps, overtrain_info, + vocoder, ): try: print(f"Saved model '{model_dir}' (epoch {epoch} and step {step})") @@ -105,6 +106,7 @@ def extract_model( opt["author"] = model_author opt["embedder_model"] = embedder_model opt["speakers_id"] = speakers_id + opt["vocoder"] = vocoder torch.save(opt, os.path.join(model_dir_path, pth_file)) diff --git a/rvc/train/train.py b/rvc/train/train.py index c1f64fbed..fc283afaf 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -39,8 +39,10 @@ from losses import ( discriminator_loss, + discriminator_loss_scaled, feature_loss, generator_loss, + generator_loss_scaled, kl_loss, ) from mel_processing import ( @@ -70,6 +72,7 @@ overtraining_detector = strtobool(sys.argv[14]) overtraining_threshold = int(sys.argv[15]) cleanup = strtobool(sys.argv[16]) +vocoder = sys.argv[17] current_dir = os.getcwd() experiment_dir = os.path.join(current_dir, "logs", model_name) @@ -81,6 +84,10 @@ config = HParams(**config) config.data.training_files = os.path.join(experiment_dir, "filelist.txt") +# for nVidia's CUDA device selection can be done from command line / UI +# for AMD the device selection can only be done from .bat file using HIP_VISIBLE_DEVICES +os.environ["CUDA_VISIBLE_DEVICES"] = gpus.replace("-", ",") + torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = False @@ -365,6 +372,7 @@ def run( use_f0=pitch_guidance == True, # converting 1/0 to True/False is_half=config.train.fp16_run and device.type == "cuda", sr=sample_rate, + vocoder=vocoder ).to(device) net_d = MultiPeriodDiscriminator(version, config.model.use_spectral_norm).to(device) @@ -606,9 +614,11 @@ def train_and_evaluate( ) y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) with autocast(enabled=False): - loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( - y_d_hat_r, y_d_hat_g - ) + #if vocoder == "default": + # loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) + #else: + # loss_disc, _, _ = discriminator_loss_scaled(y_d_hat_r, y_d_hat_g) + loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) # Discriminator backward and update optim_d.zero_grad() scaler.scale(loss_disc).backward() @@ -621,11 +631,13 @@ def train_and_evaluate( _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) with autocast(enabled=False): loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0 - loss_kl = ( - kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl - ) + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl loss_fm = feature_loss(fmap_r, fmap_g) - loss_gen, losses_gen = generator_loss(y_d_hat_g) + #if vocoder == "default": + # loss_gen, _ = generator_loss(y_d_hat_g) + #else: + # loss_gen, _ = generator_loss_scaled(y_d_hat_g) + loss_gen, _ = generator_loss(y_d_hat_g) loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl if loss_gen_all < lowest_value["value"]: @@ -871,6 +883,7 @@ def train_and_evaluate( version=version, hps=hps, overtrain_info=overtrain_info, + vocoder=vocoder, ) # Clean-up old best epochs for m in model_del: diff --git a/tabs/train/train.py b/tabs/train/train.py index 23c07dbae..1e8628179 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -314,10 +314,17 @@ def train_tab(): sampling_rate = gr.Radio( label=i18n("Sampling Rate"), info=i18n("The sampling rate of the audio files."), - choices=["32000", "40000", "48000"], + choices=["32000", "40000", "44100", "48000"], value="40000", interactive=True, ) + vocoder = gr.Radio( + label=i18n("Vocoder"), + info=i18n("Choose the vocoder type. 'Default' is a standard vocoder compatible with all other RVC clients, the experimental 'MRF HiFi-GAN' vocoder provides higher fidelity but is only compatible with Applio."), + choices=["default", "MRF HiFi-GAN"], + value="default", + interactive=True, + ) rvc_version = gr.Radio( label=i18n("Model Architecture"), info=i18n("Version of the model architecture."), @@ -765,6 +772,7 @@ def train_tab(): custom_pretrained, g_pretrained_path, d_pretrained_path, + vocoder, ], outputs=[train_output_info], ) @@ -825,7 +833,7 @@ def train_tab(): with gr.Column(): refresh_export = gr.Button(i18n("Refresh")) if not os.name == "nt": - upload_exported = gr.Button(i18n("Upload")) + upload_exported = gr.Button(i18n("Upload"), variant="primary") upload_exported.click( fn=upload_to_google_drive, inputs=[pth_dropdown_export, index_dropdown_export], From a2da78f6449b2219aff09070b458eeddfeea4939 Mon Sep 17 00:00:00 2001 From: Blaise Date: Wed, 4 Dec 2024 18:36:36 +0100 Subject: [PATCH 02/46] minor changes on typo --- assets/i18n/languages/en_US.json | 2 +- core.py | 6 +++--- rvc/infer/infer.py | 2 +- rvc/lib/algorithm/synthesizers.py | 4 +--- rvc/lib/tools/pretrained_selector.py | 2 +- rvc/train/train.py | 4 ++-- tabs/train/train.py | 6 +++--- 7 files changed, 12 insertions(+), 14 deletions(-) diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index 50b145a80..d4fff453a 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -323,5 +323,5 @@ "The name that will appear in the model information.": "The name that will appear in the model information.", "Set name": "Set name", "Vocoder": "Vocoder", - "Vocoder type": "Choose the vocoder type. 'Default' is a standard vocoder compatible with all other RVC clients, the experimental 'MRF HiFi-GAN' vocoder provides higher fidelity but is only compatible with Applio." + "Vocoder type": "Select the vocoder type for audio synthesis. The default option is HiFi-GAN, a standard vocoder compatible with all RVC clients. Alternatively, the experimental MRF HiFi-GAN offers enhanced fidelity but is exclusively compatible with Applio." } \ No newline at end of file diff --git a/core.py b/core.py index 02175d2b9..997780849 100644 --- a/core.py +++ b/core.py @@ -512,7 +512,7 @@ def run_train_script( custom_pretrained: bool = False, g_pretrained_path: str = None, d_pretrained_path: str = None, - vocoder: str = "default", + vocoder: str = "HiFi-GAN", ): if pretrained == True: @@ -1970,8 +1970,8 @@ def parse_arguments(): "--vocoder", type=str, help="Vocoder name", - choices=["default", "MRF HiFi-GAN"], - default="default", + choices=["HiFi-GAN", "MRF HiFi-GAN"], + default="HiFi-GAN", ) train_parser.add_argument( "--save_every_epoch", diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py index 93312965a..aad01ccaf 100644 --- a/rvc/infer/infer.py +++ b/rvc/infer/infer.py @@ -471,7 +471,7 @@ def setup_network(self): self.version = self.cpt.get("version", "v1") self.text_enc_hidden_dim = 768 if self.version == "v2" else 256 - self.vocoder = self.cpt.get("vocoder", "default") + self.vocoder = self.cpt.get("vocoder", "HiFi-GAN") self.net_g = Synthesizer( *self.cpt["config"], use_f0=self.use_f0, diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py index a6ea7a9e3..e4ce9c468 100644 --- a/rvc/lib/algorithm/synthesizers.py +++ b/rvc/lib/algorithm/synthesizers.py @@ -57,7 +57,7 @@ def __init__( sr: int, use_f0: bool, text_enc_hidden_dim: int = 768, - vocoder: str = "default", + vocoder: str = "HiFi-GAN", randomized: bool = True, **kwargs, ): @@ -94,7 +94,6 @@ def __init__( else: self.dec = GeneratorNSF( inter_channels, - resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, @@ -111,7 +110,6 @@ def __init__( else: self.dec = Generator( inter_channels, - resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, diff --git a/rvc/lib/tools/pretrained_selector.py b/rvc/lib/tools/pretrained_selector.py index e899065d9..ae0df05da 100644 --- a/rvc/lib/tools/pretrained_selector.py +++ b/rvc/lib/tools/pretrained_selector.py @@ -3,7 +3,7 @@ def pretrained_selector(version, vocoder, pitch_guidance, sample_rate): path = f"rvc/models/pretraineds/pretrained_{version}/" f0 = "f0" if pitch_guidance == True else "" - if vocoder == "default": + if vocoder == "HiFi-GAN": vocoder_path = "" elif vocoder == "MRF HiFi-GAN": vocoder_path = "HiFiGAN_" diff --git a/rvc/train/train.py b/rvc/train/train.py index fc283afaf..0164de203 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -614,7 +614,7 @@ def train_and_evaluate( ) y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) with autocast(enabled=False): - #if vocoder == "default": + #if vocoder == "HiFi-GAN": # loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) #else: # loss_disc, _, _ = discriminator_loss_scaled(y_d_hat_r, y_d_hat_g) @@ -633,7 +633,7 @@ def train_and_evaluate( loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0 loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl loss_fm = feature_loss(fmap_r, fmap_g) - #if vocoder == "default": + #if vocoder == "HiFi-GAN": # loss_gen, _ = generator_loss(y_d_hat_g) #else: # loss_gen, _ = generator_loss_scaled(y_d_hat_g) diff --git a/tabs/train/train.py b/tabs/train/train.py index 1e8628179..49e6a850b 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -320,9 +320,9 @@ def train_tab(): ) vocoder = gr.Radio( label=i18n("Vocoder"), - info=i18n("Choose the vocoder type. 'Default' is a standard vocoder compatible with all other RVC clients, the experimental 'MRF HiFi-GAN' vocoder provides higher fidelity but is only compatible with Applio."), - choices=["default", "MRF HiFi-GAN"], - value="default", + info=i18n("Select the vocoder type for audio synthesis. The default option is HiFi-GAN, a standard vocoder compatible with all RVC clients. Alternatively, the experimental MRF HiFi-GAN offers enhanced fidelity but is exclusively compatible with Applio."), + choices=["HiFi-GAN", "MRF HiFi-GAN"], + value="HiFi-GAN", interactive=True, ) rvc_version = gr.Radio( From 4b3496a79b4cbebc1e252f04e335e5f6d3ebcd3a Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Wed, 4 Dec 2024 15:16:11 -0500 Subject: [PATCH 03/46] added 44100 configs, corrected mismatch between f0 helper and upscaler for 44100 (does not affect other sampling rates) --- rvc/configs/v1/44100.json | 76 ++++++++++++++++++++++++++++++++++++ rvc/configs/v2/44100.json | 76 ++++++++++++++++++++++++++++++++++++ rvc/lib/algorithm/hifigan.py | 49 +++++++++++++---------- rvc/lib/algorithm/nsf.py | 25 +++++++++--- 4 files changed, 201 insertions(+), 25 deletions(-) create mode 100644 rvc/configs/v1/44100.json create mode 100644 rvc/configs/v2/44100.json diff --git a/rvc/configs/v1/44100.json b/rvc/configs/v1/44100.json new file mode 100644 index 000000000..e79b06c08 --- /dev/null +++ b/rvc/configs/v1/44100.json @@ -0,0 +1,76 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 0.0001, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-09, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 15876, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 44100, + "filter_length": 2048, + "hop_length": 441, + "win_length": 2048, + "n_mel_channels": 160, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 7, + 7, + 3, + 3 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 14, + 14, + 6, + 6 + ], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} \ No newline at end of file diff --git a/rvc/configs/v2/44100.json b/rvc/configs/v2/44100.json new file mode 100644 index 000000000..47e9cfe10 --- /dev/null +++ b/rvc/configs/v2/44100.json @@ -0,0 +1,76 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 0.0001, + "betas": [ + 0.8, + 0.99 + ], + "eps": 1e-09, + "fp16_run": false, + "lr_decay": 0.999875, + "segment_size": 15876, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 44100, + "filter_length": 2048, + "hop_length": 441, + "win_length": 2048, + "n_mel_channels": 160, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 7, + 7, + 3, + 3 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 14, + 14, + 6, + 6 + ], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} \ No newline at end of file diff --git a/rvc/lib/algorithm/hifigan.py b/rvc/lib/algorithm/hifigan.py index 24d006f7a..908c22b4a 100644 --- a/rvc/lib/algorithm/hifigan.py +++ b/rvc/lib/algorithm/hifigan.py @@ -202,6 +202,12 @@ def __init__( ) self.upsamples = nn.ModuleList() self.noise_convs = nn.ModuleList() + + stride_f0s = [ + math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 + for i in range(len(upsample_rates)) + ] + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): self.upsamples.append( weight_norm( @@ -215,24 +221,30 @@ def __init__( ) ) ) - if i < len(upsample_rates) - 1: - stride_f0 = np.prod(upsample_rates[i + 1 :]) # noqa - self.noise_convs.append( - nn.Conv1d( - 1, - upsample_initial_channel // (2 ** (i + 1)), - kernel_size=stride_f0 * 2, - stride=stride_f0, - padding=stride_f0 // 2, - ) - ) - else: - self.noise_convs.append( - nn.Conv1d( - 1, upsample_initial_channel // (2 ** (i + 1)), kernel_size=1 - ) + """ handling odd upsampling rates + # s k p + # 40 80 20 + # 32 64 16 + # 4 8 2 + # 2 3 1 + # 63 125 31 + # 9 17 4 + # 3 5 1 + # 1 1 0 + """ + stride = stride_f0s[i] + kernel = (1 if stride == 1 else stride * 2 - stride % 2) + padding = (0 if stride == 1 else (kernel - stride) // 2) + + self.noise_convs.append( + nn.Conv1d( + 1, + upsample_initial_channel // (2 ** (i + 1)), + kernel_size=kernel, + stride=stride, + padding=padding, ) - + ) self.mrfs = nn.ModuleList() for i in range(len(self.upsamples)): channel = upsample_initial_channel // (2 ** (i + 1)) @@ -264,9 +276,6 @@ def forward(self, x, f0, g: Optional[torch.Tensor] = None): x = F.leaky_relu(x, LRELU_SLOPE) x = up(x) x_source = noise_conv(har_source) - # hacky fix the for upscale/downscale mismatch between signal and f0 helper - if x_source.size(-1) < x.size(-1): - x_source = F.pad(x_source, (0, x.size(-1)-x_source.size(-1))) x = x + x_source xs = 0 for layer in mrf: diff --git a/rvc/lib/algorithm/nsf.py b/rvc/lib/algorithm/nsf.py index 514b5371d..b3ff2c81f 100644 --- a/rvc/lib/algorithm/nsf.py +++ b/rvc/lib/algorithm/nsf.py @@ -112,18 +112,33 @@ def __init__( channels[i], k, u, - padding=(k - u) // 2, + padding=u // 2 + u % 2, + output_padding=u % 2, ) ) ) - + """ handling odd upsampling rates + # s k p + # 40 80 20 + # 32 64 16 + # 4 8 2 + # 2 3 1 + # 63 125 31 + # 9 17 4 + # 3 5 1 + # 1 1 0 + """ + stride = stride_f0s[i] + kernel = (1 if stride == 1 else stride * 2 - stride % 2) + padding = (0 if stride == 1 else (kernel - stride) // 2) + self.noise_convs.append( torch.nn.Conv1d( 1, channels[i], - kernel_size=(stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1), - stride=stride_f0s[i], - padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0), + kernel_size=kernel, + stride=stride, + padding=padding, ) ) From e6b76699081e011faf3793b671ac1404c601eef9 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Wed, 4 Dec 2024 22:11:10 -0500 Subject: [PATCH 04/46] added missing named parameter to avoid error with zluda hijack --- rvc/lib/predictors/FCPE.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rvc/lib/predictors/FCPE.py b/rvc/lib/predictors/FCPE.py index 12f6c346a..9edbf0672 100644 --- a/rvc/lib/predictors/FCPE.py +++ b/rvc/lib/predictors/FCPE.py @@ -141,7 +141,7 @@ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): spec = torch.stft( y, - n_fft_new, + n_fft=n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=hann_window[keyshift_key], From 6b70b46524b1517e2199011fa6267cbcb3dbebd5 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Thu, 5 Dec 2024 00:06:15 -0500 Subject: [PATCH 05/46] added missing resblock value --- rvc/lib/algorithm/synthesizers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py index e4ce9c468..7153ad966 100644 --- a/rvc/lib/algorithm/synthesizers.py +++ b/rvc/lib/algorithm/synthesizers.py @@ -47,6 +47,7 @@ def __init__( n_layers: int, kernel_size: int, p_dropout: float, + resblock: str, resblock_kernel_sizes: list, resblock_dilation_sizes: list, upsample_rates: list, From 9d192848c73ebf578d9f9b8f576d7706d0488837 Mon Sep 17 00:00:00 2001 From: Blaise Date: Fri, 6 Dec 2024 11:29:13 +0100 Subject: [PATCH 06/46] less text --- assets/i18n/languages/en_US.json | 2 +- tabs/train/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index d4fff453a..f80c90f7a 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -323,5 +323,5 @@ "The name that will appear in the model information.": "The name that will appear in the model information.", "Set name": "Set name", "Vocoder": "Vocoder", - "Vocoder type": "Select the vocoder type for audio synthesis. The default option is HiFi-GAN, a standard vocoder compatible with all RVC clients. Alternatively, the experimental MRF HiFi-GAN offers enhanced fidelity but is exclusively compatible with Applio." + "Vocoder for audio synthesis, HiFi-GAN (default, works with all clients) or MRF HiFi-GAN (experimental, higher fidelity, Applio-only).": "Vocoder for audio synthesis, HiFi-GAN (default, works with all clients) or MRF HiFi-GAN (experimental, higher fidelity, Applio-only)." } \ No newline at end of file diff --git a/tabs/train/train.py b/tabs/train/train.py index 49e6a850b..ba7ee2112 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -320,7 +320,7 @@ def train_tab(): ) vocoder = gr.Radio( label=i18n("Vocoder"), - info=i18n("Select the vocoder type for audio synthesis. The default option is HiFi-GAN, a standard vocoder compatible with all RVC clients. Alternatively, the experimental MRF HiFi-GAN offers enhanced fidelity but is exclusively compatible with Applio."), + info=i18n("Vocoder for audio synthesis, HiFi-GAN (default, works with all clients) or MRF HiFi-GAN (experimental, higher fidelity, Applio-only)."), choices=["HiFi-GAN", "MRF HiFi-GAN"], value="HiFi-GAN", interactive=True, From 3614f2c37a1102ddeef64fb7a89a9981172e88e6 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Fri, 6 Dec 2024 07:33:42 -0500 Subject: [PATCH 07/46] changed gpu parameter type to str since it had default '-' and extract could not be used from command line --- core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core.py b/core.py index 997780849..5a770476c 100644 --- a/core.py +++ b/core.py @@ -1923,7 +1923,7 @@ def parse_arguments(): ) extract_parser.add_argument( "--gpu", - type=int, + type=str, help="GPU device to use for feature extraction (optional).", default="-", ) From c5e4cb24f8b946e07445628005bbe0e6c5a75124 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Fri, 6 Dec 2024 07:35:29 -0500 Subject: [PATCH 08/46] fix for 40k config using incorrect upscaling rates and thus resulting in mismatch with downscaler --- rvc/lib/algorithm/hifigan.py | 9 ++++++++- rvc/lib/algorithm/nsf.py | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/rvc/lib/algorithm/hifigan.py b/rvc/lib/algorithm/hifigan.py index 908c22b4a..ea1c777e7 100644 --- a/rvc/lib/algorithm/hifigan.py +++ b/rvc/lib/algorithm/hifigan.py @@ -209,6 +209,13 @@ def __init__( ] for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + # handling odd upsampling rates + if u % 2 == 0: + # old method + padding = (k - u) // 2 + else: + padding = u // 2 + u % 2 + self.upsamples.append( weight_norm( nn.ConvTranspose1d( @@ -216,7 +223,7 @@ def __init__( upsample_initial_channel // (2 ** (i + 1)), kernel_size=k, stride=u, - padding=u // 2 + u % 2, + padding=padding, output_padding=u % 2, ) ) diff --git a/rvc/lib/algorithm/nsf.py b/rvc/lib/algorithm/nsf.py index b3ff2c81f..9c9a1919a 100644 --- a/rvc/lib/algorithm/nsf.py +++ b/rvc/lib/algorithm/nsf.py @@ -105,6 +105,13 @@ def __init__( ] for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + # handling odd upsampling rates + if u % 2 == 0: + # old method + padding = (k - u) // 2 + else: + padding = u // 2 + u % 2 + self.ups.append( weight_norm( torch.nn.ConvTranspose1d( @@ -112,7 +119,7 @@ def __init__( channels[i], k, u, - padding=u // 2 + u % 2, + padding=padding, output_padding=u % 2, ) ) From d8e10862b09ff3c7761f46714686c529b24c8f18 Mon Sep 17 00:00:00 2001 From: Blaise Date: Fri, 6 Dec 2024 13:52:51 +0100 Subject: [PATCH 09/46] fix pretrained selector + print vocoder selected --- rvc/lib/algorithm/hifigan.py | 1 - rvc/lib/algorithm/synthesizers.py | 2 +- rvc/lib/tools/pretrained_selector.py | 8 ++++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/rvc/lib/algorithm/hifigan.py b/rvc/lib/algorithm/hifigan.py index ea1c777e7..851b0b096 100644 --- a/rvc/lib/algorithm/hifigan.py +++ b/rvc/lib/algorithm/hifigan.py @@ -189,7 +189,6 @@ def __init__( harmonic_num, ): super().__init__() - print('hifigan') self.num_kernels = len(resblock_kernel_sizes) self.f0_upsample = nn.Upsample(scale_factor=np.prod(upsample_rates)) diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py index 7153ad966..a097e8c2c 100644 --- a/rvc/lib/algorithm/synthesizers.py +++ b/rvc/lib/algorithm/synthesizers.py @@ -78,7 +78,7 @@ def __init__( text_enc_hidden_dim, f0=use_f0, ) - + print(f"Using {vocoder} vocoder") if use_f0: if vocoder == "MRF HiFi-GAN": self.dec = HiFiGAN( diff --git a/rvc/lib/tools/pretrained_selector.py b/rvc/lib/tools/pretrained_selector.py index ae0df05da..daf364a4c 100644 --- a/rvc/lib/tools/pretrained_selector.py +++ b/rvc/lib/tools/pretrained_selector.py @@ -1,5 +1,6 @@ +import os + def pretrained_selector(version, vocoder, pitch_guidance, sample_rate): - path = f"rvc/models/pretraineds/pretrained_{version}/" f0 = "f0" if pitch_guidance == True else "" @@ -11,4 +12,7 @@ def pretrained_selector(version, vocoder, pitch_guidance, sample_rate): path_g = f"{path}{vocoder_path}{f0}G{str(sample_rate)[:2]}k.pth" path_d = f"{path}{vocoder_path}{f0}D{str(sample_rate)[:2]}k.pth" - return path_g, path_d \ No newline at end of file + if os.path.exists(path_g) and os.path.exists(path_d): + return path_g, path_d + else: + return "", "" From b2c3428f6735cc065728573688305d8d2f688851 Mon Sep 17 00:00:00 2001 From: Blaise Date: Sat, 7 Dec 2024 14:02:17 +0100 Subject: [PATCH 10/46] same config format --- rvc/configs/v1/44100.json | 47 ++++++--------------------------------- rvc/configs/v2/44100.json | 47 ++++++--------------------------------- 2 files changed, 14 insertions(+), 80 deletions(-) diff --git a/rvc/configs/v1/44100.json b/rvc/configs/v1/44100.json index e79b06c08..6c7b850c1 100644 --- a/rvc/configs/v1/44100.json +++ b/rvc/configs/v1/44100.json @@ -3,12 +3,9 @@ "log_interval": 200, "seed": 1234, "learning_rate": 0.0001, - "betas": [ - 0.8, - 0.99 - ], + "betas": [0.8, 0.99], "eps": 1e-09, - "fp16_run": false, + "fp16_run": true, "lr_decay": 0.999875, "segment_size": 15876, "c_mel": 45, @@ -33,42 +30,12 @@ "n_layers": 6, "kernel_size": 3, "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [ - 3, - 7, - 11 - ], - "resblock_dilation_sizes": [ - [ - 1, - 3, - 5 - ], - [ - 1, - 3, - 5 - ], - [ - 1, - 3, - 5 - ] - ], - "upsample_rates": [ - 7, - 7, - 3, - 3 - ], + "resblock": 1, + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [7,5,3,3], "upsample_initial_channel": 512, - "upsample_kernel_sizes": [ - 14, - 14, - 6, - 6 - ], + "upsample_kernel_sizes": [14,14,6,6], "use_spectral_norm": false, "gin_channels": 256, "spk_embed_dim": 109 diff --git a/rvc/configs/v2/44100.json b/rvc/configs/v2/44100.json index 47e9cfe10..171207bbb 100644 --- a/rvc/configs/v2/44100.json +++ b/rvc/configs/v2/44100.json @@ -3,12 +3,9 @@ "log_interval": 200, "seed": 1234, "learning_rate": 0.0001, - "betas": [ - 0.8, - 0.99 - ], + "betas": [0.8, 0.99], "eps": 1e-09, - "fp16_run": false, + "fp16_run": true, "lr_decay": 0.999875, "segment_size": 15876, "c_mel": 45, @@ -33,42 +30,12 @@ "n_layers": 6, "kernel_size": 3, "p_dropout": 0, - "resblock": "1", - "resblock_kernel_sizes": [ - 3, - 7, - 11 - ], - "resblock_dilation_sizes": [ - [ - 1, - 3, - 5 - ], - [ - 1, - 3, - 5 - ], - [ - 1, - 3, - 5 - ] - ], - "upsample_rates": [ - 7, - 7, - 3, - 3 - ], + "resblock": 1, + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [7,5,3,3], "upsample_initial_channel": 512, - "upsample_kernel_sizes": [ - 14, - 14, - 6, - 6 - ], + "upsample_kernel_sizes": [14,14,6,6], "use_spectral_norm": false, "gin_channels": 256, "spk_embed_dim": 109 From bef023d5575f770c3b8dfb3a9c7c38fc9c318fb4 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Mon, 9 Dec 2024 18:35:03 -0500 Subject: [PATCH 11/46] corrected 44100 upsample rate --- rvc/configs/v1/44100.json | 2 +- rvc/configs/v2/44100.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rvc/configs/v1/44100.json b/rvc/configs/v1/44100.json index 6c7b850c1..a6aee470f 100644 --- a/rvc/configs/v1/44100.json +++ b/rvc/configs/v1/44100.json @@ -33,7 +33,7 @@ "resblock": 1, "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [7,5,3,3], + "upsample_rates": [7,7,3,3], "upsample_initial_channel": 512, "upsample_kernel_sizes": [14,14,6,6], "use_spectral_norm": false, diff --git a/rvc/configs/v2/44100.json b/rvc/configs/v2/44100.json index 171207bbb..04fdaf55d 100644 --- a/rvc/configs/v2/44100.json +++ b/rvc/configs/v2/44100.json @@ -33,7 +33,7 @@ "resblock": 1, "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "upsample_rates": [7,5,3,3], + "upsample_rates": [7,7,3,3], "upsample_initial_channel": 512, "upsample_kernel_sizes": [14,14,6,6], "use_spectral_norm": false, From d7bba344c7c86b0a4d9d8c449a52d315541ccaf0 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:26:16 -0500 Subject: [PATCH 12/46] added RefineGAN vocoder option --- core.py | 2 +- rvc/lib/algorithm/refinegan.py | 412 +++++++++++++++++++++++++++ rvc/lib/algorithm/synthesizers.py | 11 + rvc/lib/tools/pretrained_selector.py | 2 + tabs/train/train.py | 2 +- 5 files changed, 427 insertions(+), 2 deletions(-) create mode 100644 rvc/lib/algorithm/refinegan.py diff --git a/core.py b/core.py index 5a770476c..aa6971eeb 100644 --- a/core.py +++ b/core.py @@ -1970,7 +1970,7 @@ def parse_arguments(): "--vocoder", type=str, help="Vocoder name", - choices=["HiFi-GAN", "MRF HiFi-GAN"], + choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], default="HiFi-GAN", ) train_parser.add_argument( diff --git a/rvc/lib/algorithm/refinegan.py b/rvc/lib/algorithm/refinegan.py new file mode 100644 index 000000000..e86229cc4 --- /dev/null +++ b/rvc/lib/algorithm/refinegan.py @@ -0,0 +1,412 @@ +from typing import Callable +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F +from torch.nn.utils.parametrizations import weight_norm +from torch.nn.utils.parametrize import remove_parametrizations + + +def named_applyZ( + fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False +) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + + for child_name, child_module in module.named_children(): + child_name = ".".join((name, child_name)) if name else child_name + named_apply( + fn=fn, + module=child_module, + name=child_name, + depth_first=depth_first, + include_root=True, + ) + + if depth_first and include_root: + fn(module=module, name=name) + + return module + + +def get_padding(kernel_size: int, dilation: int = 1) -> int: + return int((kernel_size * dilation - dilation) / 2) + + +class ResBlock(torch.nn.Module): + def __init__( + self, + *, + in_channels: int, + out_channels: int, + kernel_size: int = 7, + dilation: tuple[int] = (1, 3, 5), + leaky_relu_slope: float = 0.2, + ): + super(ResBlock, self).__init__() + + self.leaky_relu_slope = leaky_relu_slope + self.in_channels = in_channels + self.out_channels = out_channels + + self.convs1 = nn.ModuleList( + [ + weight_norm( + nn.Conv1d( + in_channels=in_channels if idx == 0 else out_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + dilation=d, + padding=get_padding(kernel_size, d), + ) + ) + for idx, d in enumerate(dilation) + ] + ) + self.convs1.apply(self.init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + nn.Conv1d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=1, + dilation=d, + padding=get_padding(kernel_size, d), + ) + ) + for idx, d in enumerate(dilation) + ] + ) + self.convs2.apply(self.init_weights) + + def forward(self, x): + for idx, (c1, c2) in enumerate(zip(self.convs1, self.convs2)): + xt = F.leaky_relu(x, self.leaky_relu_slope) + xt = c1(xt) + xt = F.leaky_relu(xt, self.leaky_relu_slope) + xt = c2(xt) + + if idx != 0 or self.in_channels == self.out_channels: + x = xt + x + else: + x = xt + + return x + + def remove_parametrizations(self): + for c1, c2 in zip(self.convs1, self.convs2): + remove_parametrizations(c1) + remove_parametrizations(c2) + + def init_weights(self, m): + if type(m) == nn.Conv1d: + m.weight.data.normal_(0, 0.01) + m.bias.data.fill_(0.0) + + +class AdaIN(nn.Module): + def __init__( + self, + *, + channels: int, + leaky_relu_slope: float = 0.2, + ) -> None: + super().__init__() + + self.weight = nn.Parameter(torch.ones(channels)) + self.activation = nn.LeakyReLU(leaky_relu_slope) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + gaussian = torch.randn_like(x) * self.weight[None, :, None] + + return self.activation(x + gaussian) + + +class ParallelResBlock(nn.Module): + def __init__( + self, + *, + in_channels: int, + out_channels: int, + kernel_sizes: int = (3, 7, 11), + dilation: tuple[int] = (1, 3, 5), + leaky_relu_slope: float = 0.2, + ) -> None: + super().__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + + self.input_conv = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=7, + stride=1, + padding=3, + ) + + self.blocks = nn.ModuleList( + [ + nn.Sequential( + AdaIN(channels=out_channels), + ResBlock( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=kernel_size, + dilation=dilation, + leaky_relu_slope=leaky_relu_slope, + ), + AdaIN(channels=out_channels), + ) + for kernel_size in kernel_sizes + ] + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.input_conv(x) + + results = [block(x) for block in self.blocks] + + return torch.mean(torch.stack(results), dim=0) + + def remove_parametrizations(self): + for block in self.blocks: + block[1].remove_parametrizations() + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def _f02sine(self, f0_values): + """f0_values: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + # convert to F0 in rad. The interger part n can be ignored + # because 2 * np.pi * n doesn't affect phase + rad_values = (f0_values / self.sampling_rate) % 1 + + # initial phase noise (no noise for fundamental component) + rand_ini = torch.rand( + f0_values.shape[0], f0_values.shape[2], device=f0_values.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + + # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad) + tmp_over_one = torch.cumsum(rad_values, 1) % 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + + sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi) + + return sines + + def forward(self, f0): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) + + sine_waves = self._f02sine(f0_buf) * self.sine_amp + + uv = self._f02uv(f0) + + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + + sine_waves = sine_waves * uv + noise * (1 - uv) + return sine_waves, uv, noise + +class SourceModuleHnNSF(torch.nn.Module): + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshold=0, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x): + sine_wavs, uv, _ = self.l_sin_gen(x) + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + + return sine_merge, None, None + + +class RefineGANGenerator(nn.Module): + def __init__( + self, + *, + sample_rate: int = 44100, + downsample_rates: tuple[int] = (2, 2, 8, 8), + upsample_rates: tuple[int] = (8, 8, 2, 2), + leaky_relu_slope: float = 0.2, + num_mels: int = 128, + start_channels: int = 32, + gin_channels: int = 256, + ) -> None: + super().__init__() + + self.downsample_rates = downsample_rates + self.upsample_rates = upsample_rates + self.leaky_relu_slope = leaky_relu_slope + + self.f0_upsample = nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num=8) + + # expands + self.source_conv = weight_norm(nn.Conv1d(in_channels=1, out_channels=start_channels, kernel_size=7, stride=1, padding=3,)) + + channels = start_channels + self.downsample_blocks = nn.ModuleList([]) + for rate in downsample_rates: + new_channels = channels * 2 + + self.downsample_blocks.append( + nn.Sequential( + nn.Upsample(scale_factor=1 / rate, mode="linear"), + ResBlock(in_channels=channels, out_channels=new_channels, kernel_size=7, dilation=(1, 3, 5), leaky_relu_slope=leaky_relu_slope,), + ) + ) + + channels = new_channels + + self.mel_conv = weight_norm(nn.Conv1d(in_channels=num_mels,out_channels=channels,kernel_size=7,stride=1,padding=3,)) + + if gin_channels != 0: + self.cond = nn.Conv1d(256, channels, 1) + + channels *= 2 + + self.upsample_blocks = nn.ModuleList([]) + self.upsample_conv_blocks = nn.ModuleList([]) + + for rate in upsample_rates: + new_channels = channels // 2 + + self.upsample_blocks.append(nn.Upsample(scale_factor=rate, mode="linear")) + + self.upsample_conv_blocks.append( + ParallelResBlock( + in_channels=channels + channels // 4, + out_channels=new_channels, + kernel_sizes=(3, 7, 11), + dilation=(1, 3, 5), + leaky_relu_slope=leaky_relu_slope, + ) + ) + + channels = new_channels + + self.conv_post = weight_norm(nn.Conv1d(in_channels=channels, out_channels=1, kernel_size=7, stride=1, padding=3,)) + + def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor=None) -> torch.Tensor: + f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2) + har_source, _, _ = self.m_source(f0) + har_source = har_source.transpose(-1, -2) + + # expanding pitch source to 16 channels + x = self.source_conv(har_source) + # making a downscaled version to match upscaler stages + downs = [] + for i, block in enumerate(self.downsample_blocks): + x = F.leaky_relu(x, self.leaky_relu_slope, inplace=True) + downs.append(x) + x = block(x) + + # expanding spectrogram from 192 to 256 channels + mel = self.mel_conv(mel) + + if g is not None: + # adding expanded speaker embedding + mel = mel + self.cond(g) + + x = torch.cat([x, mel], dim=1) + i = 1 + for up, res, down in zip(self.upsample_blocks, self.upsample_conv_blocks, reversed(downs),): + x = F.leaky_relu(x, self.leaky_relu_slope, inplace=True) + x = up(x) + x = torch.cat([x, down], dim=1) + x = res(x) + + x = F.leaky_relu(x, self.leaky_relu_slope, inplace=True) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_parametrizations(self) -> None: + remove_parametrizations(self.source_conv) + remove_parametrizations(self.mel_conv) + remove_parametrizations(self.conv_post) + + for block in self.downsample_blocks: + block[1].remove_parametrizations() + + for block in self.upsample_conv_blocks: + block.remove_parametrizations() \ No newline at end of file diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py index a097e8c2c..36d35d334 100644 --- a/rvc/lib/algorithm/synthesizers.py +++ b/rvc/lib/algorithm/synthesizers.py @@ -1,6 +1,7 @@ import torch from typing import Optional from rvc.lib.algorithm.hifigan import HiFiGAN +from rvc.lib.algorithm.refinegan import RefineGANGenerator from rvc.lib.algorithm.nsf import GeneratorNSF from rvc.lib.algorithm.generators import Generator from rvc.lib.algorithm.commons import slice_segments, rand_slice_segments @@ -92,6 +93,13 @@ def __init__( sample_rate=sr, harmonic_num=8, ) + elif vocoder == "RefineGAN": + self.dec = RefineGANGenerator( + sample_rate = sr, + downsample_rates=upsample_rates[::-1], + upsample_rates=upsample_rates, + start_channels=32, + num_mels=inter_channels) else: self.dec = GeneratorNSF( inter_channels, @@ -108,6 +116,9 @@ def __init__( if vocoder == "MRF HiFi-GAN": print("MRF HiFi-GAN does not support training without pitch guidance.") self.dec = None + elif vocoder == "RefineGAN": + print("RefineGAN does not support training without pitch guidance.") + self.dec = None else: self.dec = Generator( inter_channels, diff --git a/rvc/lib/tools/pretrained_selector.py b/rvc/lib/tools/pretrained_selector.py index daf364a4c..d0aa78545 100644 --- a/rvc/lib/tools/pretrained_selector.py +++ b/rvc/lib/tools/pretrained_selector.py @@ -8,6 +8,8 @@ def pretrained_selector(version, vocoder, pitch_guidance, sample_rate): vocoder_path = "" elif vocoder == "MRF HiFi-GAN": vocoder_path = "HiFiGAN_" + elif vocoder == "RefineGAN": + vocoder_path = "RefineGAN_" path_g = f"{path}{vocoder_path}{f0}G{str(sample_rate)[:2]}k.pth" path_d = f"{path}{vocoder_path}{f0}D{str(sample_rate)[:2]}k.pth" diff --git a/tabs/train/train.py b/tabs/train/train.py index ba7ee2112..c37cb53a1 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -321,7 +321,7 @@ def train_tab(): vocoder = gr.Radio( label=i18n("Vocoder"), info=i18n("Vocoder for audio synthesis, HiFi-GAN (default, works with all clients) or MRF HiFi-GAN (experimental, higher fidelity, Applio-only)."), - choices=["HiFi-GAN", "MRF HiFi-GAN"], + choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], value="HiFi-GAN", interactive=True, ) From 642a1436d6e4f725d66db1debe1a4bf5d9c185c2 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Wed, 11 Dec 2024 10:34:17 -0500 Subject: [PATCH 13/46] reverted resblock value to "1" --- rvc/configs/v1/32000.json | 2 +- rvc/configs/v1/40000.json | 2 +- rvc/configs/v1/44100.json | 2 +- rvc/configs/v1/48000.json | 2 +- rvc/configs/v2/32000.json | 2 +- rvc/configs/v2/40000.json | 2 +- rvc/configs/v2/44100.json | 2 +- rvc/configs/v2/48000.json | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/rvc/configs/v1/32000.json b/rvc/configs/v1/32000.json index ba8ef5867..2f28f4f68 100644 --- a/rvc/configs/v1/32000.json +++ b/rvc/configs/v1/32000.json @@ -34,7 +34,7 @@ "n_layers": 6, "kernel_size": 3, "p_dropout": 0, - "resblock": 1, + "resblock": "1", "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "upsample_rates": [10,4,2,2,2], diff --git a/rvc/configs/v1/40000.json b/rvc/configs/v1/40000.json index 6a0cdce26..3961ddb64 100644 --- a/rvc/configs/v1/40000.json +++ b/rvc/configs/v1/40000.json @@ -34,7 +34,7 @@ "n_layers": 6, "kernel_size": 3, "p_dropout": 0, - "resblock": 1, + "resblock": "1", "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "upsample_rates": [10,10,2,2], diff --git a/rvc/configs/v1/44100.json b/rvc/configs/v1/44100.json index a6aee470f..39246c326 100644 --- a/rvc/configs/v1/44100.json +++ b/rvc/configs/v1/44100.json @@ -30,7 +30,7 @@ "n_layers": 6, "kernel_size": 3, "p_dropout": 0, - "resblock": 1, + "resblock": "1", "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "upsample_rates": [7,7,3,3], diff --git a/rvc/configs/v1/48000.json b/rvc/configs/v1/48000.json index a0c9e756a..41ea3b62f 100644 --- a/rvc/configs/v1/48000.json +++ b/rvc/configs/v1/48000.json @@ -34,7 +34,7 @@ "n_layers": 6, "kernel_size": 3, "p_dropout": 0, - "resblock": 1, + "resblock": "1", "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "upsample_rates": [10,6,2,2,2], diff --git a/rvc/configs/v2/32000.json b/rvc/configs/v2/32000.json index 52637e570..eabab7b53 100644 --- a/rvc/configs/v2/32000.json +++ b/rvc/configs/v2/32000.json @@ -30,7 +30,7 @@ "n_layers": 6, "kernel_size": 3, "p_dropout": 0, - "resblock": 1, + "resblock": "1", "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "upsample_rates": [10,8,2,2], diff --git a/rvc/configs/v2/40000.json b/rvc/configs/v2/40000.json index 943922ea9..e1ba44a9c 100644 --- a/rvc/configs/v2/40000.json +++ b/rvc/configs/v2/40000.json @@ -30,7 +30,7 @@ "n_layers": 6, "kernel_size": 3, "p_dropout": 0, - "resblock": 1, + "resblock": "1", "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "upsample_rates": [10,10,2,2], diff --git a/rvc/configs/v2/44100.json b/rvc/configs/v2/44100.json index 04fdaf55d..dd1e57b21 100644 --- a/rvc/configs/v2/44100.json +++ b/rvc/configs/v2/44100.json @@ -30,7 +30,7 @@ "n_layers": 6, "kernel_size": 3, "p_dropout": 0, - "resblock": 1, + "resblock": "1", "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "upsample_rates": [7,7,3,3], diff --git a/rvc/configs/v2/48000.json b/rvc/configs/v2/48000.json index ff714a9ec..1a4da9f5c 100644 --- a/rvc/configs/v2/48000.json +++ b/rvc/configs/v2/48000.json @@ -30,7 +30,7 @@ "n_layers": 6, "kernel_size": 3, "p_dropout": 0, - "resblock": 1, + "resblock": "1", "resblock_kernel_sizes": [3,7,11], "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], "upsample_rates": [12,10,2,2], From 32c0d2ae37f671c55bcba1a7ff566efe563db982 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Wed, 11 Dec 2024 22:20:58 -0500 Subject: [PATCH 14/46] added logging for average gen and disc losses over 10 epochs --- rvc/train/train.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/rvc/train/train.py b/rvc/train/train.py index 0164de203..c6dc7b332 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -6,12 +6,13 @@ import torch import datetime +from collections import deque from distutils.util import strtobool from random import randint, shuffle from time import time as ttime from time import sleep from tqdm import tqdm - +import numpy as np from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter from torch.cuda.amp import GradScaler, autocast @@ -101,6 +102,9 @@ lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} training_file_path = os.path.join(experiment_dir, "training_data.json") +gen_loss_queue = deque(maxlen=10) +disc_loss_queue = deque(maxlen=10) + import logging logging.getLogger("torch").setLevel(logging.ERROR) @@ -453,8 +457,6 @@ def run( if True == False and os.path.isfile( os.path.join("logs", "reference", f"ref{sample_rate}.wav") ): - import numpy as np - phone = np.load( os.path.join("logs", "reference", f"ref{sample_rate}_feats.npy") ) @@ -548,6 +550,9 @@ def train_and_evaluate( last_loss_gen_all = 0.0 consecutive_increases_gen = 0 consecutive_increases_disc = 0 + + epoch_disc_sum = 0.0 + epoch_gen_sum = 0.0 net_g, net_d = nets optim_g, optim_d = optims @@ -620,6 +625,7 @@ def train_and_evaluate( # loss_disc, _, _ = discriminator_loss_scaled(y_d_hat_r, y_d_hat_g) loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) # Discriminator backward and update + epoch_disc_sum += loss_disc.item() optim_d.zero_grad() scaler.scale(loss_disc).backward() scaler.unscale_(optim_d) @@ -646,7 +652,7 @@ def train_and_evaluate( "value": loss_gen_all, "epoch": epoch, } - + epoch_gen_sum += loss_gen_all.item() optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) @@ -659,6 +665,10 @@ def train_and_evaluate( # Logging and checkpointing if rank == 0: + + disc_loss_queue.append(epoch_disc_sum / len(train_loader)) + gen_loss_queue.append(epoch_gen_sum / len(train_loader)) + # used for tensorboard chart - all/mel mel = spec_to_mel_torch( spec, @@ -704,6 +714,8 @@ def train_and_evaluate( "loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl, + "loss_avg/disc": np.mean(disc_loss_queue), + "loss_avg/gen": np.mean(gen_loss_queue) } # commented out # scalar_dict.update({f"loss/g/{i}": v for i, v in enumerate(losses_gen)}) From e6e229cb383adbb330427aef0e24e0ca4e757f0e Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:20:40 -0500 Subject: [PATCH 15/46] changed RefineGAN to original 16 starting channels due to unacceptable training speed with 32 --- rvc/lib/algorithm/refinegan.py | 26 +------------------------- rvc/lib/algorithm/synthesizers.py | 2 +- 2 files changed, 2 insertions(+), 26 deletions(-) diff --git a/rvc/lib/algorithm/refinegan.py b/rvc/lib/algorithm/refinegan.py index e86229cc4..a969752bc 100644 --- a/rvc/lib/algorithm/refinegan.py +++ b/rvc/lib/algorithm/refinegan.py @@ -7,33 +7,9 @@ from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations - -def named_applyZ( - fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False -) -> nn.Module: - if not depth_first and include_root: - fn(module=module, name=name) - - for child_name, child_module in module.named_children(): - child_name = ".".join((name, child_name)) if name else child_name - named_apply( - fn=fn, - module=child_module, - name=child_name, - depth_first=depth_first, - include_root=True, - ) - - if depth_first and include_root: - fn(module=module, name=name) - - return module - - def get_padding(kernel_size: int, dilation: int = 1) -> int: return int((kernel_size * dilation - dilation) / 2) - class ResBlock(torch.nn.Module): def __init__( self, @@ -307,7 +283,7 @@ def __init__( upsample_rates: tuple[int] = (8, 8, 2, 2), leaky_relu_slope: float = 0.2, num_mels: int = 128, - start_channels: int = 32, + start_channels: int = 16, gin_channels: int = 256, ) -> None: super().__init__() diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py index 36d35d334..1cadfc103 100644 --- a/rvc/lib/algorithm/synthesizers.py +++ b/rvc/lib/algorithm/synthesizers.py @@ -98,7 +98,7 @@ def __init__( sample_rate = sr, downsample_rates=upsample_rates[::-1], upsample_rates=upsample_rates, - start_channels=32, + start_channels=16, num_mels=inter_channels) else: self.dec = GeneratorNSF( From 9e82ccfcd6c1eac174925c891a6000de4718a4a3 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sun, 15 Dec 2024 12:49:44 -0500 Subject: [PATCH 16/46] adjusted the fusion of spectrogram and speaker embedding with pitch guidance in order to shift the result towards the cloned voice --- rvc/lib/algorithm/refinegan.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/rvc/lib/algorithm/refinegan.py b/rvc/lib/algorithm/refinegan.py index a969752bc..6ab86c7c3 100644 --- a/rvc/lib/algorithm/refinegan.py +++ b/rvc/lib/algorithm/refinegan.py @@ -312,10 +312,10 @@ def __init__( channels = new_channels - self.mel_conv = weight_norm(nn.Conv1d(in_channels=num_mels,out_channels=channels,kernel_size=7,stride=1,padding=3,)) + self.mel_conv = weight_norm(nn.Conv1d(in_channels=num_mels,out_channels=channels * 2,kernel_size=7,stride=1,padding=3,)) if gin_channels != 0: - self.cond = nn.Conv1d(256, channels, 1) + self.cond = nn.Conv1d(256, channels * 2, 1) channels *= 2 @@ -356,14 +356,12 @@ def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor=None) -> x = block(x) # expanding spectrogram from 192 to 256 channels - mel = self.mel_conv(mel) + x = self.mel_conv(mel) if g is not None: # adding expanded speaker embedding - mel = mel + self.cond(g) + x = x + self.cond(g) - x = torch.cat([x, mel], dim=1) - i = 1 for up, res, down in zip(self.upsample_blocks, self.upsample_conv_blocks, reversed(downs),): x = F.leaky_relu(x, self.leaky_relu_slope, inplace=True) x = up(x) From 67f435498a9c92245a82429f87e859e5ea60a1f1 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sun, 15 Dec 2024 22:46:15 -0500 Subject: [PATCH 17/46] reverted the fusion change, it brings the mirroring back --- rvc/lib/algorithm/refinegan.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/rvc/lib/algorithm/refinegan.py b/rvc/lib/algorithm/refinegan.py index a969752bc..6ab86c7c3 100644 --- a/rvc/lib/algorithm/refinegan.py +++ b/rvc/lib/algorithm/refinegan.py @@ -312,10 +312,10 @@ def __init__( channels = new_channels - self.mel_conv = weight_norm(nn.Conv1d(in_channels=num_mels,out_channels=channels,kernel_size=7,stride=1,padding=3,)) + self.mel_conv = weight_norm(nn.Conv1d(in_channels=num_mels,out_channels=channels * 2,kernel_size=7,stride=1,padding=3,)) if gin_channels != 0: - self.cond = nn.Conv1d(256, channels, 1) + self.cond = nn.Conv1d(256, channels * 2, 1) channels *= 2 @@ -356,14 +356,12 @@ def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor=None) -> x = block(x) # expanding spectrogram from 192 to 256 channels - mel = self.mel_conv(mel) + x = self.mel_conv(mel) if g is not None: # adding expanded speaker embedding - mel = mel + self.cond(g) + x = x + self.cond(g) - x = torch.cat([x, mel], dim=1) - i = 1 for up, res, down in zip(self.upsample_blocks, self.upsample_conv_blocks, reversed(downs),): x = F.leaky_relu(x, self.leaky_relu_slope, inplace=True) x = up(x) From fe7e98d0b3c4217c4932387b07a08275c31349e2 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Tue, 17 Dec 2024 09:10:10 -0500 Subject: [PATCH 18/46] corrected the speaker embedding --- rvc/lib/algorithm/refinegan.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/rvc/lib/algorithm/refinegan.py b/rvc/lib/algorithm/refinegan.py index 6ab86c7c3..6dc166288 100644 --- a/rvc/lib/algorithm/refinegan.py +++ b/rvc/lib/algorithm/refinegan.py @@ -312,10 +312,10 @@ def __init__( channels = new_channels - self.mel_conv = weight_norm(nn.Conv1d(in_channels=num_mels,out_channels=channels * 2,kernel_size=7,stride=1,padding=3,)) + self.mel_conv = weight_norm(nn.Conv1d(in_channels=num_mels,out_channels=channels,kernel_size=7,stride=1,padding=3,)) if gin_channels != 0: - self.cond = nn.Conv1d(256, channels * 2, 1) + self.cond = nn.Conv1d(256, channels, 1) channels *= 2 @@ -356,12 +356,13 @@ def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor=None) -> x = block(x) # expanding spectrogram from 192 to 256 channels - x = self.mel_conv(mel) + mel = self.mel_conv(mel) if g is not None: # adding expanded speaker embedding - x = x + self.cond(g) - + x = x + self.cond(g) + x = torch.cat([x, mel], dim=1) + i = 1 for up, res, down in zip(self.upsample_blocks, self.upsample_conv_blocks, reversed(downs),): x = F.leaky_relu(x, self.leaky_relu_slope, inplace=True) x = up(x) From 5b3a3528613852b0c181f75d1d80609b4a99ed5c Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:01:50 -0500 Subject: [PATCH 19/46] small training loop optimizations removed unused discriminator and generator losses aggregation added a missing scaler update replaced non-functional gradient value clipping function with a standard torch implementation --- rvc/train/losses.py | 6 +++--- rvc/train/train.py | 17 +++++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/rvc/train/losses.py b/rvc/train/losses.py index b0d7cff51..2f506c80a 100644 --- a/rvc/train/losses.py +++ b/rvc/train/losses.py @@ -32,8 +32,8 @@ def discriminator_loss(disc_real_outputs, disc_generated_outputs): r_loss = torch.mean((1 - dr.float()) ** 2) g_loss = torch.mean(dg.float() ** 2) - r_losses.append(r_loss.item()) - g_losses.append(g_loss.item()) + #r_losses.append(r_loss.item()) + #g_losses.append(g_loss.item()) loss += r_loss + g_loss return loss, r_losses, g_losses @@ -50,7 +50,7 @@ def generator_loss(disc_outputs): loss = 0 for dg in disc_outputs: l = torch.mean((1 - dg.float()) ** 2) - gen_losses.append(l.item()) + #gen_losses.append(l.item()) loss += l return loss, gen_losses diff --git a/rvc/train/train.py b/rvc/train/train.py index c6dc7b332..0f36eeedb 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -625,12 +625,13 @@ def train_and_evaluate( # loss_disc, _, _ = discriminator_loss_scaled(y_d_hat_r, y_d_hat_g) loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) # Discriminator backward and update - epoch_disc_sum += loss_disc.item() + epoch_disc_sum += loss_disc optim_d.zero_grad() scaler.scale(loss_disc).backward() scaler.unscale_(optim_d) - grad_norm_d = commons.clip_grad_value(net_d.parameters(), None) + grad_norm_d = torch.nn.utils.clip_grad_norm_(net_d.parameters(), max_norm=1000.0) scaler.step(optim_d) + scaler.update() # Generator backward and update with autocast(enabled=use_amp): @@ -652,11 +653,11 @@ def train_and_evaluate( "value": loss_gen_all, "epoch": epoch, } - epoch_gen_sum += loss_gen_all.item() + epoch_gen_sum += loss_gen_all optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) - grad_norm_g = commons.clip_grad_value(net_g.parameters(), None) + grad_norm_g = torch.nn.utils.clip_grad_norm_(net_g.parameters(), max_norm=1000.0) scaler.step(optim_g) scaler.update() @@ -666,8 +667,8 @@ def train_and_evaluate( # Logging and checkpointing if rank == 0: - disc_loss_queue.append(epoch_disc_sum / len(train_loader)) - gen_loss_queue.append(epoch_gen_sum / len(train_loader)) + disc_loss_queue.append(epoch_disc_sum.item() / len(train_loader)) + gen_loss_queue.append(epoch_gen_sum.item() / len(train_loader)) # used for tensorboard chart - all/mel mel = spec_to_mel_torch( @@ -709,8 +710,8 @@ def train_and_evaluate( "loss/g/total": loss_gen_all, "loss/d/total": loss_disc, "learning_rate": lr, - "grad/norm_d": grad_norm_d, - "grad/norm_g": grad_norm_g, + "grad/norm_d": grad_norm_d.item(), + "grad/norm_g": grad_norm_g.item(), "loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl, From 7ae02257b06f5b13369a9db7763139e526ad4710 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:25:01 -0500 Subject: [PATCH 20/46] added envelope loss function added average losses over 50 steps and logging --- rvc/train/losses.py | 8 ++++++ rvc/train/train.py | 60 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 57 insertions(+), 11 deletions(-) diff --git a/rvc/train/losses.py b/rvc/train/losses.py index 2f506c80a..b78c22daa 100644 --- a/rvc/train/losses.py +++ b/rvc/train/losses.py @@ -90,3 +90,11 @@ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): loss = kl / torch.sum(z_mask) return loss + +MaxPool = torch.nn.MaxPool1d(160) + +def envelope_loss(y, y_g): + loss = 0 + loss += torch.mean(torch.abs(MaxPool( y) - MaxPool( y_g))) + loss += torch.mean(torch.abs(MaxPool(-y) - MaxPool(-y_g))) + return loss \ No newline at end of file diff --git a/rvc/train/train.py b/rvc/train/train.py index 0f36eeedb..867762308 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -6,6 +6,7 @@ import torch import datetime +import math from collections import deque from distutils.util import strtobool from random import randint, shuffle @@ -45,6 +46,7 @@ generator_loss, generator_loss_scaled, kl_loss, + envelope_loss ) from mel_processing import ( mel_spectrogram_torch, @@ -102,8 +104,16 @@ lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} training_file_path = os.path.join(experiment_dir, "training_data.json") -gen_loss_queue = deque(maxlen=10) -disc_loss_queue = deque(maxlen=10) +avg_losses={ + "gen_loss_queue": deque(maxlen=10), + "disc_loss_queue": deque(maxlen=10), + "disc_loss_50": deque(maxlen=50), + "env_loss_50": deque(maxlen=50), + "fm_loss_50": deque(maxlen=50), + "kl_loss_50": deque(maxlen=50), + "mel_loss_50": deque(maxlen=50), + "gen_loss_50": deque(maxlen=50), +} import logging @@ -632,12 +642,15 @@ def train_and_evaluate( grad_norm_d = torch.nn.utils.clip_grad_norm_(net_d.parameters(), max_norm=1000.0) scaler.step(optim_d) scaler.update() + if not math.isfinite(grad_norm_d): + print('\nWarning: grad_norm_d is NaN or Inf') # Generator backward and update with autocast(enabled=use_amp): _, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) with autocast(enabled=False): loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0 + loss_env = envelope_loss(wave, y_hat) loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl loss_fm = feature_loss(fmap_r, fmap_g) #if vocoder == "HiFi-GAN": @@ -645,7 +658,7 @@ def train_and_evaluate( #else: # loss_gen, _ = generator_loss_scaled(y_d_hat_g) loss_gen, _ = generator_loss(y_d_hat_g) - loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_env if loss_gen_all < lowest_value["value"]: lowest_value = { @@ -660,15 +673,42 @@ def train_and_evaluate( grad_norm_g = torch.nn.utils.clip_grad_norm_(net_g.parameters(), max_norm=1000.0) scaler.step(optim_g) scaler.update() + if not math.isfinite(grad_norm_g): + print('\n Warning: grad_norm_g is NaN or Inf') global_step += 1 + + # queue for rolling losses over 50 steps + avg_losses["disc_loss_50"].append(loss_disc.detach()) + avg_losses["env_loss_50"].append(loss_env.detach()) + avg_losses["fm_loss_50"].append(loss_fm.detach()) + avg_losses["kl_loss_50"].append(loss_kl.detach()) + avg_losses["mel_loss_50"].append(loss_mel.detach()) + avg_losses["gen_loss_50"].append(loss_gen_all.detach()) + + if rank == 0 and global_step % 50 == 0: + # logging rolling averages + scalar_dict = { + "loss_avg_50/d/total": torch.mean(torch.stack(list(avg_losses["disc_loss_50"]))), + "loss_avg_50/g/env": torch.mean(torch.stack(list(avg_losses["env_loss_50"]))), + "loss_avg_50/g/fm": torch.mean(torch.stack(list(avg_losses["fm_loss_50"]))), + "loss_avg_50/g/kl": torch.mean(torch.stack(list(avg_losses["kl_loss_50"]))), + "loss_avg_50/g/mel": torch.mean(torch.stack(list(avg_losses["mel_loss_50"]))), + "loss_avg_50/g/total": torch.mean(torch.stack(list(avg_losses["gen_loss_50"]))), + } + summarize( + writer=writer, + global_step=global_step, + scalars=scalar_dict, + ) + pbar.update(1) # Logging and checkpointing if rank == 0: - disc_loss_queue.append(epoch_disc_sum.item() / len(train_loader)) - gen_loss_queue.append(epoch_gen_sum.item() / len(train_loader)) + avg_losses["disc_loss_queue"].append(epoch_disc_sum.item() / len(train_loader)) + avg_losses["gen_loss_queue"].append(epoch_gen_sum.item() / len(train_loader)) # used for tensorboard chart - all/mel mel = spec_to_mel_torch( @@ -702,10 +742,7 @@ def train_and_evaluate( y_hat_mel = y_hat_mel.half() lr = optim_g.param_groups[0]["lr"] - if loss_mel > 75: - loss_mel = 75 - if loss_kl > 9: - loss_kl = 9 + scalar_dict = { "loss/g/total": loss_gen_all, "loss/d/total": loss_disc, @@ -715,8 +752,9 @@ def train_and_evaluate( "loss/g/fm": loss_fm, "loss/g/mel": loss_mel, "loss/g/kl": loss_kl, - "loss_avg/disc": np.mean(disc_loss_queue), - "loss_avg/gen": np.mean(gen_loss_queue) + "loss/g/env": loss_env, + "loss_avg_epoch/disc": np.mean(avg_losses["disc_loss_queue"]), + "loss_avg_epoch/gen": np.mean(avg_losses["gen_loss_queue"]), } # commented out # scalar_dict.update({f"loss/g/{i}": v for i, v in enumerate(losses_gen)}) From 98215e6e5ff11c594b0f187f35ac036ab48307aa Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Thu, 19 Dec 2024 20:32:23 -0500 Subject: [PATCH 21/46] added inplace to activation functions to optimize memory use flushing torch cache before the model save to avoid an extra 2.5GB VRAM consumed for no reason --- rvc/lib/algorithm/discriminators.py | 4 ++-- rvc/train/train.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/rvc/lib/algorithm/discriminators.py b/rvc/lib/algorithm/discriminators.py index ecef449a5..8545c0129 100644 --- a/rvc/lib/algorithm/discriminators.py +++ b/rvc/lib/algorithm/discriminators.py @@ -73,7 +73,7 @@ def __init__(self, use_spectral_norm: bool = False): ] ) self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1)) - self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE, inplace=True) def forward(self, x): """ @@ -138,7 +138,7 @@ def __init__( ) self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE, inplace=True) def forward(self, x): """ diff --git a/rvc/train/train.py b/rvc/train/train.py index 867762308..8c3b642e7 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -704,6 +704,9 @@ def train_and_evaluate( pbar.update(1) + with torch.no_grad(): + torch.cuda.empty_cache() + # Logging and checkpointing if rank == 0: @@ -965,6 +968,9 @@ def train_and_evaluate( if done: os._exit(2333333) + + with torch.no_grad(): + torch.cuda.empty_cache() def check_overtraining(smoothed_loss_history, threshold, epsilon=0.004): From 3fef20f91d3c613d7ec86d48596aa37462b65f03 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Fri, 20 Dec 2024 04:14:09 -0500 Subject: [PATCH 22/46] attempt to bring CPU usage down --- rvc/lib/algorithm/commons.py | 2 +- rvc/lib/algorithm/residuals.py | 2 +- rvc/train/mel_processing.py | 70 +++++++++++++++++----------------- 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/rvc/lib/algorithm/commons.py b/rvc/lib/algorithm/commons.py index 2524abc41..14cd065ef 100644 --- a/rvc/lib/algorithm/commons.py +++ b/rvc/lib/algorithm/commons.py @@ -98,7 +98,7 @@ def rand_slice_segments(x, x_lengths=None, segment_size=4): if x_lengths is None: x_lengths = t ids_str_max = x_lengths - segment_size + 1 - ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long) ret = slice_segments(x, ids_str, segment_size, dim=3) return ret, ids_str diff --git a/rvc/lib/algorithm/residuals.py b/rvc/lib/algorithm/residuals.py index 7a934285f..e6d8ee2d6 100644 --- a/rvc/lib/algorithm/residuals.py +++ b/rvc/lib/algorithm/residuals.py @@ -76,7 +76,7 @@ def forward(self, x, *args, reverse=False, **kwargs): """ x = torch.flip(x, [1]) if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + logdet = torch.zeros(x.size(0), dtype=x.dtype, device=x.device) return x, logdet else: return x diff --git a/rvc/train/mel_processing.py b/rvc/train/mel_processing.py index b3bb74f87..6b3cb0567 100644 --- a/rvc/train/mel_processing.py +++ b/rvc/train/mel_processing.py @@ -146,7 +146,7 @@ def mel_spectrogram_torch( return melspec -def compute_window_length(n_mels: int, sample_rate: int): +def compute_window_length(n_mels: int, sample_rate: int) -> int: f_min = 0 f_max = sample_rate / 2 window_length_seconds = 8 * n_mels / (f_max - f_min) @@ -159,66 +159,64 @@ class MultiScaleMelSpectrogramLoss(torch.nn.Module): def __init__( self, sample_rate: int = 24000, - n_mels=[5, 10, 20, 40, 80, 160, 320, 480], + n_mels: list[int]=[5, 10, 20, 40, 80, 160, 320, 480], loss_fn=torch.nn.L1Loss(), ): super().__init__() self.sample_rate = sample_rate self.loss_fn = loss_fn self.log_base = torch.log(torch.tensor(10.0)) - self.stft_params = {} - self.mel_banks = {} + self.stft_params: list[tuple] = [] + self.hann_window: dict[int, torch.Tensor] = {} + self.mel_banks: dict[int, torch.Tensor] = {} window_lengths = [compute_window_length(mel, sample_rate) for mel in n_mels] - # print(window_lengths) for n_mels, window_length in zip(n_mels, window_lengths): - self.stft_params[n_mels] = { - "n_mels": n_mels, - "window_length": window_length, - "hop_length": self.sample_rate // 100, - } - self.mel_banks[n_mels] = torch.from_numpy( - librosa_mel_fn( - sr=self.sample_rate, - n_mels=n_mels, - n_fft=window_length, - fmin=0, - fmax=None, - ) - ) + self.stft_params.append((n_mels, window_length, self.sample_rate // 100)) def mel_spectrogram( self, - wav, - n_mels, - window_length, - hop_length, - ): + wav: torch.Tensor, + n_mels: list[int], + window_length: int, + hop_length: int, + ) -> torch.Tensor: + dtype_device = str(wav.dtype) + "_" + str(wav.device) + fmax_dtype_device = str(window_length) + "_" + dtype_device + if fmax_dtype_device not in self.hann_window: + self.hann_window[fmax_dtype_device] = torch.hann_window(window_length, device=wav.device, dtype=wav.dtype) wav = wav.squeeze(1) # -> torch(B, T) - window = torch.hann_window(window_length).to(wav.device).to(wav.dtype) stft = torch.stft( wav.float(), n_fft=window_length, hop_length=hop_length, - window=window, + window=self.hann_window[fmax_dtype_device], return_complex=True, ) # -> torch (B, window_length // 2 + 1, (T - window_length)/hop_length + 1) magnitude = torch.sqrt(stft.real.pow(2) + stft.imag.pow(2) + 1e-6) - mel_basis = self.mel_banks[n_mels].to( - wav.device - ) # torch(n_mels, window_length // 2 + 1) + if fmax_dtype_device not in self.mel_banks: + self.mel_banks[fmax_dtype_device] = torch.from_numpy( + librosa_mel_fn( + sr=self.sample_rate, + n_mels=n_mels, + n_fft=window_length, + fmin=0, + fmax=None, + ) + ).to(device=wav.device) + mel_basis = self.mel_banks[fmax_dtype_device] # torch(n_mels, window_length // 2 + 1) mel_spectrogram = torch.matmul( mel_basis, magnitude ) # torch(B, n_mels, stft.frames) return mel_spectrogram - def forward(self, real, fake): # real: torch(B, 1, T) , fake: torch(B, 1, T) + def forward(self, real: torch.Tensor, fake: torch.Tensor): # real: torch(B, 1, T) , fake: torch(B, 1, T) loss = 0.0 - for p in self.stft_params.values(): - real_mels = self.mel_spectrogram(real, **p) - fake_mels = self.mel_spectrogram(fake, **p) - real_logmels = torch.log(real_mels.clamp(min=1e-5).pow(1)) / self.log_base - fake_logmels = torch.log(fake_mels.clamp(min=1e-5).pow(1)) / self.log_base + for p in self.stft_params: + real_mels = self.mel_spectrogram(real, *p) + fake_mels = self.mel_spectrogram(fake, *p) + real_logmels = torch.log(real_mels.clamp(min=1e-5)) / self.log_base + fake_logmels = torch.log(fake_mels.clamp(min=1e-5)) / self.log_base loss += self.loss_fn(real_logmels, fake_logmels) - return loss + return loss \ No newline at end of file From b19042903b052d4b746cbeb6d1c51cb58adc5b22 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Fri, 20 Dec 2024 17:40:56 -0500 Subject: [PATCH 23/46] corrected mel bank caching --- rvc/train/mel_processing.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/rvc/train/mel_processing.py b/rvc/train/mel_processing.py index 6b3cb0567..7e7d49c17 100644 --- a/rvc/train/mel_processing.py +++ b/rvc/train/mel_processing.py @@ -153,7 +153,6 @@ def compute_window_length(n_mels: int, sample_rate: int) -> int: window_length = int(window_length_seconds * sample_rate) return 2 ** (window_length.bit_length() - 1) - class MultiScaleMelSpectrogramLoss(torch.nn.Module): def __init__( @@ -170,33 +169,38 @@ def __init__( self.hann_window: dict[int, torch.Tensor] = {} self.mel_banks: dict[int, torch.Tensor] = {} - window_lengths = [compute_window_length(mel, sample_rate) for mel in n_mels] - - for n_mels, window_length in zip(n_mels, window_lengths): - self.stft_params.append((n_mels, window_length, self.sample_rate // 100)) + self.stft_params = [(mel, compute_window_length(mel, sample_rate), self.sample_rate // 100) for mel in n_mels] def mel_spectrogram( self, wav: torch.Tensor, - n_mels: list[int], + n_mels: int, window_length: int, hop_length: int, ) -> torch.Tensor: + # IDs for caching dtype_device = str(wav.dtype) + "_" + str(wav.device) - fmax_dtype_device = str(window_length) + "_" + dtype_device - if fmax_dtype_device not in self.hann_window: - self.hann_window[fmax_dtype_device] = torch.hann_window(window_length, device=wav.device, dtype=wav.dtype) + win_dtype_device = str(window_length) + "_" + dtype_device + mel_dtype_device = str(n_mels) + "_" + dtype_device + # caching hann window + if win_dtype_device not in self.hann_window: + self.hann_window[win_dtype_device] = torch.hann_window(window_length, device=wav.device, dtype=wav.dtype) + wav = wav.squeeze(1) # -> torch(B, T) + stft = torch.stft( - wav.float(), + wav, n_fft=window_length, hop_length=hop_length, - window=self.hann_window[fmax_dtype_device], + window=self.hann_window[win_dtype_device], return_complex=True, ) # -> torch (B, window_length // 2 + 1, (T - window_length)/hop_length + 1) + magnitude = torch.sqrt(stft.real.pow(2) + stft.imag.pow(2) + 1e-6) - if fmax_dtype_device not in self.mel_banks: - self.mel_banks[fmax_dtype_device] = torch.from_numpy( + + # caching mel filter + if mel_dtype_device not in self.mel_banks: + self.mel_banks[mel_dtype_device] = torch.from_numpy( librosa_mel_fn( sr=self.sample_rate, n_mels=n_mels, @@ -204,10 +208,10 @@ def mel_spectrogram( fmin=0, fmax=None, ) - ).to(device=wav.device) - mel_basis = self.mel_banks[fmax_dtype_device] # torch(n_mels, window_length // 2 + 1) + ).to(device=wav.device, dtype=wav.dtype) + mel_spectrogram = torch.matmul( - mel_basis, magnitude + self.mel_banks[mel_dtype_device], magnitude ) # torch(B, n_mels, stft.frames) return mel_spectrogram From e1504237fbd7c0935f2aec08648f0ececbdd79b0 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Fri, 20 Dec 2024 20:41:54 -0500 Subject: [PATCH 24/46] found a way to do STFT operation on GPU while avoiding using torch.fft --- rvc/lib/zluda.py | 53 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/rvc/lib/zluda.py b/rvc/lib/zluda.py index 4af2e86c4..f7d511b0d 100644 --- a/rvc/lib/zluda.py +++ b/rvc/lib/zluda.py @@ -1,10 +1,57 @@ import torch +import numpy as np if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): - _torch_stft = torch.stft + class STFT: + def __init__(self): + self.device = "cuda" + self.fourier_bases = {} # Cache for Fourier bases + + def _get_fourier_basis(self, n_fft): + # Check if the basis for this n_fft is already cached + if n_fft in self.fourier_bases: + return self.fourier_bases[n_fft] + eye = np.eye(n_fft) + fourier_basis = np.fft.fft(eye) + # stack separated real and imaginary components and convert to torch tensor + cutoff = n_fft // 2 + 1 + real_imag_basis = np.vstack([np.real(fourier_basis[:cutoff]), np.imag(fourier_basis[:cutoff])]) + torch_basis = torch.FloatTensor(real_imag_basis).to(self.device) + # cache the tensor and return + self.fourier_bases[n_fft] = torch_basis + return torch_basis + def transform(self, input, n_fft, hop_length, window): + # fetch cached Fourier basis + fourier_basis = self._get_fourier_basis(n_fft) + # apply hann window to Fourier basis + fourier_basis = fourier_basis * window + # pad input to center with reflect + pad_amount = n_fft // 2 + input = torch.nn.functional.pad(input, (pad_amount, pad_amount), mode='reflect') + # separate input into n_fft-sized frames + input_frames = input.unfold(1, n_fft, hop_length).permute(0, 2, 1) + # apply fft to each frame + fourier_transform = torch.matmul(fourier_basis, input_frames) + cutoff = n_fft // 2 + 1 + return torch.complex(fourier_transform[:, :cutoff, :], fourier_transform[:, cutoff:, :]) + stft = STFT() + _torch_stft = torch.stft + def z_stft(input: torch.Tensor, window: torch.Tensor, *args, **kwargs): - return _torch_stft(input=input.cpu(), window=window.cpu(), *args, **kwargs).to(input.device) + # only optimizing a specific call from rvc.train.mel_processing.MultiScaleMelSpectrogramLoss + if (kwargs.get('win_length') == None + and kwargs.get("center") == None + and kwargs.get('return_complex') == True): + # use GPU accelerated calculation + return stft.transform( + input, + kwargs.get("n_fft"), + kwargs.get("hop_length"), + window) + else: + # simply do the operation on CPU + return _torch_stft(input=input.cpu(), window=window.cpu(), *args, **kwargs).to(input.device) def z_jit(f, *_, **__): f.graph = torch._C.Graph() @@ -17,4 +64,4 @@ def z_jit(f, *_, **__): torch.backends.cudnn.enabled = False torch.backends.cuda.enable_flash_sdp(False) torch.backends.cuda.enable_math_sdp(True) - torch.backends.cuda.enable_mem_efficient_sdp(False) + torch.backends.cuda.enable_mem_efficient_sdp(False) \ No newline at end of file From c99ed992e6b96c95a7614151f77f79128ad3ddf6 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Fri, 20 Dec 2024 22:31:29 -0500 Subject: [PATCH 25/46] removed numpy use, simply doing the fft call using CPU to init the tensor --- rvc/lib/zluda.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/rvc/lib/zluda.py b/rvc/lib/zluda.py index f7d511b0d..43ef3e6dd 100644 --- a/rvc/lib/zluda.py +++ b/rvc/lib/zluda.py @@ -1,5 +1,4 @@ import torch -import numpy as np if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"): class STFT: @@ -11,15 +10,13 @@ def _get_fourier_basis(self, n_fft): # Check if the basis for this n_fft is already cached if n_fft in self.fourier_bases: return self.fourier_bases[n_fft] - eye = np.eye(n_fft) - fourier_basis = np.fft.fft(eye) + fourier_basis = torch.fft.fft(torch.eye(n_fft, device="cpu")).to(self.device) # stack separated real and imaginary components and convert to torch tensor cutoff = n_fft // 2 + 1 - real_imag_basis = np.vstack([np.real(fourier_basis[:cutoff]), np.imag(fourier_basis[:cutoff])]) - torch_basis = torch.FloatTensor(real_imag_basis).to(self.device) + fourier_basis = torch.cat([fourier_basis.real[:cutoff], fourier_basis.imag[:cutoff]], dim=0) # cache the tensor and return - self.fourier_bases[n_fft] = torch_basis - return torch_basis + self.fourier_bases[n_fft] = fourier_basis + return fourier_basis def transform(self, input, n_fft, hop_length, window): # fetch cached Fourier basis fourier_basis = self._get_fourier_basis(n_fft) From be21db9527115fb24590a388644396a78d91ebe6 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sat, 21 Dec 2024 05:42:45 -0500 Subject: [PATCH 26/46] trying to make spectrograms with FP16 was a bad idea --- rvc/train/mel_processing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rvc/train/mel_processing.py b/rvc/train/mel_processing.py index 7e7d49c17..fafa6af37 100644 --- a/rvc/train/mel_processing.py +++ b/rvc/train/mel_processing.py @@ -184,12 +184,12 @@ def mel_spectrogram( mel_dtype_device = str(n_mels) + "_" + dtype_device # caching hann window if win_dtype_device not in self.hann_window: - self.hann_window[win_dtype_device] = torch.hann_window(window_length, device=wav.device, dtype=wav.dtype) + self.hann_window[win_dtype_device] = torch.hann_window(window_length, device=wav.device, dtype=torch.float32) wav = wav.squeeze(1) # -> torch(B, T) stft = torch.stft( - wav, + wav.float(), n_fft=window_length, hop_length=hop_length, window=self.hann_window[win_dtype_device], @@ -208,7 +208,7 @@ def mel_spectrogram( fmin=0, fmax=None, ) - ).to(device=wav.device, dtype=wav.dtype) + ).to(device=wav.device, dtype=torch.float32) mel_spectrogram = torch.matmul( self.mel_banks[mel_dtype_device], magnitude From 530e82f7808aa28acf435c561d52642aa23bd732 Mon Sep 17 00:00:00 2001 From: Blaise Date: Sat, 21 Dec 2024 14:22:04 +0100 Subject: [PATCH 27/46] update description --- assets/i18n/languages/en_US.json | 2 +- tabs/train/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index f80c90f7a..8ef05c22a 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -323,5 +323,5 @@ "The name that will appear in the model information.": "The name that will appear in the model information.", "Set name": "Set name", "Vocoder": "Vocoder", - "Vocoder for audio synthesis, HiFi-GAN (default, works with all clients) or MRF HiFi-GAN (experimental, higher fidelity, Applio-only).": "Vocoder for audio synthesis, HiFi-GAN (default, works with all clients) or MRF HiFi-GAN (experimental, higher fidelity, Applio-only)." + "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance).": "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance)." } \ No newline at end of file diff --git a/tabs/train/train.py b/tabs/train/train.py index a58b5b020..81f590aa0 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -320,7 +320,7 @@ def train_tab(): ) vocoder = gr.Radio( label=i18n("Vocoder"), - info=i18n("Vocoder for audio synthesis, HiFi-GAN (default, works with all clients) or MRF HiFi-GAN (experimental, higher fidelity, Applio-only)."), + info=i18n("Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance)."), choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], value="HiFi-GAN", interactive=True, From 6f956ad2a42a75de7a65137c2cc564c57640cc75 Mon Sep 17 00:00:00 2001 From: Blaise Date: Sat, 21 Dec 2024 14:30:28 +0100 Subject: [PATCH 28/46] organize generators --- .../{generators.py => generators/hifigan.py} | 10 +- .../{hifigan.py => generators/hifigan_mrf.py} | 71 ++++----- .../{nsf.py => generators/hifigan_nsf.py} | 14 +- .../algorithm/{ => generators}/refinegan.py | 144 +++++++++++------- rvc/lib/algorithm/synthesizers.py | 27 ++-- 5 files changed, 152 insertions(+), 114 deletions(-) rename rvc/lib/algorithm/{generators.py => generators/hifigan.py} (97%) rename rvc/lib/algorithm/{hifigan.py => generators/hifigan_mrf.py} (87%) rename rvc/lib/algorithm/{nsf.py => generators/hifigan_nsf.py} (95%) rename rvc/lib/algorithm/{ => generators}/refinegan.py (76%) diff --git a/rvc/lib/algorithm/generators.py b/rvc/lib/algorithm/generators/hifigan.py similarity index 97% rename from rvc/lib/algorithm/generators.py rename to rvc/lib/algorithm/generators/hifigan.py index e0c0ebbcd..0d5c84310 100644 --- a/rvc/lib/algorithm/generators.py +++ b/rvc/lib/algorithm/generators/hifigan.py @@ -8,7 +8,7 @@ from rvc.lib.algorithm.commons import init_weights -class Generator(torch.nn.Module): +class HiFiGANGenerator(torch.nn.Module): """Generator for synthesizing audio. Args: @@ -32,7 +32,7 @@ def __init__( upsample_kernel_sizes: list, gin_channels: int = 0, ): - super(Generator, self).__init__() + super(HiFiGANGenerator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) self.conv_pre = torch.nn.Conv1d( @@ -135,7 +135,7 @@ def __init__( self.voiced_threshold = voiced_threshold self.waveform_dim = self.num_harmonics + 1 # fundamental + harmonics - def _compute_voiced_unvoiced(self, f0: torch.Tensor) -> torch.Tensor: + def _compute_voiced_unvoiced(self, f0: torch.Tensor): """ Generate a binary mask to indicate voiced/unvoiced frames. @@ -145,9 +145,7 @@ def _compute_voiced_unvoiced(self, f0: torch.Tensor) -> torch.Tensor: uv_mask = (f0 > self.voiced_threshold).float() return uv_mask - def _generate_sine_wave( - self, f0: torch.Tensor, upsampling_factor: int - ) -> torch.Tensor: + def _generate_sine_wave(self, f0: torch.Tensor, upsampling_factor: int): """ Generate sine waves for the fundamental frequency and its harmonics. diff --git a/rvc/lib/algorithm/hifigan.py b/rvc/lib/algorithm/generators/hifigan_mrf.py similarity index 87% rename from rvc/lib/algorithm/hifigan.py rename to rvc/lib/algorithm/generators/hifigan_mrf.py index 851b0b096..0a959e58f 100644 --- a/rvc/lib/algorithm/hifigan.py +++ b/rvc/lib/algorithm/generators/hifigan_mrf.py @@ -1,19 +1,18 @@ import math import numpy as np import torch -import torch.nn as nn -import torch.nn.functional as F from torch.nn.utils import remove_weight_norm from torch.nn.utils.parametrizations import weight_norm from typing import Optional LRELU_SLOPE = 0.1 -class MRFLayer(nn.Module): + +class MRFLayer(torch.nn.Module): def __init__(self, channels, kernel_size, dilation): super().__init__() self.conv1 = weight_norm( - nn.Conv1d( + torch.nn.Conv1d( channels, channels, kernel_size, @@ -22,15 +21,15 @@ def __init__(self, channels, kernel_size, dilation): ) ) self.conv2 = weight_norm( - nn.Conv1d( + torch.nn.Conv1d( channels, channels, kernel_size, padding=kernel_size // 2, dilation=1 ) ) def forward(self, x): - y = F.leaky_relu(x, LRELU_SLOPE) + y = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) y = self.conv1(y) - y = F.leaky_relu(y, LRELU_SLOPE) + y = torch.nn.functional.leaky_relu(y, LRELU_SLOPE) y = self.conv2(y) return x + y @@ -39,10 +38,10 @@ def remove_weight_norm(self): remove_weight_norm(self.conv2) -class MRFBlock(nn.Module): +class MRFBlock(torch.nn.Module): def __init__(self, channels, kernel_size, dilations): super().__init__() - self.layers = nn.ModuleList() + self.layers = torch.nn.ModuleList() for dilation in dilations: self.layers.append(MRFLayer(channels, kernel_size, dilation)) @@ -55,7 +54,8 @@ def remove_weight_norm(self): for layer in self.layers: layer.remove_weight_norm() -class SineGen(torch.nn.Module): + +class SineGenerator(torch.nn.Module): """Definition of sine generator SineGen(samp_rate, harmonic_num = 0, sine_amp = 0.1, noise_std = 0.003, @@ -81,7 +81,7 @@ def __init__( noise_std=0.003, voiced_threshold=0, ): - super(SineGen, self).__init__() + super(SineGenerator, self).__init__() self.sine_amp = sine_amp self.noise_std = noise_std self.harmonic_num = harmonic_num @@ -135,9 +135,9 @@ def forward(self, f0): f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) sine_waves = self._f02sine(f0_buf) * self.sine_amp - + uv = self._f02uv(f0) - + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise = noise_amp * torch.randn_like(sine_waves) @@ -160,7 +160,7 @@ def __init__( self.noise_std = add_noise_std # to produce sine waveforms - self.l_sin_gen = SineGen( + self.l_sin_gen = SineGenerator( sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold ) @@ -175,7 +175,8 @@ def forward(self, x): return sine_merge, None, None -class HiFiGAN(nn.Module): + +class HiFiGANMRFGenerator(torch.nn.Module): def __init__( self, in_channel, @@ -191,17 +192,17 @@ def __init__( super().__init__() self.num_kernels = len(resblock_kernel_sizes) - self.f0_upsample = nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.f0_upsample = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num) self.conv_pre = weight_norm( - nn.Conv1d( + torch.nn.Conv1d( in_channel, upsample_initial_channel, kernel_size=7, stride=1, padding=3 ) ) - self.upsamples = nn.ModuleList() - self.noise_convs = nn.ModuleList() - + self.upsamples = torch.nn.ModuleList() + self.noise_convs = torch.nn.ModuleList() + stride_f0s = [ math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 for i in range(len(upsample_rates)) @@ -214,10 +215,10 @@ def __init__( padding = (k - u) // 2 else: padding = u // 2 + u % 2 - + self.upsamples.append( weight_norm( - nn.ConvTranspose1d( + torch.nn.ConvTranspose1d( upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), kernel_size=k, @@ -239,11 +240,11 @@ def __init__( # 1 1 0 """ stride = stride_f0s[i] - kernel = (1 if stride == 1 else stride * 2 - stride % 2) - padding = (0 if stride == 1 else (kernel - stride) // 2) - + kernel = 1 if stride == 1 else stride * 2 - stride % 2 + padding = 0 if stride == 1 else (kernel - stride) // 2 + self.noise_convs.append( - nn.Conv1d( + torch.nn.Conv1d( 1, upsample_initial_channel // (2 ** (i + 1)), kernel_size=kernel, @@ -251,11 +252,11 @@ def __init__( padding=padding, ) ) - self.mrfs = nn.ModuleList() + self.mrfs = torch.nn.ModuleList() for i in range(len(self.upsamples)): channel = upsample_initial_channel // (2 ** (i + 1)) self.mrfs.append( - nn.ModuleList( + torch.nn.ModuleList( [ MRFBlock(channel, kernel_size=k, dilations=d) for k, d in zip(resblock_kernel_sizes, resblock_dilations) @@ -263,7 +264,7 @@ def __init__( ) ) self.conv_post = weight_norm( - nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3) + torch.nn.Conv1d(channel, 1, kernel_size=7, stride=1, padding=3) ) if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) @@ -272,14 +273,14 @@ def forward(self, x, f0, g: Optional[torch.Tensor] = None): f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2) har_source, _, _ = self.m_source(f0) har_source = har_source.transpose(-1, -2) - + x = self.conv_pre(x) if g is not None: - x = x + self.cond(g) - + x = x + self.cond(g) + for up, mrf, noise_conv in zip(self.upsamples, self.mrfs, self.noise_convs): - x = F.leaky_relu(x, LRELU_SLOPE) + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) x = up(x) x_source = noise_conv(har_source) x = x + x_source @@ -287,7 +288,7 @@ def forward(self, x, f0, g: Optional[torch.Tensor] = None): for layer in mrf: xs += layer(x) x = xs / self.num_kernels - x = F.leaky_relu(x) + x = torch.nn.functional.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) return x @@ -298,4 +299,4 @@ def remove_weight_norm(self): remove_weight_norm(up) for mrf in self.mrfs: mrf.remove_weight_norm() - remove_weight_norm(self.conv_post) \ No newline at end of file + remove_weight_norm(self.conv_post) diff --git a/rvc/lib/algorithm/nsf.py b/rvc/lib/algorithm/generators/hifigan_nsf.py similarity index 95% rename from rvc/lib/algorithm/nsf.py rename to rvc/lib/algorithm/generators/hifigan_nsf.py index 9c9a1919a..133d8f7c4 100644 --- a/rvc/lib/algorithm/nsf.py +++ b/rvc/lib/algorithm/generators/hifigan_nsf.py @@ -4,7 +4,7 @@ from torch.nn.utils.parametrizations import weight_norm from typing import Optional -from rvc.lib.algorithm.generators import SineGenerator +from rvc.lib.algorithm.generators.hifigan import SineGenerator from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock from rvc.lib.algorithm.commons import init_weights @@ -50,7 +50,7 @@ def forward(self, x: torch.Tensor, upsample_factor: int = 1): return sine_merge, None, None -class GeneratorNSF(torch.nn.Module): +class HiFiGANNSFGenerator(torch.nn.Module): """ Generator for synthesizing audio using the NSF (Neural Source Filter) approach. @@ -79,7 +79,7 @@ def __init__( sr: int, is_half: bool = False, ): - super(GeneratorNSF, self).__init__() + super(HiFiGANNSFGenerator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) @@ -111,7 +111,7 @@ def __init__( padding = (k - u) // 2 else: padding = u // 2 + u % 2 - + self.ups.append( weight_norm( torch.nn.ConvTranspose1d( @@ -136,9 +136,9 @@ def __init__( # 1 1 0 """ stride = stride_f0s[i] - kernel = (1 if stride == 1 else stride * 2 - stride % 2) - padding = (0 if stride == 1 else (kernel - stride) // 2) - + kernel = 1 if stride == 1 else stride * 2 - stride % 2 + padding = 0 if stride == 1 else (kernel - stride) // 2 + self.noise_convs.append( torch.nn.Conv1d( 1, diff --git a/rvc/lib/algorithm/refinegan.py b/rvc/lib/algorithm/generators/refinegan.py similarity index 76% rename from rvc/lib/algorithm/refinegan.py rename to rvc/lib/algorithm/generators/refinegan.py index 6dc166288..138ec892a 100644 --- a/rvc/lib/algorithm/refinegan.py +++ b/rvc/lib/algorithm/generators/refinegan.py @@ -1,15 +1,13 @@ -from typing import Callable -import math import numpy as np import torch -from torch import nn -from torch.nn import functional as F from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations -def get_padding(kernel_size: int, dilation: int = 1) -> int: + +def get_padding(kernel_size: int, dilation: int = 1): return int((kernel_size * dilation - dilation) / 2) + class ResBlock(torch.nn.Module): def __init__( self, @@ -26,10 +24,10 @@ def __init__( self.in_channels = in_channels self.out_channels = out_channels - self.convs1 = nn.ModuleList( + self.convs1 = torch.nn.ModuleList( [ weight_norm( - nn.Conv1d( + torch.nn.Conv1d( in_channels=in_channels if idx == 0 else out_channels, out_channels=out_channels, kernel_size=kernel_size, @@ -43,10 +41,10 @@ def __init__( ) self.convs1.apply(self.init_weights) - self.convs2 = nn.ModuleList( + self.convs2 = torch.nn.ModuleList( [ weight_norm( - nn.Conv1d( + torch.nn.Conv1d( in_channels=out_channels, out_channels=out_channels, kernel_size=kernel_size, @@ -62,9 +60,9 @@ def __init__( def forward(self, x): for idx, (c1, c2) in enumerate(zip(self.convs1, self.convs2)): - xt = F.leaky_relu(x, self.leaky_relu_slope) + xt = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope) xt = c1(xt) - xt = F.leaky_relu(xt, self.leaky_relu_slope) + xt = torch.nn.functional.leaky_relu(xt, self.leaky_relu_slope) xt = c2(xt) if idx != 0 or self.in_channels == self.out_channels: @@ -80,30 +78,30 @@ def remove_parametrizations(self): remove_parametrizations(c2) def init_weights(self, m): - if type(m) == nn.Conv1d: + if type(m) == torch.nn.Conv1d: m.weight.data.normal_(0, 0.01) m.bias.data.fill_(0.0) -class AdaIN(nn.Module): +class AdaIN(torch.nn.Module): def __init__( self, *, channels: int, leaky_relu_slope: float = 0.2, - ) -> None: + ): super().__init__() - self.weight = nn.Parameter(torch.ones(channels)) - self.activation = nn.LeakyReLU(leaky_relu_slope) + self.weight = torch.nn.Parameter(torch.ones(channels)) + self.activation = torch.nn.LeakyReLU(leaky_relu_slope) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, x: torch.Tensor): gaussian = torch.randn_like(x) * self.weight[None, :, None] return self.activation(x + gaussian) -class ParallelResBlock(nn.Module): +class ParallelResBlock(torch.nn.Module): def __init__( self, *, @@ -112,13 +110,13 @@ def __init__( kernel_sizes: int = (3, 7, 11), dilation: tuple[int] = (1, 3, 5), leaky_relu_slope: float = 0.2, - ) -> None: + ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels - self.input_conv = nn.Conv1d( + self.input_conv = torch.nn.Conv1d( in_channels=in_channels, out_channels=out_channels, kernel_size=7, @@ -126,9 +124,9 @@ def __init__( padding=3, ) - self.blocks = nn.ModuleList( + self.blocks = torch.nn.ModuleList( [ - nn.Sequential( + torch.nn.Sequential( AdaIN(channels=out_channels), ResBlock( in_channels=out_channels, @@ -143,7 +141,7 @@ def __init__( ] ) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward(self, x: torch.Tensor): x = self.input_conv(x) results = [block(x) for block in self.blocks] @@ -154,7 +152,8 @@ def remove_parametrizations(self): for block in self.blocks: block[1].remove_parametrizations() -class SineGen(torch.nn.Module): + +class SineGenerator(torch.nn.Module): """Definition of sine generator SineGen(samp_rate, harmonic_num = 0, sine_amp = 0.1, noise_std = 0.003, @@ -180,7 +179,7 @@ def __init__( noise_std=0.003, voiced_threshold=0, ): - super(SineGen, self).__init__() + super(SineGenerator, self).__init__() self.sine_amp = sine_amp self.noise_std = noise_std self.harmonic_num = harmonic_num @@ -234,15 +233,16 @@ def forward(self, f0): f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2) sine_waves = self._f02sine(f0_buf) * self.sine_amp - + uv = self._f02uv(f0) - + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 noise = noise_amp * torch.randn_like(sine_waves) sine_waves = sine_waves * uv + noise * (1 - uv) return sine_waves, uv, noise + class SourceModuleHnNSF(torch.nn.Module): def __init__( self, @@ -258,7 +258,7 @@ def __init__( self.noise_std = add_noise_std # to produce sine waveforms - self.l_sin_gen = SineGen( + self.l_sin_gen = SineGenerator( sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshold ) @@ -273,8 +273,8 @@ def forward(self, x): return sine_merge, None, None - -class RefineGANGenerator(nn.Module): + +class RefineGANGenerator(torch.nn.Module): def __init__( self, *, @@ -285,47 +285,71 @@ def __init__( num_mels: int = 128, start_channels: int = 16, gin_channels: int = 256, - ) -> None: + ): super().__init__() self.downsample_rates = downsample_rates self.upsample_rates = upsample_rates self.leaky_relu_slope = leaky_relu_slope - self.f0_upsample = nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.f0_upsample = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num=8) # expands - self.source_conv = weight_norm(nn.Conv1d(in_channels=1, out_channels=start_channels, kernel_size=7, stride=1, padding=3,)) + self.source_conv = weight_norm( + torch.nn.Conv1d( + in_channels=1, + out_channels=start_channels, + kernel_size=7, + stride=1, + padding=3, + ) + ) channels = start_channels - self.downsample_blocks = nn.ModuleList([]) + self.downsample_blocks = torch.nn.ModuleList([]) for rate in downsample_rates: new_channels = channels * 2 self.downsample_blocks.append( - nn.Sequential( - nn.Upsample(scale_factor=1 / rate, mode="linear"), - ResBlock(in_channels=channels, out_channels=new_channels, kernel_size=7, dilation=(1, 3, 5), leaky_relu_slope=leaky_relu_slope,), + torch.nn.Sequential( + torch.nn.Upsample(scale_factor=1 / rate, mode="linear"), + ResBlock( + in_channels=channels, + out_channels=new_channels, + kernel_size=7, + dilation=(1, 3, 5), + leaky_relu_slope=leaky_relu_slope, + ), ) ) channels = new_channels - self.mel_conv = weight_norm(nn.Conv1d(in_channels=num_mels,out_channels=channels,kernel_size=7,stride=1,padding=3,)) - + self.mel_conv = weight_norm( + torch.nn.Conv1d( + in_channels=num_mels, + out_channels=channels, + kernel_size=7, + stride=1, + padding=3, + ) + ) + if gin_channels != 0: - self.cond = nn.Conv1d(256, channels, 1) - + self.cond = torch.nn.Conv1d(256, channels, 1) + channels *= 2 - self.upsample_blocks = nn.ModuleList([]) - self.upsample_conv_blocks = nn.ModuleList([]) + self.upsample_blocks = torch.nn.ModuleList([]) + self.upsample_conv_blocks = torch.nn.ModuleList([]) for rate in upsample_rates: new_channels = channels // 2 - self.upsample_blocks.append(nn.Upsample(scale_factor=rate, mode="linear")) + self.upsample_blocks.append( + torch.nn.Upsample(scale_factor=rate, mode="linear") + ) self.upsample_conv_blocks.append( ParallelResBlock( @@ -339,9 +363,17 @@ def __init__( channels = new_channels - self.conv_post = weight_norm(nn.Conv1d(in_channels=channels, out_channels=1, kernel_size=7, stride=1, padding=3,)) + self.conv_post = weight_norm( + torch.nn.Conv1d( + in_channels=channels, + out_channels=1, + kernel_size=7, + stride=1, + padding=3, + ) + ) - def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor=None) -> torch.Tensor: + def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None): f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2) har_source, _, _ = self.m_source(f0) har_source = har_source.transpose(-1, -2) @@ -351,31 +383,35 @@ def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor=None) -> # making a downscaled version to match upscaler stages downs = [] for i, block in enumerate(self.downsample_blocks): - x = F.leaky_relu(x, self.leaky_relu_slope, inplace=True) + x = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope, inplace=True) downs.append(x) x = block(x) # expanding spectrogram from 192 to 256 channels mel = self.mel_conv(mel) - + if g is not None: # adding expanded speaker embedding - x = x + self.cond(g) + x = x + self.cond(g) x = torch.cat([x, mel], dim=1) i = 1 - for up, res, down in zip(self.upsample_blocks, self.upsample_conv_blocks, reversed(downs),): - x = F.leaky_relu(x, self.leaky_relu_slope, inplace=True) + for up, res, down in zip( + self.upsample_blocks, + self.upsample_conv_blocks, + reversed(downs), + ): + x = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope, inplace=True) x = up(x) x = torch.cat([x, down], dim=1) x = res(x) - x = F.leaky_relu(x, self.leaky_relu_slope, inplace=True) + x = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope, inplace=True) x = self.conv_post(x) x = torch.tanh(x) return x - def remove_parametrizations(self) -> None: + def remove_parametrizations(self): remove_parametrizations(self.source_conv) remove_parametrizations(self.mel_conv) remove_parametrizations(self.conv_post) @@ -384,4 +420,4 @@ def remove_parametrizations(self) -> None: block[1].remove_parametrizations() for block in self.upsample_conv_blocks: - block.remove_parametrizations() \ No newline at end of file + block.remove_parametrizations() diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py index 1cadfc103..d90afe9db 100644 --- a/rvc/lib/algorithm/synthesizers.py +++ b/rvc/lib/algorithm/synthesizers.py @@ -1,9 +1,9 @@ import torch from typing import Optional -from rvc.lib.algorithm.hifigan import HiFiGAN -from rvc.lib.algorithm.refinegan import RefineGANGenerator -from rvc.lib.algorithm.nsf import GeneratorNSF -from rvc.lib.algorithm.generators import Generator +from rvc.lib.algorithm.generators.hifigan_mrf import HiFiGANMRFGenerator +from rvc.lib.algorithm.generators.hifigan_nsf import HiFiGANNSFGenerator +from rvc.lib.algorithm.generators.hifigan import HiFiGANGenerator +from rvc.lib.algorithm.generators.refinegan import RefineGANGenerator from rvc.lib.algorithm.commons import slice_segments, rand_slice_segments from rvc.lib.algorithm.residuals import ResidualCouplingBlock from rvc.lib.algorithm.encoders import TextEncoder, PosteriorEncoder @@ -66,7 +66,7 @@ def __init__( super().__init__() self.segment_size = segment_size self.use_f0 = use_f0 - self.randomized = randomized + self.randomized = randomized self.enc_p = TextEncoder( inter_channels, @@ -82,7 +82,7 @@ def __init__( print(f"Using {vocoder} vocoder") if use_f0: if vocoder == "MRF HiFi-GAN": - self.dec = HiFiGAN( + self.dec = HiFiGANMRFGenerator( in_channel=inter_channels, upsample_initial_channel=upsample_initial_channel, upsample_rates=upsample_rates, @@ -95,13 +95,14 @@ def __init__( ) elif vocoder == "RefineGAN": self.dec = RefineGANGenerator( - sample_rate = sr, + sample_rate=sr, downsample_rates=upsample_rates[::-1], upsample_rates=upsample_rates, start_channels=16, - num_mels=inter_channels) + num_mels=inter_channels, + ) else: - self.dec = GeneratorNSF( + self.dec = HiFiGANNSFGenerator( inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, @@ -120,7 +121,7 @@ def __init__( print("RefineGAN does not support training without pitch guidance.") self.dec = None else: - self.dec = Generator( + self.dec = HiFiGANGenerator( inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, @@ -193,7 +194,9 @@ def forward( z_p = self.flow(z, y_mask, g=g) # regular old training method using random slices if self.randomized: - z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size) + z_slice, ids_slice = rand_slice_segments( + z, y_lengths, self.segment_size + ) if self.use_f0: pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2) o = self.dec(z_slice, pitchf, g=g) @@ -205,7 +208,7 @@ def forward( if self.use_f0: o = self.dec(z, pitchf, g=g) else: - o = self.dec(z, g=g) + o = self.dec(z, g=g) return o, None, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) else: return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) From 7cf38ae1bb461e5c4a7ff3a88b426b0f88ebed7d Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sat, 21 Dec 2024 13:31:55 -0500 Subject: [PATCH 29/46] added checkpointing for memory-efficient training --- assets/i18n/languages/en_US.json | 4 ++- core.py | 12 ++++++++- rvc/lib/algorithm/generators/hifigan_mrf.py | 28 +++++++++++++++------ rvc/lib/algorithm/generators/hifigan_nsf.py | 25 +++++++++++++----- rvc/lib/algorithm/generators/refinegan.py | 24 +++++++++++++----- rvc/lib/algorithm/synthesizers.py | 5 ++++ rvc/train/train.py | 4 ++- tabs/train/train.py | 7 ++++++ 8 files changed, 86 insertions(+), 23 deletions(-) diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index 8ef05c22a..57a5e37c1 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -323,5 +323,7 @@ "The name that will appear in the model information.": "The name that will appear in the model information.", "Set name": "Set name", "Vocoder": "Vocoder", - "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance).": "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance)." + "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance).": "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance).", + "Checkpointing": "Checkpointing", + "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.": "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate." } \ No newline at end of file diff --git a/core.py b/core.py index aa6971eeb..c9b34e5bc 100644 --- a/core.py +++ b/core.py @@ -513,6 +513,7 @@ def run_train_script( g_pretrained_path: str = None, d_pretrained_path: str = None, vocoder: str = "HiFi-GAN", + checkpointing: bool = False, ): if pretrained == True: @@ -552,7 +553,8 @@ def run_train_script( overtraining_detector, overtraining_threshold, cleanup, - vocoder + vocoder, + checkpointing ], ), ] @@ -1973,6 +1975,13 @@ def parse_arguments(): choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], default="HiFi-GAN", ) + train_parser.add_argument( + "--checkpointing", + type=str, + help="Enables memory-efficient training.", + choices=[True, False], + default="False", + ) train_parser.add_argument( "--save_every_epoch", type=int, @@ -2473,6 +2482,7 @@ def main(): g_pretrained_path=args.g_pretrained_path, d_pretrained_path=args.d_pretrained_path, vocoder=args.vocoder, + checkpointing=args.checkpointing, ) elif args.mode == "index": run_index_script( diff --git a/rvc/lib/algorithm/generators/hifigan_mrf.py b/rvc/lib/algorithm/generators/hifigan_mrf.py index 0a959e58f..3f96bb941 100644 --- a/rvc/lib/algorithm/generators/hifigan_mrf.py +++ b/rvc/lib/algorithm/generators/hifigan_mrf.py @@ -3,6 +3,7 @@ import torch from torch.nn.utils import remove_weight_norm from torch.nn.utils.parametrizations import weight_norm +import torch.utils.checkpoint as checkpoint from typing import Optional LRELU_SLOPE = 0.1 @@ -188,9 +189,11 @@ def __init__( gin_channels, sample_rate, harmonic_num, + checkpointing=False, ): super().__init__() self.num_kernels = len(resblock_kernel_sizes) + self.checkpointing = checkpointing self.f0_upsample = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num) @@ -279,15 +282,24 @@ def forward(self, x, f0, g: Optional[torch.Tensor] = None): if g is not None: x = x + self.cond(g) - for up, mrf, noise_conv in zip(self.upsamples, self.mrfs, self.noise_convs): + for ups, mrf, noise_conv in zip(self.upsamples, self.mrfs, self.noise_convs): x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) - x = up(x) - x_source = noise_conv(har_source) - x = x + x_source - xs = 0 - for layer in mrf: - xs += layer(x) - x = xs / self.num_kernels + + if self.training and self.checkpointing: + x = checkpoint.checkpoint(ups, x, use_reentrant=False) + else: + x = ups(x) + + x += noise_conv(har_source) + + def mrf_sum(x, layers): + return sum(layer(x) for layer in layers) / self.num_kernels + + if self.training and self.checkpointing: + x = checkpoint.checkpoint(mrf_sum, x, mrf, use_reentrant=False) + else: + x = mrf_sum(x, mrf) + x = torch.nn.functional.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) diff --git a/rvc/lib/algorithm/generators/hifigan_nsf.py b/rvc/lib/algorithm/generators/hifigan_nsf.py index 133d8f7c4..500159755 100644 --- a/rvc/lib/algorithm/generators/hifigan_nsf.py +++ b/rvc/lib/algorithm/generators/hifigan_nsf.py @@ -2,6 +2,7 @@ import torch from torch.nn.utils import remove_weight_norm from torch.nn.utils.parametrizations import weight_norm +import torch.utils.checkpoint as checkpoint from typing import Optional from rvc.lib.algorithm.generators.hifigan import SineGenerator @@ -78,11 +79,13 @@ def __init__( gin_channels: int, sr: int, is_half: bool = False, + checkpointing = False, ): super(HiFiGANNSFGenerator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) + self.checkpointing = checkpointing self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) self.m_source = SourceModuleHnNSF( sample_rate=sr, harmonic_num=0, is_half=is_half @@ -177,14 +180,24 @@ def forward(self, x, f0, g: Optional[torch.Tensor] = None): for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): x = torch.nn.functional.leaky_relu(x, self.lrelu_slope) - x = ups(x) + + if self.training and self.checkpointing: + x = checkpoint.checkpoint(ups, x, use_reentrant=False) + else: + x = ups(x) + x += noise_convs(har_source) - xs = sum( - self.resblocks[j](x) - for j in range(i * self.num_kernels, (i + 1) * self.num_kernels) - ) - x = xs / self.num_kernels + def resblock_forward(x, blocks): + return sum(block(x) for block in blocks) / len(blocks) + + blocks = self.resblocks[i * self.num_kernels:(i + 1) * self.num_kernels] + + # Checkpoint or regular computation for ResBlocks + if self.training and self.checkpointing: + x = checkpoint.checkpoint(resblock_forward, x, blocks, use_reentrant=False) + else: + x = resblock_forward(x, blocks) x = torch.nn.functional.leaky_relu(x) x = torch.tanh(self.conv_post(x)) diff --git a/rvc/lib/algorithm/generators/refinegan.py b/rvc/lib/algorithm/generators/refinegan.py index 138ec892a..fe5cddf12 100644 --- a/rvc/lib/algorithm/generators/refinegan.py +++ b/rvc/lib/algorithm/generators/refinegan.py @@ -2,6 +2,7 @@ import torch from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations +import torch.utils.checkpoint as checkpoint def get_padding(kernel_size: int, dilation: int = 1): @@ -285,12 +286,14 @@ def __init__( num_mels: int = 128, start_channels: int = 16, gin_channels: int = 256, + checkpointing=False, ): super().__init__() self.downsample_rates = downsample_rates self.upsample_rates = upsample_rates self.leaky_relu_slope = leaky_relu_slope + self.checkpointing = checkpointing self.f0_upsample = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) self.m_source = SourceModuleHnNSF(sample_rate, harmonic_num=8) @@ -385,7 +388,10 @@ def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None): for i, block in enumerate(self.downsample_blocks): x = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope, inplace=True) downs.append(x) - x = block(x) + if self.training and self.checkpointing: + x = checkpoint.checkpoint(block, x, use_reentrant=False) + else: + x = block(x) # expanding spectrogram from 192 to 256 channels mel = self.mel_conv(mel) @@ -394,16 +400,22 @@ def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None): # adding expanded speaker embedding x = x + self.cond(g) x = torch.cat([x, mel], dim=1) - i = 1 - for up, res, down in zip( + + for ups, res, down in zip( self.upsample_blocks, self.upsample_conv_blocks, reversed(downs), ): x = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope, inplace=True) - x = up(x) - x = torch.cat([x, down], dim=1) - x = res(x) + + if self.training and self.checkpointing: + x = checkpoint.checkpoint(ups, x, use_reentrant=False) + x = torch.cat([x, down], dim=1) + x = checkpoint.checkpoint(res, x, use_reentrant=False) + else: + x = ups(x) + x = torch.cat([x, down], dim=1) + x = res(x) x = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope, inplace=True) x = self.conv_post(x) diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py index d90afe9db..42962e7bc 100644 --- a/rvc/lib/algorithm/synthesizers.py +++ b/rvc/lib/algorithm/synthesizers.py @@ -61,6 +61,7 @@ def __init__( text_enc_hidden_dim: int = 768, vocoder: str = "HiFi-GAN", randomized: bool = True, + checkpointing: bool = False, **kwargs, ): super().__init__() @@ -92,6 +93,7 @@ def __init__( gin_channels=gin_channels, sample_rate=sr, harmonic_num=8, + checkpointing=checkpointing, ) elif vocoder == "RefineGAN": self.dec = RefineGANGenerator( @@ -100,6 +102,7 @@ def __init__( upsample_rates=upsample_rates, start_channels=16, num_mels=inter_channels, + checkpointing=checkpointing, ) else: self.dec = HiFiGANNSFGenerator( @@ -112,6 +115,7 @@ def __init__( gin_channels=gin_channels, sr=sr, is_half=kwargs["is_half"], + checkpointing=checkpointing, ) else: if vocoder == "MRF HiFi-GAN": @@ -129,6 +133,7 @@ def __init__( upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels, + checkpointing=checkpointing ) self.enc_q = PosteriorEncoder( spec_channels, diff --git a/rvc/train/train.py b/rvc/train/train.py index 8c3b642e7..3fa670617 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -76,6 +76,7 @@ overtraining_threshold = int(sys.argv[15]) cleanup = strtobool(sys.argv[16]) vocoder = sys.argv[17] +checkpointing = strtobool(sys.argv[18]) current_dir = os.getcwd() experiment_dir = os.path.join(current_dir, "logs", model_name) @@ -386,7 +387,8 @@ def run( use_f0=pitch_guidance == True, # converting 1/0 to True/False is_half=config.train.fp16_run and device.type == "cuda", sr=sample_rate, - vocoder=vocoder + vocoder=vocoder, + checkpointing=checkpointing, ).to(device) net_d = MultiPeriodDiscriminator(version, config.model.use_spectral_norm).to(device) diff --git a/tabs/train/train.py b/tabs/train/train.py index 81f590aa0..33e41742a 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -636,6 +636,12 @@ def train_tab(): value=False, interactive=True, ) + checkpointing = gr.Checkbox( + label=i18n("Checkpointing"), + info=i18n("Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate."), + value=False, + interactive=True, + ) pitch_guidance = gr.Checkbox( label=i18n("Pitch Guidance"), info=i18n( @@ -788,6 +794,7 @@ def enforce_terms(terms_accepted, *args): g_pretrained_path, d_pretrained_path, vocoder, + checkpointing ], outputs=[train_output_info], ) From 62bf923cd6603d87e6fc7cae6c8683d8113b5c8e Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sat, 21 Dec 2024 14:21:44 -0500 Subject: [PATCH 30/46] added checkpointing for discriminators as well --- rvc/lib/algorithm/discriminators.py | 36 ++++++++++++++++++++++------- rvc/train/train.py | 2 +- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/rvc/lib/algorithm/discriminators.py b/rvc/lib/algorithm/discriminators.py index 8545c0129..0759494e5 100644 --- a/rvc/lib/algorithm/discriminators.py +++ b/rvc/lib/algorithm/discriminators.py @@ -1,5 +1,6 @@ import torch from torch.nn.utils.parametrizations import spectral_norm, weight_norm +import torch.utils.checkpoint as checkpoint from rvc.lib.algorithm.commons import get_padding from rvc.lib.algorithm.residuals import LRELU_SLOPE @@ -20,14 +21,15 @@ class MultiPeriodDiscriminator(torch.nn.Module): Defaults to False. """ - def __init__(self, version: str, use_spectral_norm: bool = False): + def __init__(self, version: str, use_spectral_norm: bool = False, checkpointing: bool = False): super(MultiPeriodDiscriminator, self).__init__() periods = ( [2, 3, 5, 7, 11, 17] if version == "v1" else [2, 3, 5, 7, 11, 17, 23, 37] ) + self.checkpointing = checkpointing self.discriminators = torch.nn.ModuleList( - [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods] + [DiscriminatorS(use_spectral_norm=use_spectral_norm, checkpointing=checkpointing)] + + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm, checkpointing=checkpointing) for p in periods] ) def forward(self, y, y_hat): @@ -40,8 +42,15 @@ def forward(self, y, y_hat): """ y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], [] for d in self.discriminators: - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) + if self.training and self.checkpointing: + def forward_discriminator(d, y, y_hat): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + return y_d_r, fmap_r, y_d_g, fmap_g + y_d_r, fmap_r, y_d_g, fmap_g = checkpoint.checkpoint(forward_discriminator, d, y, y_hat, use_reentrant=False) + else: + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) y_d_rs.append(y_d_r) y_d_gs.append(y_d_g) fmap_rs.append(fmap_r) @@ -59,8 +68,9 @@ class DiscriminatorS(torch.nn.Module): convolutional layers that are applied to the input signal. """ - def __init__(self, use_spectral_norm: bool = False): + def __init__(self, use_spectral_norm: bool = False, checkpointing: bool = False): super(DiscriminatorS, self).__init__() + self.checkpointing = checkpointing norm_f = spectral_norm if use_spectral_norm else weight_norm self.convs = torch.nn.ModuleList( [ @@ -84,7 +94,11 @@ def forward(self, x): """ fmap = [] for conv in self.convs: - x = self.lrelu(conv(x)) + if self.training and self.checkpointing: + x = checkpoint.checkpoint(conv, x, use_reentrant = False) + x = checkpoint.checkpoint(self.lrelu, x, use_reentrant = False) + else: + x = self.lrelu(conv(x)) fmap.append(x) x = self.conv_post(x) fmap.append(x) @@ -114,8 +128,10 @@ def __init__( kernel_size: int = 5, stride: int = 3, use_spectral_norm: bool = False, + checkpointing: bool = False, ): super(DiscriminatorP, self).__init__() + self.checkpointing = checkpointing self.period = period norm_f = spectral_norm if use_spectral_norm else weight_norm @@ -155,7 +171,11 @@ def forward(self, x): x = x.view(b, c, -1, self.period) for conv in self.convs: - x = self.lrelu(conv(x)) + if self.training and self.checkpointing: + x = checkpoint.checkpoint(conv, x, use_reentrant = False) + x = checkpoint.checkpoint(self.lrelu, x, use_reentrant = False) + else: + x = self.lrelu(conv(x)) fmap.append(x) x = self.conv_post(x) diff --git a/rvc/train/train.py b/rvc/train/train.py index 3fa670617..cbdf56b4f 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -391,7 +391,7 @@ def run( checkpointing=checkpointing, ).to(device) - net_d = MultiPeriodDiscriminator(version, config.model.use_spectral_norm).to(device) + net_d = MultiPeriodDiscriminator(version, config.model.use_spectral_norm, checkpointing=checkpointing).to(device) optim_g = torch.optim.AdamW( net_g.parameters(), From c6d608d575dc44bfa29c3e1473949fc5a868bdd9 Mon Sep 17 00:00:00 2001 From: Blaise Date: Sun, 22 Dec 2024 13:49:48 +0100 Subject: [PATCH 31/46] add a checkbox to enable experimental options --- assets/i18n/languages/en_US.json | 5 +- tabs/train/train.py | 103 ++++++++++++++++++++----------- 2 files changed, 71 insertions(+), 37 deletions(-) diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index 57a5e37c1..b013b791a 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -325,5 +325,8 @@ "Vocoder": "Vocoder", "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance).": "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance).", "Checkpointing": "Checkpointing", - "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.": "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate." + "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.": "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.", + "Enable Experimental Options": "Enable Experimental Options", + "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.": "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.", + "Model Settings": "Model Settings" } \ No newline at end of file diff --git a/tabs/train/train.py b/tabs/train/train.py index 33e41742a..c135a73ef 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -302,37 +302,52 @@ def upload_file(file_path): # Train Tab def train_tab(): - with gr.Row(): - model_name = gr.Dropdown( - label=i18n("Model Name"), - info=i18n("Name of the new model."), - choices=get_models_list(), - value="my-project", - interactive=True, - allow_custom_value=True, - ) - sampling_rate = gr.Radio( - label=i18n("Sampling Rate"), - info=i18n("The sampling rate of the audio files."), - choices=["32000", "40000", "44100", "48000"], - value="40000", - interactive=True, - ) - vocoder = gr.Radio( - label=i18n("Vocoder"), - info=i18n("Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance)."), - choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], - value="HiFi-GAN", - interactive=True, - ) - rvc_version = gr.Radio( - label=i18n("Model Architecture"), - info=i18n("Version of the model architecture."), - choices=["v1", "v2"], - value="v2", - interactive=True, - visible=False, - ) + with gr.Accordion(i18n("Model Settings")): + with gr.Row(): + with gr.Column(): + model_name = gr.Dropdown( + label=i18n("Model Name"), + info=i18n("Name of the new model."), + choices=get_models_list(), + value="my-project", + interactive=True, + allow_custom_value=True, + ) + experimental_options = gr.Checkbox( + label=i18n("Enable Experimental Options"), + info=i18n( + "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models." + ), + value=False, + ) + + with gr.Column(): + sampling_rate = gr.Radio( + label=i18n("Sampling Rate"), + info=i18n("The sampling rate of the audio files."), + choices=["32000", "40000", "48000"], + value="40000", + interactive=True, + ) + vocoder = gr.Radio( + label=i18n("Vocoder"), + info=i18n( + "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance)." + ), + choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], + value="HiFi-GAN", + interactive=True, + visible=False, + ) + rvc_version = gr.Radio( + label=i18n("Model Architecture"), + info=i18n("Version of the model architecture."), + choices=["v1", "v2"], + value="v2", + interactive=True, + visible=False, + ) + with gr.Accordion(i18n("Preprocess")): dataset_path = gr.Dropdown( label=i18n("Dataset Path"), @@ -638,7 +653,9 @@ def train_tab(): ) checkpointing = gr.Checkbox( label=i18n("Checkpointing"), - info=i18n("Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate."), + info=i18n( + "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate." + ), value=False, interactive=True, ) @@ -794,7 +811,7 @@ def enforce_terms(terms_accepted, *args): g_pretrained_path, d_pretrained_path, vocoder, - checkpointing + checkpointing, ], outputs=[train_output_info], ) @@ -855,7 +872,7 @@ def enforce_terms(terms_accepted, *args): with gr.Column(): refresh_export = gr.Button(i18n("Refresh")) if not os.name == "nt": - upload_exported = gr.Button(i18n("Upload"), variant="primary") + upload_exported = gr.Button(i18n("Upload")) upload_exported.click( fn=upload_to_google_drive, inputs=[pth_dropdown_export, index_dropdown_export], @@ -954,6 +971,17 @@ def toggle_visible_embedder_custom(embedder_model): return {"visible": True, "__type__": "update"} return {"visible": False, "__type__": "update"} + def toggle_experimental(enabled): + if enabled: + return { + "choices": ["32000", "40000", "44100", "48000"], + "__type__": "update", + }, {"visible": True, "__type__": "update"} + return {"choices": ["32000", "40000", "48000"], "__type__": "update"}, { + "visible": False, + "__type__": "update", + } + def update_slider_visibility(noise_reduction): return gr.update(visible=noise_reduction) @@ -967,6 +995,11 @@ def update_slider_visibility(noise_reduction): inputs=[rvc_version, pitch_guidance], outputs=[], ) + experimental_options.change( + fn=toggle_experimental, + inputs=[experimental_options], + outputs=[sampling_rate, vocoder], + ) pitch_guidance.change( fn=download_prerequisites, inputs=[rvc_version, pitch_guidance], @@ -987,13 +1020,11 @@ def update_slider_visibility(noise_reduction): inputs=[upload_audio_dataset, dataset_name], outputs=[upload_audio_dataset, dataset_path], ) - f0_method.change( fn=toggle_visible_hop_length, inputs=[f0_method], outputs=[hop_length], ) - embedder_model.change( fn=toggle_visible_embedder_custom, inputs=[embedder_model], From 79ebd905527e0adfe8d67d9374fc4d12c8050f32 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sun, 22 Dec 2024 08:04:36 -0500 Subject: [PATCH 32/46] added new slicing method, added inlclude mutes option, removed pitch guidance from training --- assets/i18n/languages/en_US.json | 8 +++ core.py | 53 ++++++++++----- rvc/train/extract/extract.py | 3 +- rvc/train/extract/preparing_files.py | 30 ++++----- rvc/train/preprocess/preprocess.py | 89 ++++++++++++++++---------- rvc/train/train.py | 35 ++++------ tabs/train/train.py | 96 +++++++++++++--------------- 7 files changed, 175 insertions(+), 139 deletions(-) diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index b013b791a..774a4d752 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -329,4 +329,12 @@ "Enable Experimental Options": "Enable Experimental Options", "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.": "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.", "Model Settings": "Model Settings" + "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it.": "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it.", + "Chunk length (sec)": "Chunk length (sec)", + "Length of the audio slice for 'Simple' method.": "Length of the audio slice for 'Simple' method.", + "Overlap length (sec)": "Overlap length (sec)", + "Length of the overlap between slices for 'Simple' method.": "Length of the overlap between slices for 'Simple' method.", + "Silent training files": "Silent training files", + "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence.": "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence." + } \ No newline at end of file diff --git a/core.py b/core.py index c9b34e5bc..9f80c08ce 100644 --- a/core.py +++ b/core.py @@ -421,10 +421,12 @@ def run_preprocess_script( dataset_path: str, sample_rate: int, cpu_cores: int, - cut_preprocess: bool, + cut_preprocess: str, process_effects: bool, noise_reduction: bool, clean_strength: float, + chunk_len: float, + overlap_len: float, ): config = get_config() per = 3.0 if config.is_half else 3.7 @@ -444,6 +446,8 @@ def run_preprocess_script( process_effects, noise_reduction, clean_strength, + chunk_len, + overlap_len, ], ), ] @@ -462,6 +466,7 @@ def run_extract_script( sample_rate: int, embedder_model: str, embedder_model_custom: str = None, + include_mutes: int = 2, ): model_path = os.path.join(logs_path, model_name) @@ -482,6 +487,7 @@ def run_extract_script( sample_rate, embedder_model, embedder_model_custom, + include_mutes ], ), ] @@ -502,7 +508,6 @@ def run_train_script( sample_rate: int, batch_size: int, gpu: int, - pitch_guidance: bool, overtraining_detector: bool, overtraining_threshold: int, pretrained: bool, @@ -520,7 +525,7 @@ def run_train_script( from rvc.lib.tools.pretrained_selector import pretrained_selector if custom_pretrained == False: - pg, pd = pretrained_selector(str(rvc_version), str(vocoder), bool(pitch_guidance), int(sample_rate)) + pg, pd = pretrained_selector(str(rvc_version), str(vocoder), True, int(sample_rate)) else: if g_pretrained_path is None or d_pretrained_path is None: raise ValueError( @@ -546,7 +551,6 @@ def run_train_script( gpu, batch_size, sample_rate, - pitch_guidance, save_only_latest, save_every_weights, cache_data_in_gpu, @@ -1853,11 +1857,11 @@ def parse_arguments(): ) preprocess_parser.add_argument( "--cut_preprocess", - type=lambda x: bool(strtobool(x)), - choices=[True, False], + type=str, + choices=['Skip', 'Simple', 'Automatic'], help="Cut the dataset into smaller segments for faster preprocessing.", - default=True, - required=False, + default='Automatic', + required=True, ) preprocess_parser.add_argument( "--process_effects", @@ -1883,6 +1887,22 @@ def parse_arguments(): default=0.7, required=False, ) + preprocess_parser.add_argument( + "--chunk_len", + type=float, + help="Chunk length.", + choices=[(i / 10) for i in range(3, 6)], + default=3.0, + required=False, + ) + preprocess_parser.add_argument( + "--overlap_len", + type=float, + help="Overlap length.", + choices=[0.0, 0.1, 0.2, 0.3, 0.4], + default=0.3, + required=False, + ) # Parser for 'extract' mode extract_parser = subparsers.add_parser( @@ -1955,6 +1975,14 @@ def parse_arguments(): help=embedder_model_custom_description, default=None, ) + extract_parser.add_argument( + "--include_mutes", + type=int, + help="Number of silent files to include.", + choices=range(0, 11), + default=2, + required=True + ) # Parser for 'train' mode train_parser = subparsers.add_parser("train", help="Train an RVC model.") @@ -2030,13 +2058,6 @@ def parse_arguments(): help="GPU device to use for training (e.g., '0').", default="0", ) - train_parser.add_argument( - "--pitch_guidance", - type=lambda x: bool(strtobool(x)), - choices=[True, False], - help="Enable or disable pitch guidance during training.", - default=True, - ) train_parser.add_argument( "--pretrained", type=lambda x: bool(strtobool(x)), @@ -2459,6 +2480,7 @@ def main(): sample_rate=args.sample_rate, embedder_model=args.embedder_model, embedder_model_custom=args.embedder_model_custom, + include_mutes=args.include_mutes, ) elif args.mode == "train": run_train_script( @@ -2471,7 +2493,6 @@ def main(): sample_rate=args.sample_rate, batch_size=args.batch_size, gpu=args.gpu, - pitch_guidance=args.pitch_guidance, overtraining_detector=args.overtraining_detector, overtraining_threshold=args.overtraining_threshold, pretrained=args.pretrained, diff --git a/rvc/train/extract/extract.py b/rvc/train/extract/extract.py index 84f80cd0b..288547a93 100644 --- a/rvc/train/extract/extract.py +++ b/rvc/train/extract/extract.py @@ -250,6 +250,7 @@ def run_embedding_extraction( sample_rate = sys.argv[7] embedder_model = sys.argv[8] embedder_model_custom = sys.argv[9] if len(sys.argv) > 9 else None + include_mutes = int(sys.argv[10]) if len(sys.argv) > 10 else 2 # prep wav_path = os.path.join(exp_dir, "sliced_audios_16k") @@ -299,4 +300,4 @@ def run_embedding_extraction( # Run Preparing Files generate_config(version, sample_rate, exp_dir) - generate_filelist(exp_dir, version, sample_rate) + generate_filelist(exp_dir, version, sample_rate, include_mutes) diff --git a/rvc/train/extract/preparing_files.py b/rvc/train/extract/preparing_files.py index e0c6e2f6b..b90692907 100644 --- a/rvc/train/extract/preparing_files.py +++ b/rvc/train/extract/preparing_files.py @@ -15,7 +15,7 @@ def generate_config(rvc_version: str, sample_rate: int, model_path: str): shutil.copyfile(config_path, config_save_path) -def generate_filelist(model_path: str, rvc_version: str, sample_rate: int): +def generate_filelist(model_path: str, rvc_version: str, sample_rate: int, include_mutes: int = 2): gt_wavs_dir = os.path.join(model_path, "sliced_audios") feature_dir = os.path.join(model_path, f"{rvc_version}_extracted") @@ -41,23 +41,21 @@ def generate_filelist(model_path: str, rvc_version: str, sample_rate: int): f"{gt_wavs_dir}/{name}.wav|{feature_dir}/{name}.npy|{f0_dir}/{name}.wav.npy|{f0nsf_dir}/{name}.wav.npy|{sid}" ) - mute_audio_path = os.path.join( - mute_base_path, "sliced_audios", f"mute{sample_rate}.wav" - ) - mute_feature_path = os.path.join( - mute_base_path, f"{rvc_version}_extracted", "mute.npy" - ) - mute_f0_path = os.path.join(mute_base_path, "f0", "mute.wav.npy") - mute_f0nsf_path = os.path.join(mute_base_path, "f0_voiced", "mute.wav.npy") - - # always adding two files - for sid in sids: - options.append( - f"{mute_audio_path}|{mute_feature_path}|{mute_f0_path}|{mute_f0nsf_path}|{sid}" + if include_mutes > 0: + mute_audio_path = os.path.join( + mute_base_path, "sliced_audios", f"mute{sample_rate}.wav" ) - options.append( - f"{mute_audio_path}|{mute_feature_path}|{mute_f0_path}|{mute_f0nsf_path}|{sid}" + mute_feature_path = os.path.join( + mute_base_path, f"{rvc_version}_extracted", "mute.npy" ) + mute_f0_path = os.path.join(mute_base_path, "f0", "mute.wav.npy") + mute_f0nsf_path = os.path.join(mute_base_path, "f0_voiced", "mute.wav.npy") + + # adding x files per sid + for sid in sids * include_mutes: + options.append( + f"{mute_audio_path}|{mute_feature_path}|{mute_f0_path}|{mute_f0nsf_path}|{sid}" + ) file_path = os.path.join(model_path, "model_info.json") if os.path.exists(file_path): diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py index c9c865491..b117a91ad 100644 --- a/rvc/train/preprocess/preprocess.py +++ b/rvc/train/preprocess/preprocess.py @@ -25,13 +25,13 @@ logging.getLogger("numba.core.ssa").setLevel(logging.WARNING) logging.getLogger("numba.core.interpreter").setLevel(logging.WARNING) +# Constants OVERLAP = 0.3 MAX_AMPLITUDE = 0.9 ALPHA = 0.75 HIGH_PASS_CUTOFF = 48 SAMPLE_RATE_16K = 16000 - class PreProcess: def __init__(self, sr: int, exp_dir: str, per: float): self.slicer = Slicer( @@ -84,67 +84,78 @@ def process_audio_segment( audio_16k.astype(np.float32), ) + def simple_cut(self, audio: np.ndarray, sid: int, idx0: int, chunk_len: float, overlap_len: float): + chunk_length = int(self.sr * chunk_len) + overlap_length = int(self.sr * overlap_len) + i = 0 + while i < len(audio): + chunk = audio[i:i + chunk_length] + if len(chunk) == chunk_length: + # full SR for training + wavfile.write( + os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav"), + self.sr, + chunk.astype(np.float32), + ) + # 16KHz for feature extraction + chunk_16k = librosa.resample(chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K) + wavfile.write( + os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav"), + SAMPLE_RATE_16K, + chunk_16k.astype(np.float32), + ) + i += chunk_length - overlap_length + def process_audio( self, path: str, idx0: int, sid: int, - cut_preprocess: bool, + cut_preprocess: str, process_effects: bool, noise_reduction: bool, reduction_strength: float, + chunk_len: float, + overlap_len: float, ): audio_length = 0 try: audio = load_audio(path, self.sr) audio_length = librosa.get_duration(y=audio, sr=self.sr) + if process_effects: audio = signal.lfilter(self.b_high, self.a_high, audio) audio = self._normalize_audio(audio) if noise_reduction: - audio = nr.reduce_noise( - y=audio, sr=self.sr, prop_decrease=reduction_strength - ) - idx1 = 0 - if cut_preprocess: + audio = nr.reduce_noise(y=audio, sr=self.sr, prop_decrease=reduction_strength) + if cut_preprocess == "Skip": + # no cutting + self.process_audio_segment(audio, sid, idx0, 0,) + elif cut_preprocess == "Simple": + # simple + self.simple_cut(audio, sid, idx0, chunk_len, overlap_len) + elif cut_preprocess == "Automatic": + idx1 = 0 + # legacy for audio_segment in self.slicer.slice(audio): i = 0 while True: start = int(self.sr * (self.per - OVERLAP) * i) i += 1 if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr: - tmp_audio = audio_segment[ - start : start + int(self.per * self.sr) - ] - self.process_audio_segment( - tmp_audio, - sid, - idx0, - idx1, - ) + tmp_audio = audio_segment[start : start + int(self.per * self.sr)] + self.process_audio_segment(tmp_audio, sid, idx0, idx1, ) idx1 += 1 else: tmp_audio = audio_segment[start:] - self.process_audio_segment( - tmp_audio, - sid, - idx0, - idx1, - ) + self.process_audio_segment(tmp_audio, sid, idx0, idx1,) idx1 += 1 break - else: - self.process_audio_segment( - audio, - sid, - idx0, - idx1, - ) + except Exception as error: print(f"Error processing audio: {error}") return audio_length - def format_duration(seconds): hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) @@ -171,7 +182,7 @@ def save_dataset_duration(file_path, dataset_duration): def process_audio_wrapper(args): - pp, file, cut_preprocess, process_effects, noise_reduction, reduction_strength = ( + pp, file, cut_preprocess, process_effects, noise_reduction, reduction_strength, chunk_len, overlap_len = ( args ) file_path, idx0, sid = file @@ -183,19 +194,22 @@ def process_audio_wrapper(args): process_effects, noise_reduction, reduction_strength, + chunk_len, + overlap_len, ) - def preprocess_training_set( input_root: str, sr: int, num_processes: int, exp_dir: str, per: float, - cut_preprocess: bool, + cut_preprocess: str, process_effects: bool, noise_reduction: bool, reduction_strength: float, + chunk_len: float, + overlap_len: float, ): start_time = time.time() pp = PreProcess(sr, exp_dir, per) @@ -232,6 +246,8 @@ def preprocess_training_set( process_effects, noise_reduction, reduction_strength, + chunk_len, + overlap_len, ), ) for file in files @@ -249,7 +265,6 @@ def preprocess_training_set( f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(audio_length)} seconds of audio." ) - if __name__ == "__main__": experiment_directory = str(sys.argv[1]) input_root = str(sys.argv[2]) @@ -260,10 +275,12 @@ def preprocess_training_set( num_processes = multiprocessing.cpu_count() else: num_processes = int(num_processes) - cut_preprocess = strtobool(sys.argv[6]) + cut_preprocess = str(sys.argv[6]) process_effects = strtobool(sys.argv[7]) noise_reduction = strtobool(sys.argv[8]) reduction_strength = float(sys.argv[9]) + chunk_len = float(sys.argv[10]) + overlap_len = float(sys.argv[11]) preprocess_training_set( input_root, @@ -275,4 +292,6 @@ def preprocess_training_set( process_effects, noise_reduction, reduction_strength, + chunk_len, + overlap_len, ) diff --git a/rvc/train/train.py b/rvc/train/train.py index 9cf19e4b4..8cc7b172c 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -68,15 +68,14 @@ gpus = sys.argv[7] batch_size = int(sys.argv[8]) sample_rate = int(sys.argv[9]) -pitch_guidance = strtobool(sys.argv[10]) -save_only_latest = strtobool(sys.argv[11]) -save_every_weights = strtobool(sys.argv[12]) -cache_data_in_gpu = strtobool(sys.argv[13]) -overtraining_detector = strtobool(sys.argv[14]) -overtraining_threshold = int(sys.argv[15]) -cleanup = strtobool(sys.argv[16]) -vocoder = sys.argv[17] -checkpointing = strtobool(sys.argv[18]) +save_only_latest = strtobool(sys.argv[10]) +save_every_weights = strtobool(sys.argv[11]) +cache_data_in_gpu = strtobool(sys.argv[12]) +overtraining_detector = strtobool(sys.argv[13]) +overtraining_threshold = int(sys.argv[14]) +cleanup = strtobool(sys.argv[15]) +vocoder = sys.argv[16] +checkpointing = strtobool(sys.argv[17]) current_dir = os.getcwd() experiment_dir = os.path.join(current_dir, "logs", model_name) @@ -216,7 +215,6 @@ def start(): experiment_dir, pretrainG, pretrainD, - pitch_guidance, total_epoch, save_every_weights, config, @@ -303,7 +301,6 @@ def run( experiment_dir, pretrainG, pretrainD, - pitch_guidance, custom_total_epoch, custom_save_every_weights, config, @@ -318,7 +315,6 @@ def run( experiment_dir (str): The directory where experiment logs and checkpoints will be saved. pretrainG (str): Path to the pre-trained generator model. pretrainD (str): Path to the pre-trained discriminator model. - pitch_guidance (bool): Flag indicating whether to use pitch guidance during training. custom_total_epoch (int): The total number of epochs for training. custom_save_every_weights (int): The interval (in epochs) at which to save model weights. config (object): Configuration object containing training parameters. @@ -383,7 +379,7 @@ def run( config.data.filter_length // 2 + 1, config.train.segment_size // config.data.hop_length, **config.model, - use_f0=pitch_guidance == True, # converting 1/0 to True/False + use_f0=True, is_half=config.train.fp16_run and device.type == "cuda", sr=sample_rate, vocoder=vocoder, @@ -485,8 +481,8 @@ def run( reference = ( phone, phone_lengths, - pitch if pitch_guidance else None, - pitchf if pitch_guidance else None, + pitch, + pitchf, sid, ) else: @@ -495,8 +491,8 @@ def run( reference = ( phone.to(device), phone_lengths.to(device), - pitch.to(device) if pitch_guidance else None, - pitchf.to(device) if pitch_guidance else None, + pitch.to(device), + pitchf.to(device), sid.to(device), ) break @@ -608,8 +604,6 @@ def train_and_evaluate( wave_lengths, sid, ) = info - pitch = pitch if pitch_guidance else None - pitchf = pitchf if pitch_guidance else None # Forward pass use_amp = config.train.fp16_run and device.type == "cuda" @@ -928,8 +922,7 @@ def train_and_evaluate( extract_model( ckpt=ckpt, sr=sample_rate, - pitch_guidance=pitch_guidance - == True, # converting 1/0 to True/False, + pitch_guidance=True, name=model_name, model_dir=m, epoch=epoch, diff --git a/tabs/train/train.py b/tabs/train/train.py index c135a73ef..fe25a63fd 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -391,14 +391,32 @@ def train_tab(): interactive=True, ) with gr.Row(): - cut_preprocess = gr.Checkbox( + cut_preprocess = gr.Radio( label=i18n("Audio cutting"), info=i18n( - "It's recommended to deactivate this option if your dataset has already been processed." + "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it." ), - value=True, + choices=["Skip", "Simple", "Automatic"], + value="Automatic", + interactive=True, + ) + chunk_len = gr.Slider( + 0.5, + 5.0, + 3.0, + step=0.1, + label=i18n("Chunk length (sec)"), + info=i18n("Length of the audio slice for 'Simple' method."), + interactive=True, + ) + overlap_len = gr.Slider( + 0.0, + 0.4, + 0.3, + step=0.1, + label=i18n("Overlap length (sec)"), + info=i18n("Length of the overlap between slices for 'Simple' method."), interactive=True, - visible=True, ) process_effects = gr.Checkbox( label=i18n("Process effects"), @@ -451,6 +469,8 @@ def train_tab(): process_effects, noise_reduction, clean_strength, + chunk_len, + overlap_len, ], outputs=[preprocess_output_info], ) @@ -480,7 +500,18 @@ def train_tab(): value="contentvec", interactive=True, ) - + include_mutes = gr.Slider( + 0, + 10, + 2, + step=1, + label=i18n("Silent training files"), + info=i18n( + "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence." + ), + value=True, + interactive=True, + ) hop_length = gr.Slider( 1, 512, @@ -570,6 +601,7 @@ def train_tab(): sampling_rate, embedder_model, embedder_model_custom, + include_mutes ], outputs=[extract_output_info], ) @@ -653,20 +685,10 @@ def train_tab(): ) checkpointing = gr.Checkbox( label=i18n("Checkpointing"), - info=i18n( - "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate." - ), + info=i18n("Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate."), value=False, interactive=True, ) - pitch_guidance = gr.Checkbox( - label=i18n("Pitch Guidance"), - info=i18n( - "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential." - ), - value=True, - interactive=True, - ) with gr.Column(): custom_pretrained = gr.Checkbox( label=i18n("Custom Pretrained"), @@ -800,7 +822,6 @@ def enforce_terms(terms_accepted, *args): sampling_rate, batch_size, gpu, - pitch_guidance, overtraining_detector, overtraining_threshold, pretrained, @@ -811,7 +832,7 @@ def enforce_terms(terms_accepted, *args): g_pretrained_path, d_pretrained_path, vocoder, - checkpointing, + checkpointing ], outputs=[train_output_info], ) @@ -911,13 +932,12 @@ def disable_stop_train_button(): "__type__": "update", } - def download_prerequisites(version, pitch_guidance): + def download_prerequisites(version): if version == "v1": - if pitch_guidance: - gr.Info( + gr.Info( "Checking for v1 prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." ) - run_prerequisites_script( + run_prerequisites_script( pretraineds_v1_f0=True, pretraineds_v1_nof0=False, pretraineds_v2_f0=False, @@ -925,24 +945,11 @@ def download_prerequisites(version, pitch_guidance): models=False, exe=False, ) - else: - gr.Info( - "Checking for v1 prerequisites without pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." - ) - run_prerequisites_script( - pretraineds_v1_f0=False, - pretraineds_v1_nof0=True, - pretraineds_v2_f0=False, - pretraineds_v2_nof0=False, - models=False, - exe=False, - ) elif version == "v2": - if pitch_guidance: - gr.Info( + gr.Info( "Checking for v2 prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." ) - run_prerequisites_script( + run_prerequisites_script( pretraineds_v1_f0=False, pretraineds_v1_nof0=False, pretraineds_v2_f0=True, @@ -950,18 +957,7 @@ def download_prerequisites(version, pitch_guidance): models=False, exe=False, ) - else: - gr.Info( - "Checking for v2 prerequisites without pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." - ) - run_prerequisites_script( - pretraineds_v1_f0=False, - pretraineds_v1_nof0=False, - pretraineds_v2_f0=False, - pretraineds_v2_nof0=True, - models=False, - exe=False, - ) + gr.Info( "Prerequisites check complete. Missing files were downloaded, and you may now start preprocessing." ) @@ -992,7 +988,7 @@ def update_slider_visibility(noise_reduction): ) rvc_version.change( fn=download_prerequisites, - inputs=[rvc_version, pitch_guidance], + inputs=[rvc_version], outputs=[], ) experimental_options.change( From 2d8d747a9ad92486fbb1a7ab003739a1c5d63082 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sun, 22 Dec 2024 08:16:27 -0500 Subject: [PATCH 33/46] added missing comma --- assets/i18n/languages/en_US.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index 774a4d752..85e7efad8 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -328,7 +328,7 @@ "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.": "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.", "Enable Experimental Options": "Enable Experimental Options", "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.": "Enable extra features like 44100 sample rate and vocoder selection. These may cause errors and lack pretrained models.", - "Model Settings": "Model Settings" + "Model Settings": "Model Settings", "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it.": "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it.", "Chunk length (sec)": "Chunk length (sec)", "Length of the audio slice for 'Simple' method.": "Length of the audio slice for 'Simple' method.", @@ -336,5 +336,4 @@ "Length of the overlap between slices for 'Simple' method.": "Length of the overlap between slices for 'Simple' method.", "Silent training files": "Silent training files", "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence.": "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence." - } \ No newline at end of file From 7fefd651e11a784b26a557e29d2f6ebd2a88622c Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sun, 22 Dec 2024 08:20:10 -0500 Subject: [PATCH 34/46] removed leftovers from the merge --- tabs/train/train.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tabs/train/train.py b/tabs/train/train.py index fe25a63fd..0da383a80 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -996,11 +996,6 @@ def update_slider_visibility(noise_reduction): inputs=[experimental_options], outputs=[sampling_rate, vocoder], ) - pitch_guidance.change( - fn=download_prerequisites, - inputs=[rvc_version, pitch_guidance], - outputs=[], - ) refresh.click( fn=refresh_models_and_datasets, inputs=[], From 1aea96dbf8b975d590410a263a90d5b1b0ce9446 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sun, 22 Dec 2024 09:06:18 -0500 Subject: [PATCH 35/46] CLI fixes for new options --- core.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/core.py b/core.py index 9f80c08ce..c50d408ab 100644 --- a/core.py +++ b/core.py @@ -1891,7 +1891,7 @@ def parse_arguments(): "--chunk_len", type=float, help="Chunk length.", - choices=[(i / 10) for i in range(3, 6)], + choices=[i * 0.5 for i in range(1, 11)], default=3.0, required=False, ) @@ -2005,10 +2005,11 @@ def parse_arguments(): ) train_parser.add_argument( "--checkpointing", - type=str, - help="Enables memory-efficient training.", + type=lambda x: bool(strtobool(x)), choices=[True, False], - default="False", + help="Enables memory-efficient training.", + default=False, + required=False, ) train_parser.add_argument( "--save_every_epoch", @@ -2468,6 +2469,8 @@ def main(): process_effects=args.process_effects, noise_reduction=args.noise_reduction, clean_strength=args.noise_reduction_strength, + chunk_len=args.chunk_len, + overlap_len=args.overlap_len, ) elif args.mode == "extract": run_extract_script( From aaa68c31f4df20f787c8a2e05d32c7fd532df2c8 Mon Sep 17 00:00:00 2001 From: Blaise Date: Sun, 22 Dec 2024 17:06:59 +0100 Subject: [PATCH 36/46] improve preprocess ui --- tabs/train/train.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tabs/train/train.py b/tabs/train/train.py index 0da383a80..a67d51789 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -390,8 +390,7 @@ def train_tab(): ), interactive=True, ) - with gr.Row(): - cut_preprocess = gr.Radio( + cut_preprocess = gr.Radio( label=i18n("Audio cutting"), info=i18n( "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it." @@ -400,6 +399,7 @@ def train_tab(): value="Automatic", interactive=True, ) + with gr.Row(): chunk_len = gr.Slider( 0.5, 5.0, @@ -418,6 +418,8 @@ def train_tab(): info=i18n("Length of the overlap between slices for 'Simple' method."), interactive=True, ) + + with gr.Row(): process_effects = gr.Checkbox( label=i18n("Process effects"), info=i18n( @@ -427,7 +429,6 @@ def train_tab(): interactive=True, visible=True, ) - with gr.Row(): noise_reduction = gr.Checkbox( label=i18n("Noise Reduction"), info=i18n( @@ -437,7 +438,7 @@ def train_tab(): interactive=True, visible=True, ) - clean_strength = gr.Slider( + clean_strength = gr.Slider( minimum=0, maximum=1, label=i18n("Noise Reduction Strength"), From e05bc0fc52cdc041dc6294de89d181910df0f181 Mon Sep 17 00:00:00 2001 From: Blaise Date: Sun, 22 Dec 2024 22:10:48 +0100 Subject: [PATCH 37/46] improve pretrained selector --- rvc/lib/tools/pretrained_selector.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/rvc/lib/tools/pretrained_selector.py b/rvc/lib/tools/pretrained_selector.py index d0aa78545..56c18d263 100644 --- a/rvc/lib/tools/pretrained_selector.py +++ b/rvc/lib/tools/pretrained_selector.py @@ -1,8 +1,8 @@ import os def pretrained_selector(version, vocoder, pitch_guidance, sample_rate): - path = f"rvc/models/pretraineds/pretrained_{version}/" - f0 = "f0" if pitch_guidance == True else "" + base_path = os.path.join("rvc", "models", "pretraineds", f"pretrained_{version}") + f0 = "f0" if pitch_guidance else "" if vocoder == "HiFi-GAN": vocoder_path = "" @@ -10,11 +10,13 @@ def pretrained_selector(version, vocoder, pitch_guidance, sample_rate): vocoder_path = "HiFiGAN_" elif vocoder == "RefineGAN": vocoder_path = "RefineGAN_" + else: + vocoder_path = "" - path_g = f"{path}{vocoder_path}{f0}G{str(sample_rate)[:2]}k.pth" - path_d = f"{path}{vocoder_path}{f0}D{str(sample_rate)[:2]}k.pth" + path_g = os.path.join(base_path, f"{vocoder_path}{f0}G{str(sample_rate)[:2]}k.pth") + path_d = os.path.join(base_path, f"{vocoder_path}{f0}D{str(sample_rate)[:2]}k.pth") if os.path.exists(path_g) and os.path.exists(path_d): return path_g, path_d else: - return "", "" + return "", "" \ No newline at end of file From 7413800408642e6e992d3e70b8d65e80d8c03b4f Mon Sep 17 00:00:00 2001 From: Blaise Date: Sun, 22 Dec 2024 22:24:29 +0100 Subject: [PATCH 38/46] a single cpu/gpu selector for all training --- assets/i18n/languages/en_US.json | 3 +- tabs/train/train.py | 273 +++++++++++++------------------ 2 files changed, 119 insertions(+), 157 deletions(-) diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index 85e7efad8..8cb9af6f7 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -78,8 +78,7 @@ "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential.": "By employing pitch guidance, it becomes feasible to mirror the intonation of the original voice, including its pitch. This feature is particularly valuable for singing and other scenarios where preserving the original melody or pitch pattern is essential.", "Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality.": "Utilize pretrained models when training your own. This approach reduces training duration and enhances overall quality.", "Extract Features": "Extract Features", - "We prioritize running the model extraction on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.": "We prioritize running the model extraction on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.", - "We prioritize running the model preprocessing on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.": "We prioritize running the model preprocessing on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank.", + "Configure GPU and CPU settings.": "Configure GPU and CPU settings.", "Cache Dataset in GPU": "Cache Dataset in GPU", "Cache the dataset in GPU memory to speed up the training process.": "Cache the dataset in GPU memory to speed up the training process.", "Index Algorithm": "Index Algorithm", diff --git a/tabs/train/train.py b/tabs/train/train.py index a67d51789..42a387945 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -302,6 +302,7 @@ def upload_file(file_path): # Train Tab def train_tab(): + # Model settings section with gr.Accordion(i18n("Model Settings")): with gr.Row(): with gr.Column(): @@ -323,22 +324,22 @@ def train_tab(): with gr.Column(): sampling_rate = gr.Radio( - label=i18n("Sampling Rate"), - info=i18n("The sampling rate of the audio files."), - choices=["32000", "40000", "48000"], - value="40000", - interactive=True, - ) + label=i18n("Sampling Rate"), + info=i18n("The sampling rate of the audio files."), + choices=["32000", "40000", "48000"], + value="40000", + interactive=True, + ) vocoder = gr.Radio( - label=i18n("Vocoder"), - info=i18n( - "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance)." - ), - choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], - value="HiFi-GAN", - interactive=True, - visible=False, - ) + label=i18n("Vocoder"), + info=i18n( + "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance)." + ), + choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], + value="HiFi-GAN", + interactive=True, + visible=False, + ) rvc_version = gr.Radio( label=i18n("Model Architecture"), info=i18n("Version of the model architecture."), @@ -347,7 +348,41 @@ def train_tab(): interactive=True, visible=False, ) + with gr.Accordion( + i18n("Advanced Settings"), + open=False, + ): + with gr.Row(): + with gr.Column(): + cpu_cores = gr.Slider( + 1, + min(cpu_count(), 32), # max 32 parallel processes + min(cpu_count(), 32), + step=1, + label=i18n("CPU Cores"), + info=i18n( + "The number of CPU cores to use in the extraction process. The default setting are your cpu cores, which is recommended for most cases." + ), + interactive=True, + ) + with gr.Column(): + gpu = gr.Textbox( + label=i18n("GPU Number"), + info=i18n( + "Specify the number of GPUs you wish to utilize for extracting by entering them separated by hyphens (-)." + ), + placeholder=i18n("0 to ∞ separated by -"), + value=str(get_number_of_gpus()), + interactive=True, + ) + gr.Textbox( + label=i18n("GPU Information"), + info=i18n("The GPU information will be displayed here."), + value=get_gpu_info(), + interactive=False, + ) + # Preprocess section with gr.Accordion(i18n("Preprocess")): dataset_path = gr.Dropdown( label=i18n("Dataset Path"), @@ -379,26 +414,15 @@ def train_tab(): refresh = gr.Button(i18n("Refresh")) with gr.Accordion(i18n("Advanced Settings"), open=False): - cpu_cores_preprocess = gr.Slider( - 1, - min(cpu_count(), 32), # max 32 parallel processes - min(cpu_count(), 32), - step=1, - label=i18n("CPU Cores"), + cut_preprocess = gr.Radio( + label=i18n("Audio cutting"), info=i18n( - "The number of CPU cores to use in the preprocess. The default setting are your cpu cores, which is recommended for most cases." + "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it." ), + choices=["Skip", "Simple", "Automatic"], + value="Automatic", interactive=True, ) - cut_preprocess = gr.Radio( - label=i18n("Audio cutting"), - info=i18n( - "Audio file slicing method: Select 'Skip' if the files are already pre-sliced, 'Simple' if excessive silence has already been removed from the files, or 'Automatic' for automatic silence detection and slicing around it." - ), - choices=["Skip", "Simple", "Automatic"], - value="Automatic", - interactive=True, - ) with gr.Row(): chunk_len = gr.Slider( 0.5, @@ -415,10 +439,12 @@ def train_tab(): 0.3, step=0.1, label=i18n("Overlap length (sec)"), - info=i18n("Length of the overlap between slices for 'Simple' method."), + info=i18n( + "Length of the overlap between slices for 'Simple' method." + ), interactive=True, ) - + with gr.Row(): process_effects = gr.Checkbox( label=i18n("Process effects"), @@ -439,16 +465,16 @@ def train_tab(): visible=True, ) clean_strength = gr.Slider( - minimum=0, - maximum=1, - label=i18n("Noise Reduction Strength"), - info=i18n( - "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed." - ), - visible=False, - value=0.5, - interactive=True, - ) + minimum=0, + maximum=1, + label=i18n("Noise Reduction Strength"), + info=i18n( + "Set the clean-up level to the audio you want, the more you increase it the more it will clean up, but it is possible that the audio will be more compressed." + ), + visible=False, + value=0.5, + interactive=True, + ) preprocess_output_info = gr.Textbox( label=i18n("Output Information"), info=i18n("The output information will be displayed here."), @@ -465,7 +491,7 @@ def train_tab(): model_name, dataset_path, sampling_rate, - cpu_cores_preprocess, + cpu_cores, cut_preprocess, process_effects, noise_reduction, @@ -476,6 +502,7 @@ def train_tab(): outputs=[preprocess_output_info], ) + # Extract section with gr.Accordion(i18n("Extract")): with gr.Row(): f0_method = gr.Radio( @@ -509,7 +536,7 @@ def train_tab(): label=i18n("Silent training files"), info=i18n( "Adding several silent files to the training set enables the model to handle pure silence in inferred audio files. Select 0 if your dataset is clean and already contains segments of pure silence." - ), + ), value=True, interactive=True, ) @@ -545,43 +572,6 @@ def train_tab(): ) move_files_button = gr.Button("Move files to custom embedder folder") - with gr.Accordion( - i18n( - "We prioritize running the model extraction on the GPU for faster performance. If you prefer to use the CPU, simply leave the GPU field blank." - ), - open=False, - ): - with gr.Row(): - with gr.Column(): - cpu_cores_extract = gr.Slider( - 1, - min(cpu_count(), 32), # max 32 parallel processes - min(cpu_count(), 32), - step=1, - label=i18n("CPU Cores"), - info=i18n( - "The number of CPU cores to use in the extraction process. The default setting are your cpu cores, which is recommended for most cases." - ), - interactive=True, - ) - - with gr.Column(): - gpu_extract = gr.Textbox( - label=i18n("GPU Number"), - info=i18n( - "Specify the number of GPUs you wish to utilize for extracting by entering them separated by hyphens (-)." - ), - placeholder=i18n("0 to ∞ separated by -"), - value=str(get_number_of_gpus()), - interactive=True, - ) - gr.Textbox( - label=i18n("GPU Information"), - info=i18n("The GPU information will be displayed here."), - value=get_gpu_info(), - interactive=False, - ) - extract_output_info = gr.Textbox( label=i18n("Output Information"), info=i18n("The output information will be displayed here."), @@ -597,16 +587,17 @@ def train_tab(): rvc_version, f0_method, hop_length, - cpu_cores_extract, - gpu_extract, + cpu_cores, + gpu, sampling_rate, embedder_model, embedder_model_custom, - include_mutes + include_mutes, ], outputs=[extract_output_info], ) + # Training section with gr.Accordion(i18n("Training")): with gr.Row(): batch_size = gr.Slider( @@ -686,19 +677,22 @@ def train_tab(): ) checkpointing = gr.Checkbox( label=i18n("Checkpointing"), - info=i18n("Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate."), + info=i18n( + "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate." + ), + value=False, + interactive=True, + ) + with gr.Row(): + with gr.Column(): + custom_pretrained = gr.Checkbox( + label=i18n("Custom Pretrained"), + info=i18n( + "Utilizing custom pretrained models can lead to superior results, as selecting the most suitable pretrained models tailored to the specific use case can significantly enhance performance." + ), value=False, interactive=True, ) - with gr.Column(): - custom_pretrained = gr.Checkbox( - label=i18n("Custom Pretrained"), - info=i18n( - "Utilizing custom pretrained models can lead to superior results, as selecting the most suitable pretrained models tailored to the specific use case can significantly enhance performance." - ), - value=False, - interactive=True, - ) with gr.Column(visible=False) as pretrained_custom_settings: with gr.Accordion(i18n("Pretrained Custom Settings")): upload_pretrained = gr.File( @@ -727,33 +721,6 @@ def train_tab(): interactive=True, allow_custom_value=True, ) - multiple_gpu = gr.Checkbox( - label=i18n("GPU Settings"), - info=( - i18n( - "Sets advanced GPU settings, recommended for users with better GPU architecture." - ) - ), - value=False, - interactive=True, - ) - with gr.Column(visible=False) as gpu_custom_settings: - with gr.Accordion(i18n("GPU Settings")): - gpu = gr.Textbox( - label=i18n("GPU Number"), - info=i18n( - "Specify the number of GPUs you wish to utilize for training by entering them separated by hyphens (-)." - ), - placeholder=i18n("0 to ∞ separated by -"), - value=str(get_number_of_gpus()), - interactive=True, - ) - gr.Textbox( - label=i18n("GPU Information"), - info=i18n("The GPU information will be displayed here."), - value=get_gpu_info(), - interactive=False, - ) overtraining_detector = gr.Checkbox( label=i18n("Overtraining Detector"), info=i18n( @@ -775,15 +742,15 @@ def train_tab(): ), interactive=True, ) - index_algorithm = gr.Radio( - label=i18n("Index Algorithm"), - info=i18n( - "KMeans is a clustering algorithm that divides the dataset into K clusters. This setting is particularly useful for large datasets." - ), - choices=["Auto", "Faiss", "KMeans"], - value="Auto", - interactive=True, - ) + index_algorithm = gr.Radio( + label=i18n("Index Algorithm"), + info=i18n( + "KMeans is a clustering algorithm that divides the dataset into K clusters. This setting is particularly useful for large datasets." + ), + choices=["Auto", "Faiss", "KMeans"], + value="Auto", + interactive=True, + ) def enforce_terms(terms_accepted, *args): if not terms_accepted: @@ -833,7 +800,7 @@ def enforce_terms(terms_accepted, *args): g_pretrained_path, d_pretrained_path, vocoder, - checkpointing + checkpointing, ], outputs=[train_output_info], ) @@ -852,6 +819,7 @@ def enforce_terms(terms_accepted, *args): outputs=[train_output_info], ) + # Export Model section with gr.Accordion(i18n("Export Model"), open=False): if not os.name == "nt": gr.Markdown( @@ -936,28 +904,28 @@ def disable_stop_train_button(): def download_prerequisites(version): if version == "v1": gr.Info( - "Checking for v1 prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." - ) + "Checking for v1 prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." + ) run_prerequisites_script( - pretraineds_v1_f0=True, - pretraineds_v1_nof0=False, - pretraineds_v2_f0=False, - pretraineds_v2_nof0=False, - models=False, - exe=False, - ) + pretraineds_v1_f0=True, + pretraineds_v1_nof0=False, + pretraineds_v2_f0=False, + pretraineds_v2_nof0=False, + models=False, + exe=False, + ) elif version == "v2": gr.Info( - "Checking for v2 prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." - ) + "Checking for v2 prerequisites with pitch guidance... Missing files will be downloaded. If you already have them, this step will be skipped." + ) run_prerequisites_script( - pretraineds_v1_f0=False, - pretraineds_v1_nof0=False, - pretraineds_v2_f0=True, - pretraineds_v2_nof0=False, - models=False, - exe=False, - ) + pretraineds_v1_f0=False, + pretraineds_v1_nof0=False, + pretraineds_v2_f0=True, + pretraineds_v2_nof0=False, + models=False, + exe=False, + ) gr.Info( "Prerequisites check complete. Missing files were downloaded, and you may now start preprocessing." @@ -1060,11 +1028,6 @@ def update_slider_visibility(noise_reduction): inputs=[overtraining_detector], outputs=[overtraining_settings], ) - multiple_gpu.change( - fn=toggle_visible, - inputs=[multiple_gpu], - outputs=[gpu_custom_settings], - ) train_button.click( fn=enable_stop_train_button, inputs=[], From babdc2f7fd9b24903d56e0a587b343aad21df03e Mon Sep 17 00:00:00 2001 From: Blaise Date: Sun, 22 Dec 2024 22:48:27 +0100 Subject: [PATCH 39/46] improve preprocess readability + soxr_vhq resample --- requirements.txt | 1 + rvc/train/preprocess/preprocess.py | 126 ++++++++++++++++++----------- 2 files changed, 82 insertions(+), 45 deletions(-) diff --git a/requirements.txt b/requirements.txt index 78cf0a35b..5167f3766 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ soundfile==0.12.1 noisereduce pedalboard stftpitchshift +soxr # Machine learning and deep learning omegaconf>=2.0.6; sys_platform == 'darwin' diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py index b117a91ad..71c5c899d 100644 --- a/rvc/train/preprocess/preprocess.py +++ b/rvc/train/preprocess/preprocess.py @@ -11,6 +11,7 @@ import librosa import multiprocessing import noisereduce as nr +import soxr now_directory = os.getcwd() sys.path.append(now_directory) @@ -18,22 +19,31 @@ from rvc.lib.utils import load_audio from rvc.train.preprocess.slicer import Slicer -# Remove colab logs import logging logging.getLogger("numba.core.byteflow").setLevel(logging.WARNING) logging.getLogger("numba.core.ssa").setLevel(logging.WARNING) logging.getLogger("numba.core.interpreter").setLevel(logging.WARNING) -# Constants OVERLAP = 0.3 MAX_AMPLITUDE = 0.9 ALPHA = 0.75 HIGH_PASS_CUTOFF = 48 SAMPLE_RATE_16K = 16000 +RES_TYPE = "soxr_vhq" + class PreProcess: def __init__(self, sr: int, exp_dir: str, per: float): + self.sr = sr + self.per = per + self.exp_dir = exp_dir + self.device = "cpu" + self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") + self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") + os.makedirs(self.gt_wavs_dir, exist_ok=True) + os.makedirs(self.wavs16k_dir, exist_ok=True) + self.slicer = Slicer( sr=sr, threshold=-42, @@ -42,17 +52,9 @@ def __init__(self, sr: int, exp_dir: str, per: float): hop_size=15, max_sil_kept=500, ) - self.sr = sr self.b_high, self.a_high = signal.butter( N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr ) - self.per = per - self.exp_dir = exp_dir - self.device = "cpu" - self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") - self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") - os.makedirs(self.gt_wavs_dir, exist_ok=True) - os.makedirs(self.wavs16k_dir, exist_ok=True) def _normalize_audio(self, audio: np.ndarray): tmp_max = np.abs(audio).max() @@ -76,7 +78,10 @@ def process_audio_segment( normalized_audio.astype(np.float32), ) audio_16k = librosa.resample( - normalized_audio, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K + normalized_audio, + orig_sr=self.sr, + target_sr=SAMPLE_RATE_16K, + res_type=RES_TYPE, ) wavfile.write( os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{idx1}.wav"), @@ -84,27 +89,39 @@ def process_audio_segment( audio_16k.astype(np.float32), ) - def simple_cut(self, audio: np.ndarray, sid: int, idx0: int, chunk_len: float, overlap_len: float): - chunk_length = int(self.sr * chunk_len) - overlap_length = int(self.sr * overlap_len) - i = 0 - while i < len(audio): - chunk = audio[i:i + chunk_length] - if len(chunk) == chunk_length: + def simple_cut( + self, + audio: np.ndarray, + sid: int, + idx0: int, + chunk_len: float, + overlap_len: float, + ): + chunk_samples = int(self.sr * chunk_len) + overlap_samples = int(self.sr * overlap_len) + step = chunk_samples - overlap_samples + num_chunks = (len(audio) - chunk_samples) // step + 1 + for i in range(num_chunks): + start = i * step + end = start + chunk_samples + if end <= len(audio): + chunk = audio[start:end] + file_index = i # full SR for training wavfile.write( - os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav"), + os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{file_index}.wav"), self.sr, chunk.astype(np.float32), ) # 16KHz for feature extraction - chunk_16k = librosa.resample(chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K) + chunk_16k = librosa.resample( + chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K, res_type=RES_TYPE + ) wavfile.write( - os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav"), + os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{file_index}.wav"), SAMPLE_RATE_16K, chunk_16k.astype(np.float32), ) - i += chunk_length - overlap_length def process_audio( self, @@ -127,28 +144,38 @@ def process_audio( audio = signal.lfilter(self.b_high, self.a_high, audio) audio = self._normalize_audio(audio) if noise_reduction: - audio = nr.reduce_noise(y=audio, sr=self.sr, prop_decrease=reduction_strength) + audio = nr.reduce_noise( + y=audio, + sr=self.sr, + prop_decrease=reduction_strength, + n_fft=2048, + hop_length=512, + ) if cut_preprocess == "Skip": - # no cutting - self.process_audio_segment(audio, sid, idx0, 0,) + self.process_audio_segment(audio, sid, idx0, 0) elif cut_preprocess == "Simple": - # simple self.simple_cut(audio, sid, idx0, chunk_len, overlap_len) elif cut_preprocess == "Automatic": + segments = self.slicer.slice(audio) idx1 = 0 - # legacy - for audio_segment in self.slicer.slice(audio): - i = 0 - while True: - start = int(self.sr * (self.per - OVERLAP) * i) - i += 1 - if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr: - tmp_audio = audio_segment[start : start + int(self.per * self.sr)] - self.process_audio_segment(tmp_audio, sid, idx0, idx1, ) + for audio_segment in segments: + segment_length = len(audio_segment) + per_samples = int(self.sr * self.per) + overlap_samples_segment = int(self.sr * OVERLAP) + step = per_samples - overlap_samples_segment + + num_sub_segments = (segment_length - per_samples + step - 1) // step + + for i in range(num_sub_segments): + start = i * step + end = start + per_samples + if end <= segment_length: + tmp_audio = audio_segment[start:end] + self.process_audio_segment(tmp_audio, sid, idx0, idx1) idx1 += 1 - else: + elif start < segment_length: tmp_audio = audio_segment[start:] - self.process_audio_segment(tmp_audio, sid, idx0, idx1,) + self.process_audio_segment(tmp_audio, sid, idx0, idx1) idx1 += 1 break @@ -156,6 +183,7 @@ def process_audio( print(f"Error processing audio: {error}") return audio_length + def format_duration(seconds): hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) @@ -182,9 +210,16 @@ def save_dataset_duration(file_path, dataset_duration): def process_audio_wrapper(args): - pp, file, cut_preprocess, process_effects, noise_reduction, reduction_strength, chunk_len, overlap_len = ( - args - ) + ( + pp, + file, + cut_preprocess, + process_effects, + noise_reduction, + reduction_strength, + chunk_len, + overlap_len, + ) = args file_path, idx0, sid = file return pp.process_audio( file_path, @@ -198,6 +233,7 @@ def process_audio_wrapper(args): overlap_len, ) + def preprocess_training_set( input_root: str, sr: int, @@ -209,7 +245,7 @@ def preprocess_training_set( noise_reduction: bool, reduction_strength: float, chunk_len: float, - overlap_len: float, + overlap_len: float, ): start_time = time.time() pp = PreProcess(sr, exp_dir, per) @@ -230,7 +266,6 @@ def preprocess_training_set( f'Speaker ID folder is expected to be integer, got "{os.path.basename(root)}" instead.' ) - # print(f"Number of files: {len(files)}") audio_length = [] with tqdm(total=len(files)) as pbar: with concurrent.futures.ProcessPoolExecutor( @@ -256,15 +291,16 @@ def preprocess_training_set( audio_length.append(future.result()) pbar.update(1) - audio_length = sum(audio_length) + total_audio_length = sum(audio_length) save_dataset_duration( - os.path.join(exp_dir, "model_info.json"), dataset_duration=audio_length + os.path.join(exp_dir, "model_info.json"), dataset_duration=total_audio_length ) elapsed_time = time.time() - start_time print( - f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(audio_length)} seconds of audio." + f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(total_audio_length)} seconds of audio." ) + if __name__ == "__main__": experiment_directory = str(sys.argv[1]) input_root = str(sys.argv[2]) From 3947fc3d201853767d0077c32e0673e61a298807 Mon Sep 17 00:00:00 2001 From: Blaise Date: Sun, 22 Dec 2024 22:55:34 +0100 Subject: [PATCH 40/46] not slower --- assets/i18n/languages/en_US.json | 2 +- tabs/train/train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json index 8cb9af6f7..0b3367c3a 100644 --- a/assets/i18n/languages/en_US.json +++ b/assets/i18n/languages/en_US.json @@ -322,7 +322,7 @@ "The name that will appear in the model information.": "The name that will appear in the model information.", "Set name": "Set name", "Vocoder": "Vocoder", - "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance).": "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance).", + "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only).": "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only).", "Checkpointing": "Checkpointing", "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.": "Enables memory-efficient training. This reduces VRAM usage at the cost of slower training speed. It is useful for GPUs with limited memory (e.g., <6GB VRAM) or when training with a batch size larger than what your GPU can normally accommodate.", "Enable Experimental Options": "Enable Experimental Options", diff --git a/tabs/train/train.py b/tabs/train/train.py index 42a387945..6cd6c12c5 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -333,7 +333,7 @@ def train_tab(): vocoder = gr.Radio( label=i18n("Vocoder"), info=i18n( - "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only, with slightly slower performance)." + "Vocoder for audio synthesis: HiFi-GAN (default, available for all clients), MRF HiFi-GAN (higher fidelity, Applio-only), or RefineGAN (offering superior audio quality, Applio-only)." ), choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"], value="HiFi-GAN", From 3393dadc7f2fe4c4b5a55cd586b265c9958ffd42 Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sun, 22 Dec 2024 17:29:26 -0500 Subject: [PATCH 41/46] FP16 is not suitable for inference - overflows TextEncoder and results in NaN and the complete silence in the output on some files and models. --- rvc/infer/infer.py | 6 ++---- rvc/infer/pipeline.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py index 5425f64a3..d0c48b2f9 100644 --- a/rvc/infer/infer.py +++ b/rvc/infer/infer.py @@ -481,15 +481,13 @@ def setup_network(self): *self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=self.text_enc_hidden_dim, - is_half=self.config.is_half, + is_half=False vocoder=self.vocoder ) del self.net_g.enc_q self.net_g.load_state_dict(self.cpt["weight"], strict=False) self.net_g.eval().to(self.config.device) - self.net_g = ( - self.net_g.half() if self.config.is_half else self.net_g.float() - ) + self.net_g = self.net_g.float() def setup_vc_instance(self): """ diff --git a/rvc/infer/pipeline.py b/rvc/infer/pipeline.py index bf82afd4c..6e73efcbc 100644 --- a/rvc/infer/pipeline.py +++ b/rvc/infer/pipeline.py @@ -485,7 +485,7 @@ def voice_conversion( pitch, pitchf = None, None p_len = torch.tensor([p_len], device=self.device).long() audio1 = ( - (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) + (net_g.infer(feats.float(), p_len, pitch, pitchf.float(), sid)[0][0, 0]) .data.cpu() .float() .numpy() From 69f62f539213038aa5c00e39c3d34cf1ab4f0b3e Mon Sep 17 00:00:00 2001 From: Blaise Date: Sun, 22 Dec 2024 23:32:25 +0100 Subject: [PATCH 42/46] simplify docstrings + small refactor on train --- rvc/lib/algorithm/discriminators.py | 19 --- rvc/lib/algorithm/encoders.py | 8 +- rvc/lib/algorithm/generators/hifigan.py | 62 ++++---- rvc/lib/algorithm/generators/hifigan_mrf.py | 167 ++++++++++++++------ rvc/lib/algorithm/generators/hifigan_nsf.py | 63 ++++---- rvc/lib/algorithm/generators/refinegan.py | 111 +++++++++---- rvc/lib/algorithm/modules.py | 11 +- rvc/lib/algorithm/normalization.py | 9 +- rvc/lib/algorithm/residuals.py | 36 +---- rvc/lib/algorithm/synthesizers.py | 15 -- rvc/train/mel_processing.py | 26 +-- rvc/train/train.py | 82 ++++++---- 12 files changed, 337 insertions(+), 272 deletions(-) diff --git a/rvc/lib/algorithm/discriminators.py b/rvc/lib/algorithm/discriminators.py index 0759494e5..c9ede6c27 100644 --- a/rvc/lib/algorithm/discriminators.py +++ b/rvc/lib/algorithm/discriminators.py @@ -33,13 +33,6 @@ def __init__(self, version: str, use_spectral_norm: bool = False, checkpointing: ) def forward(self, y, y_hat): - """ - Forward pass of the multi-period discriminator. - - Args: - y (torch.Tensor): Real audio signal. - y_hat (torch.Tensor): Fake audio signal. - """ y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], [] for d in self.discriminators: if self.training and self.checkpointing: @@ -86,12 +79,6 @@ def __init__(self, use_spectral_norm: bool = False, checkpointing: bool = False) self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE, inplace=True) def forward(self, x): - """ - Forward pass of the discriminator. - - Args: - x (torch.Tensor): Input audio signal. - """ fmap = [] for conv in self.convs: if self.training and self.checkpointing: @@ -157,12 +144,6 @@ def __init__( self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE, inplace=True) def forward(self, x): - """ - Forward pass of the discriminator. - - Args: - x (torch.Tensor): Input audio signal. - """ fmap = [] b, c, t = x.shape if t % self.period != 0: diff --git a/rvc/lib/algorithm/encoders.py b/rvc/lib/algorithm/encoders.py index e742378ae..3f94a1478 100644 --- a/rvc/lib/algorithm/encoders.py +++ b/rvc/lib/algorithm/encoders.py @@ -85,7 +85,8 @@ def forward(self, x, x_mask): class TextEncoder(torch.nn.Module): - """Text Encoder with configurable embedding dimension. + """ + Text Encoder with configurable embedding dimension. Args: out_channels (int): Output channels of the encoder. @@ -152,7 +153,8 @@ def forward( class PosteriorEncoder(torch.nn.Module): - """Posterior Encoder for inferring latent representation. + """ + Posterior Encoder for inferring latent representation. Args: in_channels (int): Number of channels in the input. @@ -211,11 +213,9 @@ def forward( return z, m, logs, x_mask def remove_weight_norm(self): - """Removes weight normalization from the encoder.""" self.enc.remove_weight_norm() def __prepare_scriptable__(self): - """Prepares the module for scripting.""" for hook in self.enc._forward_pre_hooks.values(): if ( hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" diff --git a/rvc/lib/algorithm/generators/hifigan.py b/rvc/lib/algorithm/generators/hifigan.py index 0d5c84310..4f1f6cab5 100644 --- a/rvc/lib/algorithm/generators/hifigan.py +++ b/rvc/lib/algorithm/generators/hifigan.py @@ -7,19 +7,22 @@ from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock from rvc.lib.algorithm.commons import init_weights - class HiFiGANGenerator(torch.nn.Module): - """Generator for synthesizing audio. + """ + HiFi-GAN Generator module for audio synthesis. + + This module implements the generator part of the HiFi-GAN architecture, + which uses transposed convolutions for upsampling and residual blocks for + refining the audio output. It can also incorporate global conditioning. Args: - initial_channel (int): Number of channels in the initial convolutional layer. - resblock (str): Type of residual block to use (1 or 2). - resblock_kernel_sizes (list): Kernel sizes of the residual blocks. - resblock_dilation_sizes (list): Dilation rates of the residual blocks. - upsample_rates (list): Upsampling rates. - upsample_initial_channel (int): Number of channels in the initial upsampling layer. - upsample_kernel_sizes (list): Kernel sizes of the upsampling layers. - gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + initial_channel (int): Number of input channels to the initial convolutional layer. + resblock_kernel_sizes (list): List of kernel sizes for the residual blocks. + resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size. + upsample_rates (list): List of upsampling factors for each upsampling layer. + upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer. + upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling. + gin_channels (int, optional): Number of input channels for the global conditioning. If 0, no global conditioning is used. Defaults to 0. """ def __init__( @@ -76,7 +79,7 @@ def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): x = self.ups[i](x) xs = None for j in range(self.num_kernels): - if xs == None: + if xs is None: xs = self.resblocks[i * self.num_kernels + j](x) else: xs += self.resblocks[i * self.num_kernels + j](x) @@ -89,7 +92,6 @@ def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): return x def __prepare_scriptable__(self): - """Prepares the module for scripting.""" for l in self.ups_and_resblocks: for hook in l._forward_pre_hooks.values(): if ( @@ -100,23 +102,24 @@ def __prepare_scriptable__(self): return self def remove_weight_norm(self): - """Removes weight normalization from the upsampling and residual blocks.""" for l in self.ups: remove_weight_norm(l) for l in self.resblocks: l.remove_weight_norm() - class SineGenerator(torch.nn.Module): """ - A sine wave generator that synthesizes waveforms with optional harmonic overtones and noise. + Sine wave generator with optional harmonic overtones and noise. + + This module generates sine waves for a fundamental frequency and its harmonics. + It can also add Gaussian noise and apply a voiced/unvoiced mask. Args: - sampling_rate (int): The sampling rate in Hz. - num_harmonics (int, optional): The number of harmonic overtones to include. Defaults to 0. - sine_amplitude (float, optional): The amplitude of the sine waveform. Defaults to 0.1. - noise_stddev (float, optional): The standard deviation of Gaussian noise. Defaults to 0.003. - voiced_threshold (float, optional): F0 threshold for distinguishing voiced/unvoiced frames. Defaults to 0. + sampling_rate (int): The sampling rate of the audio in Hz. + num_harmonics (int, optional): The number of harmonic overtones to generate. Defaults to 0. + sine_amplitude (float, optional): The amplitude of the sine wave components. Defaults to 0.1. + noise_stddev (float, optional): The standard deviation of the additive Gaussian noise. Defaults to 0.003. + voiced_threshold (float, optional): The threshold for the fundamental frequency (F0) to determine if a frame is voiced. Defaults to 0.0. """ def __init__( @@ -137,21 +140,21 @@ def __init__( def _compute_voiced_unvoiced(self, f0: torch.Tensor): """ - Generate a binary mask to indicate voiced/unvoiced frames. + Generates a binary mask indicating voiced/unvoiced frames based on the fundamental frequency. Args: - f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length). + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length). """ uv_mask = (f0 > self.voiced_threshold).float() return uv_mask def _generate_sine_wave(self, f0: torch.Tensor, upsampling_factor: int): """ - Generate sine waves for the fundamental frequency and its harmonics. + Generates sine waves for the fundamental frequency and its harmonics. Args: - f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1). - upsampling_factor (int): Upsampling factor. + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1). + upsampling_factor (int): The factor by which to upsample the sine wave. """ batch_size, length, _ = f0.shape @@ -187,13 +190,6 @@ def _generate_sine_wave(self, f0: torch.Tensor, upsampling_factor: int): return sine_waves def forward(self, f0: torch.Tensor, upsampling_factor: int): - """ - Forward pass to generate sine waveforms with noise and voiced/unvoiced masking. - - Args: - f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1). - upsampling_factor (int): Upsampling factor. - """ with torch.no_grad(): # Expand `f0` to include waveform dimensions f0 = f0.unsqueeze(-1) @@ -224,4 +220,4 @@ def forward(self, f0: torch.Tensor, upsampling_factor: int): # Combine sine waves and noise sine_waveforms = sine_waves * voiced_mask + noise - return sine_waveforms, voiced_mask, noise + return sine_waveforms, voiced_mask, noise \ No newline at end of file diff --git a/rvc/lib/algorithm/generators/hifigan_mrf.py b/rvc/lib/algorithm/generators/hifigan_mrf.py index 3f96bb941..e3834ab82 100644 --- a/rvc/lib/algorithm/generators/hifigan_mrf.py +++ b/rvc/lib/algorithm/generators/hifigan_mrf.py @@ -10,6 +10,20 @@ class MRFLayer(torch.nn.Module): + """ + A single layer of the Multi-Receptive Field (MRF) block. + + This layer consists of two 1D convolutional layers with weight normalization + and Leaky ReLU activation in between. The first convolution has a dilation, + while the second has a dilation of 1. A skip connection is added from the input + to the output. + + Args: + channels (int): The number of input and output channels. + kernel_size (int): The kernel size of the convolutional layers. + dilation (int): The dilation rate for the first convolutional layer. + """ + def __init__(self, channels, kernel_size, dilation): super().__init__() self.conv1 = weight_norm( @@ -27,7 +41,7 @@ def __init__(self, channels, kernel_size, dilation): ) ) - def forward(self, x): + def forward(self, x: torch.Tensor): y = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) y = self.conv1(y) y = torch.nn.functional.leaky_relu(y, LRELU_SLOPE) @@ -40,13 +54,25 @@ def remove_weight_norm(self): class MRFBlock(torch.nn.Module): + """ + A Multi-Receptive Field (MRF) block. + + This block consists of multiple MRFLayers with different dilation rates. + It applies each layer sequentially to the input. + + Args: + channels (int): The number of input and output channels for the MRFLayers. + kernel_size (int): The kernel size for the convolutional layers in the MRFLayers. + dilations (list[int]): A list of dilation rates for the MRFLayers. + """ + def __init__(self, channels, kernel_size, dilations): super().__init__() self.layers = torch.nn.ModuleList() for dilation in dilations: self.layers.append(MRFLayer(channels, kernel_size, dilation)) - def forward(self, x): + def forward(self, x: torch.Tensor): for layer in self.layers: x = layer(x) return x @@ -57,30 +83,27 @@ def remove_weight_norm(self): class SineGenerator(torch.nn.Module): - """Definition of sine generator - SineGen(samp_rate, harmonic_num = 0, - sine_amp = 0.1, noise_std = 0.003, - voiced_threshold = 0, - flag_for_pulse=False) - - samp_rate: sampling rate in Hz - harmonic_num: number of harmonic overtones (default 0) - sine_amp: amplitude of sine-wavefrom (default 0.1) - noise_std: std of Gaussian noise (default 0.003) - voiced_thoreshold: F0 threshold for U/V classification (default 0) - flag_for_pulse: this SinGen is used inside PulseGen (default False) - - Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(np.pi) or cos(0) + """ + Definition of sine generator + + Generates sine waveforms with optional harmonics and additive noise. + Can be used to create harmonic noise source for neural vocoders. + + Args: + samp_rate (int): Sampling rate in Hz. + harmonic_num (int): Number of harmonic overtones (default 0). + sine_amp (float): Amplitude of sine-waveform (default 0.1). + noise_std (float): Standard deviation of Gaussian noise (default 0.003). + voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0). """ def __init__( self, - samp_rate, - harmonic_num=0, - sine_amp=0.1, - noise_std=0.003, - voiced_threshold=0, + samp_rate: int, + harmonic_num: int = 0, + sine_amp: float = 0.1, + noise_std: float = 0.003, + voiced_threshold: float = 0, ): super(SineGenerator, self).__init__() self.sine_amp = sine_amp @@ -90,17 +113,28 @@ def __init__( self.sampling_rate = samp_rate self.voiced_threshold = voiced_threshold - def _f02uv(self, f0): + def _f02uv(self, f0: torch.Tensor): + """ + Generates voiced/unvoiced (UV) signal based on the fundamental frequency (F0). + + Args: + f0 (torch.Tensor): Fundamental frequency tensor of shape (batch_size, length, 1). + """ # generate uv signal uv = torch.ones_like(f0) uv = uv * (f0 > self.voiced_threshold) return uv - def _f02sine(self, f0_values): - """f0_values: (batchsize, length, dim) - where dim indicates fundamental tone and overtones + def _f02sine(self, f0_values: torch.Tensor): + """ + Generates sine waveforms based on the fundamental frequency (F0) and its harmonics. + + Args: + f0_values (torch.Tensor): Tensor of fundamental frequency and its harmonics, + shape (batch_size, length, dim), where dim indicates + the fundamental tone and overtones. """ - # convert to F0 in rad. The interger part n can be ignored + # convert to F0 in rad. The integer part n can be ignored # because 2 * np.pi * n doesn't affect phase rad_values = (f0_values / self.sampling_rate) % 1 @@ -121,13 +155,7 @@ def _f02sine(self, f0_values): return sines - def forward(self, f0): - """sine_tensor, uv = forward(f0) - input F0: tensor(batchsize=1, length, dim=1) - f0 for unvoiced steps should be 0 - output sine_tensor: tensor(batchsize=1, length, dim) - output uv: tensor(batchsize=1, length, 1) - """ + def forward(self, f0: torch.Tensor): with torch.no_grad(): f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # fundamental component @@ -147,13 +175,27 @@ def forward(self, f0): class SourceModuleHnNSF(torch.nn.Module): + """ + Generates harmonic and noise source features. + + This module uses the SineGenerator to create harmonic signals based on the + fundamental frequency (F0) and merges them into a single excitation signal. + + Args: + sample_rate (int): Sampling rate in Hz. + harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0. + sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003. + voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0. + """ + def __init__( self, - sampling_rate, - harmonic_num=0, - sine_amp=0.1, - add_noise_std=0.003, - voiced_threshold=0, + sampling_rate: int, + harmonic_num: int = 0, + sine_amp: float = 0.1, + add_noise_std: float = 0.003, + voiced_threshold: float = 0, ): super(SourceModuleHnNSF, self).__init__() @@ -169,7 +211,7 @@ def __init__( self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() - def forward(self, x): + def forward(self, x: torch.Tensor): sine_wavs, uv, _ = self.l_sin_gen(x) sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) sine_merge = self.l_tanh(self.l_linear(sine_wavs)) @@ -178,18 +220,39 @@ def forward(self, x): class HiFiGANMRFGenerator(torch.nn.Module): + """ + HiFi-GAN generator with Multi-Receptive Field (MRF) blocks. + + This generator takes an input feature sequence and fundamental frequency (F0) + as input and generates an audio waveform. It utilizes transposed convolutions + for upsampling and MRF blocks for feature refinement. It can also condition + on global conditioning features. + + Args: + in_channel (int): Number of input channels. + upsample_initial_channel (int): Number of channels after the initial convolution. + upsample_rates (list[int]): List of upsampling rates for the transposed convolutions. + upsample_kernel_sizes (list[int]): List of kernel sizes for the transposed convolutions. + resblock_kernel_sizes (list[int]): List of kernel sizes for the convolutional layers in the MRF blocks. + resblock_dilations (list[list[int]]): List of lists of dilation rates for the MRF blocks. + gin_channels (int): Number of global conditioning input channels (0 if no global conditioning). + sample_rate (int): Sampling rate of the audio. + harmonic_num (int): Number of harmonics to generate. + checkpointing (bool): Whether to use checkpointing to save memory during training (default: False). + """ + def __init__( self, - in_channel, - upsample_initial_channel, - upsample_rates, - upsample_kernel_sizes, - resblock_kernel_sizes, - resblock_dilations, - gin_channels, - sample_rate, - harmonic_num, - checkpointing=False, + in_channel: int, + upsample_initial_channel: int, + upsample_rates: list[int], + upsample_kernel_sizes: list[int], + resblock_kernel_sizes: list[int], + resblock_dilations: list[list[int]], + gin_channels: int, + sample_rate: int, + harmonic_num: int, + checkpointing: bool = False, ): super().__init__() self.num_kernels = len(resblock_kernel_sizes) @@ -272,7 +335,9 @@ def __init__( if gin_channels != 0: self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) - def forward(self, x, f0, g: Optional[torch.Tensor] = None): + def forward( + self, x: torch.Tensor, f0: torch.Tensor, g: Optional[torch.Tensor] = None + ): f0 = self.f0_upsample(f0[:, None, :]).transpose(-1, -2) har_source, _, _ = self.m_source(f0) har_source = har_source.transpose(-1, -2) diff --git a/rvc/lib/algorithm/generators/hifigan_nsf.py b/rvc/lib/algorithm/generators/hifigan_nsf.py index 500159755..c17b7a6a1 100644 --- a/rvc/lib/algorithm/generators/hifigan_nsf.py +++ b/rvc/lib/algorithm/generators/hifigan_nsf.py @@ -9,18 +9,19 @@ from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock from rvc.lib.algorithm.commons import init_weights - class SourceModuleHnNSF(torch.nn.Module): """ - Source Module for harmonic-plus-noise excitation. + Source Module for generating harmonic and noise components for audio synthesis. + + This module generates a harmonic source signal using sine waves and adds + optional noise. It's often used in neural vocoders as a source of excitation. Args: - sample_rate (int): Sampling rate in Hz. - harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0. - sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1. - add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003. - voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0. - is_half (bool, optional): Whether to use half precision. Defaults to True. + sample_rate (int): Sampling rate of the audio in Hz. + harmonic_num (int, optional): Number of harmonic overtones to generate above the fundamental frequency (F0). Defaults to 0. + sine_amp (float, optional): Amplitude of the sine wave components. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of the additive white Gaussian noise. Defaults to 0.003. + voiced_threshod (float, optional): Threshold for the fundamental frequency (F0) to determine if a frame is voiced. If F0 is below this threshold, it's considered unvoiced. Defaults to 0. """ def __init__( @@ -30,13 +31,11 @@ def __init__( sine_amp: float = 0.1, add_noise_std: float = 0.003, voiced_threshod: float = 0, - is_half: bool = True, ): super(SourceModuleHnNSF, self).__init__() self.sine_amp = sine_amp self.noise_std = add_noise_std - self.is_half = is_half self.l_sin_gen = SineGenerator( sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod @@ -50,22 +49,24 @@ def forward(self, x: torch.Tensor, upsample_factor: int = 1): sine_merge = self.l_tanh(self.l_linear(sine_wavs)) return sine_merge, None, None - class HiFiGANNSFGenerator(torch.nn.Module): """ - Generator for synthesizing audio using the NSF (Neural Source Filter) approach. + Generator module based on the Neural Source Filter (NSF) architecture. + + This generator synthesizes audio by first generating a source excitation signal + (harmonic and noise) and then filtering it through a series of upsampling and + residual blocks. Global conditioning can be applied to influence the generation. Args: - initial_channel (int): Number of channels in the initial convolutional layer. - resblock (str): Type of residual block to use (1 or 2). - resblock_kernel_sizes (list): Kernel sizes of the residual blocks. - resblock_dilation_sizes (list): Dilation rates of the residual blocks. - upsample_rates (list): Upsampling rates. - upsample_initial_channel (int): Number of channels in the initial upsampling layer. - upsample_kernel_sizes (list): Kernel sizes of the upsampling layers. - gin_channels (int): Number of channels for the global conditioning input. - sr (int): Sampling rate. - is_half (bool, optional): Whether to use half precision. Defaults to False. + initial_channel (int): Number of input channels to the initial convolutional layer. + resblock_kernel_sizes (list): List of kernel sizes for the residual blocks. + resblock_dilation_sizes (list): List of lists of dilation rates for the residual blocks, corresponding to each kernel size. + upsample_rates (list): List of upsampling factors for each upsampling layer. + upsample_initial_channel (int): Number of output channels from the initial convolutional layer, which is also the input to the first upsampling layer. + upsample_kernel_sizes (list): List of kernel sizes for the transposed convolutional layers used for upsampling. + gin_channels (int): Number of input channels for the global conditioning. If 0, no global conditioning is used. + sr (int): Sampling rate of the audio. + checkpointing (bool, optional): Whether to use gradient checkpointing to save memory during training. Defaults to False. """ def __init__( @@ -78,8 +79,7 @@ def __init__( upsample_kernel_sizes: list, gin_channels: int, sr: int, - is_half: bool = False, - checkpointing = False, + checkpointing: bool = False, ): super(HiFiGANNSFGenerator, self).__init__() @@ -88,7 +88,7 @@ def __init__( self.checkpointing = checkpointing self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) self.m_source = SourceModuleHnNSF( - sample_rate=sr, harmonic_num=0, is_half=is_half + sample_rate=sr, harmonic_num=0 ) self.conv_pre = torch.nn.Conv1d( @@ -169,7 +169,7 @@ def __init__( self.upp = math.prod(upsample_rates) self.lrelu_slope = LRELU_SLOPE - def forward(self, x, f0, g: Optional[torch.Tensor] = None): + def forward(self, x: torch.Tensor, f0: torch.Tensor, g: Optional[torch.Tensor] = None): har_source, _, _ = self.m_source(f0, self.upp) har_source = har_source.transpose(1, 2) @@ -180,18 +180,21 @@ def forward(self, x, f0, g: Optional[torch.Tensor] = None): for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): x = torch.nn.functional.leaky_relu(x, self.lrelu_slope) - + + # Apply upsampling layer if self.training and self.checkpointing: x = checkpoint.checkpoint(ups, x, use_reentrant=False) else: x = ups(x) - + + # Add noise excitation x += noise_convs(har_source) + # Apply residual blocks def resblock_forward(x, blocks): return sum(block(x) for block in blocks) / len(blocks) - blocks = self.resblocks[i * self.num_kernels:(i + 1) * self.num_kernels] + blocks = self.resblocks[i * self.num_kernels : (i + 1) * self.num_kernels] # Checkpoint or regular computation for ResBlocks if self.training and self.checkpointing: @@ -225,4 +228,4 @@ def __prepare_scriptable__(self): and hook.__class__.__name__ == "WeightNorm" ): remove_weight_norm(l) - return self + return self \ No newline at end of file diff --git a/rvc/lib/algorithm/generators/refinegan.py b/rvc/lib/algorithm/generators/refinegan.py index fe5cddf12..0af95eeb3 100644 --- a/rvc/lib/algorithm/generators/refinegan.py +++ b/rvc/lib/algorithm/generators/refinegan.py @@ -4,12 +4,23 @@ from torch.nn.utils.parametrize import remove_parametrizations import torch.utils.checkpoint as checkpoint +from rvc.lib.algorithm.commons import get_padding -def get_padding(kernel_size: int, dilation: int = 1): - return int((kernel_size * dilation - dilation) / 2) +class ResBlock(torch.nn.Module): + """ + Residual block with multiple dilated convolutions. + This block applies a sequence of dilated convolutional layers with Leaky ReLU activation. + It's designed to capture information at different scales due to the varying dilation rates. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int, optional): Kernel size for the convolutional layers. Defaults to 7. + dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers. Defaults to (1, 3, 5). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + """ -class ResBlock(torch.nn.Module): def __init__( self, *, @@ -59,7 +70,7 @@ def __init__( ) self.convs2.apply(self.init_weights) - def forward(self, x): + def forward(self, x: torch.Tensor): for idx, (c1, c2) in enumerate(zip(self.convs1, self.convs2)): xt = torch.nn.functional.leaky_relu(x, self.leaky_relu_slope) xt = c1(xt) @@ -83,8 +94,16 @@ def init_weights(self, m): m.weight.data.normal_(0, 0.01) m.bias.data.fill_(0.0) - class AdaIN(torch.nn.Module): + """ + Adaptive Instance Normalization layer. + + This layer applies a scaling factor to the input based on a learnable weight. + + Args: + channels (int): Number of input channels. + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation applied after scaling. Defaults to 0.2. + """ def __init__( self, *, @@ -101,14 +120,23 @@ def forward(self, x: torch.Tensor): return self.activation(x + gaussian) - class ParallelResBlock(torch.nn.Module): + """ + Parallel residual block that applies multiple residual blocks with different kernel sizes in parallel. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_sizes (tuple[int], optional): Tuple of kernel sizes for the parallel residual blocks. Defaults to (3, 7, 11). + dilation (tuple[int], optional): Tuple of dilation rates for the convolutional layers within the residual blocks. Defaults to (1, 3, 5). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + """ def __init__( self, *, in_channels: int, out_channels: int, - kernel_sizes: int = (3, 7, 11), + kernel_sizes: tuple[int] = (3, 7, 11), dilation: tuple[int] = (1, 3, 5), leaky_relu_slope: float = 0.2, ): @@ -153,23 +181,19 @@ def remove_parametrizations(self): for block in self.blocks: block[1].remove_parametrizations() - class SineGenerator(torch.nn.Module): - """Definition of sine generator - SineGen(samp_rate, harmonic_num = 0, - sine_amp = 0.1, noise_std = 0.003, - voiced_threshold = 0, - flag_for_pulse=False) - - samp_rate: sampling rate in Hz - harmonic_num: number of harmonic overtones (default 0) - sine_amp: amplitude of sine-wavefrom (default 0.1) - noise_std: std of Gaussian noise (default 0.003) - voiced_thoreshold: F0 threshold for U/V classification (default 0) - flag_for_pulse: this SinGen is used inside PulseGen (default False) - - Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(np.pi) or cos(0) + """ + Definition of sine generator + + Generates sine waveforms with optional harmonics and additive noise. + Can be used to create harmonic noise source for neural vocoders. + + Args: + samp_rate (int): Sampling rate in Hz. + harmonic_num (int): Number of harmonic overtones (default 0). + sine_amp (float): Amplitude of sine-waveform (default 0.1). + noise_std (float): Standard deviation of Gaussian noise (default 0.003). + voiced_threshold (float): F0 threshold for voiced/unvoiced classification (default 0). """ def __init__( @@ -220,12 +244,6 @@ def _f02sine(self, f0_values): return sines def forward(self, f0): - """sine_tensor, uv = forward(f0) - input F0: tensor(batchsize=1, length, dim=1) - f0 for unvoiced steps should be 0 - output sine_tensor: tensor(batchsize=1, length, dim) - output uv: tensor(batchsize=1, length, 1) - """ with torch.no_grad(): f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # fundamental component @@ -243,8 +261,19 @@ def forward(self, f0): sine_waves = sine_waves * uv + noise * (1 - uv) return sine_waves, uv, noise - class SourceModuleHnNSF(torch.nn.Module): + """ + Source Module for generating harmonic and noise signals. + + This module uses a SineGenerator to produce harmonic signals based on the fundamental frequency (F0). + + Args: + sampling_rate (int): Sampling rate of the audio. + harmonic_num (int, optional): Number of harmonics to generate. Defaults to 0. + sine_amp (float, optional): Amplitude of the sine wave. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of the additive noise. Defaults to 0.003. + voiced_threshold (int, optional): F0 threshold for voiced/unvoiced classification. Defaults to 0. + """ def __init__( self, sampling_rate, @@ -267,15 +296,31 @@ def __init__( self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) self.l_tanh = torch.nn.Tanh() - def forward(self, x): + def forward(self, x: torch.Tensor): sine_wavs, uv, _ = self.l_sin_gen(x) sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) sine_merge = self.l_tanh(self.l_linear(sine_wavs)) return sine_merge, None, None - class RefineGANGenerator(torch.nn.Module): + """ + RefineGAN generator for audio synthesis. + + This generator uses a combination of downsampling, residual blocks, and parallel residual blocks + to refine an input mel-spectrogram and fundamental frequency (F0) into an audio waveform. + It can also incorporate global conditioning. + + Args: + sample_rate (int, optional): Sampling rate of the audio. Defaults to 44100. + downsample_rates (tuple[int], optional): Downsampling rates for the downsampling blocks. Defaults to (2, 2, 8, 8). + upsample_rates (tuple[int], optional): Upsampling rates for the upsampling blocks. Defaults to (8, 8, 2, 2). + leaky_relu_slope (float, optional): Slope for the Leaky ReLU activation. Defaults to 0.2. + num_mels (int, optional): Number of mel-frequency bins in the input mel-spectrogram. Defaults to 128. + start_channels (int, optional): Number of channels in the initial convolutional layer. Defaults to 16. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 256. + checkpointing (bool, optional): Whether to use checkpointing for memory efficiency. Defaults to False. + """ def __init__( self, *, @@ -432,4 +477,4 @@ def remove_parametrizations(self): block[1].remove_parametrizations() for block in self.upsample_conv_blocks: - block.remove_parametrizations() + block.remove_parametrizations() \ No newline at end of file diff --git a/rvc/lib/algorithm/modules.py b/rvc/lib/algorithm/modules.py index 55454abb6..611c45d38 100644 --- a/rvc/lib/algorithm/modules.py +++ b/rvc/lib/algorithm/modules.py @@ -3,7 +3,8 @@ class WaveNet(torch.nn.Module): - """WaveNet residual blocks as used in WaveGlow. + """ + WaveNet residual blocks as used in WaveGlow. Args: hidden_channels (int): Number of hidden channels. @@ -75,13 +76,6 @@ def __init__( ) def forward(self, x, x_mask, g=None): - """Forward pass. - - Args: - x (torch.Tensor): Input tensor (batch_size, hidden_channels, time_steps). - x_mask (torch.Tensor): Mask tensor (batch_size, 1, time_steps). - g (torch.Tensor, optional): Conditioning tensor (batch_size, gin_channels, time_steps). - """ output = x.clone().zero_() # Apply conditional layer if global conditioning is provided @@ -115,7 +109,6 @@ def forward(self, x, x_mask, g=None): return output * x_mask def remove_weight_norm(self): - """Remove weight normalization from the module.""" if self.gin_channels: torch.nn.utils.remove_weight_norm(self.cond_layer) for layer in self.in_layers: diff --git a/rvc/lib/algorithm/normalization.py b/rvc/lib/algorithm/normalization.py index a2a898cb6..94a29bac9 100644 --- a/rvc/lib/algorithm/normalization.py +++ b/rvc/lib/algorithm/normalization.py @@ -2,7 +2,8 @@ class LayerNorm(torch.nn.Module): - """Layer normalization module. + """ + Layer normalization module. Args: channels (int): Number of channels. @@ -16,12 +17,6 @@ def __init__(self, channels: int, eps: float = 1e-5): self.beta = torch.nn.Parameter(torch.zeros(channels)) def forward(self, x): - """Forward pass. - - Args: - x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps). - - """ # Transpose to (batch_size, time_steps, channels) for layer_norm x = x.transpose(1, -1) x = torch.nn.functional.layer_norm( diff --git a/rvc/lib/algorithm/residuals.py b/rvc/lib/algorithm/residuals.py index fbee72c81..ac151187a 100644 --- a/rvc/lib/algorithm/residuals.py +++ b/rvc/lib/algorithm/residuals.py @@ -65,12 +65,6 @@ def _create_convs(channels: int, kernel_size: int, dilations: Tuple[int]): return layers def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None): - """Forward pass. - - Args: - x (torch.Tensor): Input tensor of shape (batch_size, channels, sequence_length). - x_mask (torch.Tensor, optional): Optional mask to apply to the input and output tensors. - """ for conv1, conv2 in zip(self.convs1, self.convs2): x_residual = x x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) @@ -82,26 +76,18 @@ def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None): return apply_mask(x, x_mask) def remove_weight_norm(self): - """ - Removes weight normalization from all convolutional layers in the block. - """ for conv in chain(self.convs1, self.convs2): remove_weight_norm(conv) class Flip(torch.nn.Module): - """Flip module for flow-based models. + """ + Flip module for flow-based models. This module flips the input along the time dimension. """ def forward(self, x, *args, reverse=False, **kwargs): - """Forward pass. - - Args: - x (torch.Tensor): Input tensor. - reverse (bool, optional): Whether to reverse the operation. Defaults to False. - """ x = torch.flip(x, [1]) if not reverse: logdet = torch.zeros(x.size(0), dtype=x.dtype, device=x.device) @@ -111,7 +97,8 @@ def forward(self, x, *args, reverse=False, **kwargs): class ResidualCouplingBlock(torch.nn.Module): - """Residual Coupling Block for normalizing flow. + """ + Residual Coupling Block for normalizing flow. Args: channels (int): Number of channels in the input. @@ -173,12 +160,10 @@ def forward( return x def remove_weight_norm(self): - """Removes weight normalization from the coupling layers.""" for i in range(self.n_flows): self.flows[i * 2].remove_weight_norm() def __prepare_scriptable__(self): - """Prepares the module for scripting.""" for i in range(self.n_flows): for hook in self.flows[i * 2]._forward_pre_hooks.values(): if ( @@ -191,7 +176,8 @@ def __prepare_scriptable__(self): class ResidualCouplingLayer(torch.nn.Module): - """Residual coupling layer for flow-based models. + """ + Residual coupling layer for flow-based models. Args: channels (int): Number of channels. @@ -247,15 +233,6 @@ def forward( g: Optional[torch.Tensor] = None, reverse: bool = False, ): - """Forward pass. - - Args: - x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps). - x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps). - g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps). - Defaults to None. - reverse (bool, optional): Whether to reverse the operation. Defaults to False. - """ x0, x1 = torch.split(x, [self.half_channels] * 2, 1) h = self.pre(x0) * x_mask h = self.enc(h, x_mask, g=g) @@ -277,5 +254,4 @@ def forward( return x def remove_weight_norm(self): - """Remove weight normalization from the module.""" self.enc.remove_weight_norm() diff --git a/rvc/lib/algorithm/synthesizers.py b/rvc/lib/algorithm/synthesizers.py index 42962e7bc..9c8ed8253 100644 --- a/rvc/lib/algorithm/synthesizers.py +++ b/rvc/lib/algorithm/synthesizers.py @@ -114,7 +114,6 @@ def __init__( upsample_kernel_sizes, gin_channels=gin_channels, sr=sr, - is_half=kwargs["is_half"], checkpointing=checkpointing, ) else: @@ -155,13 +154,11 @@ def __init__( self.emb_g = torch.nn.Embedding(spk_embed_dim, gin_channels) def _remove_weight_norm_from(self, module): - """Utility to remove weight normalization from a module.""" for hook in module._forward_pre_hooks.values(): if getattr(hook, "__class__", None).__name__ == "WeightNorm": torch.nn.utils.remove_weight_norm(module) def remove_weight_norm(self): - """Removes weight normalization from the model.""" for module in [self.dec, self.flow, self.enc_q]: self._remove_weight_norm_from(module) @@ -179,18 +176,6 @@ def forward( y_lengths: Optional[torch.Tensor] = None, ds: Optional[torch.Tensor] = None, ): - """ - Forward pass of the model. - - Args: - phone (torch.Tensor): Phoneme sequence. - phone_lengths (torch.Tensor): Lengths of the phoneme sequences. - pitch (torch.Tensor, optional): Pitch sequence. - pitchf (torch.Tensor, optional): Fine-grained pitch sequence. - y (torch.Tensor, optional): Target spectrogram. - y_lengths (torch.Tensor, optional): Lengths of the target spectrograms. - ds (torch.Tensor, optional): Speaker embedding. - """ g = self.emb_g(ds).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) diff --git a/rvc/train/mel_processing.py b/rvc/train/mel_processing.py index fafa6af37..e4d51cbc1 100644 --- a/rvc/train/mel_processing.py +++ b/rvc/train/mel_processing.py @@ -146,19 +146,20 @@ def mel_spectrogram_torch( return melspec -def compute_window_length(n_mels: int, sample_rate: int) -> int: +def compute_window_length(n_mels: int, sample_rate: int): f_min = 0 f_max = sample_rate / 2 window_length_seconds = 8 * n_mels / (f_max - f_min) window_length = int(window_length_seconds * sample_rate) return 2 ** (window_length.bit_length() - 1) + class MultiScaleMelSpectrogramLoss(torch.nn.Module): def __init__( self, sample_rate: int = 24000, - n_mels: list[int]=[5, 10, 20, 40, 80, 160, 320, 480], + n_mels: list[int] = [5, 10, 20, 40, 80, 160, 320, 480], loss_fn=torch.nn.L1Loss(), ): super().__init__() @@ -169,7 +170,10 @@ def __init__( self.hann_window: dict[int, torch.Tensor] = {} self.mel_banks: dict[int, torch.Tensor] = {} - self.stft_params = [(mel, compute_window_length(mel, sample_rate), self.sample_rate // 100) for mel in n_mels] + self.stft_params = [ + (mel, compute_window_length(mel, sample_rate), self.sample_rate // 100) + for mel in n_mels + ] def mel_spectrogram( self, @@ -177,15 +181,17 @@ def mel_spectrogram( n_mels: int, window_length: int, hop_length: int, - ) -> torch.Tensor: + ): # IDs for caching dtype_device = str(wav.dtype) + "_" + str(wav.device) win_dtype_device = str(window_length) + "_" + dtype_device mel_dtype_device = str(n_mels) + "_" + dtype_device # caching hann window if win_dtype_device not in self.hann_window: - self.hann_window[win_dtype_device] = torch.hann_window(window_length, device=wav.device, dtype=torch.float32) - + self.hann_window[win_dtype_device] = torch.hann_window( + window_length, device=wav.device, dtype=torch.float32 + ) + wav = wav.squeeze(1) # -> torch(B, T) stft = torch.stft( @@ -209,13 +215,15 @@ def mel_spectrogram( fmax=None, ) ).to(device=wav.device, dtype=torch.float32) - + mel_spectrogram = torch.matmul( self.mel_banks[mel_dtype_device], magnitude ) # torch(B, n_mels, stft.frames) return mel_spectrogram - def forward(self, real: torch.Tensor, fake: torch.Tensor): # real: torch(B, 1, T) , fake: torch(B, 1, T) + def forward( + self, real: torch.Tensor, fake: torch.Tensor + ): # real: torch(B, 1, T) , fake: torch(B, 1, T) loss = 0.0 for p in self.stft_params: real_mels = self.mel_spectrogram(real, *p) @@ -223,4 +231,4 @@ def forward(self, real: torch.Tensor, fake: torch.Tensor): # real: torch(B, 1, real_logmels = torch.log(real_mels.clamp(min=1e-5)) / self.log_base fake_logmels = torch.log(fake_mels.clamp(min=1e-5)) / self.log_base loss += self.loss_fn(real_logmels, fake_logmels) - return loss \ No newline at end of file + return loss diff --git a/rvc/train/train.py b/rvc/train/train.py index 8cc7b172c..86a66fcd4 100644 --- a/rvc/train/train.py +++ b/rvc/train/train.py @@ -41,12 +41,10 @@ from losses import ( discriminator_loss, - discriminator_loss_scaled, feature_loss, generator_loss, - generator_loss_scaled, kl_loss, - envelope_loss + envelope_loss, ) from mel_processing import ( mel_spectrogram_torch, @@ -87,7 +85,7 @@ config = HParams(**config) config.data.training_files = os.path.join(experiment_dir, "filelist.txt") -# for nVidia's CUDA device selection can be done from command line / UI +# for Nvidia's CUDA device selection can be done from command line / UI # for AMD the device selection can only be done from .bat file using HIP_VISIBLE_DEVICES os.environ["CUDA_VISIBLE_DEVICES"] = gpus.replace("-", ",") @@ -104,7 +102,7 @@ lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} training_file_path = os.path.join(experiment_dir, "training_data.json") -avg_losses={ +avg_losses = { "gen_loss_queue": deque(maxlen=10), "disc_loss_queue": deque(maxlen=10), "disc_loss_50": deque(maxlen=50), @@ -386,7 +384,9 @@ def run( checkpointing=checkpointing, ).to(device) - net_d = MultiPeriodDiscriminator(version, config.model.use_spectral_norm, checkpointing=checkpointing).to(device) + net_d = MultiPeriodDiscriminator( + version, config.model.use_spectral_norm, checkpointing=checkpointing + ).to(device) optim_g = torch.optim.AdamW( net_g.parameters(), @@ -556,8 +556,8 @@ def train_and_evaluate( lowest_value = {"step": 0, "value": float("inf"), "epoch": 0} consecutive_increases_gen = 0 consecutive_increases_disc = 0 - - epoch_disc_sum = 0.0 + + epoch_disc_sum = 0.0 epoch_gen_sum = 0.0 net_g, net_d = nets @@ -623,9 +623,9 @@ def train_and_evaluate( ) y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) with autocast(enabled=False): - #if vocoder == "HiFi-GAN": + # if vocoder == "HiFi-GAN": # loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) - #else: + # else: # loss_disc, _, _ = discriminator_loss_scaled(y_d_hat_r, y_d_hat_g) loss_disc, _, _ = discriminator_loss(y_d_hat_r, y_d_hat_g) # Discriminator backward and update @@ -633,11 +633,13 @@ def train_and_evaluate( optim_d.zero_grad() scaler.scale(loss_disc).backward() scaler.unscale_(optim_d) - grad_norm_d = torch.nn.utils.clip_grad_norm_(net_d.parameters(), max_norm=1000.0) + grad_norm_d = torch.nn.utils.clip_grad_norm_( + net_d.parameters(), max_norm=1000.0 + ) scaler.step(optim_d) scaler.update() - if not math.isfinite(grad_norm_d): - print('\nWarning: grad_norm_d is NaN or Inf') + # if not math.isfinite(grad_norm_d): + # print("\nWarning: grad_norm_d is NaN or Inf") # Generator backward and update with autocast(enabled=use_amp): @@ -645,12 +647,14 @@ def train_and_evaluate( with autocast(enabled=False): loss_mel = fn_mel_loss(wave, y_hat) * config.train.c_mel / 3.0 loss_env = envelope_loss(wave, y_hat) - loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl + loss_kl = ( + kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl + ) loss_fm = feature_loss(fmap_r, fmap_g) - #if vocoder == "HiFi-GAN": - # loss_gen, _ = generator_loss(y_d_hat_g) - #else: - # loss_gen, _ = generator_loss_scaled(y_d_hat_g) + # if vocoder == "HiFi-GAN": + # loss_gen, _ = generator_loss(y_d_hat_g) + # else: + # loss_gen, _ = generator_loss_scaled(y_d_hat_g) loss_gen, _ = generator_loss(y_d_hat_g) loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + loss_env @@ -664,14 +668,16 @@ def train_and_evaluate( optim_g.zero_grad() scaler.scale(loss_gen_all).backward() scaler.unscale_(optim_g) - grad_norm_g = torch.nn.utils.clip_grad_norm_(net_g.parameters(), max_norm=1000.0) + grad_norm_g = torch.nn.utils.clip_grad_norm_( + net_g.parameters(), max_norm=1000.0 + ) scaler.step(optim_g) scaler.update() - if not math.isfinite(grad_norm_g): - print('\n Warning: grad_norm_g is NaN or Inf') + # if not math.isfinite(grad_norm_g): + # print("\n Warning: grad_norm_g is NaN or Inf") global_step += 1 - + # queue for rolling losses over 50 steps avg_losses["disc_loss_50"].append(loss_disc.detach()) avg_losses["env_loss_50"].append(loss_env.detach()) @@ -679,16 +685,28 @@ def train_and_evaluate( avg_losses["kl_loss_50"].append(loss_kl.detach()) avg_losses["mel_loss_50"].append(loss_mel.detach()) avg_losses["gen_loss_50"].append(loss_gen_all.detach()) - + if rank == 0 and global_step % 50 == 0: # logging rolling averages scalar_dict = { - "loss_avg_50/d/total": torch.mean(torch.stack(list(avg_losses["disc_loss_50"]))), - "loss_avg_50/g/env": torch.mean(torch.stack(list(avg_losses["env_loss_50"]))), - "loss_avg_50/g/fm": torch.mean(torch.stack(list(avg_losses["fm_loss_50"]))), - "loss_avg_50/g/kl": torch.mean(torch.stack(list(avg_losses["kl_loss_50"]))), - "loss_avg_50/g/mel": torch.mean(torch.stack(list(avg_losses["mel_loss_50"]))), - "loss_avg_50/g/total": torch.mean(torch.stack(list(avg_losses["gen_loss_50"]))), + "loss_avg_50/d/total": torch.mean( + torch.stack(list(avg_losses["disc_loss_50"])) + ), + "loss_avg_50/g/env": torch.mean( + torch.stack(list(avg_losses["env_loss_50"])) + ), + "loss_avg_50/g/fm": torch.mean( + torch.stack(list(avg_losses["fm_loss_50"])) + ), + "loss_avg_50/g/kl": torch.mean( + torch.stack(list(avg_losses["kl_loss_50"])) + ), + "loss_avg_50/g/mel": torch.mean( + torch.stack(list(avg_losses["mel_loss_50"])) + ), + "loss_avg_50/g/total": torch.mean( + torch.stack(list(avg_losses["gen_loss_50"])) + ), } summarize( writer=writer, @@ -703,10 +721,10 @@ def train_and_evaluate( # Logging and checkpointing if rank == 0: - + avg_losses["disc_loss_queue"].append(epoch_disc_sum.item() / len(train_loader)) avg_losses["gen_loss_queue"].append(epoch_gen_sum.item() / len(train_loader)) - + # used for tensorboard chart - all/mel mel = spec_to_mel_torch( spec, @@ -961,7 +979,7 @@ def train_and_evaluate( if done: os._exit(2333333) - + with torch.no_grad(): torch.cuda.empty_cache() From e96a659966ba595bac1db115e13ea69136211d58 Mon Sep 17 00:00:00 2001 From: Blaise Date: Sun, 22 Dec 2024 23:39:51 +0100 Subject: [PATCH 43/46] revert preprocess changes --- rvc/train/preprocess/preprocess.py | 106 ++++++++++++++++------------- 1 file changed, 59 insertions(+), 47 deletions(-) diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py index 71c5c899d..edc02f87a 100644 --- a/rvc/train/preprocess/preprocess.py +++ b/rvc/train/preprocess/preprocess.py @@ -35,15 +35,6 @@ class PreProcess: def __init__(self, sr: int, exp_dir: str, per: float): - self.sr = sr - self.per = per - self.exp_dir = exp_dir - self.device = "cpu" - self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") - self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") - os.makedirs(self.gt_wavs_dir, exist_ok=True) - os.makedirs(self.wavs16k_dir, exist_ok=True) - self.slicer = Slicer( sr=sr, threshold=-42, @@ -52,9 +43,17 @@ def __init__(self, sr: int, exp_dir: str, per: float): hop_size=15, max_sil_kept=500, ) + self.sr = sr self.b_high, self.a_high = signal.butter( N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr ) + self.per = per + self.exp_dir = exp_dir + self.device = "cpu" + self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") + self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") + os.makedirs(self.gt_wavs_dir, exist_ok=True) + os.makedirs(self.wavs16k_dir, exist_ok=True) def _normalize_audio(self, audio: np.ndarray): tmp_max = np.abs(audio).max() @@ -97,19 +96,18 @@ def simple_cut( chunk_len: float, overlap_len: float, ): - chunk_samples = int(self.sr * chunk_len) - overlap_samples = int(self.sr * overlap_len) - step = chunk_samples - overlap_samples - num_chunks = (len(audio) - chunk_samples) // step + 1 - for i in range(num_chunks): - start = i * step - end = start + chunk_samples - if end <= len(audio): - chunk = audio[start:end] - file_index = i + chunk_length = int(self.sr * chunk_len) + overlap_length = int(self.sr * overlap_len) + i = 0 + while i < len(audio): + chunk = audio[i : i + chunk_length] + if len(chunk) == chunk_length: # full SR for training wavfile.write( - os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{file_index}.wav"), + os.path.join( + self.gt_wavs_dir, + f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav", + ), self.sr, chunk.astype(np.float32), ) @@ -118,10 +116,14 @@ def simple_cut( chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K, res_type=RES_TYPE ) wavfile.write( - os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{file_index}.wav"), + os.path.join( + self.wavs16k_dir, + f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav", + ), SAMPLE_RATE_16K, chunk_16k.astype(np.float32), ) + i += chunk_length - overlap_length def process_audio( self, @@ -145,37 +147,46 @@ def process_audio( audio = self._normalize_audio(audio) if noise_reduction: audio = nr.reduce_noise( - y=audio, - sr=self.sr, - prop_decrease=reduction_strength, - n_fft=2048, - hop_length=512, + y=audio, sr=self.sr, prop_decrease=reduction_strength ) if cut_preprocess == "Skip": - self.process_audio_segment(audio, sid, idx0, 0) + # no cutting + self.process_audio_segment( + audio, + sid, + idx0, + 0, + ) elif cut_preprocess == "Simple": + # simple self.simple_cut(audio, sid, idx0, chunk_len, overlap_len) elif cut_preprocess == "Automatic": - segments = self.slicer.slice(audio) idx1 = 0 - for audio_segment in segments: - segment_length = len(audio_segment) - per_samples = int(self.sr * self.per) - overlap_samples_segment = int(self.sr * OVERLAP) - step = per_samples - overlap_samples_segment - - num_sub_segments = (segment_length - per_samples + step - 1) // step - - for i in range(num_sub_segments): - start = i * step - end = start + per_samples - if end <= segment_length: - tmp_audio = audio_segment[start:end] - self.process_audio_segment(tmp_audio, sid, idx0, idx1) + # legacy + for audio_segment in self.slicer.slice(audio): + i = 0 + while True: + start = int(self.sr * (self.per - OVERLAP) * i) + i += 1 + if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr: + tmp_audio = audio_segment[ + start : start + int(self.per * self.sr) + ] + self.process_audio_segment( + tmp_audio, + sid, + idx0, + idx1, + ) idx1 += 1 - elif start < segment_length: + else: tmp_audio = audio_segment[start:] - self.process_audio_segment(tmp_audio, sid, idx0, idx1) + self.process_audio_segment( + tmp_audio, + sid, + idx0, + idx1, + ) idx1 += 1 break @@ -266,6 +277,7 @@ def preprocess_training_set( f'Speaker ID folder is expected to be integer, got "{os.path.basename(root)}" instead.' ) + # print(f"Number of files: {len(files)}") audio_length = [] with tqdm(total=len(files)) as pbar: with concurrent.futures.ProcessPoolExecutor( @@ -291,13 +303,13 @@ def preprocess_training_set( audio_length.append(future.result()) pbar.update(1) - total_audio_length = sum(audio_length) + audio_length = sum(audio_length) save_dataset_duration( - os.path.join(exp_dir, "model_info.json"), dataset_duration=total_audio_length + os.path.join(exp_dir, "model_info.json"), dataset_duration=audio_length ) elapsed_time = time.time() - start_time print( - f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(total_audio_length)} seconds of audio." + f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(audio_length)} seconds of audio." ) From e3272644283641f76093eaa13eb1a387da9cf22e Mon Sep 17 00:00:00 2001 From: Vidalnt Date: Sun, 22 Dec 2024 17:48:49 -0500 Subject: [PATCH 44/46] feat: :art: Updating the colab with respect to cli --- assets/Applio_NoUI.ipynb | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/assets/Applio_NoUI.ipynb b/assets/Applio_NoUI.ipynb index 9a0ac6c03..edd9a9ca0 100644 --- a/assets/Applio_NoUI.ipynb +++ b/assets/Applio_NoUI.ipynb @@ -429,12 +429,14 @@ "sample_rate = \"40k\" # @param [\"32k\", \"40k\", \"48k\"] {allow-input: false}\n", "sr = int(sample_rate.rstrip(\"k\")) * 1000\n", "cpu_cores = 2 # @param {type:\"slider\", min:1, max:2, step:1}\n", - "cut_preprocess = True # @param{type:\"boolean\"}\n", + "cut_preprocess = \"Automatic\" # @param [\"Skip\", \"Simple\", \"Automatic\"] {allow-input: false}\n", "process_effects = False # @param{type:\"boolean\"}\n", "noise_reduction = False # @param{type:\"boolean\"}\n", "noise_reduction_strength = 0.7 # @param {type:\"slider\", min:0.0, max:1.0, step:0.1}\n", + "chunk_len = 3.0 # @param {type:\"slider\", min:0.5, max:5.0, step:0.5}\n", + "overlap_len = 0.3 # @param {type:\"slider\", min:0.0, max:0.4, step:0.1}\n", "\n", - "!python core.py preprocess --model_name \"{model_name}\" --dataset_path \"{dataset_path}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --cut_preprocess \"{cut_preprocess}\" --process_effects \"{process_effects}\" --noise_reduction \"{noise_reduction}\" --noise_reduction_strength \"{noise_reduction_strength}\"" + "!python core.py preprocess --model_name \"{model_name}\" --dataset_path \"{dataset_path}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --cut_preprocess \"{cut_preprocess}\" --process_effects \"{process_effects}\" --noise_reduction \"{noise_reduction}\" --noise_reduction_strength \"{noise_reduction_strength}\" --chunk_len \"{chunk_len}\" --overlap_len \"{overlap_len}\"" ] }, { @@ -453,10 +455,11 @@ "\n", "sr = int(sample_rate.rstrip(\"k\")) * 1000\n", "cpu_cores = 2 # @param {type:\"slider\", min:1, max:2, step:1}\n", + "include_mutes = 2 # @param {type:\"slider\", min:0, max:10, step:1}\n", "embedder_model = \"contentvec\" # @param [\"contentvec\", \"chinese-hubert-base\", \"japanese-hubert-base\", \"korean-hubert-base\", \"custom\"] {allow-input: false}\n", "embedder_model_custom = \"\" # @param {type:\"string\"}\n", "\n", - "!python core.py extract --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --f0_method \"{f0_method}\" --hop_length \"{hop_length}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --gpu \"0\" --embedder_model \"{embedder_model}\" --embedder_model_custom \"{embedder_model_custom}\"" + "!python core.py extract --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --f0_method \"{f0_method}\" --hop_length \"{hop_length}\" --sample_rate \"{sr}\" --cpu_cores \"{cpu_cores}\" --gpu \"0\" --embedder_model \"{embedder_model}\" --embedder_model_custom \"{embedder_model_custom}\" --include_mutes \"{include_mutes}\"" ] }, { @@ -597,7 +600,7 @@ " print(\"Autobackup Disabled\")\n", "else:\n", " autobackups = True\n", - " print(\"Autobackup Enabled\") \n", + " print(\"Autobackup Enabled\")\n", "# @markdown ### ⚙️ Train Settings\n", "total_epoch = 800 # @param {type:\"integer\"}\n", "batch_size = 15 # @param {type:\"slider\", min:1, max:25, step:0}\n", @@ -618,6 +621,8 @@ "custom_pretrained = False # @param{type:\"boolean\"}\n", "g_pretrained_path = \"/content/Applio/rvc/models/pretraineds/pretraineds_custom/G48k.pth\" # @param {type:\"string\"}\n", "d_pretrained_path = \"/content/Applio/rvc/models/pretraineds/pretraineds_custom/D48k.pth\" # @param {type:\"string\"}\n", + "vocoder = \"HiFi-GAN\" # @param [\"HiFi-GAN\", \"MRF HiFi-GAN\", \"RefineGAN\"] {allow-input: false}\n", + "checkpointing = False # @param{type:\"boolean\"}\n", "\n", "if \"pretrained\" not in globals():\n", " pretrained = True\n", @@ -636,8 +641,7 @@ " if tensorboard == True:\n", " %load_ext tensorboard\n", " %tensorboard --logdir /content/Applio/logs/\n", - " !python core.py train --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --save_every_epoch \"{save_every_epoch}\" --save_only_latest \"{save_only_latest}\" --save_every_weights \"{save_every_weights}\" --total_epoch \"{total_epoch}\" --sample_rate \"{sr}\" --batch_size \"{batch_size}\" --gpu \"{gpu}\" --pretrained \"{pretrained}\" --custom_pretrained \"{custom_pretrained}\" --g_pretrained_path \"{g_pretrained_path}\" --d_pretrained_path \"{d_pretrained_path}\" --overtraining_detector \"{overtraining_detector}\" --overtraining_threshold \"{overtraining_threshold}\" --cleanup \"{cleanup}\" --cache_data_in_gpu \"{cache_data_in_gpu}\"\n", - "\n", + " !python core.py train --model_name \"{model_name}\" --rvc_version \"{rvc_version}\" --save_every_epoch \"{save_every_epoch}\" --save_only_latest \"{save_only_latest}\" --save_every_weights \"{save_every_weights}\" --total_epoch \"{total_epoch}\" --sample_rate \"{sr}\" --batch_size \"{batch_size}\" --gpu \"{gpu}\" --pretrained \"{pretrained}\" --custom_pretrained \"{custom_pretrained}\" --g_pretrained_path \"{g_pretrained_path}\" --d_pretrained_path \"{d_pretrained_path}\" --overtraining_detector \"{overtraining_detector}\" --overtraining_threshold \"{overtraining_threshold}\" --cleanup \"{cleanup}\" --cache_data_in_gpu \"{cache_data_in_gpu}\" --vocoder \"{vocoder}\" --checkpointing \"{checkpointing}\"\n", "\n", "server_thread = threading.Thread(target=start_train)\n", "server_thread.start()\n", From e338c883deccaa306785e0a13bcdc7758d990b1d Mon Sep 17 00:00:00 2001 From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com> Date: Sun, 22 Dec 2024 17:50:41 -0500 Subject: [PATCH 45/46] typo --- rvc/infer/infer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rvc/infer/infer.py b/rvc/infer/infer.py index d0c48b2f9..ad8cddea3 100644 --- a/rvc/infer/infer.py +++ b/rvc/infer/infer.py @@ -481,7 +481,7 @@ def setup_network(self): *self.cpt["config"], use_f0=self.use_f0, text_enc_hidden_dim=self.text_enc_hidden_dim, - is_half=False + is_half=False, vocoder=self.vocoder ) del self.net_g.enc_q From 29b5140affb755b6afbf172660514bb08d89f878 Mon Sep 17 00:00:00 2001 From: Blaise Date: Mon, 23 Dec 2024 00:26:41 +0100 Subject: [PATCH 46/46] fix train ui --- tabs/train/train.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/tabs/train/train.py b/tabs/train/train.py index 6cd6c12c5..2dcb15cc9 100644 --- a/tabs/train/train.py +++ b/tabs/train/train.py @@ -684,15 +684,23 @@ def train_tab(): interactive=True, ) with gr.Row(): - with gr.Column(): - custom_pretrained = gr.Checkbox( - label=i18n("Custom Pretrained"), - info=i18n( - "Utilizing custom pretrained models can lead to superior results, as selecting the most suitable pretrained models tailored to the specific use case can significantly enhance performance." - ), - value=False, - interactive=True, - ) + custom_pretrained = gr.Checkbox( + label=i18n("Custom Pretrained"), + info=i18n( + "Utilizing custom pretrained models can lead to superior results, as selecting the most suitable pretrained models tailored to the specific use case can significantly enhance performance." + ), + value=False, + interactive=True, + ) + overtraining_detector = gr.Checkbox( + label=i18n("Overtraining Detector"), + info=i18n( + "Detect overtraining to prevent the model from learning the training data too well and losing the ability to generalize to new data." + ), + value=False, + interactive=True, + ) + with gr.Row(): with gr.Column(visible=False) as pretrained_custom_settings: with gr.Accordion(i18n("Pretrained Custom Settings")): upload_pretrained = gr.File( @@ -721,14 +729,7 @@ def train_tab(): interactive=True, allow_custom_value=True, ) - overtraining_detector = gr.Checkbox( - label=i18n("Overtraining Detector"), - info=i18n( - "Detect overtraining to prevent the model from learning the training data too well and losing the ability to generalize to new data." - ), - value=False, - interactive=True, - ) + with gr.Column(visible=False) as overtraining_settings: with gr.Accordion(i18n("Overtraining Detector Settings")): overtraining_threshold = gr.Slider(