diff --git a/.github/environment-ci.yml b/.github/environment-ci.yml index d3c24541..5c0caeda 100644 --- a/.github/environment-ci.yml +++ b/.github/environment-ci.yml @@ -27,8 +27,8 @@ dependencies: - pip: - "keras<3.0.0" - "tensorflow>=2.12.0,<2.16" - - "torch==2.0.0" - - "torchaudio==2.0.1" + - "torch==1.13" + - "torchaudio" - "essentia" - "soundfile>=0.12.1" - "opencv-python~=4.6.0" diff --git a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py index 6c57b0c1..8cc92c06 100644 --- a/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py +++ b/compiam/separation/singing_voice_extraction/cold_diff_sep/__init__.py @@ -94,6 +94,7 @@ def separate( clusters=5, scheduler=4, chunk_size=3, + normalize_input=True, gpu="-1", ): """Separate singing voice from mixture. @@ -103,6 +104,8 @@ def separate( relevant if the input is an array of data instead of a filepath. :param clusters: Number of clusters to use to build the separation masks. :param scheduler: Scheduler factor to weight the clusters to be more or less restirctive with the interferences. + :param chunk_size: Size of the chunks to process the audio signal. + :param normalize_input: Normalize the input audio signal. :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc. :return: Singing voice signal. """ @@ -153,6 +156,12 @@ def separate( f"Downsampling to mono... your audio is stereo, \ and the model is trained on mono audio." ) + + if normalize_input: + # Normalizing audio for better performance overall + mean = tf.reduce_mean(mixture, keepdims=True) + std = tf.math.reduce_std(mixture, keepdims=True) + mixture = (mixture - mean) / (1e-6 + std) output_voc = np.zeros(mixture.shape) hopsized_chunk = int((chunk_size * self.sample_rate) / 2) diff --git a/tests/melody/test_deepsrgm.py b/tests/melody/test_deepsrgm.py index e01dc73f..0d47655d 100644 --- a/tests/melody/test_deepsrgm.py +++ b/tests/melody/test_deepsrgm.py @@ -43,8 +43,11 @@ def _get_features(): feat = deepsrgm.get_features( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - feat_1 = deepsrgm.get_features(np.zeros(44100)) - feat_2 = deepsrgm.get_features( + feat_1 = deepsrgm.get_features(np.zeros([44100])) + feat_2 = deepsrgm.get_features(np.zeros([1, 44100])) + feat_3 = deepsrgm.get_features(np.zeros([2, 44100])) + feat_3 = deepsrgm.get_features(np.zeros([44100, 2])) + feat_4 = deepsrgm.get_features( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) diff --git a/tests/melody/test_essentia_extractors.py b/tests/melody/test_essentia_extractors.py index b2db2904..622c969a 100644 --- a/tests/melody/test_essentia_extractors.py +++ b/tests/melody/test_essentia_extractors.py @@ -1,5 +1,6 @@ import os import pytest +import librosa import numpy as np @@ -15,9 +16,9 @@ def _predict_normalized_pitch(): pitch = melodia.extract( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - pitch_2 = melodia.extract(np.zeros(44100)) - pitch_3 = melodia.extract(np.zeros(2, 44100)) # Testing input array - pitch_4 = melodia.extract(np.zeros(44100, 2)) # Testing input array + pitch_2 = melodia.extract(np.zeros([44100])) + pitch_3 = melodia.extract(np.zeros([2, 44100])) # Testing input array + pitch_4 = melodia.extract(np.zeros([44100, 2])) # Testing input array assert isinstance(pitch, np.ndarray) assert np.shape(pitch) == (699, 2) @@ -70,11 +71,10 @@ def _predict_normalized_pitch(): tonic = tonic_multipitch.extract( os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav") ) - tonic_2 = tonic_multipitch.extract(np.zeros(44100)) # Testing input array - tonic_3 = tonic_multipitch.extract(np.zeros(2, 44100)) # Testing input array - tonic_4 = tonic_multipitch.extract(np.zeros(44100, 2)) # Testing input array - - + audio = librosa.load(os.path.join(TESTDIR, "resources", "melody", "pitch_test.wav"), sr=44100)[0] + tonic_2 = tonic_multipitch.extract(audio) # Testing input array + tonic_3 = tonic_multipitch.extract(np.stack([audio, audio])) # Testing input array + tonic_4 = tonic_multipitch.extract(np.stack([audio, audio]).T) # Testing input array assert isinstance(tonic, float) assert tonic == 157.64892578125