Skip to content

Commit

Permalink
Merge pull request #2 from maks00170/review
Browse files Browse the repository at this point in the history
Review passed
  • Loading branch information
maks00170 authored Jan 16, 2024
2 parents 8a8e6d5 + 3bea291 commit d52dd15
Show file tree
Hide file tree
Showing 19 changed files with 854 additions and 669 deletions.
9 changes: 9 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
.github
**/__pycache__/
separator/inference/
streaming/weights/
streaming/input/
streaming/streams/
streaming/model/
streaming/tflite_model/

2 changes: 1 addition & 1 deletion .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ jobs:
run: |
pip install -r requirements-dev.txt
- name: "black"
run: black . --check --diff --color
run: black . --check --diff --color --exclude .*/config/
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.vscode
**/__pycache__/
separator/inference/
streaming/weights
streaming/input/
streaming/streams/
streaming/model/
streaming/tflite_model/
19 changes: 5 additions & 14 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,19 +1,10 @@
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
FROM nvcr.io/nvidia/tensorrt:22.08-py3

ENV NV_CUDNN_VERSION 8.6.0.163
ENV NV_CUDNN_PACKAGE_NAME "libcudnn8"
ENV PYTHONUNBUFFERED=1

ENV NV_CUDNN_PACKAGE "$NV_CUDNN_PACKAGE_NAME=$NV_CUDNN_VERSION-1+cuda11.8"
ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get -y update && apt-get -y upgrade && apt-get install -y --no-install-recommends ffmpeg
RUN apt-get update && apt-get install -y --no-install-recommends \
${NV_CUDNN_PACKAGE} \
unzip \
&& apt-mark hold ${NV_CUDNN_PACKAGE_NAME} \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update -y \
&& apt-get install -y python3-pip
RUN apt-get -y update && apt-get -y upgrade
RUN apt-get install -y --no-install-recommends ffmpeg
RUN apt-get install -y python3-pip
RUN echo 'alias python=python3' >> ~/.bashrc
RUN echo 'NCCL_SOCKET_IFNAME=lo' >> ~/.bashrc

Expand Down
126 changes: 3 additions & 123 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,139 +1,19 @@
aiohttp==3.8.4
aiosignal==1.3.1
antlr4-python3-runtime==4.9.3
appdirs==1.4.4
asttokens
async-timeout==4.0.2
attrs==23.1.0
audioread==3.0.0
backcall
certifi==2023.5.7
cffi==1.15.1
charset-normalizer==3.1.0
cmake==3.26.4
comm
contourpy
cycler
Cython==0.29.35
debugpy
decorator
diffq==0.2.4
einops==0.6.1
executing
fast-bss-eval==0.1.4
ffmpeg-python==0.2.0
filelock==3.12.0
fonttools==4.25.0
frozenlist==1.3.3
fsspec==2023.6.0
future==0.18.3
gdown
idna==3.4
ipykernel
ipython
jedi
Jinja2==3.1.2
joblib==1.3.1
jsonschema==4.19.0
jsonschema-specifications==2023.7.1
gdown==4.6.3
julius==0.2.7
jupyter_client
jupyter_core
kiwisolver
lameenc==1.4.2
lazy_loader==0.3
librosa==0.10.0.post2
lightning-utilities==0.8.0
lit==16.0.5.post0
llvmlite==0.40.1
lpips==0.1.4
MarkupSafe==2.1.3
matplotlib
matplotlib-inline
mir-eval==0.7
mkl-fft==1.3.6
mkl-random
mkl-service==2.4.0
mpmath==1.3.0
msgpack==1.0.5
multidict==6.0.4
munkres==1.1.4
musdb==0.4.0
museval==0.4.1
nest-asyncio
networkx==3.1
numba==0.57.1
numpy #==1.24.4
nobuco
nvidia-cublas-cu11==11.10.3.66
nvidia-cuda-cupti-cu11==11.7.101
nvidia-cuda-nvrtc-cu11==11.7.99
nvidia-cuda-runtime-cu11==11.7.99
nvidia-cudnn-cu11==8.5.0.96
nvidia-cufft-cu11==10.9.0.58
nvidia-curand-cu11==10.2.10.91
nvidia-cusolver-cu11==11.4.0.1
nvidia-cusparse-cu11==11.7.4.91
nvidia-nccl-cu11==2.14.3
nvidia-nvtx-cu11==11.7.91
omegaconf==2.3.0
openunmix==1.2.1
packaging
pandas==2.1.0
parso
pexpect
pickleshare
Pillow==9.5.0
platformdirs
ply==3.11
pooch==1.6.0
primePy==1.3
prompt-toolkit
psutil
ptyprocess
pure-eval
pyaml==23.5.9
pycparser==2.21
pyee==10.0.1
Pygments
pyparsing
PyQt5-sip==12.11.0
PySoundFile==0.9.0.post1
python-dateutil
python-ffmpeg==2.0.4
pytorch-lightning==2.0.3
pytz==2023.3
PyYAML==6.0
pyzmq
referencing==0.30.2
requests==2.31.0
rpds-py==0.10.0
scikit-learn==1.3.0
scipy==1.10.1
simplejson==3.19.1
sip
six
soundfile==0.12.1
sox==1.4.1
soxr==0.3.5
stack-data
stempeg==0.2.3
sympy==1.12
tensorflow>=2.13.0 #.*
threadpoolctl==3.1.0
toml
tensorflow>=2.13.0
torch==2.0.1
torch-audiomentations==0.11.0
torch-pitch-shift==1.2.4
torchaudio==2.0.2
torchmetrics==0.11.4
torchvision==0.15.2
tornado
pytorch-lightning==2.0.3
tqdm==4.65.0
traitlets
triton==2.0.0
typing_extensions>=4.6.1
tzdata==2023.3
urllib3==2.0.3
wcwidth
yarl==1.9.2
2 changes: 1 addition & 1 deletion run_docker.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

app=$PWD
app=$(pwd)

docker run --name pmunet -it --rm \
--net=host --ipc=host \
Expand Down
162 changes: 83 additions & 79 deletions separator/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,95 +3,99 @@
from typing import Union


from dataclasses import dataclass
from pathlib import Path
from typing import Union


@dataclass
class TrainConfig:
device: str = "cuda"

# datasets
musdb_path: str = "musdb18hq"
metadata_train_path: str = "metadata"
metadata_test_path: str = "metadata1"
segment: int = 5

# dataloaders
batch_size: int = 6
shuffle_train: bool = True
shuffle_valid: bool = False
drop_last: bool = True
num_workers: int = 2
# DATA OPTIONS
musdb_path : str = "musdb18hq" # Directory path where the MUSDB18-HQ dataset is stored.
metadata_train_path : str = "metadata" # Directory path for saving training metadata, like track names and lengths.
metadata_test_path : str = "metadata1" # Directory path for saving testing metadata.
segment : int = 5 # Length (in seconds) of each audio segment used during training.

# checkpoint_callback
metric_monitor_mode: str = "min"
save_top_k_model_weights: int = 1
# MODEL OPTIONS
model_source : tuple = ("drums", "bass", "other", "vocals") # Sources to target in source separation.
model_depth : int = 4 # The depth of the U-Net architecture.
model_channel : int = 28 # Number of initial channels in U-Net layers.
is_mono : bool = False # Indicates whether the input audio should be treated as mono (True) or stereo (False).
mask_mode : bool = False # Whether to utilize masking within the model.
skip_mode : str = "concat" # Mode of skip connections in U-Net ('concat' for concatenation, 'add' for summation).
nfft : int = 4096 # Number of bins used in STFT.
bottlneck_lstm : bool = True # Determines whether to use LSTM layers as bottleneck in the U-Net architecture.
layers : int = 2 # Number of LSTM layers if bottleneck.
stft_flag : bool = True # A flag to decide whether to apply the STFT is required for tflite.

# PM_Unet model
model_source: tuple = ("drums", "bass", "other", "vocals")
model_depth: int = 4
model_channel: int = 28
is_mono: bool = False
mask_mode: bool = False
skip_mode: str = "concat"
nfft: int = 4096
bottlneck_lstm: bool = True
layers: int = 2
stft_flag: bool = True
# augments
shift: int = 8192
pitchshift_proba: float = 0.2
vocals_min_semitones: int = -5
vocals_max_semitones: int = 5
other_min_semitones: int = -2
other_max_semitones: int = 2
pitchshift_flag_other: bool = False
time_change_proba: float = 0.2
time_change_factors: tuple = (0.8, 0.85, 0.9, 0.95, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3)
remix_proba: float = 1
remix_group_size: int = batch_size
scale_proba: float = 1
scale_min: float = 0.25
scale_max: float = 1.25
fade_mask_proba: float = 0.1
double_proba: float = 0.1
reverse_proba: float = 0.2
mushap_proba: float = 0.0
mushap_depth: int = 2
# TRAIN OPTIONS
device : str = "cuda" # The computing platform for training: 'cuda' for NVIDIA GPUs or 'cpu'.
batch_size : int = 6 # Batch size for training.
shuffle_train : bool = True # Whether to shuffle the training dataset.
shuffle_valid : bool = False # Whether to shuffle the valid dataset.
drop_last : bool = True # Whether to drop the last incomplete batch in train data.
num_workers : int = 2 # Number of worker processes used for loading data.
metric_monitor_mode : str = "min" # Strategy for monitoring metrics to save model checkpoints.
save_top_k_model_weights : int = 1 # Number of best-performing model weights to save based on the monitored metric.

factor : int = 1 # Factors for different components of the loss function.
c_factor : int = 1

# loss if there are artifacts while listening, then increase this params
factor: int = 1
c_factor: int = 1
loss_nfft: tuple = (4096,)
gamma: float = 0.3
# lr
lr: float = 0.5 * 3e-3
T_0: int = 40
loss_nfft : tuple = (4096,) # Number of FFT bins for calculating loss.
gamma : float = 0.3 # Gamma parameter for adjusting the focus of the loss on certain aspects of the audio spectrum.
lr : float = 0.5 * 3e-3 # Learning rate for the optimizer.
T_0 : int = 40 # Period of the cosine annealing schedule in learning rate adjustment.
max_epochs : int = 100 # Maximum number of training epochs.
precision : str = 16 # Precision of training computations.
grad_clip : float = 0.5 # Gradient clipping value.

# lightning
max_epochs: int = 100
precision: str = 16 # "bf16-mixed"
grad_clip: float = 0.5
# AUGMENTATION OPTIONS
proba_shift : float = 0.5 # Probability of applying the shift.
shift : int = 8192 # Maximum number of samples for the shift.
proba_flip_channel : float = 1 # Probability of applying the flip left-right channels.
proba_flip_sign : float = 1 # Probability of applying the sign flip.
pitchshift_proba : float = 0.2 # Probability of applying pitch shift.
vocals_min_semitones : int = -5 # The lower limit of vocal semitones.
vocals_max_semitones : int = 5 # The upper limit of vocal semitones.
other_min_semitones : int = -2 # The lower limit of non-vocal semitones.
other_max_semitones : int = 2 # The upper limit of non-vocal semitones.
pitchshift_flag_other : bool = False # Flag to enable pitch shift augmentation on non-vocal sources.
time_change_proba : float = 0.2 # Probability of applying time stretching.
time_change_factors : tuple = (0.8, 0.85, 0.9, 0.95, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3) # Factors for time stretching/compression, defining the range and intensity of this augmentation.
remix_proba : float = 1 # Probability of remixing audio tracks.
remix_group_size : int = batch_size # Size of groups within which shuffling occurs.
scale_proba : float = 1 # Probability of applying the scaling.
scale_min : float = 0.25 # Minimum scaling factor.
scale_max : float = 1.25 # Maximum scaling factor.
fade_mask_proba : float = 0.1 # Probability of applying a fade effect.
double_proba : float = 0.1 # Probability of doubling one channel's audio to both channels.
reverse_proba : float = 0.2 # Probability of reversing a segment of the audio track.
mushap_proba : float = 0.0 # Probability create mashups.
mushap_depth : int = 2 # Number of tracks to mix.


@dataclass
class InferenceConfig:
GDRIVE_PREFIX = "https://drive.google.com/uc?id="

device: str = "cpu"

# weights
weights_dir: Path = Path("/app/separator/inference/weights")
gdrive_weights_LSTM: str = f"{GDRIVE_PREFIX}18jT2TYffdRD1fL7wecAiM5nJPM_OKpNB"
gdrive_weights_conv: str = f"{GDRIVE_PREFIX}1VO07OYbsnCuEJYRSuA8HhjlQnx6dbWX7"
GDRIVE_PREFIX = "https://drive.google.com/uc?id=" # Google Drive URL

# inference instance
segment: int = 7
overlap: float = 0.2
offset: Union[int, None] = None
duration: Union[int, None] = None
# MODEL OPTIONS
weights_dir : Path = Path("/app/separator/inference/weights") # file name where weights are saved
weights_LSTM_filename : str = "weight_LSTM.pt" # file name model with LSTM
weights_conv_filename : str = "weight_conv.pt" # file name model without LSTM
gdrive_weights_LSTM : str = f"{GDRIVE_PREFIX}1uhAVMvW3x-KL2T2-VkjKjn9K7dTJnoyo" # Google Drive URL that directs weights LSTM
gdrive_weights_conv : str = f"{GDRIVE_PREFIX}1VO07OYbsnCuEJYRSuA8HhjlQnx6dbWX7" # Google Drive URL that directs weights without_LSTM
device : str = "cpu" # The computing platform for inference

# inference
sample_rate: int = 44100
num_channels: int = 2
default_result_dir: str = "/app/separator/inference/output"
default_input_dir: str = "/app/separator/inference/input"
# adele
gdrive_mix: str = f"{GDRIVE_PREFIX}1zJpyW1fYxHKXDcDH9s5DiBCYiRpraDB3"
# INFERENCE OPTIONS
segment : int = 7 # Length (in seconds) of each audio segment used during inference.
overlap : float = 0.2 # overlapping segments at the beginning of the track and at the end
offset : Union[int, None] = None # start (in seconds) of segment to split
duration : Union[int, None] = None # duration (in seconds) of segment to split, use with `offset`
sample_rate : int = 44100 # sample rate track
num_channels : int = 2 # Number of channels in the audio track
default_result_dir : str = "/app/separator/inference/output" # path file output tracks
default_input_dir : str = "/app/separator/inference/input" # path file input track

# TEST TRACK
gdrive_mix : str = f"{GDRIVE_PREFIX}1zJpyW1fYxHKXDcDH9s5DiBCYiRpraDB3" # Google Drive URL that directs test track
Loading

0 comments on commit d52dd15

Please sign in to comment.