From 7bc0364771738f42592fa1daee22f3f76fcbd0fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Tue, 9 Feb 2021 13:32:38 -0500 Subject: [PATCH 1/2] Sync snowfall with Lhotse's dataset/sampler refactoring PR --- egs/aishell/asr/simple_v1/ctc_decode.py | 20 ++--- egs/aishell/asr/simple_v1/ctc_train.py | 71 ++++++++++-------- .../asr/simple_v1/mmi_bigram_decode.py | 28 +++---- egs/aishell/asr/simple_v1/mmi_bigram_train.py | 73 ++++++++++-------- egs/librispeech/asr/simple_v1/ctc_decode.py | 21 +++--- egs/librispeech/asr/simple_v1/ctc_train.py | 72 ++++++++++-------- .../asr/simple_v1/mmi_bigram_decode.py | 29 +++----- .../asr/simple_v1/mmi_bigram_train.py | 74 +++++++++++-------- .../asr/simple_v1/mmi_mbr_decode.py | 29 +++----- .../asr/simple_v1/mmi_mbr_train.py | 73 ++++++++++-------- 10 files changed, 260 insertions(+), 230 deletions(-) diff --git a/egs/aishell/asr/simple_v1/ctc_decode.py b/egs/aishell/asr/simple_v1/ctc_decode.py index 46c7be7e..18c21e8a 100755 --- a/egs/aishell/asr/simple_v1/ctc_decode.py +++ b/egs/aishell/asr/simple_v1/ctc_decode.py @@ -3,26 +3,24 @@ # Copyright (c) 2020 Xiaomi Corporation (authors: Daniel Povey, Haowen Qiu) # Apache 2.0 +import k2 import logging import os +import torch +from k2 import Fsa, SymbolTable +from kaldialign import edit_distance from pathlib import Path from typing import List from typing import Optional from typing import Union -import k2 -import torch -from k2 import Fsa, SymbolTable -from kaldialign import edit_distance from lhotse import CutSet -from lhotse.dataset.speech_recognition import K2SpeechRecognitionIterableDataset - +from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler from snowfall.common import get_phone_symbols from snowfall.common import load_checkpoint from snowfall.common import setup_logger from snowfall.decoding.graph import compile_LG from snowfall.models import AcousticModel -from snowfall.models.tdnn import Tdnn1a from snowfall.models.tdnn_lstm import TdnnLstm1b from snowfall.training.ctc_graph import build_ctc_topo @@ -168,12 +166,10 @@ def main(): cuts_test = CutSet.from_json(feature_dir / 'cuts_test.json.gz') print("About to create test dataset") - test = K2SpeechRecognitionIterableDataset(cuts_test, - max_frames=100000, - shuffle=False, - concat_cuts=False) + test = K2SpeechRecognitionDataset(cuts_test) + sampler = SingleCutSampler(cuts_test, max_frames=100000) print("About to create test dataloader") - test_dl = torch.utils.data.DataLoader(test, batch_size=None, num_workers=1) + test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') diff --git a/egs/aishell/asr/simple_v1/ctc_train.py b/egs/aishell/asr/simple_v1/ctc_train.py index 392f7dcb..68ef1614 100755 --- a/egs/aishell/asr/simple_v1/ctc_train.py +++ b/egs/aishell/asr/simple_v1/ctc_train.py @@ -3,31 +3,28 @@ # Copyright (c) 2020 Xiaomi Corporation (authors: Daniel Povey, Haowen Qiu) # Apache 2.0 +import k2 import logging import math +import numpy as np import os import sys -from datetime import datetime -from pathlib import Path -from typing import Dict, Optional, Tuple - -import k2 -import numpy as np import torch import torch.optim as optim - -from lhotse import CutSet -from lhotse.dataset.speech_recognition import K2SpeechRecognitionIterableDataset -from lhotse.utils import fix_random_seed - +from datetime import datetime +from pathlib import Path from torch import nn from torch.nn.utils import clip_grad_value_ from torch.utils.tensorboard import SummaryWriter +from typing import Dict, Optional, Tuple -from snowfall.common import save_checkpoint, load_checkpoint +from lhotse import CutSet +from lhotse.dataset import CutConcatenate, CutMix, K2SpeechRecognitionDataset, SingleCutSampler +from lhotse.utils import fix_random_seed +from snowfall.common import get_phone_symbols +from snowfall.common import load_checkpoint, save_checkpoint from snowfall.common import save_training_info from snowfall.common import setup_logger -from snowfall.common import get_phone_symbols from snowfall.models import AcousticModel from snowfall.models.tdnn_lstm import TdnnLstm1b from snowfall.training.ctc_graph import CtcTrainingGraphCompiler @@ -275,25 +272,39 @@ def main(): cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") - train = K2SpeechRecognitionIterableDataset(cuts_train, - max_frames=90000, - shuffle=True, - aug_cuts=cuts_musan, - aug_prob=0.5, - aug_snr=(10, 20)) - logging.info("About to create dev dataset") - validate = K2SpeechRecognitionIterableDataset(cuts_dev, - max_frames=90000, - shuffle=False, - concat_cuts=False) + train = K2SpeechRecognitionDataset( + cuts_train, + cut_transforms=[ + CutConcatenate(), + CutMix( + cuts=cuts_musan, + prob=0.5, + snr=(10, 20) + ) + ] + ) + train_sampler = SingleCutSampler( + cuts_train, + max_frames=90000, + shuffle=True, + ) logging.info("About to create train dataloader") - train_dl = torch.utils.data.DataLoader(train, - batch_size=None, - num_workers=4) + train_dl = torch.utils.data.DataLoader( + train, + sampler=train_sampler, + batch_size=None, + num_workers=4 + ) + logging.info("About to create dev dataset") + validate = K2SpeechRecognitionDataset(cuts_dev) + valid_sampler = SingleCutSampler(cuts_dev, max_frames=90000) logging.info("About to create dev dataloader") - valid_dl = torch.utils.data.DataLoader(validate, - batch_size=None, - num_workers=1) + valid_dl = torch.utils.data.DataLoader( + validate, + sampler=valid_sampler, + batch_size=None, + num_workers=1 + ) if not torch.cuda.is_available(): logging.error('No GPU detected!') diff --git a/egs/aishell/asr/simple_v1/mmi_bigram_decode.py b/egs/aishell/asr/simple_v1/mmi_bigram_decode.py index 70fe73e9..41c71819 100755 --- a/egs/aishell/asr/simple_v1/mmi_bigram_decode.py +++ b/egs/aishell/asr/simple_v1/mmi_bigram_decode.py @@ -3,30 +3,28 @@ # 2021 Pingfeng Luo # Apache 2.0 +import k2 import logging +import numpy as np import os +import torch +from k2 import Fsa, SymbolTable +from kaldialign import edit_distance from pathlib import Path from typing import List from typing import Optional from typing import Union -import k2 -import numpy as np -import torch -from k2 import Fsa, SymbolTable -from kaldialign import edit_distance from lhotse import CutSet -from lhotse.dataset.speech_recognition import K2SpeechRecognitionIterableDataset - +from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler from snowfall.common import load_checkpoint from snowfall.common import setup_logger from snowfall.decoding.graph import compile_LG from snowfall.models import AcousticModel -from snowfall.models.tdnn import Tdnn1a from snowfall.models.tdnn_lstm import TdnnLstm1b from snowfall.training.ctc_graph import build_ctc_topo -from snowfall.training.mmi_graph import get_phone_symbols from snowfall.training.mmi_graph import create_bigram_phone_lm +from snowfall.training.mmi_graph import get_phone_symbols def decode(dataloader: torch.utils.data.DataLoader, model: AcousticModel, @@ -256,13 +254,11 @@ def main(): logging.debug("About to get test cuts") cuts_test = CutSet.from_json(feature_dir / 'cuts_test.json.gz') - logging.debug("About to create test dataset") - test = K2SpeechRecognitionIterableDataset(cuts_test, - max_frames=100000, - shuffle=False, - concat_cuts=False) - logging.debug("About to create test dataloader") - test_dl = torch.utils.data.DataLoader(test, batch_size=None, num_workers=1) + logging.info("About to create test dataset") + test = K2SpeechRecognitionDataset(cuts_test) + sampler = SingleCutSampler(cuts_test, max_frames=100000) + logging.info("About to create test dataloader") + test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') diff --git a/egs/aishell/asr/simple_v1/mmi_bigram_train.py b/egs/aishell/asr/simple_v1/mmi_bigram_train.py index 3e1a4303..0b4b3dbc 100755 --- a/egs/aishell/asr/simple_v1/mmi_bigram_train.py +++ b/egs/aishell/asr/simple_v1/mmi_bigram_train.py @@ -3,35 +3,32 @@ # 2021 Pingfeng Luo # Apache 2.0 +import k2 import logging import math +import numpy as np import os import sys -from datetime import datetime -from pathlib import Path -from typing import Dict, Optional, Tuple - -import k2 -import numpy as np import torch import torch.optim as optim - -from lhotse import CutSet -from lhotse.dataset.speech_recognition import K2SpeechRecognitionIterableDataset -from lhotse.utils import fix_random_seed - +from datetime import datetime +from pathlib import Path from torch import nn from torch.nn.utils import clip_grad_value_ from torch.utils.tensorboard import SummaryWriter +from typing import Dict, Optional, Tuple -from snowfall.common import save_checkpoint, load_checkpoint +from lhotse import CutSet +from lhotse.dataset import CutConcatenate, CutMix, K2SpeechRecognitionDataset, SingleCutSampler +from lhotse.utils import fix_random_seed +from snowfall.common import load_checkpoint, save_checkpoint from snowfall.common import save_training_info from snowfall.common import setup_logger from snowfall.models import AcousticModel from snowfall.models.tdnn_lstm import TdnnLstm1b -from snowfall.training.mmi_graph import get_phone_symbols -from snowfall.training.mmi_graph import create_bigram_phone_lm from snowfall.training.mmi_graph import MmiTrainingGraphCompiler +from snowfall.training.mmi_graph import create_bigram_phone_lm +from snowfall.training.mmi_graph import get_phone_symbols den_scale = 1.0 @@ -303,25 +300,39 @@ def main(): cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") - train = K2SpeechRecognitionIterableDataset(cuts_train, - max_frames=30000, - shuffle=True, - aug_cuts=cuts_musan, - aug_prob=0.5, - aug_snr=(10, 20)) - logging.info("About to create dev dataset") - validate = K2SpeechRecognitionIterableDataset(cuts_dev, - max_frames=30000, - shuffle=False, - concat_cuts=False) + train = K2SpeechRecognitionDataset( + cuts_train, + cut_transforms=[ + CutConcatenate(), + CutMix( + cuts=cuts_musan, + prob=0.5, + snr=(10, 20) + ) + ] + ) + train_sampler = SingleCutSampler( + cuts_train, + max_frames=30000, + shuffle=True, + ) logging.info("About to create train dataloader") - train_dl = torch.utils.data.DataLoader(train, - batch_size=None, - num_workers=2) + train_dl = torch.utils.data.DataLoader( + train, + sampler=train_sampler, + batch_size=None, + num_workers=4 + ) + logging.info("About to create dev dataset") + validate = K2SpeechRecognitionDataset(cuts_dev) + valid_sampler = SingleCutSampler(cuts_dev, max_frames=30000) logging.info("About to create dev dataloader") - valid_dl = torch.utils.data.DataLoader(validate, - batch_size=None, - num_workers=1) + valid_dl = torch.utils.data.DataLoader( + validate, + sampler=valid_sampler, + batch_size=None, + num_workers=1 + ) if not torch.cuda.is_available(): logging.error('No GPU detected!') diff --git a/egs/librispeech/asr/simple_v1/ctc_decode.py b/egs/librispeech/asr/simple_v1/ctc_decode.py index 1eae5ead..bf17929e 100755 --- a/egs/librispeech/asr/simple_v1/ctc_decode.py +++ b/egs/librispeech/asr/simple_v1/ctc_decode.py @@ -3,26 +3,25 @@ # Copyright (c) 2020 Xiaomi Corporation (authors: Daniel Povey, Haowen Qiu) # Apache 2.0 +import k2 import logging import os +import torch +from k2 import Fsa, SymbolTable +from kaldialign import edit_distance from pathlib import Path from typing import List from typing import Optional from typing import Union -import k2 -import torch -from k2 import Fsa, SymbolTable -from kaldialign import edit_distance from lhotse import CutSet -from lhotse.dataset.speech_recognition import K2SpeechRecognitionIterableDataset - +from lhotse.dataset import K2SpeechRecognitionDataset +from lhotse.dataset import SingleCutSampler from snowfall.common import get_phone_symbols from snowfall.common import load_checkpoint from snowfall.common import setup_logger from snowfall.decoding.graph import compile_LG from snowfall.models import AcousticModel -from snowfall.models.tdnn import Tdnn1a from snowfall.models.tdnn_lstm import TdnnLstm1b from snowfall.training.ctc_graph import build_ctc_topo @@ -168,12 +167,10 @@ def main(): cuts_test = CutSet.from_json(feature_dir / 'cuts_test-clean.json.gz') print("About to create test dataset") - test = K2SpeechRecognitionIterableDataset(cuts_test, - max_frames=100000, - shuffle=False, - concat_cuts=False) + test = K2SpeechRecognitionDataset(cuts_test) + sampler = SingleCutSampler(cuts_test, max_frames=100000) print("About to create test dataloader") - test_dl = torch.utils.data.DataLoader(test, batch_size=None, num_workers=1) + test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') diff --git a/egs/librispeech/asr/simple_v1/ctc_train.py b/egs/librispeech/asr/simple_v1/ctc_train.py index d0b20e57..2725b8d8 100755 --- a/egs/librispeech/asr/simple_v1/ctc_train.py +++ b/egs/librispeech/asr/simple_v1/ctc_train.py @@ -3,34 +3,30 @@ # Copyright (c) 2020 Xiaomi Corporation (authors: Daniel Povey, Haowen Qiu) # Apache 2.0 +import k2 import logging import math +import numpy as np import os import sys -from datetime import datetime -from pathlib import Path -from typing import Dict, Optional, Tuple - -import k2 -import numpy as np import torch import torch.optim as optim - -from lhotse import CutSet -from lhotse.dataset.speech_recognition import K2SpeechRecognitionIterableDataset -from lhotse.utils import fix_random_seed - +from datetime import datetime +from pathlib import Path from torch import nn from torch.nn.utils import clip_grad_value_ from torch.utils.tensorboard import SummaryWriter +from typing import Dict, Optional, Tuple -from snowfall.common import save_checkpoint, load_checkpoint +from lhotse import CutSet +from lhotse.dataset import CutConcatenate, CutMix, K2SpeechRecognitionDataset, SingleCutSampler +from lhotse.utils import fix_random_seed +from snowfall.common import get_phone_symbols +from snowfall.common import load_checkpoint, save_checkpoint from snowfall.common import save_training_info from snowfall.common import setup_logger -from snowfall.common import get_phone_symbols from snowfall.models import AcousticModel from snowfall.models.tdnn_lstm import TdnnLstm1b -from snowfall.models.tdnnf import Tdnnf1a from snowfall.training.ctc_graph import CtcTrainingGraphCompiler @@ -276,25 +272,39 @@ def main(): cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") - train = K2SpeechRecognitionIterableDataset(cuts_train, - max_frames=90000, - shuffle=True, - aug_cuts=cuts_musan, - aug_prob=0.5, - aug_snr=(10, 20)) - logging.info("About to create dev dataset") - validate = K2SpeechRecognitionIterableDataset(cuts_dev, - max_frames=90000, - shuffle=False, - concat_cuts=False) + train = K2SpeechRecognitionDataset( + cuts_train, + cut_transforms=[ + CutConcatenate(), + CutMix( + cuts=cuts_musan, + prob=0.5, + snr=(10, 20) + ) + ] + ) + train_sampler = SingleCutSampler( + cuts_train, + max_frames=90000, + shuffle=True, + ) logging.info("About to create train dataloader") - train_dl = torch.utils.data.DataLoader(train, - batch_size=None, - num_workers=4) + train_dl = torch.utils.data.DataLoader( + train, + sampler=train_sampler, + batch_size=None, + num_workers=4 + ) + logging.info("About to create dev dataset") + validate = K2SpeechRecognitionDataset(cuts_dev) + valid_sampler = SingleCutSampler(cuts_dev, max_frames=90000) logging.info("About to create dev dataloader") - valid_dl = torch.utils.data.DataLoader(validate, - batch_size=None, - num_workers=1) + valid_dl = torch.utils.data.DataLoader( + validate, + sampler=valid_sampler, + batch_size=None, + num_workers=1 + ) if not torch.cuda.is_available(): logging.error('No GPU detected!') diff --git a/egs/librispeech/asr/simple_v1/mmi_bigram_decode.py b/egs/librispeech/asr/simple_v1/mmi_bigram_decode.py index 59aba23c..2f99a5a2 100755 --- a/egs/librispeech/asr/simple_v1/mmi_bigram_decode.py +++ b/egs/librispeech/asr/simple_v1/mmi_bigram_decode.py @@ -3,30 +3,28 @@ # Copyright (c) 2020 Xiaomi Corporation (authors: Daniel Povey, Haowen Qiu) # Apache 2.0 +import k2 import logging +import numpy as np import os +import torch +from k2 import Fsa, SymbolTable +from kaldialign import edit_distance from pathlib import Path from typing import List from typing import Optional from typing import Union -import k2 -import numpy as np -import torch -from k2 import Fsa, SymbolTable -from kaldialign import edit_distance from lhotse import CutSet -from lhotse.dataset.speech_recognition import K2SpeechRecognitionIterableDataset - +from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler from snowfall.common import load_checkpoint from snowfall.common import setup_logger from snowfall.decoding.graph import compile_LG from snowfall.models import AcousticModel -from snowfall.models.tdnnf import Tdnnf1a from snowfall.models.tdnn_lstm import TdnnLstm1b from snowfall.training.ctc_graph import build_ctc_topo -from snowfall.training.mmi_graph import get_phone_symbols from snowfall.training.mmi_graph import create_bigram_phone_lm +from snowfall.training.mmi_graph import get_phone_symbols def decode(dataloader: torch.utils.data.DataLoader, model: AcousticModel, @@ -255,19 +253,16 @@ def main(): logging.debug("About to get test cuts") cuts_test = CutSet.from_json(feature_dir / 'cuts_test-clean.json.gz') - logging.debug("About to create test dataset") - test = K2SpeechRecognitionIterableDataset(cuts_test, - max_frames=100000, - shuffle=False, - concat_cuts=False) - logging.debug("About to create test dataloader") - test_dl = torch.utils.data.DataLoader(test, batch_size=None, num_workers=1) + logging.info("About to create test dataset") + test = K2SpeechRecognitionDataset(cuts_test) + sampler = SingleCutSampler(cuts_test, max_frames=100000) + logging.info("About to create test dataloader") + test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') # sys.exit(-1) - logging.debug("convert LG to device") LG = LG.to(device) LG.aux_labels = k2.ragged.remove_values_eq(LG.aux_labels, 0) diff --git a/egs/librispeech/asr/simple_v1/mmi_bigram_train.py b/egs/librispeech/asr/simple_v1/mmi_bigram_train.py index 6bdf79ea..99f5ee47 100755 --- a/egs/librispeech/asr/simple_v1/mmi_bigram_train.py +++ b/egs/librispeech/asr/simple_v1/mmi_bigram_train.py @@ -4,37 +4,33 @@ # Fangjun Kuang) # Apache 2.0 +import k2 import logging import math +import numpy as np import os import sys -from datetime import datetime -from pathlib import Path -from typing import Dict, Optional, Tuple - -import k2 -import numpy as np import torch import torch.optim as optim - -from lhotse import CutSet -from lhotse.dataset.speech_recognition import K2SpeechRecognitionIterableDataset -from lhotse.utils import fix_random_seed - +from datetime import datetime +from pathlib import Path from torch import nn from torch.nn.utils import clip_grad_value_ from torch.utils.tensorboard import SummaryWriter +from typing import Dict, Optional, Tuple -from snowfall.common import save_checkpoint, load_checkpoint +from lhotse import CutSet +from lhotse.dataset import CutConcatenate, CutMix, K2SpeechRecognitionDataset, SingleCutSampler +from lhotse.utils import fix_random_seed +from snowfall.common import load_checkpoint, save_checkpoint from snowfall.common import save_training_info from snowfall.common import setup_logger from snowfall.models import AcousticModel from snowfall.models.tdnn_lstm import TdnnLstm1b -from snowfall.models.tdnnf import Tdnnf1a from snowfall.training.diagnostics import measure_gradient_norms, optim_step_and_measure_param_change -from snowfall.training.mmi_graph import get_phone_symbols -from snowfall.training.mmi_graph import create_bigram_phone_lm from snowfall.training.mmi_graph import MmiTrainingGraphCompiler +from snowfall.training.mmi_graph import create_bigram_phone_lm +from snowfall.training.mmi_graph import get_phone_symbols den_scale = 1.0 @@ -337,25 +333,39 @@ def main(): cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") - train = K2SpeechRecognitionIterableDataset(cuts_train, - max_frames=60000, - shuffle=True, - aug_cuts=cuts_musan, - aug_prob=0.5, - aug_snr=(10, 20)) - logging.info("About to create dev dataset") - validate = K2SpeechRecognitionIterableDataset(cuts_dev, - max_frames=60000, - shuffle=False, - concat_cuts=False) + train = K2SpeechRecognitionDataset( + cuts_train, + cut_transforms=[ + CutConcatenate(), + CutMix( + cuts=cuts_musan, + prob=0.5, + snr=(10, 20) + ) + ] + ) + train_sampler = SingleCutSampler( + cuts_train, + max_frames=90000, + shuffle=True, + ) logging.info("About to create train dataloader") - train_dl = torch.utils.data.DataLoader(train, - batch_size=None, - num_workers=4) + train_dl = torch.utils.data.DataLoader( + train, + sampler=train_sampler, + batch_size=None, + num_workers=4 + ) + logging.info("About to create dev dataset") + validate = K2SpeechRecognitionDataset(cuts_dev) + valid_sampler = SingleCutSampler(cuts_dev, max_frames=90000) logging.info("About to create dev dataloader") - valid_dl = torch.utils.data.DataLoader(validate, - batch_size=None, - num_workers=1) + valid_dl = torch.utils.data.DataLoader( + validate, + sampler=valid_sampler, + batch_size=None, + num_workers=1 + ) if not torch.cuda.is_available(): logging.error('No GPU detected!') diff --git a/egs/librispeech/asr/simple_v1/mmi_mbr_decode.py b/egs/librispeech/asr/simple_v1/mmi_mbr_decode.py index b601074f..cd926408 100755 --- a/egs/librispeech/asr/simple_v1/mmi_mbr_decode.py +++ b/egs/librispeech/asr/simple_v1/mmi_mbr_decode.py @@ -3,30 +3,28 @@ # Copyright (c) 2020 Xiaomi Corporation (authors: Daniel Povey, Haowen Qiu) # Apache 2.0 +import k2 import logging +import numpy as np import os +import torch +from k2 import Fsa, SymbolTable +from kaldialign import edit_distance from pathlib import Path from typing import List from typing import Optional from typing import Union -import k2 -import numpy as np -import torch -from k2 import Fsa, SymbolTable -from kaldialign import edit_distance from lhotse import CutSet -from lhotse.dataset.speech_recognition import K2SpeechRecognitionIterableDataset - +from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler from snowfall.common import load_checkpoint from snowfall.common import setup_logger from snowfall.decoding.graph import compile_LG from snowfall.models import AcousticModel -from snowfall.models.tdnn import Tdnn1a from snowfall.models.tdnn_lstm import TdnnLstm1b from snowfall.training.ctc_graph import build_ctc_topo -from snowfall.training.mmi_graph import get_phone_symbols from snowfall.training.mmi_graph import create_bigram_phone_lm +from snowfall.training.mmi_graph import get_phone_symbols def decode(dataloader: torch.utils.data.DataLoader, model: AcousticModel, @@ -255,19 +253,16 @@ def main(): logging.debug("About to get test cuts") cuts_test = CutSet.from_json(feature_dir / 'cuts_test-clean.json.gz') - logging.debug("About to create test dataset") - test = K2SpeechRecognitionIterableDataset(cuts_test, - max_frames=100000, - shuffle=False, - concat_cuts=False) - logging.debug("About to create test dataloader") - test_dl = torch.utils.data.DataLoader(test, batch_size=None, num_workers=1) + logging.info("About to create test dataset") + test = K2SpeechRecognitionDataset(cuts_test) + sampler = SingleCutSampler(cuts_test, max_frames=100000) + logging.info("About to create test dataloader") + test_dl = torch.utils.data.DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) # if not torch.cuda.is_available(): # logging.error('No GPU detected!') # sys.exit(-1) - logging.debug("convert LG to device") LG = LG.to(device) LG.aux_labels = k2.ragged.remove_values_eq(LG.aux_labels, 0) diff --git a/egs/librispeech/asr/simple_v1/mmi_mbr_train.py b/egs/librispeech/asr/simple_v1/mmi_mbr_train.py index 6f610220..44fa9575 100755 --- a/egs/librispeech/asr/simple_v1/mmi_mbr_train.py +++ b/egs/librispeech/asr/simple_v1/mmi_mbr_train.py @@ -4,35 +4,30 @@ # Fangjun Kuang) # Apache 2.0 -import logging -import math -import os -import sys -from datetime import datetime -from pathlib import Path -from typing import Dict, Optional, Tuple - import k2 import k2.sparse +import logging import numpy as np +import os import torch import torch.optim as optim - -from lhotse import CutSet -from lhotse.dataset.speech_recognition import K2SpeechRecognitionIterableDataset -from lhotse.utils import fix_random_seed - +from datetime import datetime +from pathlib import Path from torch import nn from torch.nn.utils import clip_grad_value_ from torch.utils.tensorboard import SummaryWriter +from typing import Dict, Optional, Tuple -from snowfall.common import save_checkpoint, load_checkpoint +from lhotse import CutSet +from lhotse.dataset import CutConcatenate, CutMix, K2SpeechRecognitionDataset, SingleCutSampler +from lhotse.utils import fix_random_seed +from snowfall.common import load_checkpoint, save_checkpoint from snowfall.common import save_training_info from snowfall.common import setup_logger from snowfall.models import AcousticModel from snowfall.models.tdnn_lstm import TdnnLstm1b -from snowfall.training.mmi_graph import get_phone_symbols from snowfall.training.mmi_graph import create_bigram_phone_lm +from snowfall.training.mmi_graph import get_phone_symbols from snowfall.training.mmi_mbr_graph import MmiMbrTrainingGraphCompiler den_scale = 1.0 @@ -462,25 +457,39 @@ def main(): cuts_musan = CutSet.from_json(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") - train = K2SpeechRecognitionIterableDataset(cuts_train, - max_frames=30000, - shuffle=True, - aug_cuts=cuts_musan, - aug_prob=0.5, - aug_snr=(10, 20)) - logging.info("About to create dev dataset") - validate = K2SpeechRecognitionIterableDataset(cuts_dev, - max_frames=60000, - shuffle=False, - concat_cuts=False) + train = K2SpeechRecognitionDataset( + cuts_train, + cut_transforms=[ + CutConcatenate(), + CutMix( + cuts=cuts_musan, + prob=0.5, + snr=(10, 20) + ) + ] + ) + train_sampler = SingleCutSampler( + cuts_train, + max_frames=30000, + shuffle=True, + ) logging.info("About to create train dataloader") - train_dl = torch.utils.data.DataLoader(train, - batch_size=None, - num_workers=4) + train_dl = torch.utils.data.DataLoader( + train, + sampler=train_sampler, + batch_size=None, + num_workers=4 + ) + logging.info("About to create dev dataset") + validate = K2SpeechRecognitionDataset(cuts_dev) + valid_sampler = SingleCutSampler(cuts_dev, max_frames=60000) logging.info("About to create dev dataloader") - valid_dl = torch.utils.data.DataLoader(validate, - batch_size=None, - num_workers=1) + valid_dl = torch.utils.data.DataLoader( + validate, + sampler=valid_sampler, + batch_size=None, + num_workers=1 + ) logging.info("About to create model") model = TdnnLstm1b(num_features=40, From 817a34c1475e56d0d17d51bba6d300eb7cc7f46d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Tue, 9 Feb 2021 15:50:40 -0500 Subject: [PATCH 2/2] Add the "train_sampler.set_epoch()" call to have different sample order at each epoch --- egs/aishell/asr/simple_v1/ctc_train.py | 1 + egs/aishell/asr/simple_v1/mmi_bigram_train.py | 1 + egs/librispeech/asr/simple_v1/ctc_train.py | 1 + egs/librispeech/asr/simple_v1/mmi_bigram_train.py | 1 + egs/librispeech/asr/simple_v1/mmi_mbr_train.py | 1 + 5 files changed, 5 insertions(+) diff --git a/egs/aishell/asr/simple_v1/ctc_train.py b/egs/aishell/asr/simple_v1/ctc_train.py index 68ef1614..61635ff1 100755 --- a/egs/aishell/asr/simple_v1/ctc_train.py +++ b/egs/aishell/asr/simple_v1/ctc_train.py @@ -346,6 +346,7 @@ def main(): weight_decay=5e-4) for epoch in range(start_epoch, num_epochs): + train_sampler.set_epoch(epoch) curr_learning_rate = 1e-3 # curr_learning_rate = learning_rate * pow(0.4, epoch) # for param_group in optimizer.param_groups: diff --git a/egs/aishell/asr/simple_v1/mmi_bigram_train.py b/egs/aishell/asr/simple_v1/mmi_bigram_train.py index 0b4b3dbc..2c6744e1 100755 --- a/egs/aishell/asr/simple_v1/mmi_bigram_train.py +++ b/egs/aishell/asr/simple_v1/mmi_bigram_train.py @@ -375,6 +375,7 @@ def main(): curr_learning_rate = learning_rate for epoch in range(start_epoch, num_epochs): + train_sampler.set_epoch(epoch) # curr_learning_rate = learning_rate * pow(0.4, epoch) if epoch > 6: curr_learning_rate *= 0.8 diff --git a/egs/librispeech/asr/simple_v1/ctc_train.py b/egs/librispeech/asr/simple_v1/ctc_train.py index 2725b8d8..c490688f 100755 --- a/egs/librispeech/asr/simple_v1/ctc_train.py +++ b/egs/librispeech/asr/simple_v1/ctc_train.py @@ -346,6 +346,7 @@ def main(): weight_decay=5e-4) for epoch in range(start_epoch, num_epochs): + train_sampler.set_epoch(epoch) curr_learning_rate = 1e-3 # curr_learning_rate = learning_rate * pow(0.4, epoch) # for param_group in optimizer.param_groups: diff --git a/egs/librispeech/asr/simple_v1/mmi_bigram_train.py b/egs/librispeech/asr/simple_v1/mmi_bigram_train.py index 99f5ee47..8bff31e2 100755 --- a/egs/librispeech/asr/simple_v1/mmi_bigram_train.py +++ b/egs/librispeech/asr/simple_v1/mmi_bigram_train.py @@ -431,6 +431,7 @@ def main(): ) for epoch in range(start_epoch, num_epochs): + train_sampler.set_epoch(epoch) # LR scheduler can hold multiple learning rates for multiple parameter groups; # For now we report just the first LR which we assume concerns most of the parameters. curr_learning_rate = lr_scheduler.get_last_lr()[0] diff --git a/egs/librispeech/asr/simple_v1/mmi_mbr_train.py b/egs/librispeech/asr/simple_v1/mmi_mbr_train.py index 44fa9575..451eb78c 100755 --- a/egs/librispeech/asr/simple_v1/mmi_mbr_train.py +++ b/egs/librispeech/asr/simple_v1/mmi_mbr_train.py @@ -554,6 +554,7 @@ def main(): for epoch in range(start_epoch, num_epochs): # LR scheduler can hold multiple learning rates for multiple parameter groups; # For now we report just the first LR which we assume concerns most of the parameters. + train_sampler.set_epoch(epoch) curr_learning_rate = lr_scheduler.get_last_lr()[0] tb_writer.add_scalar('train/learning_rate', curr_learning_rate, global_batch_idx_train) tb_writer.add_scalar('train/epoch', epoch, global_batch_idx_train)