From 4bdd5094c5f6b50435df58ee637dccf6ef9f5e09 Mon Sep 17 00:00:00 2001 From: Chaitanya Narisetty Date: Sat, 5 Jun 2021 10:02:56 -0400 Subject: [PATCH 1/3] added conv2d1, conv2d2 --- .../nets/pytorch_backend/conformer/encoder.py | 20 ++- .../pytorch_backend/transformer/argument.py | 2 +- .../pytorch_backend/transformer/encoder.py | 12 +- .../transformer/subsampling.py | 118 ++++++++++++++++++ 4 files changed, 148 insertions(+), 4 deletions(-) diff --git a/espnet/nets/pytorch_backend/conformer/encoder.py b/espnet/nets/pytorch_backend/conformer/encoder.py index 980d15a18b8..47ffcb96935 100644 --- a/espnet/nets/pytorch_backend/conformer/encoder.py +++ b/espnet/nets/pytorch_backend/conformer/encoder.py @@ -30,6 +30,8 @@ ) from espnet.nets.pytorch_backend.transformer.repeat import repeat from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling +from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling1 +from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling2 class Encoder(torch.nn.Module): @@ -112,6 +114,22 @@ def __init__( torch.nn.Dropout(dropout_rate), pos_enc_class(attention_dim, positional_dropout_rate), ) + elif input_layer == "conv2d1": + self.embed = Conv2dSubsampling1( + idim, + attention_dim, + dropout_rate, + pos_enc_class(attention_dim, positional_dropout_rate), + ) + self.conv_subsampling_factor = 1 + elif input_layer == "conv2d2": + self.embed = Conv2dSubsampling2( + idim, + attention_dim, + dropout_rate, + pos_enc_class(attention_dim, positional_dropout_rate), + ) + self.conv_subsampling_factor = 2 elif input_layer == "conv2d": self.embed = Conv2dSubsampling( idim, @@ -231,7 +249,7 @@ def forward(self, xs, masks): torch.Tensor: Mask tensor (#batch, time). """ - if isinstance(self.embed, (Conv2dSubsampling, VGG2L)): + if isinstance(self.embed, (Conv2dSubsampling1, Conv2dSubsampling2, Conv2dSubsampling, VGG2L)): xs, masks = self.embed(xs, masks) else: xs = self.embed(xs) diff --git a/espnet/nets/pytorch_backend/transformer/argument.py b/espnet/nets/pytorch_backend/transformer/argument.py index 216a68d90c3..3b8f0de0a76 100644 --- a/espnet/nets/pytorch_backend/transformer/argument.py +++ b/espnet/nets/pytorch_backend/transformer/argument.py @@ -26,7 +26,7 @@ def add_arguments_transformer_common(group): "--transformer-input-layer", type=str, default="conv2d", - choices=["conv2d", "linear", "embed"], + choices=["conv2d", "conv2d1", "conv2d2", "linear", "embed"], help="transformer input layer type", ) group.add_argument( diff --git a/espnet/nets/pytorch_backend/transformer/encoder.py b/espnet/nets/pytorch_backend/transformer/encoder.py index 5b19ded7dde..a7dc3d3009d 100644 --- a/espnet/nets/pytorch_backend/transformer/encoder.py +++ b/espnet/nets/pytorch_backend/transformer/encoder.py @@ -23,6 +23,8 @@ ) from espnet.nets.pytorch_backend.transformer.repeat import repeat from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling +from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling1 +from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling2 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling6 from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling8 @@ -103,6 +105,12 @@ def __init__( torch.nn.ReLU(), pos_enc_class(attention_dim, positional_dropout_rate), ) + elif input_layer == "conv2d1": + self.embed = Conv2dSubsampling1(idim, attention_dim, dropout_rate) + self.conv_subsampling_factor = 1 + elif input_layer == "conv2d2": + self.embed = Conv2dSubsampling2(idim, attention_dim, dropout_rate) + self.conv_subsampling_factor = 2 elif input_layer == "conv2d": self.embed = Conv2dSubsampling(idim, attention_dim, dropout_rate) self.conv_subsampling_factor = 4 @@ -292,7 +300,7 @@ def forward(self, xs, masks): """ if isinstance( self.embed, - (Conv2dSubsampling, Conv2dSubsampling6, Conv2dSubsampling8, VGG2L), + (Conv2dSubsampling1, Conv2dSubsampling2, Conv2dSubsampling, Conv2dSubsampling6, Conv2dSubsampling8, VGG2L), ): xs, masks = self.embed(xs, masks) else: @@ -316,7 +324,7 @@ def forward_one_step(self, xs, masks, cache=None): List[torch.Tensor]: List of new cache tensors. """ - if isinstance(self.embed, Conv2dSubsampling): + if isinstance(self.embed, Conv2dSubsampling1, Conv2dSubsampling2, Conv2dSubsampling): xs, masks = self.embed(xs, masks) else: xs = self.embed(xs) diff --git a/espnet/nets/pytorch_backend/transformer/subsampling.py b/espnet/nets/pytorch_backend/transformer/subsampling.py index 1f5a736d3aa..6a61a4a25ea 100644 --- a/espnet/nets/pytorch_backend/transformer/subsampling.py +++ b/espnet/nets/pytorch_backend/transformer/subsampling.py @@ -39,6 +39,124 @@ def check_short_utt(ins, size): return False, -1 +class Conv2dSubsampling1(torch.nn.Module): + """Convolutional 2D subsampling (to same length). + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + pos_enc (torch.nn.Module): Custom position encoding layer. + + """ + + def __init__(self, idim, odim, dropout_rate, pos_enc=None): + """Construct an Conv2dSubsampling object.""" + super(Conv2dSubsampling1, self).__init__() + self.conv = torch.nn.Sequential( + torch.nn.Conv2d(1, odim, 3, 1, 1), + torch.nn.ReLU(), + torch.nn.Conv2d(odim, odim, 3, 1, 1), + torch.nn.ReLU(), + ) + self.out = torch.nn.Sequential( + torch.nn.Linear(odim * (((idim - 0) // 1 - 0) // 1), odim), + pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate), + ) + + def forward(self, x, x_mask): + """Subsample x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: Subsampled tensor (#batch, time', odim), + where time' = time // 1. + torch.Tensor: Subsampled mask (#batch, 1, time'), + where time' = time // 1. + + """ + x = x.unsqueeze(1) # (b, c, t, f) + x = self.conv(x) + b, c, t, f = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) + if x_mask is None: + return x, None + return x, x_mask[:, :, ::1][:, :, ::1] + + def __getitem__(self, key): + """Get item. + + When reset_parameters() is called, if use_scaled_pos_enc is used, + return the positioning encoding. + + """ + if key != -1: + raise NotImplementedError("Support only `-1` (for `reset_parameters`).") + return self.out[key] + + +class Conv2dSubsampling2(torch.nn.Module): + """Convolutional 2D subsampling (to 1/2 length). + + Args: + idim (int): Input dimension. + odim (int): Output dimension. + dropout_rate (float): Dropout rate. + pos_enc (torch.nn.Module): Custom position encoding layer. + + """ + + def __init__(self, idim, odim, dropout_rate, pos_enc=None): + """Construct an Conv2dSubsampling object.""" + super(Conv2dSubsampling2, self).__init__() + self.conv = torch.nn.Sequential( + torch.nn.Conv2d(1, odim, 3, 2), + torch.nn.ReLU(), + torch.nn.Conv2d(odim, odim, 3, 1, 1), + torch.nn.ReLU(), + ) + self.out = torch.nn.Sequential( + torch.nn.Linear(odim * (((idim - 1) // 2 - 0) // 1), odim), + pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate), + ) + + def forward(self, x, x_mask): + """Subsample x. + + Args: + x (torch.Tensor): Input tensor (#batch, time, idim). + x_mask (torch.Tensor): Input mask (#batch, 1, time). + + Returns: + torch.Tensor: Subsampled tensor (#batch, time', odim), + where time' = time // 2. + torch.Tensor: Subsampled mask (#batch, 1, time'), + where time' = time // 2. + + """ + x = x.unsqueeze(1) # (b, c, t, f) + x = self.conv(x) + b, c, t, f = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) + if x_mask is None: + return x, None + return x, x_mask[:, :, :-2:2][:, :, ::1] + + def __getitem__(self, key): + """Get item. + + When reset_parameters() is called, if use_scaled_pos_enc is used, + return the positioning encoding. + + """ + if key != -1: + raise NotImplementedError("Support only `-1` (for `reset_parameters`).") + return self.out[key] + + class Conv2dSubsampling(torch.nn.Module): """Convolutional 2D subsampling (to 1/4 length). From 3081b84f99abd0754b85b821d8723017d50eef98 Mon Sep 17 00:00:00 2001 From: Chaitanya Narisetty Date: Mon, 7 Jun 2021 07:32:24 -0400 Subject: [PATCH 2/3] ClothoV2 word tokens recipe --- egs/clotho/aac_word/README.md | 56 +++ egs/clotho/aac_word/cmd.sh | 89 +++++ egs/clotho/aac_word/conf/decode.yaml | 1 + egs/clotho/aac_word/conf/fbank.conf | 4 + egs/clotho/aac_word/conf/gpu.conf | 10 + egs/clotho/aac_word/conf/lm.yaml | 8 + egs/clotho/aac_word/conf/no_preprocess.yaml | 2 + egs/clotho/aac_word/conf/pitch.conf | 3 + egs/clotho/aac_word/conf/queue.conf | 10 + egs/clotho/aac_word/conf/slurm.conf | 14 + egs/clotho/aac_word/conf/specaug.yaml | 16 + egs/clotho/aac_word/conf/train.yaml | 1 + egs/clotho/aac_word/conf/tuning/decode.yaml | 7 + .../aac_word/conf/tuning/decode_rnn.yaml | 6 + .../conf/tuning/decode_transfromer.yaml | 8 + .../aac_word/conf/tuning/train_conformer.yaml | 49 +++ .../conf/tuning/train_conformer_large.yaml | 49 +++ .../conf/tuning/train_conformer_med.yaml | 49 +++ .../conf/tuning/train_conformer_small.yaml | 49 +++ .../aac_word/conf/tuning/train_rnn.yaml | 32 ++ .../conf/tuning/train_transformer.yaml | 40 +++ .../cs_train_conformer-rnn_transducer.yaml | 47 +++ .../cy_train_conformer-rnn_transducer.yaml | 47 +++ .../tuning/transducer/decode_default.yaml | 6 + .../aac_word/local/back_translate_data_dir.sh | 38 +++ .../aac_word/local/back_translate_text.py | 57 ++++ .../aac_word/local/data_prep_audiocaps.py | 56 +++ .../local/download_large_drive_file.sh | 10 + .../local/evaluate_decoded_captions.py | 63 ++++ egs/clotho/aac_word/path.sh | 25 ++ egs/clotho/aac_word/run.sh | 322 ++++++++++++++++++ egs/clotho/aac_word/steps | 1 + egs/clotho/aac_word/utils | 1 + utils/text2token.py | 16 +- 34 files changed, 1189 insertions(+), 3 deletions(-) create mode 100644 egs/clotho/aac_word/README.md create mode 100644 egs/clotho/aac_word/cmd.sh create mode 120000 egs/clotho/aac_word/conf/decode.yaml create mode 100644 egs/clotho/aac_word/conf/fbank.conf create mode 100644 egs/clotho/aac_word/conf/gpu.conf create mode 100644 egs/clotho/aac_word/conf/lm.yaml create mode 100644 egs/clotho/aac_word/conf/no_preprocess.yaml create mode 100644 egs/clotho/aac_word/conf/pitch.conf create mode 100644 egs/clotho/aac_word/conf/queue.conf create mode 100644 egs/clotho/aac_word/conf/slurm.conf create mode 100644 egs/clotho/aac_word/conf/specaug.yaml create mode 120000 egs/clotho/aac_word/conf/train.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/decode.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/decode_rnn.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/decode_transfromer.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/train_conformer.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/train_conformer_large.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/train_conformer_med.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/train_conformer_small.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/train_rnn.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/train_transformer.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/transducer/cs_train_conformer-rnn_transducer.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/transducer/cy_train_conformer-rnn_transducer.yaml create mode 100644 egs/clotho/aac_word/conf/tuning/transducer/decode_default.yaml create mode 100755 egs/clotho/aac_word/local/back_translate_data_dir.sh create mode 100755 egs/clotho/aac_word/local/back_translate_text.py create mode 100644 egs/clotho/aac_word/local/data_prep_audiocaps.py create mode 100755 egs/clotho/aac_word/local/download_large_drive_file.sh create mode 100644 egs/clotho/aac_word/local/evaluate_decoded_captions.py create mode 100644 egs/clotho/aac_word/path.sh create mode 100755 egs/clotho/aac_word/run.sh create mode 120000 egs/clotho/aac_word/steps create mode 120000 egs/clotho/aac_word/utils diff --git a/egs/clotho/aac_word/README.md b/egs/clotho/aac_word/README.md new file mode 100644 index 00000000000..ac8372f20f6 --- /dev/null +++ b/egs/clotho/aac_word/README.md @@ -0,0 +1,56 @@ +# Clotho Recipe + +## Data preparation +* Data preparation during `stage 0` can be performed by appropriately setting the boolean variables in below command. By default, all variables are set to `false`. Description of each variable is also detailed below. + + ```bash + ./run.sh --stage 0 --stop_stage 0 \ + --download_clothov2 true \ + --download_audiocaps true \ + --augment_audiocaps true \ + --augment_speedperturbation false \ + --download_evalmetrics true + ``` + +#### Setting up Clotho-V2 dataset +* Download and prepare the *Clotho-V2* dataset using below command. This should prepare `data` and `clothov2_data` directories in the current recipe's root directory. The `data` directory should have `{dev,val,eval,recog_val,recog_eval}_clothov2` directories. The `clothov2_data` should have `clotho_{audio,csv}_files` directories. + + ```bash + ./run.sh --stage 0 --stop_stage 0 --download_clothov2 true + ``` +* Among the `data/{dev,val,eval,recog_val,recog_eval}_clothov2` directories, `dev_clothov2` is used for training, `val_clothov2` is used for validation, and `recog_{val,eval}_clothov2` are used for decoding captions. +* Since each audio sample in this dataset has 5 captions, the `wav.scp` and `text` files in `data/{dev,val,eval}_clothov2` directories contain 5 lines for each audio sample, mapping to its 5 captions. +* To aviod decoding the same audio sample 5 times during the decoding stage, the `wav.scp` and `text` files in `data/recog_{val,eval}_clothov2` directories contain just one line for each audio sample, mapping to its first caption. Additionally, a `groundtruth_captions.txt` file is created in each directory which providing all the 5 ground truth captions for each audio sample. +* The `clothov2_data/clotho_audio_files` directory contains the audio samples from development, validation and evaluation sets, however renamed to `{dev,val,eval}file_{file-ID}.wav` filenames respectively. A mapping of renamed filenames to the original filenames can be found in `data/{dev,val,eval,recog_val,recog_eval}_clothov2/original_filenames.txt`. + +#### Setting up AudioCaps dataset +* Download and prepare the *AudioCaps* dataset using below command. + + ```bash + ./run.sh --stage 0 --stop_stage 0 --download_audiocaps true + ``` +* To augment the *AudioCaps* dataset during stages 1 to 5 (i.e. for generation of features, dict, json, and for training and decoding), please add `--augment_audiocaps true` when executing `./run.sh`. + +#### Performing speed perturbation augmentation +* For speed perturbation based data-augmentation, please add `--augment_speedperturbation true` during data preparation. + +#### Setting up COCO Evaluation Metrics +* Download and setup the evaluation framework using below command. **WARNING:** This performs `pip3 install scikit-image` + + ```bash + ./run.sh --stage 0 --stop_stage 0 --download_evalmetrics true + ``` + +## Decoding +#### Caption evaluation +* By default, stage 5 decoding evaluates the decoded captions and saves a summary `caption_evaluation_summary.txt` file and a detailed `caption_evaluation_results.txt` file to the experiment's decoding directory (ex: `exp/dev_clothov2_pytorch_train_specaug/decode_recog_val_clothov2_decode_lm_last10/`). +* Alternatively to evaluate the decoded captions, please execute `local/evaluate_decoded_captions.py`. This method takes two inputs: `decoded_json_path`, `groundtruth_captions_path`, and outputs a textfile: `caption_evaluation_results.txt` to the same directory as `decoded_json_path`. This output file tabulates the individual metric scores of each decoded audio sample. An example execution is provided below. + + ```bash + python local/evaluate_decoded_captions.py \ + exp/dev_clothov2_pytorch_train_specaug/decode_recog_val_clothov2_decode_lm_last10/data.json \ + data/recog_eval_clothov2/groundtruth_captions.txt + ``` + +#### Using best 10 validation epochs +* By default, stage 5 decoding averages the model parameters saved from the last 10 training epochs. To instead average the model parameters saved from the training epochs with best 10 validation scores, please add `--use_valbest_average true`. diff --git a/egs/clotho/aac_word/cmd.sh b/egs/clotho/aac_word/cmd.sh new file mode 100644 index 00000000000..4d70c9c7a79 --- /dev/null +++ b/egs/clotho/aac_word/cmd.sh @@ -0,0 +1,89 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time