diff --git a/alreadrun/test_bert_mnli.py b/alreadrun/test_bert_mnli.py index 3ea6e15..44722ea 100644 --- a/alreadrun/test_bert_mnli.py +++ b/alreadrun/test_bert_mnli.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.runned.utils_test_three import validate -from a3v.model_transformer import ESIM -# from a3v.model_bert_transformer import ESIM +from vaa.model_transformer import ESIM +# from vaa.model_bert_transformer import ESIM import os import argparse import json diff --git a/alreadrun/test_bert_quora.py b/alreadrun/test_bert_quora.py index 2d785b2..d6fc4c3 100644 --- a/alreadrun/test_bert_quora.py +++ b/alreadrun/test_bert_quora.py @@ -4,7 +4,7 @@ # Aurelien Coet, 2018. from utils.runned.utils_test_two import validate -from a3v.model_transformer import ESIM +from vaa.model_transformer import ESIM import os import argparse import json diff --git a/alreadrun/test_bert_snli.py b/alreadrun/test_bert_snli.py index 5650a08..60a5d2c 100644 --- a/alreadrun/test_bert_snli.py +++ b/alreadrun/test_bert_snli.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.runned.utils_test_three import validate -from a3v.model_transformer import ESIM -# from a3v.model_bert_transformer import ESIM +from vaa.model_transformer import ESIM +# from vaa.model_bert_transformer import ESIM import os import argparse import json diff --git a/alreadrun/test_cifar10.py b/alreadrun/test_cifar10.py index 85c7a80..418aa9e 100644 --- a/alreadrun/test_cifar10.py +++ b/alreadrun/test_cifar10.py @@ -6,8 +6,8 @@ import os import torch import torch.nn.functional as F -from a3v.droped.resnet import PreActResNet18 -from a3v.droped.resnet_top import PreActResNet18Top +from vaa.droped.resnet import PreActResNet18 +from vaa.droped.resnet_top import PreActResNet18Top from torch.autograd import Variable import sys from utils.utils_base import creterion_cifar diff --git a/alreadrun/test_esim_quora.py b/alreadrun/test_esim_quora.py index 90aafc3..bb6d056 100644 --- a/alreadrun/test_esim_quora.py +++ b/alreadrun/test_esim_quora.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.runned.utils_test_esim_quora import validate -from a3v.model import ESIM -from a3v.data import NLIDataset +from vaa.model import ESIM +from vaa.data import NLIDataset from torch.utils.data import DataLoader import os import argparse diff --git a/alreadrun/test_esim_snli.py b/alreadrun/test_esim_snli.py index cc587a0..86e07f5 100644 --- a/alreadrun/test_esim_snli.py +++ b/alreadrun/test_esim_snli.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.runned.utils_test_esim_snli import validate -from a3v.model import ESIM -from a3v.data import NLIDataset +from vaa.model import ESIM +from vaa.data import NLIDataset from torch.utils.data import DataLoader import os import argparse diff --git a/alreadrun/top_cifar10.py b/alreadrun/top_cifar10.py index 4662fe3..0613490 100644 --- a/alreadrun/top_cifar10.py +++ b/alreadrun/top_cifar10.py @@ -5,8 +5,8 @@ import torchvision.transforms as transforms import os import torch -from a3v.droped.resnet import PreActResNet18 -from a3v.droped.resnet_top import PreActResNet18Top +from vaa.droped.resnet import PreActResNet18 +from vaa.droped.resnet_top import PreActResNet18Top from torch.autograd import Variable import sys diff --git a/bert_mnli.py b/bert_mnli.py index 789ed46..d8efb90 100644 --- a/bert_mnli.py +++ b/bert_mnli.py @@ -4,7 +4,7 @@ # Aurelien Coet, 2018. from utils.utils_transformer import train, validate -from a3v.model_transformer import ESIM +from vaa.model_transformer import ESIM import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/bert_mnli_test.py b/bert_mnli_test.py index 6cd5cdc..b011cff 100644 --- a/bert_mnli_test.py +++ b/bert_mnli_test.py @@ -4,7 +4,7 @@ # Aurelien Coet, 2018. from utils.utils_transformer import test -from a3v.model_transformer import ESIM +from vaa.model_transformer import ESIM import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/bert_quora.py b/bert_quora.py index 7ace86c..f1c4ec6 100644 --- a/bert_quora.py +++ b/bert_quora.py @@ -4,7 +4,7 @@ # Aurelien Coet, 2018. from utils.utils_transformer import train, validate -from a3v.model_transformer import ESIM +from vaa.model_transformer import ESIM import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/bert_quora_loss.py b/bert_quora_loss.py index 9ff85a6..e445d9a 100644 --- a/bert_quora_loss.py +++ b/bert_quora_loss.py @@ -4,7 +4,7 @@ # Aurelien Coet, 2018. from utils.utils_transformer import train_loss -from a3v.model_transformer import ESIM +from vaa.model_transformer import ESIM import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/bert_snli.py b/bert_snli.py index 038cc4f..f0d1abc 100644 --- a/bert_snli.py +++ b/bert_snli.py @@ -4,7 +4,7 @@ # Aurelien Coet, 2018. from utils.utils_transformer import train, validate -from a3v.model_transformer import ESIM +from vaa.model_transformer import ESIM import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/esim_mnli.py b/esim_mnli.py index 6727b19..67fba88 100644 --- a/esim_mnli.py +++ b/esim_mnli.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.utils_esim import train, validate -from a3v.model import ESIM -from a3v.data import NLIDataset +from vaa.model import ESIM +from vaa.data import NLIDataset from torch.utils.data import DataLoader import torch.nn as nn import matplotlib.pyplot as plt diff --git a/esim_mnli_test.py b/esim_mnli_test.py index 9c6f224..a839168 100644 --- a/esim_mnli_test.py +++ b/esim_mnli_test.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.utils_esim import test -from a3v.model import ESIM -from a3v.data import NLIDataset +from vaa.model import ESIM +from vaa.data import NLIDataset from torch.utils.data import DataLoader import torch.nn as nn import matplotlib.pyplot as plt diff --git a/esim_quora.py b/esim_quora.py index 8f4a416..bcc8804 100644 --- a/esim_quora.py +++ b/esim_quora.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.utils_esim import train, validate -from a3v.model import ESIM -from a3v.data import NLIDataset +from vaa.model import ESIM +from vaa.data import NLIDataset from torch.utils.data import DataLoader import torch.nn as nn import matplotlib.pyplot as plt diff --git a/esim_snli.py b/esim_snli.py index 2adef59..593dfc7 100644 --- a/esim_snli.py +++ b/esim_snli.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.utils_esim import train, validate -from a3v.model import ESIM -from a3v.data import NLIDataset +from vaa.model import ESIM +from vaa.data import NLIDataset from torch.utils.data import DataLoader import torch.nn as nn import matplotlib.pyplot as plt diff --git a/scripts/droped/cifar10.py b/scripts/droped/cifar10.py index 4830482..c6b5f1b 100644 --- a/scripts/droped/cifar10.py +++ b/scripts/droped/cifar10.py @@ -6,7 +6,7 @@ import torchvision.transforms as transforms import os import torch -from a3v.droped.resnet import PreActResNet18 +from vaa.droped.resnet import PreActResNet18 # Training def train(epoch): diff --git a/scripts/droped/top_quora_transformer.py b/scripts/droped/top_quora_transformer.py index e47751e..1a3fcf1 100644 --- a/scripts/droped/top_quora_transformer.py +++ b/scripts/droped/top_quora_transformer.py @@ -4,10 +4,10 @@ # Aurelien Coet, 2018. from utils.utils_top_transformer import train, validate -from a3v.droped import TransformerESIM as ESIM -# from a3v.model_esim import ESIM -from a3v.model_transformer_top import TOP -# from a3v.model_bert_transformer import ESIM +from vaa.droped import TransformerESIM as ESIM +# from vaa.model_esim import ESIM +from vaa.model_transformer_top import TOP +# from vaa.model_bert_transformer import ESIM import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/scripts/droped/transformer_quora.py b/scripts/droped/transformer_quora.py index 079d23e..4e932f4 100644 --- a/scripts/droped/transformer_quora.py +++ b/scripts/droped/transformer_quora.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.utils_transformer import train, validate -from a3v.droped import TransformerESIM as ESIM -# from a3v.model_esim import ESIM +from vaa.droped import TransformerESIM as ESIM +# from vaa.model_esim import ESIM import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/scripts/preprocessing/preprocess_mnli.py b/scripts/preprocessing/preprocess_mnli.py index d0e880b..07cf88f 100644 --- a/scripts/preprocessing/preprocess_mnli.py +++ b/scripts/preprocessing/preprocess_mnli.py @@ -10,7 +10,7 @@ import fnmatch import json -from a3v.data import Preprocessor +from vaa.data import Preprocessor def preprocess_MNLI_data(inputdir, diff --git a/scripts/preprocessing/preprocess_mnli_bert.py b/scripts/preprocessing/preprocess_mnli_bert.py index b4f3804..7c7c55a 100644 --- a/scripts/preprocessing/preprocess_mnli_bert.py +++ b/scripts/preprocessing/preprocess_mnli_bert.py @@ -10,7 +10,7 @@ import fnmatch import json -from a3v.data import Preprocessor +from vaa.data import Preprocessor def preprocess_MNLI_data(inputdir, diff --git a/scripts/preprocessing/preprocess_quora.py b/scripts/preprocessing/preprocess_quora.py index 3ec1147..f376ef3 100644 --- a/scripts/preprocessing/preprocess_quora.py +++ b/scripts/preprocessing/preprocess_quora.py @@ -7,7 +7,7 @@ import fnmatch import json -from a3v.data import Preprocessor +from vaa.data import Preprocessor def preprocess_quora_data(inputdir, diff --git a/scripts/preprocessing/preprocess_quora_bert.py b/scripts/preprocessing/preprocess_quora_bert.py index 2d7b607..3a7f667 100644 --- a/scripts/preprocessing/preprocess_quora_bert.py +++ b/scripts/preprocessing/preprocess_quora_bert.py @@ -7,7 +7,7 @@ import fnmatch import json -from a3v.data import Preprocessor +from vaa.data import Preprocessor def preprocess_quora_data(inputdir, diff --git a/scripts/preprocessing/preprocess_snli.py b/scripts/preprocessing/preprocess_snli.py index b103bc8..db77f26 100644 --- a/scripts/preprocessing/preprocess_snli.py +++ b/scripts/preprocessing/preprocess_snli.py @@ -9,7 +9,7 @@ import fnmatch import json -from a3v.data import Preprocessor +from vaa.data import Preprocessor def preprocess_SNLI_data(inputdir, diff --git a/scripts/preprocessing/preprocess_snli_bert.py b/scripts/preprocessing/preprocess_snli_bert.py index 805925f..389ec48 100644 --- a/scripts/preprocessing/preprocess_snli_bert.py +++ b/scripts/preprocessing/preprocess_snli_bert.py @@ -9,7 +9,7 @@ import fnmatch import json -from a3v.data import Preprocessor +from vaa.data import Preprocessor def preprocess_SNLI_data(inputdir, diff --git a/scripts/testing/test_mnli.py b/scripts/testing/test_mnli.py index f67989f..ea23b69 100644 --- a/scripts/testing/test_mnli.py +++ b/scripts/testing/test_mnli.py @@ -10,8 +10,8 @@ import json from torch.utils.data import DataLoader -from a3v.data import NLIDataset -from a3v.model import ESIM +from vaa.data import NLIDataset +from vaa.model import ESIM def predict(model, dataloader, labeldict): diff --git a/scripts/testing/test_quora.py b/scripts/testing/test_quora.py index 5cff3f6..c516f3f 100644 --- a/scripts/testing/test_quora.py +++ b/scripts/testing/test_quora.py @@ -9,9 +9,9 @@ import torch from torch.utils.data import DataLoader -from a3v.data import NLIDataset -from a3v.model import ESIM -from a3v.utils import correct_predictions +from vaa.data import NLIDataset +from vaa.model import ESIM +from vaa.utils import correct_predictions from sklearn import metrics diff --git a/scripts/testing/test_snli.py b/scripts/testing/test_snli.py index 395ff23..0510a82 100644 --- a/scripts/testing/test_snli.py +++ b/scripts/testing/test_snli.py @@ -9,9 +9,9 @@ import torch from torch.utils.data import DataLoader -from a3v.data import NLIDataset -from a3v.model import ESIM -from a3v.utils import correct_predictions +from vaa.data import NLIDataset +from vaa.model import ESIM +from vaa.utils import correct_predictions def test(model, dataloader): diff --git a/scripts/training/test_quora_elmo.py b/scripts/training/test_quora_elmo.py index d66d4b5..f05426c 100644 --- a/scripts/training/test_quora_elmo.py +++ b/scripts/training/test_quora_elmo.py @@ -10,10 +10,10 @@ import argparse import torch import numpy as np -from a3v.data import ElmoDataset +from vaa.data import ElmoDataset from torch.utils.data import DataLoader -from a3v.model_elmo import ESIM -from a3v.utils import correct_predictions +from vaa.model_elmo import ESIM +from vaa.utils import correct_predictions from sklearn import metrics from allennlp.modules.elmo import batch_to_ids diff --git a/scripts/training/test_snli_elmo.py b/scripts/training/test_snli_elmo.py index 8fd0396..acdaf40 100644 --- a/scripts/training/test_snli_elmo.py +++ b/scripts/training/test_snli_elmo.py @@ -10,10 +10,10 @@ import argparse import torch import numpy as np -from a3v.data import ElmoDataset +from vaa.data import ElmoDataset from torch.utils.data import DataLoader -from a3v.model_elmo2 import ESIM -from a3v.utils import correct_predictions +from vaa.model_elmo2 import ESIM +from vaa.utils import correct_predictions from sklearn import metrics from allennlp.modules.elmo import batch_to_ids diff --git a/scripts/training/train_mnli.py b/scripts/training/train_mnli.py index 3726039..a0bdf2b 100644 --- a/scripts/training/train_mnli.py +++ b/scripts/training/train_mnli.py @@ -13,8 +13,8 @@ import torch.nn as nn from torch.utils.data import DataLoader -from a3v.data import NLIDataset -from a3v.model import ESIM +from vaa.data import NLIDataset +from vaa.model import ESIM from utils.utils_esim import train, validate diff --git a/scripts/training/train_snli.py b/scripts/training/train_snli.py index 9b946fb..fda04ab 100644 --- a/scripts/training/train_snli.py +++ b/scripts/training/train_snli.py @@ -13,8 +13,8 @@ import torch.nn as nn from torch.utils.data import DataLoader -from a3v.data import NLIDataset -from a3v.model import ESIM +from vaa.data import NLIDataset +from vaa.model import ESIM from utils.utils_esim import train, validate diff --git a/test_bert_mnli.py b/test_bert_mnli.py index 3ea6e15..44722ea 100644 --- a/test_bert_mnli.py +++ b/test_bert_mnli.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.runned.utils_test_three import validate -from a3v.model_transformer import ESIM -# from a3v.model_bert_transformer import ESIM +from vaa.model_transformer import ESIM +# from vaa.model_bert_transformer import ESIM import os import argparse import json diff --git a/test_bert_snli.py b/test_bert_snli.py index 5650a08..60a5d2c 100644 --- a/test_bert_snli.py +++ b/test_bert_snli.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.runned.utils_test_three import validate -from a3v.model_transformer import ESIM -# from a3v.model_bert_transformer import ESIM +from vaa.model_transformer import ESIM +# from vaa.model_bert_transformer import ESIM import os import argparse import json diff --git a/top_bert_mnli.py b/top_bert_mnli.py index 37e05fe..b234f06 100644 --- a/top_bert_mnli.py +++ b/top_bert_mnli.py @@ -4,10 +4,10 @@ # Aurelien Coet, 2018. from utils.utils_top_transformer import train, validate -# from a3v.model_transformer import TransformerESIM as ESIM -from a3v.model_transformer import ESIM -from a3v.model_transformer_top import TOP -# from a3v.model_bert_transformer import ESIM +# from vaa.model_transformer import TransformerESIM as ESIM +from vaa.model_transformer import ESIM +from vaa.model_transformer_top import TOP +# from vaa.model_bert_transformer import ESIM import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/top_bert_mnli_test.py b/top_bert_mnli_test.py index d610d22..b39f340 100644 --- a/top_bert_mnli_test.py +++ b/top_bert_mnli_test.py @@ -4,10 +4,10 @@ # Aurelien Coet, 2018. from utils.utils_top_transformer import test -# from a3v.model_transformer import TransformerESIM as ESIM -from a3v.model_transformer import ESIM -from a3v.model_transformer_top import TOP -# from a3v.model_bert_transformer import ESIM +# from vaa.model_transformer import TransformerESIM as ESIM +from vaa.model_transformer import ESIM +from vaa.model_transformer_top import TOP +# from vaa.model_bert_transformer import ESIM import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/top_bert_quora.py b/top_bert_quora.py index a3081db..03951b8 100644 --- a/top_bert_quora.py +++ b/top_bert_quora.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.utils_top_transformer import train, validate -from a3v.model_transformer import ESIM -from a3v.model_transformer_top import TOP +from vaa.model_transformer import ESIM +from vaa.model_transformer_top import TOP import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/top_bert_quora_loss.py b/top_bert_quora_loss.py index dd96387..c8018eb 100644 --- a/top_bert_quora_loss.py +++ b/top_bert_quora_loss.py @@ -4,8 +4,8 @@ # Aurelien Coet, 2018. from utils.utils_top_transformer import train_loss -from a3v.model_transformer import ESIM -from a3v.model_transformer_top import TOP +from vaa.model_transformer import ESIM +from vaa.model_transformer_top import TOP import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/top_bert_snli.py b/top_bert_snli.py index f5a260c..23d2e72 100644 --- a/top_bert_snli.py +++ b/top_bert_snli.py @@ -4,10 +4,10 @@ # Aurelien Coet, 2018. from utils.utils_top_transformer import train, validate -# from a3v.model_transformer import TransformerESIM as ESIM -from a3v.model_transformer import ESIM -from a3v.model_transformer_top import TOP -# from a3v.model_bert_transformer import ESIM +# from vaa.model_transformer import TransformerESIM as ESIM +from vaa.model_transformer import ESIM +from vaa.model_transformer_top import TOP +# from vaa.model_bert_transformer import ESIM import torch.nn as nn import matplotlib.pyplot as plt import os diff --git a/top_esim_mnli.py b/top_esim_mnli.py index 6c4d731..96fbc89 100644 --- a/top_esim_mnli.py +++ b/top_esim_mnli.py @@ -4,9 +4,9 @@ # Aurelien Coet, 2018. from utils.utils_top_esim import train, validate -from a3v.model import ESIM -from a3v.model_top import TOP -from a3v.data import NLIDataset +from vaa.model import ESIM +from vaa.model_top import TOP +from vaa.data import NLIDataset from torch.utils.data import DataLoader import torch.nn as nn import matplotlib.pyplot as plt diff --git a/top_esim_mnli_test.py b/top_esim_mnli_test.py index 34123e7..5c75d1c 100644 --- a/top_esim_mnli_test.py +++ b/top_esim_mnli_test.py @@ -4,9 +4,9 @@ # Aurelien Coet, 2018. from utils.utils_top_esim import train, validate, test -from a3v.model import ESIM -from a3v.model_top import TOP -from a3v.data import NLIDataset +from vaa.model import ESIM +from vaa.model_top import TOP +from vaa.data import NLIDataset from torch.utils.data import DataLoader import torch.nn as nn import matplotlib.pyplot as plt diff --git a/top_esim_quora.py b/top_esim_quora.py index 6ba2a68..037fcb2 100644 --- a/top_esim_quora.py +++ b/top_esim_quora.py @@ -4,9 +4,9 @@ # Aurelien Coet, 2018. from utils.utils_top_esim import train, validate -from a3v.model import ESIM -from a3v.model_top import TOP -from a3v.data import NLIDataset +from vaa.model import ESIM +from vaa.model_top import TOP +from vaa.data import NLIDataset from torch.utils.data import DataLoader import torch.nn as nn import matplotlib.pyplot as plt diff --git a/top_esim_snli.py b/top_esim_snli.py index 27e4a7f..7ecb53e 100644 --- a/top_esim_snli.py +++ b/top_esim_snli.py @@ -4,9 +4,9 @@ # Aurelien Coet, 2018. from utils.utils_top_esim import train, validate -from a3v.model import ESIM -from a3v.model_top import TOP -from a3v.data import NLIDataset +from vaa.model import ESIM +from vaa.model_top import TOP +from vaa.data import NLIDataset from torch.utils.data import DataLoader import torch.nn as nn import matplotlib.pyplot as plt diff --git a/utils/droped/utils_transformer_new.py b/utils/droped/utils_transformer_new.py index 556a9df..98e68e8 100644 --- a/utils/droped/utils_transformer_new.py +++ b/utils/droped/utils_transformer_new.py @@ -6,7 +6,7 @@ import torch.nn as nn import torch.nn.functional as F from tqdm import tqdm -from a3v.utils import correct_predictions +from vaa.utils import correct_predictions from bert_serving.client import BertClient from transformers import * diff --git a/utils/runned/utils_test_esim_quora.py b/utils/runned/utils_test_esim_quora.py index a582719..3664c06 100644 --- a/utils/runned/utils_test_esim_quora.py +++ b/utils/runned/utils_test_esim_quora.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn import pandas as pd -from a3v.utils import correct_predictions +from vaa.utils import correct_predictions from bert_serving.client import BertClient import numpy as np from torch.autograd import Variable diff --git a/utils/runned/utils_test_esim_snli.py b/utils/runned/utils_test_esim_snli.py index 7d837ea..a4c9c88 100644 --- a/utils/runned/utils_test_esim_snli.py +++ b/utils/runned/utils_test_esim_snli.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn import pandas as pd -from a3v.utils import correct_predictions +from vaa.utils import correct_predictions from bert_serving.client import BertClient import numpy as np from torch.autograd import Variable diff --git a/utils/runned/utils_test_three.py b/utils/runned/utils_test_three.py index 48e73c3..301edab 100644 --- a/utils/runned/utils_test_three.py +++ b/utils/runned/utils_test_three.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn import pandas as pd -from a3v.utils import correct_predictions +from vaa.utils import correct_predictions from bert_serving.client import BertClient import numpy as np from torch.autograd import Variable diff --git a/utils/runned/utils_test_two.py b/utils/runned/utils_test_two.py index 44addb4..1c8727b 100644 --- a/utils/runned/utils_test_two.py +++ b/utils/runned/utils_test_two.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn import pandas as pd -from a3v.utils import correct_predictions +from vaa.utils import correct_predictions from bert_serving.client import BertClient import numpy as np from torch.autograd import Variable diff --git a/utils/utils_esim.py b/utils/utils_esim.py index 1e97a05..ed3e0f9 100644 --- a/utils/utils_esim.py +++ b/utils/utils_esim.py @@ -9,7 +9,7 @@ import torch.nn as nn from tqdm import tqdm -from a3v.utils import correct_predictions +from vaa.utils import correct_predictions def train(model, diff --git a/utils/utils_top_esim.py b/utils/utils_top_esim.py index 71183c3..39030f0 100644 --- a/utils/utils_top_esim.py +++ b/utils/utils_top_esim.py @@ -5,7 +5,7 @@ import time import torch.nn as nn from tqdm import tqdm -from a3v.utils import correct_predictions +from vaa.utils import correct_predictions from utils.utils_base import * def train(model, diff --git a/utils/utils_top_transformer.py b/utils/utils_top_transformer.py index 244db3b..78c38f4 100644 --- a/utils/utils_top_transformer.py +++ b/utils/utils_top_transformer.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn from tqdm import tqdm -from a3v.utils import correct_predictions +from vaa.utils import correct_predictions from bert_serving.client import BertClient from utils.utils_base import * diff --git a/utils/utils_transformer.py b/utils/utils_transformer.py index 3f1959a..44da590 100644 --- a/utils/utils_transformer.py +++ b/utils/utils_transformer.py @@ -6,7 +6,7 @@ import torch.nn as nn import pandas as pd from tqdm import tqdm -from a3v.utils import correct_predictions +from vaa.utils import correct_predictions from bert_serving.client import BertClient diff --git a/vaa/__init__.py b/vaa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vaa/data.py b/vaa/data.py new file mode 100644 index 0000000..85b1d56 --- /dev/null +++ b/vaa/data.py @@ -0,0 +1,654 @@ +""" +Preprocessor and dataset definition for NLI. +""" +# Aurelien Coet, 2018. + +import string +import torch +import numpy as np + +from collections import Counter +from torch.utils.data import Dataset +from allennlp.modules.elmo import Elmo, batch_to_ids +from sklearn.preprocessing import LabelEncoder + + +class Preprocessor(object): + """ + Preprocessor class for Natural Language Inference datasets. + + The class can be used to read NLI datasets, build worddicts for them + and transform their premises, hypotheses and labels into lists of + integer indices. + """ + + def __init__(self, + lowercase=False, + ignore_punctuation=False, + num_words=None, + stopwords=[], + labeldict={}, + bos=None, + eos=None): + """ + Args: + lowercase: A boolean indicating whether the words in the datasets + being preprocessed must be lowercased or not. Defaults to + False. + ignore_punctuation: A boolean indicating whether punctuation must + be ignored or not in the datasets preprocessed by the object. + num_words: An integer indicating the number of words to use in the + worddict of the object. If set to None, all the words in the + data are kept. Defaults to None. + stopwords: A list of words that must be ignored when building the + worddict for a dataset. Defaults to an empty list. + bos: A string indicating the symbol to use for the 'beginning of + sentence' token in the data. If set to None, the token isn't + used. Defaults to None. + eos: A string indicating the symbol to use for the 'end of + sentence' token in the data. If set to None, the token isn't + used. Defaults to None. + """ + self.lowercase = lowercase + self.ignore_punctuation = ignore_punctuation + self.num_words = num_words + self.stopwords = stopwords + self.labeldict = labeldict + self.bos = bos + self.eos = eos + + def read_data(self, filepath): + """ + Read the premises, hypotheses and labels from some NLI dataset's + file and return them in a dictionary. The file should be in the same + form as SNLI's .txt files. + + Args: + filepath: The path to a file containing some premises, hypotheses + and labels that must be read. The file should be formatted in + the same way as the SNLI (and MultiNLI) dataset. + + Returns: + A dictionary containing three lists, one for the premises, one for + the hypotheses, and one for the labels in the input data. + """ + with open(filepath, "r", encoding="utf8") as input_data: + ids, premises, hypotheses, labels = [], [], [], [] + + # Translation tables to remove parentheses and punctuation from + # strings. + parentheses_table = str.maketrans({"(": None, ")": None}) + punct_table = str.maketrans({key: " " + for key in string.punctuation}) + + # Ignore the headers on the first line of the file. + next(input_data) + + for line in input_data: + line = line.strip().split("\t") + + # Ignore sentences that have no gold label. + if line[0] == "-": + continue + + pair_id = line[7] + premise = line[1] + hypothesis = line[2] + + # Remove '(' and ')' from the premises and hypotheses. + premise = premise.translate(parentheses_table) + hypothesis = hypothesis.translate(parentheses_table) + + if self.lowercase: + premise = premise.lower() + hypothesis = hypothesis.lower() + + if self.ignore_punctuation: + premise = premise.translate(punct_table) + hypothesis = hypothesis.translate(punct_table) + + # Each premise and hypothesis is split into a list of words. + premises.append([w for w in premise.rstrip().split() + if w not in self.stopwords]) + hypotheses.append([w for w in hypothesis.rstrip().split() + if w not in self.stopwords]) + labels.append(line[0]) + ids.append(pair_id) + # labels = list(LabelEncoder().fit_transform(labels)) + return {"ids": ids, + "premises": premises, + "hypotheses": hypotheses, + "labels": labels} + + def read_data_bert(self, filepath): + """ + Read the premises, hypotheses and labels from some NLI dataset's + file and return them in a dictionary. The file should be in the same + form as SNLI's .txt files. + + Args: + filepath: The path to a file containing some premises, hypotheses + and labels that must be read. The file should be formatted in + the same way as the SNLI (and MultiNLI) dataset. + + Returns: + A dictionary containing three lists, one for the premises, one for + the hypotheses, and one for the labels in the input data. + """ + with open(filepath, "r", encoding="utf8") as input_data: + ids, premises, hypotheses, labels = [], [], [], [] + + # Translation tables to remove parentheses and punctuation from + # strings. + parentheses_table = str.maketrans({"(": None, ")": None}) + punct_table = str.maketrans({key: " " + for key in string.punctuation}) + + # Ignore the headers on the first line of the file. + next(input_data) + + for line in input_data: + line = line.strip().split("\t") + + # Ignore sentences that have no gold label. + if line[0] == "-": + continue + + pair_id = line[7] + premise = line[1] + hypothesis = line[2] + + # Remove '(' and ')' from the premises and hypotheses. + premise = premise.translate(parentheses_table) + hypothesis = hypothesis.translate(parentheses_table) + + if self.lowercase: + premise = premise.lower() + hypothesis = hypothesis.lower() + + if self.ignore_punctuation: + premise = premise.translate(punct_table) + hypothesis = hypothesis.translate(punct_table) + + # Each premise and hypothesis is split into a list of words. + premises.append(premise.rstrip()) + hypotheses.append(hypothesis.rstrip()) + labels.append(line[0]) + ids.append(pair_id) + label_encoder = LabelEncoder() + labels = list(label_encoder.fit_transform(labels)) + # print(label_encoder.classes_) + # print(labels[0]) + return {"ids": ids, + "premises": premises, + "hypotheses": hypotheses, + "labels": labels} + + def read_data_quora(self, filepath): + """ + Read the premises, hypotheses and labels from some NLI dataset's + file and return them in a dictionary. The file should be in the same + form as quora's .tsv files. + + Args: + filepath: The path to a file containing some premises, hypotheses + and labels that must be read. The file should be formatted in + the same way as the quora dataset. + + Returns: + A dictionary containing three lists, one for the premises, one for + the hypotheses, and one for the labels in the input data. + """ + with open(filepath, "r", encoding="utf8") as input_data: + ids, premises, hypotheses, labels = [], [], [], [] + + # Translation tables to remove parentheses and punctuation from + # strings. + punct_table = str.maketrans({key: " " + for key in string.punctuation}) + + for line in input_data: + line = line.strip().split("\t") + + pair_id = line[3] + premise = line[1] + hypothesis = line[2] + + if self.lowercase: + premise = premise.lower() + hypothesis = hypothesis.lower() + + if self.ignore_punctuation: + premise = premise.translate(punct_table) + hypothesis = hypothesis.translate(punct_table) + + # Each premise and hypothesis is split into a list of words. + premises.append([w for w in premise.rstrip().split() + if w not in self.stopwords]) + hypotheses.append([w for w in hypothesis.rstrip().split() + if w not in self.stopwords]) + labels.append(line[0]) + ids.append(pair_id) + + return {"ids": ids, + "premises": premises, + "hypotheses": hypotheses, + "labels": labels} + + def read_data_quora_bert(self, filepath): + """ + Read the premises, hypotheses and labels from some NLI dataset's + file and return them in a dictionary. The file should be in the same + form as quora's .tsv files. + + Args: + filepath: The path to a file containing some premises, hypotheses + and labels that must be read. The file should be formatted in + the same way as the quora dataset. + + Returns: + A dictionary containing three lists, one for the premises, one for + the hypotheses, and one for the labels in the input data. + """ + with open(filepath, "r", encoding="utf8") as input_data: + ids, premises, hypotheses, labels = [], [], [], [] + + # Translation tables to remove parentheses and punctuation from + # strings. + punct_table = str.maketrans({key: " " + for key in string.punctuation}) + + for line in input_data: + line = line.strip().split("\t") + + pair_id = line[3] + premise = line[1] + hypothesis = line[2] + + if self.lowercase: + premise = premise.lower() + hypothesis = hypothesis.lower() + + if self.ignore_punctuation: + premise = premise.translate(punct_table) + hypothesis = hypothesis.translate(punct_table) + + # Each premise and hypothesis is split into a list of words. + premises.append(premise.rstrip()) + hypotheses.append(hypothesis.rstrip()) + labels.append(line[0]) + ids.append(pair_id) + + return {"ids": ids, + "premises": premises, + "hypotheses": hypotheses, + "labels": labels} + + def read_data_quora_balance(self, filepath): + """ + Read the premises, hypotheses and labels from some NLI dataset's + file and return them in a dictionary. The file should be in the same + form as quora's .tsv files. + + Args: + filepath: The path to a file containing some premises, hypotheses + and labels that must be read. The file should be formatted in + the same way as the quora dataset. + + Returns: + A dictionary containing three lists, one for the premises, one for + the hypotheses, and one for the labels in the input data. + """ + with open(filepath, "r", encoding="utf8") as input_data: + ids, premises, hypotheses, labels = [], [], [], [] + + # Translation tables to remove parentheses and punctuation from + # strings. + punct_table = str.maketrans({key: " " + for key in string.punctuation}) + + for line in input_data: + line = line.strip().split("\t") + + pair_id = line[3] + premise = line[1] + hypothesis = line[2] + + if self.lowercase: + premise = premise.lower() + hypothesis = hypothesis.lower() + + if self.ignore_punctuation: + premise = premise.translate(punct_table) + hypothesis = hypothesis.translate(punct_table) + + # Each premise and hypothesis is split into a list of words. + premises.append([w for w in premise.rstrip().split() + if w not in self.stopwords]) + hypotheses.append([w for w in hypothesis.rstrip().split() + if w not in self.stopwords]) + labels.append(line[0]) + ids.append(pair_id) + + array_labels = np.array([int(x) for x in labels]) + index = np.arange(len(array_labels))[array_labels == 1] + np.random.shuffle(index) + for i in range((array_labels == 0).sum() - (array_labels == 1).sum()): + idx = index[i] + ids.append(ids[idx]) + premises.append(hypotheses[idx]) + hypotheses.append(premises[idx]) + labels.append(labels[idx]) + + return {"ids": ids, + "premises": premises, + "hypotheses": hypotheses, + "labels": labels} + + def build_worddict(self, data): + """ + Build a dictionary associating words to unique integer indices for + some dataset. The worddict can then be used to transform the words + in datasets to their indices. + + Args: + data: A dictionary containing the premises, hypotheses and + labels of some NLI dataset, in the format returned by the + 'read_data' method of the Preprocessor class. + """ + words = [] + [words.extend(sentence) for sentence in data["premises"]] + [words.extend(sentence) for sentence in data["hypotheses"]] + + counts = Counter(words) + num_words = self.num_words + if self.num_words is None: + num_words = len(counts) + + self.worddict = {} + + # Special indices are used for padding, out-of-vocabulary words, and + # beginning and end of sentence tokens. + self.worddict["_PAD_"] = 0 + self.worddict["_OOV_"] = 1 + + offset = 2 + if self.bos: + self.worddict["_BOS_"] = 2 + offset += 1 + if self.eos: + self.worddict["_EOS_"] = 3 + offset += 1 + + for i, word in enumerate(counts.most_common(num_words)): + self.worddict[word[0]] = i + offset + + if self.labeldict == {}: + label_names = set(data["labels"]) + if len(label_names)==3: + label_names = ['entailment', 'neutral', 'contradiction'] + self.labeldict = {label_name: i + for i, label_name in enumerate(label_names)} + print('label_dict',self.labeldict) + + def words_to_indices(self, sentence): + """ + Transform the words in a sentence to their corresponding integer + indices. + + Args: + sentence: A list of words that must be transformed to indices. + + Returns: + A list of indices. + """ + indices = [] + # Include the beggining of sentence token at the start of the sentence + # if one is defined. + if self.bos: + indices.append(self.worddict["_BOS_"]) + + for word in sentence: + if word in self.worddict: + index = self.worddict[word] + else: + # Words absent from 'worddict' are treated as a special + # out-of-vocabulary word (OOV). + index = self.worddict["_OOV_"] + indices.append(index) + # Add the end of sentence token at the end of the sentence if one + # is defined. + if self.eos: + indices.append(self.worddict["_EOS_"]) + + return indices + + def indices_to_words(self, indices): + """ + Transform the indices in a list to their corresponding words in + the object's worddict. + + Args: + indices: A list of integer indices corresponding to words in + the Preprocessor's worddict. + + Returns: + A list of words. + """ + return [list(self.worddict.keys())[list(self.worddict.values()) + .index(i)] + for i in indices] + + def transform_to_indices(self, data): + """ + Transform the words in the premises and hypotheses of a dataset, as + well as their associated labels, to integer indices. + + Args: + data: A dictionary containing lists of premises, hypotheses + and labels, in the format returned by the 'read_data' + method of the Preprocessor class. + + Returns: + A dictionary containing the transformed premises, hypotheses and + labels. + """ + transformed_data = {"ids": [], + "premises": [], + "hypotheses": [], + "labels": []} + + for i, premise in enumerate(data["premises"]): + # Ignore sentences that have a label for which no index was + # defined in 'labeldict'. + label = data["labels"][i] + if label not in self.labeldict and label != "hidden": + continue + + transformed_data["ids"].append(data["ids"][i]) + + if label == "hidden": + transformed_data["labels"].append(-1) + else: + transformed_data["labels"].append(self.labeldict[label]) + + indices = self.words_to_indices(premise) + transformed_data["premises"].append(indices) + + indices = self.words_to_indices(data["hypotheses"][i]) + transformed_data["hypotheses"].append(indices) + return transformed_data + + def build_embedding_matrix(self, embeddings_file): + """ + Build an embedding matrix with pretrained weights for object's + worddict. + + Args: + embeddings_file: A file containing pretrained word embeddings. + + Returns: + A numpy matrix of size (num_words+n_special_tokens, embedding_dim) + containing pretrained word embeddings (the +n_special_tokens is for + the padding and out-of-vocabulary tokens, as well as BOS and EOS if + they're used). + """ + # Load the word embeddings in a dictionnary. + embeddings = {} + with open(embeddings_file, "r", encoding="utf8") as input_data: + for line in input_data: + line = line.split() + + try: + # Check that the second element on the line is the start + # of the embedding and not another word. Necessary to + # ignore multiple word lines. + float(line[1]) + word = line[0] + if word in self.worddict: + embeddings[word] = line[1:] + + # Ignore lines corresponding to multiple words separated + # by spaces. + except ValueError: + continue + + num_words = len(self.worddict) + embedding_dim = len(list(embeddings.values())[0]) + embedding_matrix = np.zeros((num_words, embedding_dim)) + + # Actual building of the embedding matrix. + missed = 0 + for word, i in self.worddict.items(): + if word in embeddings: + embedding_matrix[i] = np.array(embeddings[word], dtype=float) + else: + if word == "_PAD_": + continue + missed += 1 + # Out of vocabulary words are initialised with random gaussian + # samples. + embedding_matrix[i] = np.random.normal(size=(embedding_dim)) + print("Missed words: ", missed) + + return embedding_matrix + + def build_embedding_matrix_elmo(self, options_file, weight_file, embedding_dim=1024): + """ + Build an embedding matrix with pretrained weights for object's + worddict. + + Args: + embeddings_file: A file containing pretrained word embeddings. + + Returns: + A numpy matrix of size (num_words+n_special_tokens, embedding_dim) + containing pretrained word embeddings (the +n_special_tokens is for + the padding and out-of-vocabulary tokens, as well as BOS and EOS if + they're used). + """ + options_file = options_file + weight_file = weight_file + elmo = Elmo(options_file, weight_file, 1, dropout=0) + + num_words = len(self.worddict) + embedding_matrix = np.zeros((num_words, embedding_dim)) + + print(len(self.worddict)) + # Actual building of the embedding matrix. + for word, i in self.worddict.items(): + embedding_word = elmo(batch_to_ids([[word]]))['elmo_representations'][0].squeeze() + embedding_matrix[i] = np.array(embedding_word.detach().cpu(), dtype=float) + if (i+1) % 100 == 0: + print(i/len(self.worddict)) + return embedding_matrix + + +class ElmoDataset(Dataset): + def __init__(self, + data): + self.data = data + self.num_sequences = len(data["premises"]) + + def __len__(self): + return self.num_sequences + + def __getitem__(self, index): + return { + "premises": self.data["premises"][index], + "hypotheses": self.data["hypotheses"][index], + "labels": self.data["labels"][index]} + + +class NLIDataset(Dataset): + """ + Dataset class for Natural Language Inference datasets. + + The class can be used to read preprocessed datasets where the premises, + hypotheses and labels have been transformed to unique integer indices + (this can be done with the 'preprocess_data' script in the 'scripts' + folder of this repository). + """ + + def __init__(self, + data, + padding_idx=0, + max_premise_length=None, + max_hypothesis_length=None): + """ + Args: + data: A dictionary containing the preprocessed premises, + hypotheses and labels of some dataset. + padding_idx: An integer indicating the index being used for the + padding token in the preprocessed data. Defaults to 0. + max_premise_length: An integer indicating the maximum length + accepted for the sequences in the premises. If set to None, + the length of the longest premise in 'data' is used. + Defaults to None. + max_hypothesis_length: An integer indicating the maximum length + accepted for the sequences in the hypotheses. If set to None, + the length of the longest hypothesis in 'data' is used. + Defaults to None. + """ + self.premises_lengths = [len(seq) for seq in data["premises"]] + self.max_premise_length = max_premise_length + if self.max_premise_length is None: + self.max_premise_length = max(self.premises_lengths) + + self.hypotheses_lengths = [len(seq) for seq in data["hypotheses"]] + self.max_hypothesis_length = max_hypothesis_length + if self.max_hypothesis_length is None: + self.max_hypothesis_length = max(self.hypotheses_lengths) + + self.num_sequences = len(data["premises"]) + + self.data = {"ids": [], + "premises": torch.ones((self.num_sequences, + self.max_premise_length), + dtype=torch.long) * padding_idx, + "hypotheses": torch.ones((self.num_sequences, + self.max_hypothesis_length), + dtype=torch.long) * padding_idx, + "labels": torch.tensor(data["labels"], dtype=torch.long)} + + for i, premise in enumerate(data["premises"]): + self.data["ids"].append(data["ids"][i]) + end = min(len(premise), self.max_premise_length) + self.data["premises"][i][:end] = torch.tensor(premise[:end]) + + hypothesis = data["hypotheses"][i] + end = min(len(hypothesis), self.max_hypothesis_length) + self.data["hypotheses"][i][:end] = torch.tensor(hypothesis[:end]) + + def __len__(self): + return self.num_sequences + + def __getitem__(self, index): + return {"id": self.data["ids"][index], + "premise": self.data["premises"][index], + "premise_length": min(self.premises_lengths[index], + self.max_premise_length), + "hypothesis": self.data["hypotheses"][index], + "hypothesis_length": min(self.hypotheses_lengths[index], + self.max_hypothesis_length), + "label": self.data["labels"][index]} diff --git a/vaa/droped/__init__.py b/vaa/droped/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vaa/droped/droped.py b/vaa/droped/droped.py new file mode 100644 index 0000000..511a375 --- /dev/null +++ b/vaa/droped/droped.py @@ -0,0 +1,179 @@ +""" +Definition of the ESIM model. +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn +from vaa.layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention +from vaa.utils import replace_masked +import math +from torch.nn.modules.transformer import * + +# Temporarily leave PositionalEncoding module here. Will be moved somewhere else. +class PositionalEncoding(nn.Module): + r"""Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, d_model, dropout=0.1, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer('pe', pe) + + def forward(self, x): + r"""Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + + x = x + self.pe[:x.size(0), :] + return self.dropout(x) + +class TransformerESIM(nn.Module): + """ + Implementation of the ESIM model presented in the paper "Enhanced LSTM for + Natural Language Inference" by Chen et al. + """ + + def __init__(self, + embedding_dim, + hidden_size, + padding_idx=0, + dropout=0.5, + num_classes=3, + device="cpu"): + """ + Args: + vocab_size: The size of the vocabulary of embeddings in the model. + embedding_dim: The dimension of the word embeddings. + hidden_size: The size of all the hidden layers in the network. + embeddings: A tensor of size (vocab_size, embedding_dim) containing + pretrained word embeddings. If None, word embeddings are + initialised randomly. Defaults to None. + padding_idx: The index of the padding token in the premises and + hypotheses passed as input to the model. Defaults to 0. + dropout: The dropout rate to use between the layers of the network. + A dropout rate of 0 corresponds to using no dropout at all. + Defaults to 0.5. + num_classes: The number of classes in the output of the network. + Defaults to 3. + device: The name of the device on which the model is being + executed. Defaults to 'cpu'. + """ + super(TransformerESIM, self).__init__() + + self.embedding_dim = embedding_dim + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.device = device + + # if self.dropout: + # self._rnn_dropout = RNNDropout(p=self.dropout) + # + # + + self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout) + # self._composition = Seq2SeqEncoder(nn.LSTM, + # self.hidden_size, + # self.hidden_size, + # bidirectional=True) + + self.pos_encoder = PositionalEncoding(self.hidden_size, self.dropout) + encoder_layers = TransformerEncoderLayer(d_model=384, nhead=8) + self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers=6) + + self._combine = nn.Sequential(nn.Dropout(p=self.dropout), + nn.Linear(4*self.hidden_size, self.hidden_size), + nn.Tanh(), + nn.Dropout(p=self.dropout)) + + self._classification = nn.Sequential(nn.Linear(self.hidden_size, self.num_classes)) + + def _generate_square_subsequent_mask(self, sz): + mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) + mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) + return mask + + def forward(self, premises, hypotheses): + """ + Args: + premises: A batch of varaible length sequences of word indices + representing premises. The batch is assumed to be of size + (batch, premises_length). + premises_lengths: A 1D tensor containing the lengths of the + premises in 'premises'. + hypothesis: A batch of varaible length sequences of word indices + representing hypotheses. The batch is assumed to be of size + (batch, hypotheses_length). + hypotheses_lengths: A 1D tensor containing the lengths of the + hypotheses in 'hypotheses'. + + Returns: + logits: A tensor of size (batch, num_classes) containing the + logits for each output class of the model. + probabilities: A tensor of size (batch, num_classes) containing + the probabilities of each output class in the model. + """ + premises = premises[:, :min(128,premises.size()[1]), :] + hypotheses = hypotheses[:, :min(128, hypotheses.size()[1]), :] + + premises_mask_key = (torch.sum(premises, dim=-1) == 0) # 需要被mask的 true + hypotheses_mask_key = (torch.sum(hypotheses, dim=-1) == 0) + premises_mask = 1 - premises_mask_key.float() # 不需要被mask的 1 + hypotheses_mask = 1 - hypotheses_mask_key.float() + premises_lengths = premises_mask.sum(dim=-1).long() + hypotheses_lengths = hypotheses_mask.sum(dim=-1).long() + + projected_premises, projected_hypotheses = self._attention(premises, premises_mask, + hypotheses, hypotheses_mask) + v_ai = self._composition(projected_premises, premises_lengths) + v_bj = self._composition(projected_hypotheses, hypotheses_lengths) + + # projected_premises = self.pos_encoder(projected_premises.transpose(1, 0).contiguous()) + # projected_hypotheses = self.pos_encoder(projected_hypotheses.transpose(1, 0).contiguous()) + # mask1 = self._generate_square_subsequent_mask(len(projected_premises)).to(projected_premises.device) + # mask2 = self._generate_square_subsequent_mask(len(projected_hypotheses)).to(projected_hypotheses.device) + # v_ai = self.transformer_encoder(projected_premises).transpose(1, 0).contiguous() + # v_bj = self.transformer_encoder(projected_hypotheses).transpose(1, 0).contiguous() + + v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1).transpose(2, 1), dim=1)\ + / torch.sum(premises_mask, dim=1, keepdim=True) + v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1).transpose(2, 1), dim=1)\ + / torch.sum(hypotheses_mask, dim=1, keepdim=True) + + v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1) + v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1) + + v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1) + adv_logits = self._combine(v) + logits = self._classification(adv_logits) + probabilities = nn.functional.softmax(logits, dim=-1) + + return logits, probabilities, adv_logits + diff --git a/vaa/droped/layers.py b/vaa/droped/layers.py new file mode 100644 index 0000000..fecef0b --- /dev/null +++ b/vaa/droped/layers.py @@ -0,0 +1,421 @@ +""" +Definition of custom layers for the ESIM model. +""" +# Aurelien Coet, 2018. +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.modules.activation import MultiheadAttention +from vaa.utils import sort_by_seq_lens, masked_softmax, weighted_sum, normal_softmax + +# Class widely inspired from: +# https://github.com/allenai/allennlp/blob/master/allennlp/modules/input_variational_dropout.py +class RNNDropout(nn.Dropout): + """ + Dropout layer for the inputs of RNNs. + + Apply the same dropout mask to all the elements of the same sequence in + a batch of sequences of size (batch, sequences_length, embedding_dim). + """ + + def forward(self, sequences_batch): + """ + Apply dropout to the input batch of sequences. + + Args: + sequences_batch: A batch of sequences of vectors that will serve + as input to an RNN. + Tensor of size (batch, sequences_length, emebdding_dim). + + Returns: + A new tensor on which dropout has been applied. + """ + ones = sequences_batch.data.new_ones(sequences_batch.shape[0], + sequences_batch.shape[-1]) + dropout_mask = nn.functional.dropout(ones, self.p, self.training, + inplace=False) + return dropout_mask.unsqueeze(1) * sequences_batch + + +# class TransformerEncoder(nn.Module): +# """ +# RNN taking variable length padded sequences of vectors as input and +# encoding them into padded sequences of vectors of the same length. +# +# This module is useful to handle batches of padded sequences of vectors +# that have different lengths and that need to be passed through a RNN. +# The sequences are sorted in descending order of their lengths, packed, +# passed through the RNN, and the resulting sequences are then padded and +# permuted back to the original order of the input sequences. +# """ +# +# def __init__(self, +# input_size, +# nhead=4, +# num_layers=1): +# +# super(TransformerEncoder, self).__init__() +# +# self.input_size = input_size +# self.nhead = nhead +# self.num_layers = num_layers +# self._encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer( +# self.input_size, nhead=nhead), num_layers=num_layers) +# +# def forward(self, sequences_batch, sequences_lengths): +# sequences_batch = sequences_batch.transpose(1, 0).contiguous() +# outputs = self._encoder(sequences_batch) +# outputs = outputs.transpose(1, 0).contiguous() +# +# sorted_batch, sorted_lengths, _, restoration_idx =\ +# sort_by_seq_lens(outputs, sequences_lengths) +# packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, sorted_lengths, batch_first=True) +# outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch, batch_first=True) +# return outputs + +class LinerEncoder(nn.Module): + """ + RNN taking variable length padded sequences of vectors as input and + encoding them into padded sequences of vectors of the same length. + + This module is useful to handle batches of padded sequences of vectors + that have different lengths and that need to be passed through a RNN. + The sequences are sorted in descending order of their lengths, packed, + passed through the RNN, and the resulting sequences are then padded and + permuted back to the original order of the input sequences. + """ + + def __init__(self, + input_size, + hidden_size, + dropout=0.0): + super(LinerEncoder, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.dropout = dropout + self._encoder = nn.Linear(input_size, hidden_size*2) + + def forward(self, sequences_batch, sequences_lengths): + """ + Args: + sequences_batch: A batch of variable length sequences of vectors. + The batch is assumed to be of size + (batch, sequence, vector_dim). + sequences_lengths: A 1D tensor containing the sizes of the + sequences in the input batch. + + Returns: + reordered_outputs: The outputs (hidden states) of the encoder for + the sequences in the input batch, in the same order. + """ + sorted_batch, sorted_lengths, _, restoration_idx =\ + sort_by_seq_lens(sequences_batch, sequences_lengths) + packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, + sorted_lengths, + batch_first=True) + outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch, + batch_first=True) + + outputs = self._encoder(outputs) + + reordered_outputs = outputs.index_select(0, restoration_idx) + + return reordered_outputs + +class LengthEncoder(nn.Module): + + def forward(self, sequences_batch, sequences_lengths): + sorted_batch, sorted_lengths, _, restoration_idx =\ + sort_by_seq_lens(sequences_batch, sequences_lengths) + packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, + sorted_lengths, + batch_first=True) + outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch, + batch_first=True) + reordered_outputs = outputs.index_select(0, restoration_idx) + + return reordered_outputs + +class Seq2SeqEncoder(nn.Module): + """ + RNN taking variable length padded sequences of vectors as input and + encoding them into padded sequences of vectors of the same length. + + This module is useful to handle batches of padded sequences of vectors + that have different lengths and that need to be passed through a RNN. + The sequences are sorted in descending order of their lengths, packed, + passed through the RNN, and the resulting sequences are then padded and + permuted back to the original order of the input sequences. + """ + + def __init__(self, + rnn_type, + input_size, + hidden_size, + num_layers=1, + bias=True, + dropout=0.0, + bidirectional=False): + """ + Args: + rnn_type: The type of RNN to use as encoder in the module. + Must be a class inheriting from torch.nn.RNNBase + (such as torch.nn.LSTM for example). + input_size: The number of expected features in the input of the + module. + hidden_size: The number of features in the hidden state of the RNN + used as encoder by the module. + num_layers: The number of recurrent layers in the encoder of the + module. Defaults to 1. + bias: If False, the encoder does not use bias weights b_ih and + b_hh. Defaults to True. + dropout: If non-zero, introduces a dropout layer on the outputs + of each layer of the encoder except the last one, with dropout + probability equal to 'dropout'. Defaults to 0.0. + bidirectional: If True, the encoder of the module is bidirectional. + Defaults to False. + """ + assert issubclass(rnn_type, nn.RNNBase),\ + "rnn_type must be a class inheriting from torch.nn.RNNBase" + + super(Seq2SeqEncoder, self).__init__() + + self.rnn_type = rnn_type + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.dropout = dropout + self.bidirectional = bidirectional + + self._encoder = rnn_type(input_size, + hidden_size, + num_layers=num_layers, + bias=bias, + batch_first=True, + dropout=dropout, + bidirectional=bidirectional) + + def forward(self, sequences_batch, sequences_lengths): + """ + Args: + sequences_batch: A batch of variable length sequences of vectors. + The batch is assumed to be of size + (batch, sequence, vector_dim). + sequences_lengths: A 1D tensor containing the sizes of the + sequences in the input batch. + + Returns: + reordered_outputs: The outputs (hidden states) of the encoder for + the sequences in the input batch, in the same order. + """ + sorted_batch, sorted_lengths, _, restoration_idx =\ + sort_by_seq_lens(sequences_batch, sequences_lengths) + packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, + sorted_lengths, + batch_first=True) + + outputs, _ = self._encoder(packed_batch, None) + + outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, + batch_first=True) + reordered_outputs = outputs.index_select(0, restoration_idx) + + return reordered_outputs + + +class SoftmaxAttention(nn.Module): + + def __init__(self, hidden_size, dropout=0.5): + super(SoftmaxAttention, self).__init__() + # self.multi_head_attn = MultiheadAttention(hidden_size*2, 8) + self.liner1 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2)) + self.liner2 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2)) + self.liner3 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2)) + self.liner4 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2)) + self.liner5 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2)) + self.liner6 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2)) + self.liner7 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2)) + self.liner8 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2)) + self.liner = nn.Sequential(nn.Linear(hidden_size * 4, hidden_size * 4), nn.ReLU(), RNNDropout(p=dropout)) + + # self._enhance = nn.Sequential(nn.Linear(2*7*2*hidden_size, 7*2*hidden_size), nn.ReLU(), RNNDropout(p=dropout)) + self._projection = nn.Sequential(nn.Linear(7*2*hidden_size, hidden_size), nn.ReLU(), RNNDropout(p=dropout)) + + # self.Wb_inter = torch.nn.Parameter(torch.randn(hidden_size*2, hidden_size*2), requires_grad=True) + # self.Wb_intra = torch.nn.Parameter(torch.randn(hidden_size * 2, hidden_size * 2), requires_grad=True) + + """ + Attention layer taking premises and hypotheses encoded by an RNN as input + and computing the soft attention between their elements. + The dot product of the encoded vectors in the premises and hypotheses is + first computed. The softmax of the result is then used in a weighted sum + of the vectors of the premises for each element of the hypotheses, and + conversely for the elements of the premises. + """ + + def forward(self, + premise_batch, + premise_mask, + hypothesis_batch, + hypothesis_mask): + """ + Args: + premise_batch: A batch of sequences of vectors representing the + premises in some NLI task. The batch is assumed to have the + size (batch, sequences, vector_dim). + premise_mask: A mask for the sequences in the premise batch, to + ignore padding data in the sequences during the computation of + the attention. + hypothesis_batch: A batch of sequences of vectors representing the + hypotheses in some NLI task. The batch is assumed to have the + size (batch, sequences, vector_dim). + hypothesis_mask: A mask for the sequences in the hypotheses batch, + to ignore padding data in the sequences during the computation + of the attention. + + Returns: + attended_premises: The sequences of attention vectors for the + premises in the input batch. + attended_hypotheses: The sequences of attention vectors for the + hypotheses in the input batch. + """ + # dot attn + enhanced_premises0, enhanced_hypotheses0 = self.dot_attn(premise_batch, premise_mask, + hypothesis_batch, hypothesis_mask) + # # bilinear attn + # enhanced_premises1, enhanced_hypotheses1 = self.bilinear_attn(premise_batch, premise_mask, + # hypothesis_batch, hypothesis_mask) + # + # enhanced_premises = self._enhance(torch.cat((enhanced_premises0, enhanced_premises1), dim=-1)) + # enhanced_hypotheses = self._enhance(torch.cat((enhanced_hypotheses0, enhanced_hypotheses1), dim=-1)) + + projected_premises = self._projection(enhanced_premises0) + projected_hypotheses = self._projection(enhanced_hypotheses0) + + return projected_premises, projected_hypotheses + + def bilinear_attn(self, premise_batch, premise_mask, + hypothesis_batch, hypothesis_mask): + # inter-attention Softmax attention weights. + Wb_inter = self.Wb_inter.repeat(premise_batch.size()[0], 1, 1) + Wb_intra = self.Wb_intra.repeat(premise_batch.size()[0], 1, 1) + + similarity_matrix = premise_batch.bmm(Wb_inter).bmm(hypothesis_batch.transpose(2, 1).contiguous()) + prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask) + hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask) + attended_premises = weighted_sum(hypothesis_batch, prem_hyp_attn, premise_mask) + attended_hypotheses = weighted_sum(premise_batch, hyp_prem_attn, hypothesis_mask) + + self_premises_matrix = premise_batch.bmm(Wb_intra).bmm(premise_batch.transpose(2, 1).contiguous()) + self_hypotheses_matrix = hypothesis_batch.bmm(Wb_intra).bmm(hypothesis_batch.transpose(2, 1).contiguous()) + self_premises_attn = normal_softmax(self_premises_matrix) + self_hypotheses_attn = normal_softmax(self_hypotheses_matrix) + self_premises = self_premises_attn.bmm(premise_batch) + self_hypotheses = self_hypotheses_attn.bmm(hypothesis_batch) + + # attn_importance + premise_importance = torch.sum(self_premises_attn, dim=-2).unsqueeze(-1) + hypotheses_importance = torch.sum(self_hypotheses_attn, dim=-2).unsqueeze(-1) + inter_hypotheses_importance = torch.sum(prem_hyp_attn, dim=-2).unsqueeze(-1) + inter_premise_importance = torch.sum(hyp_prem_attn, dim=-2).unsqueeze(-1) + + enhanced_premises, enhanced_hypotheses = self.multi_importance(premise_importance, hypotheses_importance, + inter_premise_importance, inter_hypotheses_importance, + premise_batch, hypothesis_batch, attended_premises, + attended_hypotheses, self_premises, self_hypotheses) + + return enhanced_premises, enhanced_hypotheses + + def dot_attn(self, premise_batch, premise_mask, + hypothesis_batch, hypothesis_mask): + sqrt_dim = np.sqrt(premise_batch.size()[2]) + # inter-attention Softmax attention weights. + similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1).contiguous()) / sqrt_dim + + prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask) + hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask) + attended_premises = weighted_sum(hypothesis_batch, prem_hyp_attn, premise_mask) + attended_hypotheses = weighted_sum(premise_batch, hyp_prem_attn, hypothesis_mask) + + self_premises_matrix = premise_batch.bmm(premise_batch.transpose(2, 1).contiguous()) / sqrt_dim + self_hypotheses_matrix = hypothesis_batch.bmm(hypothesis_batch.transpose(2, 1).contiguous()) / sqrt_dim + + self_premises_attn = normal_softmax(self_premises_matrix) + self_hypotheses_attn = normal_softmax(self_hypotheses_matrix) + self_premises = self_premises_attn.bmm(premise_batch) + self_hypotheses = self_hypotheses_attn.bmm(hypothesis_batch) + + # attn_importance max + premise_importance = torch.sum(self_premises_attn, dim=-2).unsqueeze(-1) + hypotheses_importance = torch.sum(self_hypotheses_attn, dim=-2).unsqueeze(-1) + inter_hypotheses_importance = torch.sum(prem_hyp_attn, dim=-2).unsqueeze(-1) + inter_premise_importance = torch.sum(hyp_prem_attn, dim=-2).unsqueeze(-1) + + enhanced_premises, enhanced_hypotheses = self.multi_importance(premise_importance, hypotheses_importance, + inter_premise_importance, inter_hypotheses_importance, + premise_batch, hypothesis_batch, attended_premises, + attended_hypotheses, self_premises, self_hypotheses) + + return enhanced_premises, enhanced_hypotheses + + + def multi_importance(self, premise_importance, hypotheses_importance, + inter_premise_importance, inter_hypotheses_importance, + premise_batch, hypothesis_batch,attended_premises, + attended_hypotheses, self_premises, self_hypotheses): + # attn1 + prem_all_attn1 = premise_importance * inter_premise_importance + hyp_all_attn1 = hypotheses_importance * inter_hypotheses_importance + attended_premises1 = self.liner1(premise_batch * prem_all_attn1) + attended_hypotheses1 = self.liner1(hypothesis_batch * hyp_all_attn1) + + # attn2 + prem_all_attn2 = premise_importance + inter_premise_importance + hyp_all_attn2 = hypotheses_importance + inter_hypotheses_importance + attended_premises2 = self.liner2(premise_batch * prem_all_attn2) + attended_hypotheses2 = self.liner2(hypothesis_batch * hyp_all_attn2) + # attn3 + prem_all_attn3 = torch.max(premise_importance, inter_premise_importance) + hyp_all_attn3 = torch.max(hypotheses_importance, inter_hypotheses_importance) + attended_premises3 = self.liner3(premise_batch * prem_all_attn3) + attended_hypotheses3 = self.liner3(hypothesis_batch * hyp_all_attn3) + # attn4 + attended_premises4_1 = premise_batch * premise_importance + attended_premises4_2 = premise_batch * inter_premise_importance + attended_premises4 = self.liner4(torch.max(attended_premises4_1, attended_premises4_2)) + attended_hypotheses4_1 = hypothesis_batch * hypotheses_importance + attended_hypotheses4_2 = hypothesis_batch * inter_hypotheses_importance + attended_hypotheses4 = self.liner4(torch.max(attended_hypotheses4_1, attended_hypotheses4_2)) + # attn5 + attended_premises5 = self.liner5(premise_batch * (prem_all_attn1 + 1)) + attended_hypotheses5 = self.liner5(hypothesis_batch * (hyp_all_attn1 + 1)) + # attn6 + attended_premises6 = self.liner6(premise_batch * (prem_all_attn2 + 1)) + attended_hypotheses6 = self.liner6(hypothesis_batch * (hyp_all_attn2 + 1)) + # attn7 + attended_premises7 = self.liner7(premise_batch * (prem_all_attn3 + 1)) + attended_hypotheses7 = self.liner7(hypothesis_batch * (hyp_all_attn3 + 1)) + # attn8 + attended_premises8 = self.liner8(torch.max(attended_premises4_1, attended_premises4_2) + premise_batch) + attended_hypotheses8 = self.liner8(torch.max(attended_hypotheses4_1, attended_hypotheses4_2) + hypothesis_batch) + + premise_all = self.liner(torch.cat([attended_premises1, attended_premises2, attended_premises3, + attended_premises4, attended_premises5, attended_premises6, + attended_premises7, attended_premises8], dim=-1)) + hypotheses_all = self.liner(torch.cat([attended_hypotheses1, attended_hypotheses2, attended_hypotheses3, + attended_hypotheses4, attended_hypotheses5, attended_hypotheses6, + attended_hypotheses7, attended_hypotheses8], dim=-1)) + + enhanced_premises = torch.cat([premise_batch, attended_premises, self_premises, + premise_batch - attended_premises, premise_batch * attended_premises, + premise_all + ], + dim=-1) + enhanced_hypotheses = torch.cat([hypothesis_batch, attended_hypotheses, self_hypotheses, + hypothesis_batch - attended_hypotheses, hypothesis_batch * attended_hypotheses, + hypotheses_all + ], + dim=-1) + return enhanced_premises, enhanced_hypotheses \ No newline at end of file diff --git a/vaa/droped/model_new.py b/vaa/droped/model_new.py new file mode 100644 index 0000000..8f6a1f9 --- /dev/null +++ b/vaa/droped/model_new.py @@ -0,0 +1,146 @@ +""" +Definition of the ESIM model. +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn +from vaa.layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention, LinerEncoder +from vaa.utils import get_mask, replace_masked +# from allennlp.modules.elmo import Elmo, batch_to_ids + +class ESIM(nn.Module): + """ + Implementation of the ESIM model presented in the paper "Enhanced LSTM for + Natural Language Inference" by Chen et al. + """ + + def __init__(self, + vocab_size, + embedding_dim, + hidden_size, + embeddings=None, + padding_idx=0, + dropout=0.5, + num_classes=3, + device="cpu"): + """ + Args: + vocab_size: The size of the vocabulary of embeddings in the model. + embedding_dim: The dimension of the word embeddings. + hidden_size: The size of all the hidden layers in the network. + embeddings: A tensor of size (vocab_size, embedding_dim) containing + pretrained word embeddings. If None, word embeddings are + initialised randomly. Defaults to None. + padding_idx: The index of the padding token in the premises and + hypotheses passed as input to the model. Defaults to 0. + dropout: The dropout rate to use between the layers of the network. + A dropout rate of 0 corresponds to using no dropout at all. + Defaults to 0.5. + num_classes: The number of classes in the output of the network. + Defaults to 3. + device: The name of the device on which the model is being + executed. Defaults to 'cpu'. + """ + super(ESIM, self).__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.device = device + + if self.dropout: + self._rnn_dropout = RNNDropout(p=self.dropout) + + self._word_embedding = nn.Embedding(self.vocab_size, + self.embedding_dim, + padding_idx=padding_idx, + _weight=embeddings) + + self.transformer_model = nn.Transformer(d_model=self.embedding_dim, nhead=4, + num_encoder_layers=3, num_decoder_layers=3) + + self._composition = nn.LSTM(self.embedding_dim, self.hidden_size, bidirectional=True, batch_first=True) + + self._classification = nn.Sequential(nn.Linear(self.hidden_size*2, self.num_classes)) + + + def forward(self, + premises, + premises_lengths, + hypotheses, + hypotheses_lengths, + embedd=False, + premises_mask=None, + hypotheses_mask=None): + """ + Args: + premises: A batch of varaible length sequences of word indices + representing premises. The batch is assumed to be of size + (batch, premises_length). + premises_lengths: A 1D tensor containing the lengths of the + premises in 'premises'. + hypothesis: A batch of varaible length sequences of word indices + representing hypotheses. The batch is assumed to be of size + (batch, hypotheses_length). + hypotheses_lengths: A 1D tensor containing the lengths of the + hypotheses in 'hypotheses'. + + Returns: + logits: A tensor of size (batch, num_classes) containing the + logits for each output class of the model. + probabilities: A tensor of size (batch, num_classes) containing + the probabilities of each output class in the model. + """ + if premises_mask is None: + premises_mask = get_mask(premises, premises_lengths).to(self.device) + hypotheses_mask = get_mask(hypotheses, hypotheses_lengths).to(self.device) + + if embedd: + embedded_premises = premises + embedded_hypotheses = hypotheses + else: + embedded_premises = self._word_embedding(premises) + embedded_hypotheses = self._word_embedding(hypotheses) + + if self.dropout: + embedded_premises = self._rnn_dropout(embedded_premises) + embedded_hypotheses = self._rnn_dropout(embedded_hypotheses) + + # encoded_premises = self._encoding(embedded_premises, premises_lengths) + # encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths) + + v = self.transformer_model(embedded_premises.transpose(0, 1), embedded_hypotheses.transpose(0,1)).transpose(0,1) + _, (hn, cn) = self._composition(v) + hn = hn.transpose(0, 1).contiguous() + logits = self._classification(hn.view(hn.size()[0], -1)) + probabilities = nn.functional.softmax(logits, dim=-1) + + return logits, probabilities, hn, \ + (embedded_premises, embedded_hypotheses, premises_mask, hypotheses_mask) + + +def _init_esim_weights(module): + """ + Initialise the weights of the ESIM model. + """ + if isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight.data) + nn.init.constant_(module.bias.data, 0.0) + + elif isinstance(module, nn.LSTM): + nn.init.xavier_uniform_(module.weight_ih_l0.data) + nn.init.orthogonal_(module.weight_hh_l0.data) + nn.init.constant_(module.bias_ih_l0.data, 0.0) + nn.init.constant_(module.bias_hh_l0.data, 0.0) + hidden_size = module.bias_hh_l0.data.shape[0] // 4 + module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0 + + if (module.bidirectional): + nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data) + nn.init.orthogonal_(module.weight_hh_l0_reverse.data) + nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0) + nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0) + module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0 diff --git a/vaa/droped/model_top.py b/vaa/droped/model_top.py new file mode 100644 index 0000000..36032f3 --- /dev/null +++ b/vaa/droped/model_top.py @@ -0,0 +1,174 @@ +""" +Definition of the ESIM model. +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn +from vaa.layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention, LinerEncoder +from vaa.utils import get_mask, replace_masked +# from allennlp.modules.elmo import Elmo, batch_to_ids + +class TOP(nn.Module): + """ + Implementation of the ESIM model presented in the paper "Enhanced LSTM for + Natural Language Inference" by Chen et al. + """ + + def __init__(self, + vocab_size, + embedding_dim, + hidden_size, + embeddings=None, + padding_idx=0, + dropout=0.5, + num_classes=3, + device="cpu"): + """ + Args: + vocab_size: The size of the vocabulary of embeddings in the model. + embedding_dim: The dimension of the word embeddings. + hidden_size: The size of all the hidden layers in the network. + embeddings: A tensor of size (vocab_size, embedding_dim) containing + pretrained word embeddings. If None, word embeddings are + initialised randomly. Defaults to None. + padding_idx: The index of the padding token in the premises and + hypotheses passed as input to the model. Defaults to 0. + dropout: The dropout rate to use between the layers of the network. + A dropout rate of 0 corresponds to using no dropout at all. + Defaults to 0.5. + num_classes: The number of classes in the output of the network. + Defaults to 3. + device: The name of the device on which the model is being + executed. Defaults to 'cpu'. + """ + super(TOP, self).__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.device = device + + # self._word_embedding = nn.Embedding(self.vocab_size, + # self.embedding_dim, + # padding_idx=padding_idx, + # _weight=embeddings) + if self.dropout: + self._rnn_dropout = RNNDropout(p=self.dropout) + + self._encoding = Seq2SeqEncoder(nn.LSTM, + self.embedding_dim, + self.hidden_size, + num_layers=1, + bidirectional=True) + + self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout) + + self._composition = Seq2SeqEncoder(nn.LSTM, + self.hidden_size, + self.hidden_size, + bidirectional=True) + + for p in self.parameters(): + p.requires_grad = False + + self.linear_vulnerability = nn.Linear(3 * self.hidden_size, self.hidden_size) + + self.classification = nn.Sequential(nn.Dropout(p=self.dropout), + nn.Linear((2*4+1)*self.hidden_size, + self.hidden_size), + nn.Tanh(), + nn.Dropout(p=self.dropout), + nn.Linear(self.hidden_size, + self.num_classes)) + + # Initialize all weights and biases in the model. + self.apply(_init_esim_weights) + + def forward(self, + premises, + premises_lengths, + hypotheses, + hypotheses_lengths, + vulnerability, + premises_mask, + hypotheses_mask): + """ + Args: + premises: A batch of varaible length sequences of word indices + representing premises. The batch is assumed to be of size + (batch, premises_length). + premises_lengths: A 1D tensor containing the lengths of the + premises in 'premises'. + hypothesis: A batch of varaible length sequences of word indices + representing hypotheses. The batch is assumed to be of size + (batch, hypotheses_length). + hypotheses_lengths: A 1D tensor containing the lengths of the + hypotheses in 'hypotheses'. + + Returns: + logits: A tensor of size (batch, num_classes) containing the + logits for each output class of the model. + probabilities: A tensor of size (batch, num_classes) containing + the probabilities of each output class in the model. + """ + + if self.dropout: + embedded_premises = self._rnn_dropout(premises) + embedded_hypotheses = self._rnn_dropout(hypotheses) + else: + embedded_premises = premises + embedded_hypotheses = hypotheses + + encoded_premises = self._encoding(embedded_premises, premises_lengths) + encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths) + + projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask, + encoded_hypotheses, hypotheses_mask) + + v_ai = self._composition(projected_premises, premises_lengths) + v_bj = self._composition(projected_hypotheses, hypotheses_lengths) + + v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(premises_mask, dim=1, keepdim=True) + v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(hypotheses_mask, dim=1, keepdim=True) + + v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1) + v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1) + + vulnerability = self.linear_vulnerability(vulnerability) + v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max, vulnerability], dim=1) + + logits = self.classification(v) + probabilities = nn.functional.softmax(logits, dim=-1) + + return logits, probabilities + + +def _init_esim_weights(module): + """ + Initialise the weights of the ESIM model. + """ + if isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight.data) + nn.init.constant_(module.bias.data, 0.0) + + elif isinstance(module, nn.LSTM): + nn.init.xavier_uniform_(module.weight_ih_l0.data) + nn.init.orthogonal_(module.weight_hh_l0.data) + nn.init.constant_(module.bias_ih_l0.data, 0.0) + nn.init.constant_(module.bias_hh_l0.data, 0.0) + hidden_size = module.bias_hh_l0.data.shape[0] // 4 + module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0 + + if (module.bidirectional): + nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data) + nn.init.orthogonal_(module.weight_hh_l0_reverse.data) + nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0) + nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0) + module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0 diff --git a/vaa/droped/model_transformer_new.py b/vaa/droped/model_transformer_new.py new file mode 100644 index 0000000..9bb008a --- /dev/null +++ b/vaa/droped/model_transformer_new.py @@ -0,0 +1,104 @@ +""" +Definition of the ESIM model. +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn +from vaa.layers import RNNDropout, Seq2SeqEncoderLast, SoftmaxAttention, LengthEncoder +from vaa.utils import replace_masked +from torch.nn.modules import TransformerEncoder, TransformerEncoderLayer, LayerNorm +import math + +class BertPooler(nn.Module): + def __init__(self, config): + super(BertPooler, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + +class ESIM(nn.Module): + """ + Implementation of the ESIM model presented in the paper "Enhanced LSTM for + Natural Language Inference" by Chen et al. + """ + + def __init__(self, + embedding_dim, + hidden_size, + padding_idx=0, + dropout=0.5, + num_classes=3, + device="cpu"): + """ + Args: + vocab_size: The size of the vocabulary of embeddings in the model. + embedding_dim: The dimension of the word embeddings. + hidden_size: The size of all the hidden layers in the network. + embeddings: A tensor of size (vocab_size, embedding_dim) containing + pretrained word embeddings. If None, word embeddings are + initialised randomly. Defaults to None. + padding_idx: The index of the padding token in the premises and + hypotheses passed as input to the model. Defaults to 0. + dropout: The dropout rate to use between the layers of the network. + A dropout rate of 0 corresponds to using no dropout at all. + Defaults to 0.5. + num_classes: The number of classes in the output of the network. + Defaults to 3. + device: The name of the device on which the model is being + executed. Defaults to 'cpu'. + """ + super(ESIM, self).__init__() + + self.embedding_dim = embedding_dim + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.device = device + + self.transformer_model = nn.Transformer(d_model=self.embedding_dim, nhead=8, + num_encoder_layers=6, num_decoder_layers=6) + + # self._composition = nn.LSTM(self.embedding_dim, self.hidden_size, bidirectional=True, batch_first=True) + + self._classification = nn.Sequential(nn.Linear(self.hidden_size*2, self.num_classes)) + + + + def forward(self, premises, hypotheses): + """ + Args: + premises: A batch of varaible length sequences of word indices + representing premises. The batch is assumed to be of size + (batch, premises_length). + premises_lengths: A 1D tensor containing the lengths of the + premises in 'premises'. + hypothesis: A batch of varaible length sequences of word indices + representing hypotheses. The batch is assumed to be of size + (batch, hypotheses_length). + hypotheses_lengths: A 1D tensor containing the lengths of the + hypotheses in 'hypotheses'. + + Returns: + logits: A tensor of size (batch, num_classes) containing the + logits for each output class of the model. + probabilities: A tensor of size (batch, num_classes) containing + the probabilities of each output class in the model. + """ + premises = premises[:, :min(128,premises.size()[1]), :] + hypotheses = hypotheses[:, :min(128, hypotheses.size()[1]), :] + + v = self.transformer_model(premises.transpose(0, 1), hypotheses.transpose(0,1)).transpose(0,1)[:,0] + + logits = self._classification(v) + probabilities = nn.functional.softmax(logits, dim=-1) + + return logits, probabilities, v + diff --git a/vaa/droped/resnet.py b/vaa/droped/resnet.py new file mode 100644 index 0000000..219d8e8 --- /dev/null +++ b/vaa/droped/resnet.py @@ -0,0 +1,94 @@ +import torch.nn as nn +import torch.nn.functional as F + + +class PreActBlock(nn.Module): + '''Pre-activation version of the BasicBlock.''' + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(PreActBlock, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) + ) + + def forward(self, x): + out = F.relu(self.bn1(x)) + shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x + out = self.conv1(out) + out = self.conv2(F.relu(self.bn2(out))) + out += shortcut + return out + + +class PreActBottleneck(nn.Module): + '''Pre-activation version of the original Bottleneck module.''' + expansion = 4 + + def __init__(self, in_planes, planes, stride=1): + super(PreActBottleneck, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) + + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) + ) + + def forward(self, x): + out = F.relu(self.bn1(x)) + shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x + out = self.conv1(out) + out = self.conv2(F.relu(self.bn2(out))) + out = self.conv3(F.relu(self.bn3(out))) + out += shortcut + return out + + +class PreActResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super(PreActResNet, self).__init__() + self.in_planes = 64 + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + self.linear = nn.Linear(512*block.expansion, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + out = self.conv1(x) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 4) + logits = out.view(out.size(0), -1) + out = self.linear(logits) + # out = F.log_softmax(out, dim=1) + return out, logits + + +def PreActResNet18(): + return PreActResNet(PreActBlock, [2,2,2,2]) + + diff --git a/vaa/droped/resnet_top.py b/vaa/droped/resnet_top.py new file mode 100644 index 0000000..77a5832 --- /dev/null +++ b/vaa/droped/resnet_top.py @@ -0,0 +1,110 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class PreActBlock(nn.Module): + '''Pre-activation version of the BasicBlock.''' + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(PreActBlock, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) + ) + + def forward(self, x): + out = F.relu(self.bn1(x)) + shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x + out = self.conv1(out) + out = self.conv2(F.relu(self.bn2(out))) + out += shortcut + return out + + +class PreActBottleneck(nn.Module): + '''Pre-activation version of the original Bottleneck module.''' + expansion = 4 + + def __init__(self, in_planes, planes, stride=1): + super(PreActBottleneck, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) + + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) + ) + + def forward(self, x): + out = F.relu(self.bn1(x)) + shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x + out = self.conv1(out) + out = self.conv2(F.relu(self.bn2(out))) + out = self.conv3(F.relu(self.bn3(out))) + out += shortcut + return out + + +class PreActResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super(PreActResNet, self).__init__() + self.in_planes = 64 + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + + for p in self.parameters(): + p.requires_grad = False + + self.linear = nn.Sequential( + nn.Dropout(p=0.5), + nn.Linear(512*block.expansion*3, 16*block.expansion), + #nn.ReLU() + ) + + self.classification = nn.Linear((512+16)*block.expansion, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x, vulnerability): + out = self.conv1(x) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 4) + logits = out.view(out.size(0), -1) + + vulnerability = self.linear(vulnerability) + logits = torch.cat([logits, vulnerability], dim=1) + + # out = self.linear(logits) + out = self.classification(logits) + # out = F.log_softmax(out, dim=1) + return out + + +def PreActResNet18Top(): + return PreActResNet(PreActBlock, [2,2,2,2]) + + diff --git a/vaa/layers.py b/vaa/layers.py new file mode 100644 index 0000000..7b3ea20 --- /dev/null +++ b/vaa/layers.py @@ -0,0 +1,420 @@ +""" +Definition of custom layers for the ESIM model. +""" +# Aurelien Coet, 2018. +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from .utils import sort_by_seq_lens, masked_softmax, weighted_sum, get_mask + +# Class widely inspired from: +# https://github.com/allenai/allennlp/blob/master/allennlp/modules/input_variational_dropout.py +class RNNDropout(nn.Dropout): + """ + Dropout layer for the inputs of RNNs. + + Apply the same dropout mask to all the elements of the same sequence in + a batch of sequences of size (batch, sequences_length, embedding_dim). + """ + + def forward(self, sequences_batch): + """ + Apply dropout to the input batch of sequences. + + Args: + sequences_batch: A batch of sequences of vectors that will serve + as input to an RNN. + Tensor of size (batch, sequences_length, emebdding_dim). + + Returns: + A new tensor on which dropout has been applied. + """ + ones = sequences_batch.data.new_ones(sequences_batch.shape[0], + sequences_batch.shape[-1]) + dropout_mask = nn.functional.dropout(ones, self.p, self.training, + inplace=False) + return dropout_mask.unsqueeze(1) * sequences_batch + + +# class TransformerEncoder(nn.Module): +# """ +# RNN taking variable length padded sequences of vectors as input and +# encoding them into padded sequences of vectors of the same length. +# +# This module is useful to handle batches of padded sequences of vectors +# that have different lengths and that need to be passed through a RNN. +# The sequences are sorted in descending order of their lengths, packed, +# passed through the RNN, and the resulting sequences are then padded and +# permuted back to the original order of the input sequences. +# """ +# +# def __init__(self, +# input_size, +# nhead=4, +# num_layers=1): +# +# super(TransformerEncoder, self).__init__() +# +# self.input_size = input_size +# self.nhead = nhead +# self.num_layers = num_layers +# self._encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer( +# self.input_size, nhead=nhead), num_layers=num_layers) +# +# def forward(self, sequences_batch, sequences_lengths): +# sequences_batch = sequences_batch.transpose(1, 0).contiguous() +# outputs = self._encoder(sequences_batch) +# outputs = outputs.transpose(1, 0).contiguous() +# +# sorted_batch, sorted_lengths, _, restoration_idx =\ +# sort_by_seq_lens(outputs, sequences_lengths) +# packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, sorted_lengths, batch_first=True) +# outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch, batch_first=True) +# return outputs + +class LinerEncoder(nn.Module): + """ + RNN taking variable length padded sequences of vectors as input and + encoding them into padded sequences of vectors of the same length. + + This module is useful to handle batches of padded sequences of vectors + that have different lengths and that need to be passed through a RNN. + The sequences are sorted in descending order of their lengths, packed, + passed through the RNN, and the resulting sequences are then padded and + permuted back to the original order of the input sequences. + """ + + def __init__(self, + input_size, + hidden_size, + dropout=0.0): + super(LinerEncoder, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.dropout = dropout + self._encoder = nn.Linear(input_size, hidden_size*2) + + def forward(self, sequences_batch, sequences_lengths): + """ + Args: + sequences_batch: A batch of variable length sequences of vectors. + The batch is assumed to be of size + (batch, sequence, vector_dim). + sequences_lengths: A 1D tensor containing the sizes of the + sequences in the input batch. + + Returns: + reordered_outputs: The outputs (hidden states) of the encoder for + the sequences in the input batch, in the same order. + """ + sorted_batch, sorted_lengths, _, restoration_idx =\ + sort_by_seq_lens(sequences_batch, sequences_lengths) + packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, + sorted_lengths, + batch_first=True) + outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch, + batch_first=True) + + outputs = self._encoder(outputs) + + reordered_outputs = outputs.index_select(0, restoration_idx) + + return reordered_outputs + +class LengthEncoder(nn.Module): + + def forward(self, sequences_batch, sequences_lengths): + sorted_batch, sorted_lengths, _, restoration_idx =\ + sort_by_seq_lens(sequences_batch, sequences_lengths) + packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, + sorted_lengths, + batch_first=True) + outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch, + batch_first=True) + reordered_outputs = outputs.index_select(0, restoration_idx) + + return reordered_outputs + +class Seq2SeqEncoder(nn.Module): + """ + RNN taking variable length padded sequences of vectors as input and + encoding them into padded sequences of vectors of the same length. + + This module is useful to handle batches of padded sequences of vectors + that have different lengths and that need to be passed through a RNN. + The sequences are sorted in descending order of their lengths, packed, + passed through the RNN, and the resulting sequences are then padded and + permuted back to the original order of the input sequences. + """ + + def __init__(self, + rnn_type, + input_size, + hidden_size, + num_layers=1, + bias=True, + dropout=0.0, + bidirectional=False): + """ + Args: + rnn_type: The type of RNN to use as encoder in the module. + Must be a class inheriting from torch.nn.RNNBase + (such as torch.nn.LSTM for example). + input_size: The number of expected features in the input of the + module. + hidden_size: The number of features in the hidden state of the RNN + used as encoder by the module. + num_layers: The number of recurrent layers in the encoder of the + module. Defaults to 1. + bias: If False, the encoder does not use bias weights b_ih and + b_hh. Defaults to True. + dropout: If non-zero, introduces a dropout layer on the outputs + of each layer of the encoder except the last one, with dropout + probability equal to 'dropout'. Defaults to 0.0. + bidirectional: If True, the encoder of the module is bidirectional. + Defaults to False. + """ + assert issubclass(rnn_type, nn.RNNBase),\ + "rnn_type must be a class inheriting from torch.nn.RNNBase" + + super(Seq2SeqEncoder, self).__init__() + + self.rnn_type = rnn_type + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.dropout = dropout + self.bidirectional = bidirectional + + self._encoder = rnn_type(input_size, + hidden_size, + num_layers=num_layers, + bias=bias, + batch_first=True, + dropout=dropout, + bidirectional=bidirectional) + + def forward(self, sequences_batch, sequences_lengths): + """ + Args: + sequences_batch: A batch of variable length sequences of vectors. + The batch is assumed to be of size + (batch, sequence, vector_dim). + sequences_lengths: A 1D tensor containing the sizes of the + sequences in the input batch. + + Returns: + reordered_outputs: The outputs (hidden states) of the encoder for + the sequences in the input batch, in the same order. + """ + sorted_batch, sorted_lengths, _, restoration_idx =\ + sort_by_seq_lens(sequences_batch, sequences_lengths) + packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, + sorted_lengths, + batch_first=True) + + outputs, _ = self._encoder(packed_batch, None) + + outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, + batch_first=True) + reordered_outputs = outputs.index_select(0, restoration_idx) + + return reordered_outputs + +class Seq2SeqEncoderLast(nn.Module): + """ + RNN taking variable length padded sequences of vectors as input and + encoding them into padded sequences of vectors of the same length. + + This module is useful to handle batches of padded sequences of vectors + that have different lengths and that need to be passed through a RNN. + The sequences are sorted in descending order of their lengths, packed, + passed through the RNN, and the resulting sequences are then padded and + permuted back to the original order of the input sequences. + """ + + def __init__(self, + rnn_type, + input_size, + hidden_size, + num_layers=1, + bias=True, + dropout=0.0, + bidirectional=False): + """ + Args: + rnn_type: The type of RNN to use as encoder in the module. + Must be a class inheriting from torch.nn.RNNBase + (such as torch.nn.LSTM for example). + input_size: The number of expected features in the input of the + module. + hidden_size: The number of features in the hidden state of the RNN + used as encoder by the module. + num_layers: The number of recurrent layers in the encoder of the + module. Defaults to 1. + bias: If False, the encoder does not use bias weights b_ih and + b_hh. Defaults to True. + dropout: If non-zero, introduces a dropout layer on the outputs + of each layer of the encoder except the last one, with dropout + probability equal to 'dropout'. Defaults to 0.0. + bidirectional: If True, the encoder of the module is bidirectional. + Defaults to False. + """ + assert issubclass(rnn_type, nn.RNNBase),\ + "rnn_type must be a class inheriting from torch.nn.RNNBase" + + super(Seq2SeqEncoderLast, self).__init__() + + self.rnn_type = rnn_type + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.dropout = dropout + self.bidirectional = bidirectional + + self._encoder = rnn_type(input_size, + hidden_size, + num_layers=num_layers, + bias=bias, + batch_first=True, + dropout=dropout, + bidirectional=bidirectional) + + def forward(self, sequences_batch, sequences_lengths): + """ + Args: + sequences_batch: A batch of variable length sequences of vectors. + The batch is assumed to be of size + (batch, sequence, vector_dim). + sequences_lengths: A 1D tensor containing the sizes of the + sequences in the input batch. + + Returns: + reordered_outputs: The outputs (hidden states) of the encoder for + the sequences in the input batch, in the same order. + """ + sorted_batch, sorted_lengths, _, restoration_idx =\ + sort_by_seq_lens(sequences_batch, sequences_lengths) + packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, + sorted_lengths, + batch_first=True) + + outputs, (hidden, cell) = self._encoder(packed_batch, None) + hidden = hidden.transpose(0, 1).contiguous() + hidden = hidden.view(hidden.size()[0], -1) + hidden = hidden.index_select(0, restoration_idx) + + return hidden + + +class SoftmaxAttention(nn.Module): + + def __init__(self, hidden_size, dropout=0.5): + super(SoftmaxAttention, self).__init__() + + self._projection = nn.Sequential(nn.Linear(4*2*hidden_size, hidden_size), nn.ReLU(), RNNDropout(p=dropout)) + + + """ + Attention layer taking premises and hypotheses encoded by an RNN as input + and computing the soft attention between their elements. + The dot product of the encoded vectors in the premises and hypotheses is + first computed. The softmax of the result is then used in a weighted sum + of the vectors of the premises for each element of the hypotheses, and + conversely for the elements of the premises. + """ + + def forward(self, + premise_batch, + premise_mask, + hypothesis_batch, + hypothesis_mask): + """ + Args: + premise_batch: A batch of sequences of vectors representing the + premises in some NLI task. The batch is assumed to have the + size (batch, sequences, vector_dim). + premise_mask: A mask for the sequences in the premise batch, to + ignore padding data in the sequences during the computation of + the attention. + hypothesis_batch: A batch of sequences of vectors representing the + hypotheses in some NLI task. The batch is assumed to have the + size (batch, sequences, vector_dim). + hypothesis_mask: A mask for the sequences in the hypotheses batch, + to ignore padding data in the sequences during the computation + of the attention. + + Returns: + attended_premises: The sequences of attention vectors for the + premises in the input batch. + attended_hypotheses: The sequences of attention vectors for the + hypotheses in the input batch. + """ + # dot attn + enhanced_premises0, enhanced_hypotheses0 = self.dot_attn(premise_batch, premise_mask, + hypothesis_batch, hypothesis_mask) + projected_premises = self._projection(enhanced_premises0) + projected_hypotheses = self._projection(enhanced_hypotheses0) + + return projected_premises, projected_hypotheses + + def dot_attn(self, premise_batch, premise_mask, hypothesis_batch, hypothesis_mask): + sqrt_dim = np.sqrt(premise_batch.size()[2]) + # inter-attention Softmax attention weights. + similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1).contiguous()) / sqrt_dim + prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask) + hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask) + attended_premises = weighted_sum(hypothesis_batch, prem_hyp_attn, premise_mask) + attended_hypotheses = weighted_sum(premise_batch, hyp_prem_attn, hypothesis_mask) + + enhanced_premises, enhanced_hypotheses = self.multi_importance(premise_batch, hypothesis_batch, + attended_premises, attended_hypotheses) + + return enhanced_premises, enhanced_hypotheses + + + def multi_importance(self, premise_batch, hypothesis_batch,attended_premises, + attended_hypotheses): + enhanced_premises = torch.cat([premise_batch, attended_premises, + premise_batch * attended_premises, premise_batch - attended_premises + ], + dim=-1) + enhanced_hypotheses = torch.cat([hypothesis_batch, attended_hypotheses, + hypothesis_batch * attended_hypotheses, hypothesis_batch - attended_hypotheses + ], + dim=-1) + return enhanced_premises, enhanced_hypotheses + + +class WordEmbedding(nn.Module): + def __init__(self, + vocab_size, + embedding_dim, + embeddings=None, + padding_idx=0): + + super(WordEmbedding, self).__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + + self._word_embedding = nn.Embedding(self.vocab_size, + self.embedding_dim, + padding_idx=padding_idx, + _weight=embeddings) + + def forward(self, + premises, + premises_lengths, + hypotheses, + hypotheses_lengths): + + premises_mask = get_mask(premises, premises_lengths).to(self.device) + hypotheses_mask = get_mask(hypotheses, hypotheses_lengths).to(self.device) + + embedded_premises = self._word_embedding(premises) + embedded_hypotheses = self._word_embedding(hypotheses) + return (embedded_premises, embedded_hypotheses, premises_mask, hypotheses_mask) \ No newline at end of file diff --git a/vaa/model.py b/vaa/model.py new file mode 100644 index 0000000..7f336be --- /dev/null +++ b/vaa/model.py @@ -0,0 +1,190 @@ +""" +Definition of the ESIM model. +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn +from .layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention, LinerEncoder +from .utils import get_mask, replace_masked +# from allennlp.modules.elmo import Elmo, batch_to_ids + +class ESIM(nn.Module): + """ + Implementation of the ESIM model presented in the paper "Enhanced LSTM for + Natural Language Inference" by Chen et al. + """ + + def __init__(self, + vocab_size, + embedding_dim, + hidden_size, + embeddings=None, + padding_idx=0, + dropout=0.5, + num_classes=3, + device="cpu"): + """ + Args: + vocab_size: The size of the vocabulary of embeddings in the model. + embedding_dim: The dimension of the word embeddings. + hidden_size: The size of all the hidden layers in the network. + embeddings: A tensor of size (vocab_size, embedding_dim) containing + pretrained word embeddings. If None, word embeddings are + initialised randomly. Defaults to None. + padding_idx: The index of the padding token in the premises and + hypotheses passed as input to the model. Defaults to 0. + dropout: The dropout rate to use between the layers of the network. + A dropout rate of 0 corresponds to using no dropout at all. + Defaults to 0.5. + num_classes: The number of classes in the output of the network. + Defaults to 3. + device: The name of the device on which the model is being + executed. Defaults to 'cpu'. + """ + super(ESIM, self).__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.device = device + + self._word_embedding = nn.Embedding(self.vocab_size, + self.embedding_dim, + padding_idx=padding_idx, + _weight=embeddings) + if self.dropout: + self._rnn_dropout = RNNDropout(p=self.dropout) + + self._encoding = Seq2SeqEncoder(nn.LSTM, + self.embedding_dim, + self.hidden_size, + num_layers=1, + bidirectional=True) + # self._encoding = LinerEncoder(self.embedding_dim, self.hidden_size) + + self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout) + + # self._projection = nn.Sequential(nn.Linear(7*2*self.hidden_size, + # self.hidden_size), + # nn.ReLU()) + + self._composition = Seq2SeqEncoder(nn.LSTM, + self.hidden_size, + self.hidden_size, + bidirectional=True) + + # self._classification = nn.Sequential(nn.Dropout(p=self.dropout), + # nn.Linear(2*4*self.hidden_size, + # self.hidden_size), + # nn.Tanh(), + # nn.Dropout(p=self.dropout), + # nn.Linear(self.hidden_size, + # self.num_classes)) + + self._combine = nn.Sequential(nn.Dropout(p=self.dropout), + nn.Linear(2*4*self.hidden_size, self.hidden_size), + nn.Tanh(), + nn.Dropout(p=self.dropout)) + + self._classification = nn.Sequential(nn.Linear(self.hidden_size, self.num_classes)) + + + # Initialize all weights and biases in the model. + self.apply(_init_esim_weights) + + def forward(self, + premises, + premises_lengths, + hypotheses, + hypotheses_lengths, + embedd=False, + premises_mask=None, + hypotheses_mask=None): + """ + Args: + premises: A batch of varaible length sequences of word indices + representing premises. The batch is assumed to be of size + (batch, premises_length). + premises_lengths: A 1D tensor containing the lengths of the + premises in 'premises'. + hypothesis: A batch of varaible length sequences of word indices + representing hypotheses. The batch is assumed to be of size + (batch, hypotheses_length). + hypotheses_lengths: A 1D tensor containing the lengths of the + hypotheses in 'hypotheses'. + + Returns: + logits: A tensor of size (batch, num_classes) containing the + logits for each output class of the model. + probabilities: A tensor of size (batch, num_classes) containing + the probabilities of each output class in the model. + """ + if premises_mask is None: + premises_mask = get_mask(premises, premises_lengths).to(self.device) + hypotheses_mask = get_mask(hypotheses, hypotheses_lengths).to(self.device) + + if embedd: + embedded_premises = premises + embedded_hypotheses = hypotheses + else: + embedded_premises = self._word_embedding(premises) + embedded_hypotheses = self._word_embedding(hypotheses) + + if self.dropout: + embedded_premises = self._rnn_dropout(embedded_premises) + embedded_hypotheses = self._rnn_dropout(embedded_hypotheses) + + encoded_premises = self._encoding(embedded_premises, premises_lengths) + encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths) + + projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask, + encoded_hypotheses, hypotheses_mask) + + v_ai = self._composition(projected_premises, premises_lengths) + v_bj = self._composition(projected_hypotheses, hypotheses_lengths) + + v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(premises_mask, dim=1, keepdim=True) + v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(hypotheses_mask, dim=1, keepdim=True) + + v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1) + v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1) + + v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1) + + adv_logits = self._combine(v) + logits = self._classification(adv_logits) + probabilities = nn.functional.softmax(logits, dim=-1) + + return logits, probabilities, adv_logits, \ + (embedded_premises, embedded_hypotheses, premises_mask, hypotheses_mask) + + +def _init_esim_weights(module): + """ + Initialise the weights of the ESIM model. + """ + if isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight.data) + nn.init.constant_(module.bias.data, 0.0) + + elif isinstance(module, nn.LSTM): + nn.init.xavier_uniform_(module.weight_ih_l0.data) + nn.init.orthogonal_(module.weight_hh_l0.data) + nn.init.constant_(module.bias_ih_l0.data, 0.0) + nn.init.constant_(module.bias_hh_l0.data, 0.0) + hidden_size = module.bias_hh_l0.data.shape[0] // 4 + module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0 + + if (module.bidirectional): + nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data) + nn.init.orthogonal_(module.weight_hh_l0_reverse.data) + nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0) + nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0) + module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0 diff --git a/vaa/model_top.py b/vaa/model_top.py new file mode 100644 index 0000000..314f99b --- /dev/null +++ b/vaa/model_top.py @@ -0,0 +1,177 @@ +""" +vulnerability vector pad on the embedding left +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn +from .layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention, LinerEncoder +from .utils import get_mask, replace_masked +# from allennlp.modules.elmo import Elmo, batch_to_ids + +class TOP(nn.Module): + """ + Implementation of the ESIM model presented in the paper "Enhanced LSTM for + Natural Language Inference" by Chen et al. + """ + + def __init__(self, + vocab_size, + embedding_dim, + hidden_size, + embeddings=None, + padding_idx=0, + dropout=0.5, + num_classes=3, + device="cpu"): + """ + Args: + vocab_size: The size of the vocabulary of embeddings in the model. + embedding_dim: The dimension of the word embeddings. + hidden_size: The size of all the hidden layers in the network. + embeddings: A tensor of size (vocab_size, embedding_dim) containing + pretrained word embeddings. If None, word embeddings are + initialised randomly. Defaults to None. + padding_idx: The index of the padding token in the premises and + hypotheses passed as input to the model. Defaults to 0. + dropout: The dropout rate to use between the layers of the network. + A dropout rate of 0 corresponds to using no dropout at all. + Defaults to 0.5. + num_classes: The number of classes in the output of the network. + Defaults to 3. + device: The name of the device on which the model is being + executed. Defaults to 'cpu'. + """ + super(TOP, self).__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.device = device + + self.linear_vulnerability = nn.Linear(3 * self.hidden_size, self.embedding_dim) + + if self.dropout: + self._rnn_dropout = RNNDropout(p=self.dropout) + + self._encoding = Seq2SeqEncoder(nn.LSTM, + self.embedding_dim, + self.hidden_size, + num_layers=1, + bidirectional=True) + + self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout) + + self._composition = Seq2SeqEncoder(nn.LSTM, + self.hidden_size, + self.hidden_size, + bidirectional=True) + # for p in self.parameters(): + # p.requires_grad = False + + self.classification_v = nn.Sequential(nn.Dropout(p=self.dropout), + nn.Linear(((2*4)*self.hidden_size+self.embedding_dim), + self.hidden_size), + nn.Tanh(), + nn.Dropout(p=self.dropout), + nn.Linear(self.hidden_size, + self.num_classes)) + + # Initialize all weights and biases in the model. + self.apply(_init_esim_weights) + + def forward(self, + premises, + premises_lengths, + hypotheses, + hypotheses_lengths, + vulnerability, + premises_mask, + hypotheses_mask): + """ + Args: + premises: A batch of varaible length sequences of word indices + representing premises. The batch is assumed to be of size + (batch, premises_length). + premises_lengths: A 1D tensor containing the lengths of the + premises in 'premises'. + hypothesis: A batch of varaible length sequences of word indices + representing hypotheses. The batch is assumed to be of size + (batch, hypotheses_length). + hypotheses_lengths: A 1D tensor containing the lengths of the + hypotheses in 'hypotheses'. + + Returns: + logits: A tensor of size (batch, num_classes) containing the + logits for each output class of the model. + probabilities: A tensor of size (batch, num_classes) containing + the probabilities of each output class in the model. + """ + device = premises.device + vulnerability = self.linear_vulnerability(vulnerability)#.unsqueeze(dim=1) + + # premises = torch.cat((vulnerability, premises), dim=1) + # hypotheses = torch.cat((vulnerability, hypotheses), dim=1) + # premises_lengths = premises_lengths+1 + # hypotheses_lengths= hypotheses_lengths+1 + # premises_mask = torch.cat((torch.ones(premises_mask.size()[0],1).to(device), premises_mask), dim=1) + # hypotheses_mask = torch.cat((torch.ones(hypotheses_mask.size()[0],1).to(device), hypotheses_mask), dim=1) + + embedded_premises = premises + embedded_hypotheses = hypotheses + if self.dropout: + embedded_premises = self._rnn_dropout(premises) + embedded_hypotheses = self._rnn_dropout(hypotheses) + + encoded_premises = self._encoding(embedded_premises, premises_lengths) + encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths) + + projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask, + encoded_hypotheses, hypotheses_mask) + + v_ai = self._composition(projected_premises, premises_lengths) + v_bj = self._composition(projected_hypotheses, hypotheses_lengths) + + v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(premises_mask, dim=1, keepdim=True) + v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(hypotheses_mask, dim=1, keepdim=True) + + v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1) + v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1) + + # print(vulnerability.size(), v_a_avg.size()) + v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max, vulnerability], dim=1) + + logits = self.classification_v(v) + probabilities = nn.functional.softmax(logits, dim=-1) + + return logits, probabilities + + +def _init_esim_weights(module): + """ + Initialise the weights of the ESIM model. + """ + if isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight.data) + nn.init.constant_(module.bias.data, 0.0) + + elif isinstance(module, nn.LSTM): + nn.init.xavier_uniform_(module.weight_ih_l0.data) + nn.init.orthogonal_(module.weight_hh_l0.data) + nn.init.constant_(module.bias_ih_l0.data, 0.0) + nn.init.constant_(module.bias_hh_l0.data, 0.0) + hidden_size = module.bias_hh_l0.data.shape[0] // 4 + module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0 + + if (module.bidirectional): + nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data) + nn.init.orthogonal_(module.weight_hh_l0_reverse.data) + nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0) + nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0) + module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0 diff --git a/vaa/model_top_embed.py b/vaa/model_top_embed.py new file mode 100644 index 0000000..1700471 --- /dev/null +++ b/vaa/model_top_embed.py @@ -0,0 +1,173 @@ +""" +vulnerability vector pad on the embedding left +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn +from .layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention, LinerEncoder +from .utils import get_mask, replace_masked +# from allennlp.modules.elmo import Elmo, batch_to_ids + +class TOP(nn.Module): + """ + Implementation of the ESIM model presented in the paper "Enhanced LSTM for + Natural Language Inference" by Chen et al. + """ + + def __init__(self, + vocab_size, + embedding_dim, + hidden_size, + embeddings=None, + padding_idx=0, + dropout=0.5, + num_classes=3, + device="cpu"): + """ + Args: + vocab_size: The size of the vocabulary of embeddings in the model. + embedding_dim: The dimension of the word embeddings. + hidden_size: The size of all the hidden layers in the network. + embeddings: A tensor of size (vocab_size, embedding_dim) containing + pretrained word embeddings. If None, word embeddings are + initialised randomly. Defaults to None. + padding_idx: The index of the padding token in the premises and + hypotheses passed as input to the model. Defaults to 0. + dropout: The dropout rate to use between the layers of the network. + A dropout rate of 0 corresponds to using no dropout at all. + Defaults to 0.5. + num_classes: The number of classes in the output of the network. + Defaults to 3. + device: The name of the device on which the model is being + executed. Defaults to 'cpu'. + """ + super(TOP, self).__init__() + + self.vocab_size = vocab_size + self.embedding_dim = embedding_dim + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.device = device + + self.linear_vulnerability = nn.Linear(3 * self.hidden_size, self.embedding_dim) + + if self.dropout: + self._rnn_dropout = RNNDropout(p=self.dropout) + + self._encoding = Seq2SeqEncoder(nn.LSTM, + self.embedding_dim, + self.hidden_size, + num_layers=1, + bidirectional=True) + + self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout) + + self._composition = Seq2SeqEncoder(nn.LSTM, + self.hidden_size, + self.hidden_size, + bidirectional=True) + + self.classification = nn.Sequential(nn.Dropout(p=self.dropout), + nn.Linear((2*4)*self.hidden_size, + self.hidden_size), + nn.Tanh(), + nn.Dropout(p=self.dropout), + nn.Linear(self.hidden_size, + self.num_classes)) + + # Initialize all weights and biases in the model. + self.apply(_init_esim_weights) + + def forward(self, + premises, + premises_lengths, + hypotheses, + hypotheses_lengths, + vulnerability, + premises_mask, + hypotheses_mask): + """ + Args: + premises: A batch of varaible length sequences of word indices + representing premises. The batch is assumed to be of size + (batch, premises_length). + premises_lengths: A 1D tensor containing the lengths of the + premises in 'premises'. + hypothesis: A batch of varaible length sequences of word indices + representing hypotheses. The batch is assumed to be of size + (batch, hypotheses_length). + hypotheses_lengths: A 1D tensor containing the lengths of the + hypotheses in 'hypotheses'. + + Returns: + logits: A tensor of size (batch, num_classes) containing the + logits for each output class of the model. + probabilities: A tensor of size (batch, num_classes) containing + the probabilities of each output class in the model. + """ + device = premises.device + vulnerability = self.linear_vulnerability(vulnerability).unsqueeze(dim=1) + premises = torch.cat((vulnerability, premises), dim=1) + hypotheses = torch.cat((vulnerability, hypotheses), dim=1) + premises_lengths = premises_lengths+1 + hypotheses_lengths= hypotheses_lengths+1 + premises_mask = torch.cat((torch.ones(premises_mask.size()[0],1).to(device), premises_mask), dim=1) + hypotheses_mask = torch.cat((torch.ones(hypotheses_mask.size()[0],1).to(device), hypotheses_mask), dim=1) + + embedded_premises = premises + embedded_hypotheses = hypotheses + if self.dropout: + embedded_premises = self._rnn_dropout(premises) + embedded_hypotheses = self._rnn_dropout(hypotheses) + + encoded_premises = self._encoding(embedded_premises, premises_lengths) + encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths) + + projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask, + encoded_hypotheses, hypotheses_mask) + + v_ai = self._composition(projected_premises, premises_lengths) + v_bj = self._composition(projected_hypotheses, hypotheses_lengths) + + v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(premises_mask, dim=1, keepdim=True) + v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(hypotheses_mask, dim=1, keepdim=True) + + v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1) + v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1) + + v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1) + + logits = self.classification(v) + probabilities = nn.functional.softmax(logits, dim=-1) + + return logits, probabilities + + +def _init_esim_weights(module): + """ + Initialise the weights of the ESIM model. + """ + if isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight.data) + nn.init.constant_(module.bias.data, 0.0) + + elif isinstance(module, nn.LSTM): + nn.init.xavier_uniform_(module.weight_ih_l0.data) + nn.init.orthogonal_(module.weight_hh_l0.data) + nn.init.constant_(module.bias_ih_l0.data, 0.0) + nn.init.constant_(module.bias_hh_l0.data, 0.0) + hidden_size = module.bias_hh_l0.data.shape[0] // 4 + module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0 + + if (module.bidirectional): + nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data) + nn.init.orthogonal_(module.weight_hh_l0_reverse.data) + nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0) + nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0) + module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0 diff --git a/vaa/model_transformer.py b/vaa/model_transformer.py new file mode 100644 index 0000000..ad61b16 --- /dev/null +++ b/vaa/model_transformer.py @@ -0,0 +1,134 @@ +""" +Definition of the ESIM model. +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn +from .layers import RNNDropout, Seq2SeqEncoderLast, SoftmaxAttention, Seq2SeqEncoder +from .utils import replace_masked +from torch.nn.modules import TransformerEncoder, TransformerEncoderLayer +import math + + +class ESIM(nn.Module): + """ + Implementation of the ESIM model presented in the paper "Enhanced LSTM for + Natural Language Inference" by Chen et al. + """ + + def __init__(self, + embedding_dim, + hidden_size, + padding_idx=0, + dropout=0.5, + num_classes=3, + device="cpu"): + """ + Args: + vocab_size: The size of the vocabulary of embeddings in the model. + embedding_dim: The dimension of the word embeddings. + hidden_size: The size of all the hidden layers in the network. + embeddings: A tensor of size (vocab_size, embedding_dim) containing + pretrained word embeddings. If None, word embeddings are + initialised randomly. Defaults to None. + padding_idx: The index of the padding token in the premises and + hypotheses passed as input to the model. Defaults to 0. + dropout: The dropout rate to use between the layers of the network. + A dropout rate of 0 corresponds to using no dropout at all. + Defaults to 0.5. + num_classes: The number of classes in the output of the network. + Defaults to 3. + device: The name of the device on which the model is being + executed. Defaults to 'cpu'. + """ + super(ESIM, self).__init__() + + self.embedding_dim = embedding_dim + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.device = device + + if self.dropout: + self._rnn_dropout = RNNDropout(p=self.dropout) + + self._encoding = Seq2SeqEncoder(nn.LSTM, + self.embedding_dim, + self.hidden_size, + bidirectional=True) + + self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout) + + self._composition = Seq2SeqEncoder(nn.LSTM, + self.hidden_size, + self.hidden_size, + bidirectional=True) + + self._combine = nn.Sequential(nn.Dropout(p=self.dropout), + nn.Linear(2*4*self.hidden_size, self.hidden_size), + nn.Tanh(), + nn.Dropout(p=self.dropout)) + + self._classification = nn.Sequential(nn.Linear(self.hidden_size, self.num_classes)) + + def forward(self, premises, hypotheses): + """ + Args: + premises: A batch of varaible length sequences of word indices + representing premises. The batch is assumed to be of size + (batch, premises_length). + premises_lengths: A 1D tensor containing the lengths of the + premises in 'premises'. + hypothesis: A batch of varaible length sequences of word indices + representing hypotheses. The batch is assumed to be of size + (batch, hypotheses_length). + hypotheses_lengths: A 1D tensor containing the lengths of the + hypotheses in 'hypotheses'. + + Returns: + logits: A tensor of size (batch, num_classes) containing the + logits for each output class of the model. + probabilities: A tensor of size (batch, num_classes) containing + the probabilities of each output class in the model. + """ + premises = premises[:, :min(128,premises.size()[1]), :] + hypotheses = hypotheses[:, :min(128, hypotheses.size()[1]), :] + + premises_mask = (torch.sum(premises, dim=-1) != 0).float() + hypotheses_mask = (torch.sum(hypotheses, dim=-1) != 0).float() + premises_lengths = premises_mask.sum(dim=-1).long() + hypotheses_lengths = hypotheses_mask.sum(dim=-1).long() + + embedded_premises, embedded_hypotheses = premises, hypotheses + if self.dropout: + embedded_premises = self._rnn_dropout(embedded_premises) + embedded_hypotheses = self._rnn_dropout(embedded_hypotheses) + + encoded_premises = self._encoding(embedded_premises, premises_lengths) + encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths) + + projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask, + encoded_hypotheses, hypotheses_mask) + + v_ai = self._composition(projected_premises, premises_lengths) + v_bj = self._composition(projected_hypotheses, hypotheses_lengths) + + v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(premises_mask, dim=1, keepdim=True) + v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(hypotheses_mask, dim=1, keepdim=True) + + v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1) + v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1) + + v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1) + + adv_logits = self._combine(v) + logits = self._classification(adv_logits) + probabilities = nn.functional.softmax(logits, dim=-1) + + return logits, probabilities, adv_logits + diff --git a/vaa/model_transformer_top.py b/vaa/model_transformer_top.py new file mode 100644 index 0000000..04ff365 --- /dev/null +++ b/vaa/model_transformer_top.py @@ -0,0 +1,143 @@ +""" +Definition of the ESIM model. +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn +from .layers import RNNDropout, Seq2SeqEncoderLast, SoftmaxAttention, Seq2SeqEncoder +from .utils import replace_masked + +class TOP(nn.Module): + """ + Implementation of the ESIM model presented in the paper "Enhanced LSTM for + Natural Language Inference" by Chen et al. + """ + + def __init__(self, + embedding_dim, + hidden_size, + padding_idx=0, + dropout=0.5, + num_classes=3, + device="cpu"): + """ + Args: + vocab_size: The size of the vocabulary of embeddings in the model. + embedding_dim: The dimension of the word embeddings. + hidden_size: The size of all the hidden layers in the network. + embeddings: A tensor of size (vocab_size, embedding_dim) containing + pretrained word embeddings. If None, word embeddings are + initialised randomly. Defaults to None. + padding_idx: The index of the padding token in the premises and + hypotheses passed as input to the model. Defaults to 0. + dropout: The dropout rate to use between the layers of the network. + A dropout rate of 0 corresponds to using no dropout at all. + Defaults to 0.5. + num_classes: The number of classes in the output of the network. + Defaults to 3. + device: The name of the device on which the model is being + executed. Defaults to 'cpu'. + """ + super(TOP, self).__init__() + + self.embedding_dim = embedding_dim + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.device = device + + self.linear_vulnerability = nn.Linear(3*self.hidden_size, self.embedding_dim) + + if self.dropout: + self._rnn_dropout = RNNDropout(p=self.dropout) + + self._encoding = Seq2SeqEncoder(nn.LSTM, + self.embedding_dim, + self.hidden_size, + bidirectional=True) + + self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout) + + self._composition = Seq2SeqEncoder(nn.LSTM, + self.hidden_size, + self.hidden_size, + bidirectional=True) + + # for p in self.parameters(): + # p.requires_grad = False + + self.classification_v = nn.Sequential(nn.Dropout(p=self.dropout), + nn.Linear((2*4*self.hidden_size+self.embedding_dim), + self.hidden_size), + nn.Tanh(), + nn.Dropout(p=self.dropout), + nn.Linear(self.hidden_size, + self.num_classes)) + + + + def forward(self, premises, hypotheses, vulnerability): + """ + Args: + premises: A batch of varaible length sequences of word indices + representing premises. The batch is assumed to be of size + (batch, premises_length). + premises_lengths: A 1D tensor containing the lengths of the + premises in 'premises'. + hypothesis: A batch of varaible length sequences of word indices + representing hypotheses. The batch is assumed to be of size + (batch, hypotheses_length). + hypotheses_lengths: A 1D tensor containing the lengths of the + hypotheses in 'hypotheses'. + + Returns: + logits: A tensor of size (batch, num_classes) containing the + logits for each output class of the model. + probabilities: A tensor of size (batch, num_classes) containing + the probabilities of each output class in the model. + """ + premises = premises[:, :min(128,premises.size()[1]), :] + hypotheses = hypotheses[:, :min(128, hypotheses.size()[1]), :] + + vulnerability = self.linear_vulnerability(vulnerability)#.unsqueeze(dim=1) + # premises = torch.cat((vulnerability, premises), dim=1) + # hypotheses = torch.cat((vulnerability, hypotheses), dim=1) + + # max min 不为0 + premises_mask = ((torch.max(premises, dim=-1)[0]-torch.min(premises, dim=-1)[0]) != 0).float() + hypotheses_mask = ((torch.max(hypotheses, dim=-1)[0]-torch.min(hypotheses, dim=-1)[0])!= 0).float() + premises_lengths = premises_mask.sum(dim=-1).long() + hypotheses_lengths = hypotheses_mask.sum(dim=-1).long() + # print(premises_mask) + + embedded_premises, embedded_hypotheses = premises, hypotheses + if self.dropout: + embedded_premises = self._rnn_dropout(embedded_premises) + embedded_hypotheses = self._rnn_dropout(embedded_hypotheses) + + encoded_premises = self._encoding(embedded_premises, premises_lengths) + encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths) + + projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask, + encoded_hypotheses, hypotheses_mask) + v_ai = self._composition(projected_premises, premises_lengths) + v_bj = self._composition(projected_hypotheses, hypotheses_lengths) + + v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(premises_mask, dim=1, keepdim=True) + v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(hypotheses_mask, dim=1, keepdim=True) + + v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1) + v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1) + + v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max, vulnerability], dim=1) + + logits = self.classification_v(v) + probabilities = nn.functional.softmax(logits, dim=-1) + + return logits, probabilities + diff --git a/vaa/model_transformer_top_embed.py b/vaa/model_transformer_top_embed.py new file mode 100644 index 0000000..a6ec457 --- /dev/null +++ b/vaa/model_transformer_top_embed.py @@ -0,0 +1,140 @@ +""" +Definition of the ESIM model. +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn +from .layers import RNNDropout, Seq2SeqEncoderLast, SoftmaxAttention, Seq2SeqEncoder +from .utils import replace_masked + +class TOP(nn.Module): + """ + Implementation of the ESIM model presented in the paper "Enhanced LSTM for + Natural Language Inference" by Chen et al. + """ + + def __init__(self, + embedding_dim, + hidden_size, + padding_idx=0, + dropout=0.5, + num_classes=3, + device="cpu"): + """ + Args: + vocab_size: The size of the vocabulary of embeddings in the model. + embedding_dim: The dimension of the word embeddings. + hidden_size: The size of all the hidden layers in the network. + embeddings: A tensor of size (vocab_size, embedding_dim) containing + pretrained word embeddings. If None, word embeddings are + initialised randomly. Defaults to None. + padding_idx: The index of the padding token in the premises and + hypotheses passed as input to the model. Defaults to 0. + dropout: The dropout rate to use between the layers of the network. + A dropout rate of 0 corresponds to using no dropout at all. + Defaults to 0.5. + num_classes: The number of classes in the output of the network. + Defaults to 3. + device: The name of the device on which the model is being + executed. Defaults to 'cpu'. + """ + super(TOP, self).__init__() + + self.embedding_dim = embedding_dim + self.hidden_size = hidden_size + self.num_classes = num_classes + self.dropout = dropout + self.device = device + + self.linear_vulnerability = nn.Linear(3*self.hidden_size, self.embedding_dim) + + if self.dropout: + self._rnn_dropout = RNNDropout(p=self.dropout) + + self._encoding = Seq2SeqEncoder(nn.LSTM, + self.embedding_dim, + self.hidden_size, + bidirectional=True) + + self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout) + + self._composition = Seq2SeqEncoder(nn.LSTM, + self.hidden_size, + self.hidden_size, + bidirectional=True) + + self.classification = nn.Sequential(nn.Dropout(p=self.dropout), + nn.Linear(2*4*self.hidden_size, + self.hidden_size), + nn.Tanh(), + nn.Dropout(p=self.dropout), + nn.Linear(self.hidden_size, + self.num_classes)) + + + + def forward(self, premises, hypotheses, vulnerability): + """ + Args: + premises: A batch of varaible length sequences of word indices + representing premises. The batch is assumed to be of size + (batch, premises_length). + premises_lengths: A 1D tensor containing the lengths of the + premises in 'premises'. + hypothesis: A batch of varaible length sequences of word indices + representing hypotheses. The batch is assumed to be of size + (batch, hypotheses_length). + hypotheses_lengths: A 1D tensor containing the lengths of the + hypotheses in 'hypotheses'. + + Returns: + logits: A tensor of size (batch, num_classes) containing the + logits for each output class of the model. + probabilities: A tensor of size (batch, num_classes) containing + the probabilities of each output class in the model. + """ + premises = premises[:, :min(128,premises.size()[1]), :] + hypotheses = hypotheses[:, :min(128, hypotheses.size()[1]), :] + + vulnerability = self.linear_vulnerability(vulnerability).unsqueeze(dim=1) + premises = torch.cat((vulnerability, premises), dim=1) + hypotheses = torch.cat((vulnerability, hypotheses), dim=1) + + # max min 不为0 + premises_mask = ((torch.max(premises, dim=-1)[0]-torch.min(premises, dim=-1)[0]) != 0).float() + hypotheses_mask = ((torch.max(hypotheses, dim=-1)[0]-torch.min(hypotheses, dim=-1)[0])!= 0).float() + premises_lengths = premises_mask.sum(dim=-1).long() + hypotheses_lengths = hypotheses_mask.sum(dim=-1).long() + # print(premises_mask) + + embedded_premises, embedded_hypotheses = premises, hypotheses + if self.dropout: + embedded_premises = self._rnn_dropout(embedded_premises) + embedded_hypotheses = self._rnn_dropout(embedded_hypotheses) + + encoded_premises = self._encoding(embedded_premises, premises_lengths) + encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths) + + projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask, + encoded_hypotheses, hypotheses_mask) + v_ai = self._composition(projected_premises, premises_lengths) + v_bj = self._composition(projected_hypotheses, hypotheses_lengths) + + v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(premises_mask, dim=1, keepdim=True) + v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1) + .transpose(2, 1), dim=1)\ + / torch.sum(hypotheses_mask, dim=1, keepdim=True) + + v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1) + v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1) + + v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1) + + logits = self.classification(v) + probabilities = nn.functional.softmax(logits, dim=-1) + + return logits, probabilities + diff --git a/vaa/utils.py b/vaa/utils.py new file mode 100644 index 0000000..37c9054 --- /dev/null +++ b/vaa/utils.py @@ -0,0 +1,179 @@ +""" +Utility functions for the ESIM model. +""" +# Aurelien Coet, 2018. + +import torch +import torch.nn as nn + + +# Code widely inspired from: +# https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py. +def sort_by_seq_lens(batch, sequences_lengths, descending=True): + """ + Sort a batch of padded variable length sequences by their length. + + Args: + batch: A batch of padded variable length sequences. The batch should + have the dimensions (batch_size x max_sequence_length x *). + sequences_lengths: A tensor containing the lengths of the sequences in the + input batch. The tensor should be of size (batch_size). + descending: A boolean value indicating whether to sort the sequences + by their lengths in descending order. Defaults to True. + + Returns: + sorted_batch: A tensor containing the input batch reordered by + sequences lengths. + sorted_seq_lens: A tensor containing the sorted lengths of the + sequences in the input batch. + sorting_idx: A tensor containing the indices used to permute the input + batch in order to get 'sorted_batch'. + restoration_idx: A tensor containing the indices that can be used to + restore the order of the sequences in 'sorted_batch' so that it + matches the input batch. + """ + sorted_seq_lens, sorting_index =\ + sequences_lengths.sort(0, descending=descending) + + sorted_batch = batch.index_select(0, sorting_index) + + # idx_range = sequences_lengths.new_tensor(torch.arange(0, len(sequences_lengths))) + idx_range = torch.arange(0, len(sequences_lengths)).to(sequences_lengths.device) + _, reverse_mapping = sorting_index.sort(0, descending=False) + restoration_index = idx_range.index_select(0, reverse_mapping) + + return sorted_batch, sorted_seq_lens, sorting_index, restoration_index + + +def get_mask(sequences_batch, sequences_lengths): + """ + Get the mask for a batch of padded variable length sequences. + + Args: + sequences_batch: A batch of padded variable length sequences + containing word indices. Must be a 2-dimensional tensor of size + (batch, sequence). + sequences_lengths: A tensor containing the lengths of the sequences in + 'sequences_batch'. Must be of size (batch). + + Returns: + A mask of size (batch, max_sequence_length), where max_sequence_length + is the length of the longest sequence in the batch. + """ + batch_size = sequences_batch.size()[0] + max_length = torch.max(sequences_lengths) + mask = torch.ones(batch_size, max_length, dtype=torch.float) + mask[sequences_batch[:, :max_length] == 0] = 0.0 + return mask + + +# Code widely inspired from: +# https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py. +def masked_softmax(tensor, mask): + """ + Apply a masked softmax on the last dimension of a tensor. + The input tensor and mask should be of size (batch, *, sequence_length). + + Args: + tensor: The tensor on which the softmax function must be applied along + the last dimension. + mask: A mask of the same size as the tensor with 0s in the positions of + the values that must be masked and 1s everywhere else. + + Returns: + A tensor of the same size as the inputs containing the result of the + softmax. + """ + tensor_shape = tensor.size() + reshaped_tensor = tensor.view(-1, tensor_shape[-1]) + + # Reshape the mask so it matches the size of the input tensor. + while mask.dim() < tensor.dim(): + mask = mask.unsqueeze(1) + # print(mask.size(), tensor.size()) + mask = mask.expand_as(tensor).contiguous().float() + reshaped_mask = mask.view(-1, mask.size()[-1]) + + result = nn.functional.softmax(reshaped_tensor * reshaped_mask, dim=-1) + result = result * reshaped_mask + # 1e-13 is added to avoid divisions by zero. + result = result / (result.sum(dim=-1, keepdim=True) + 1e-13) + + return result.view(*tensor_shape) + + +def normal_softmax(tensor): + tensor_shape = tensor.size() + reshaped_tensor = tensor.view(-1, tensor_shape[-1]) + result = nn.functional.softmax(reshaped_tensor, dim=-1) + # 1e-13 is added to avoid divisions by zero. + result = result / (result.sum(dim=-1, keepdim=True) + 1e-13) + return result.view(*tensor_shape) + + +# Code widely inspired from: +# https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py. +def weighted_sum(tensor, weights, mask): + """ + Apply a weighted sum on the vectors along the last dimension of 'tensor', + and mask the vectors in the result with 'mask'. + + Args: + tensor: A tensor of vectors on which a weighted sum must be applied. + weights: The weights to use in the weighted sum. + mask: A mask to apply on the result of the weighted sum. + + Returns: + A new tensor containing the result of the weighted sum after the mask + has been applied on it. + """ + weighted_sum = weights.bmm(tensor) + + while mask.dim() < weighted_sum.dim(): + mask = mask.unsqueeze(1) + mask = mask.transpose(-1, -2) + mask = mask.expand_as(weighted_sum).contiguous().float() + + return weighted_sum * mask + + +# Code inspired from: +# https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py. +def replace_masked(tensor, mask, value): + """ + Replace the all the values of vectors in 'tensor' that are masked in + 'masked' by 'value'. + + Args: + tensor: The tensor in which the masked vectors must have their values + replaced. + mask: A mask indicating the vectors which must have their values + replaced. + value: The value to place in the masked vectors of 'tensor'. + + Returns: + A new tensor of the same size as 'tensor' where the values of the + vectors masked in 'mask' were replaced by 'value'. + """ + mask = mask.unsqueeze(1).transpose(2, 1) + reverse_mask = 1.0 - mask + values_to_add = value * reverse_mask + return tensor * mask + values_to_add + + +def correct_predictions(output_probabilities, targets): + """ + Compute the number of predictions that match some target classes in the + output of a model. + + Args: + output_probabilities: A tensor of probabilities for different output + classes. + targets: The indices of the actual target classes. + + Returns: + The number of correct predictions in 'output_probabilities'. + """ + _, out_classes = output_probabilities.max(dim=1) + correct = (out_classes == targets).sum() + return correct.item()