diff --git a/alreadrun/test_bert_mnli.py b/alreadrun/test_bert_mnli.py
index 3ea6e15..44722ea 100644
--- a/alreadrun/test_bert_mnli.py
+++ b/alreadrun/test_bert_mnli.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.runned.utils_test_three import validate
-from a3v.model_transformer import ESIM
-# from a3v.model_bert_transformer import ESIM
+from vaa.model_transformer import ESIM
+# from vaa.model_bert_transformer import ESIM
 import os
 import argparse
 import json
diff --git a/alreadrun/test_bert_quora.py b/alreadrun/test_bert_quora.py
index 2d785b2..d6fc4c3 100644
--- a/alreadrun/test_bert_quora.py
+++ b/alreadrun/test_bert_quora.py
@@ -4,7 +4,7 @@
 # Aurelien Coet, 2018.
 
 from utils.runned.utils_test_two import validate
-from a3v.model_transformer import ESIM
+from vaa.model_transformer import ESIM
 import os
 import argparse
 import json
diff --git a/alreadrun/test_bert_snli.py b/alreadrun/test_bert_snli.py
index 5650a08..60a5d2c 100644
--- a/alreadrun/test_bert_snli.py
+++ b/alreadrun/test_bert_snli.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.runned.utils_test_three import validate
-from a3v.model_transformer import ESIM
-# from a3v.model_bert_transformer import ESIM
+from vaa.model_transformer import ESIM
+# from vaa.model_bert_transformer import ESIM
 import os
 import argparse
 import json
diff --git a/alreadrun/test_cifar10.py b/alreadrun/test_cifar10.py
index 85c7a80..418aa9e 100644
--- a/alreadrun/test_cifar10.py
+++ b/alreadrun/test_cifar10.py
@@ -6,8 +6,8 @@
 import os
 import torch
 import torch.nn.functional as F
-from a3v.droped.resnet import PreActResNet18
-from a3v.droped.resnet_top import PreActResNet18Top
+from vaa.droped.resnet import PreActResNet18
+from vaa.droped.resnet_top import PreActResNet18Top
 from torch.autograd import Variable
 import sys
 from utils.utils_base import creterion_cifar
diff --git a/alreadrun/test_esim_quora.py b/alreadrun/test_esim_quora.py
index 90aafc3..bb6d056 100644
--- a/alreadrun/test_esim_quora.py
+++ b/alreadrun/test_esim_quora.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.runned.utils_test_esim_quora import validate
-from a3v.model import ESIM
-from a3v.data import NLIDataset
+from vaa.model import ESIM
+from vaa.data import NLIDataset
 from torch.utils.data import DataLoader
 import os
 import argparse
diff --git a/alreadrun/test_esim_snli.py b/alreadrun/test_esim_snli.py
index cc587a0..86e07f5 100644
--- a/alreadrun/test_esim_snli.py
+++ b/alreadrun/test_esim_snli.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.runned.utils_test_esim_snli import validate
-from a3v.model import ESIM
-from a3v.data import NLIDataset
+from vaa.model import ESIM
+from vaa.data import NLIDataset
 from torch.utils.data import DataLoader
 import os
 import argparse
diff --git a/alreadrun/top_cifar10.py b/alreadrun/top_cifar10.py
index 4662fe3..0613490 100644
--- a/alreadrun/top_cifar10.py
+++ b/alreadrun/top_cifar10.py
@@ -5,8 +5,8 @@
 import torchvision.transforms as transforms
 import os
 import torch
-from a3v.droped.resnet import PreActResNet18
-from a3v.droped.resnet_top import PreActResNet18Top
+from vaa.droped.resnet import PreActResNet18
+from vaa.droped.resnet_top import PreActResNet18Top
 from torch.autograd import Variable
 import sys
 
diff --git a/bert_mnli.py b/bert_mnli.py
index 789ed46..d8efb90 100644
--- a/bert_mnli.py
+++ b/bert_mnli.py
@@ -4,7 +4,7 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_transformer import train, validate
-from a3v.model_transformer import ESIM
+from vaa.model_transformer import ESIM
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/bert_mnli_test.py b/bert_mnli_test.py
index 6cd5cdc..b011cff 100644
--- a/bert_mnli_test.py
+++ b/bert_mnli_test.py
@@ -4,7 +4,7 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_transformer import test
-from a3v.model_transformer import ESIM
+from vaa.model_transformer import ESIM
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/bert_quora.py b/bert_quora.py
index 7ace86c..f1c4ec6 100644
--- a/bert_quora.py
+++ b/bert_quora.py
@@ -4,7 +4,7 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_transformer import train, validate
-from a3v.model_transformer import ESIM
+from vaa.model_transformer import ESIM
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/bert_quora_loss.py b/bert_quora_loss.py
index 9ff85a6..e445d9a 100644
--- a/bert_quora_loss.py
+++ b/bert_quora_loss.py
@@ -4,7 +4,7 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_transformer import train_loss
-from a3v.model_transformer import ESIM
+from vaa.model_transformer import ESIM
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/bert_snli.py b/bert_snli.py
index 038cc4f..f0d1abc 100644
--- a/bert_snli.py
+++ b/bert_snli.py
@@ -4,7 +4,7 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_transformer import train, validate
-from a3v.model_transformer import ESIM
+from vaa.model_transformer import ESIM
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/esim_mnli.py b/esim_mnli.py
index 6727b19..67fba88 100644
--- a/esim_mnli.py
+++ b/esim_mnli.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_esim import train, validate
-from a3v.model import ESIM
-from a3v.data import NLIDataset
+from vaa.model import ESIM
+from vaa.data import NLIDataset
 from torch.utils.data import DataLoader
 import torch.nn as nn
 import matplotlib.pyplot as plt
diff --git a/esim_mnli_test.py b/esim_mnli_test.py
index 9c6f224..a839168 100644
--- a/esim_mnli_test.py
+++ b/esim_mnli_test.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_esim import test
-from a3v.model import ESIM
-from a3v.data import NLIDataset
+from vaa.model import ESIM
+from vaa.data import NLIDataset
 from torch.utils.data import DataLoader
 import torch.nn as nn
 import matplotlib.pyplot as plt
diff --git a/esim_quora.py b/esim_quora.py
index 8f4a416..bcc8804 100644
--- a/esim_quora.py
+++ b/esim_quora.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_esim import train, validate
-from a3v.model import ESIM
-from a3v.data import NLIDataset
+from vaa.model import ESIM
+from vaa.data import NLIDataset
 from torch.utils.data import DataLoader
 import torch.nn as nn
 import matplotlib.pyplot as plt
diff --git a/esim_snli.py b/esim_snli.py
index 2adef59..593dfc7 100644
--- a/esim_snli.py
+++ b/esim_snli.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_esim import train, validate
-from a3v.model import ESIM
-from a3v.data import NLIDataset
+from vaa.model import ESIM
+from vaa.data import NLIDataset
 from torch.utils.data import DataLoader
 import torch.nn as nn
 import matplotlib.pyplot as plt
diff --git a/scripts/droped/cifar10.py b/scripts/droped/cifar10.py
index 4830482..c6b5f1b 100644
--- a/scripts/droped/cifar10.py
+++ b/scripts/droped/cifar10.py
@@ -6,7 +6,7 @@
 import torchvision.transforms as transforms
 import os
 import torch
-from a3v.droped.resnet import PreActResNet18
+from vaa.droped.resnet import PreActResNet18
 
 # Training
 def train(epoch):
diff --git a/scripts/droped/top_quora_transformer.py b/scripts/droped/top_quora_transformer.py
index e47751e..1a3fcf1 100644
--- a/scripts/droped/top_quora_transformer.py
+++ b/scripts/droped/top_quora_transformer.py
@@ -4,10 +4,10 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_top_transformer import train, validate
-from a3v.droped import TransformerESIM as ESIM
-# from a3v.model_esim import ESIM
-from a3v.model_transformer_top import TOP
-# from a3v.model_bert_transformer import ESIM
+from vaa.droped import TransformerESIM as ESIM
+# from vaa.model_esim import ESIM
+from vaa.model_transformer_top import TOP
+# from vaa.model_bert_transformer import ESIM
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/scripts/droped/transformer_quora.py b/scripts/droped/transformer_quora.py
index 079d23e..4e932f4 100644
--- a/scripts/droped/transformer_quora.py
+++ b/scripts/droped/transformer_quora.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_transformer import train, validate
-from a3v.droped import TransformerESIM as ESIM
-# from a3v.model_esim import ESIM
+from vaa.droped import TransformerESIM as ESIM
+# from vaa.model_esim import ESIM
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/scripts/preprocessing/preprocess_mnli.py b/scripts/preprocessing/preprocess_mnli.py
index d0e880b..07cf88f 100644
--- a/scripts/preprocessing/preprocess_mnli.py
+++ b/scripts/preprocessing/preprocess_mnli.py
@@ -10,7 +10,7 @@
 import fnmatch
 import json
 
-from a3v.data import Preprocessor
+from vaa.data import Preprocessor
 
 
 def preprocess_MNLI_data(inputdir,
diff --git a/scripts/preprocessing/preprocess_mnli_bert.py b/scripts/preprocessing/preprocess_mnli_bert.py
index b4f3804..7c7c55a 100644
--- a/scripts/preprocessing/preprocess_mnli_bert.py
+++ b/scripts/preprocessing/preprocess_mnli_bert.py
@@ -10,7 +10,7 @@
 import fnmatch
 import json
 
-from a3v.data import Preprocessor
+from vaa.data import Preprocessor
 
 
 def preprocess_MNLI_data(inputdir,
diff --git a/scripts/preprocessing/preprocess_quora.py b/scripts/preprocessing/preprocess_quora.py
index 3ec1147..f376ef3 100644
--- a/scripts/preprocessing/preprocess_quora.py
+++ b/scripts/preprocessing/preprocess_quora.py
@@ -7,7 +7,7 @@
 import fnmatch
 import json
 
-from a3v.data import Preprocessor
+from vaa.data import Preprocessor
 
 
 def preprocess_quora_data(inputdir,
diff --git a/scripts/preprocessing/preprocess_quora_bert.py b/scripts/preprocessing/preprocess_quora_bert.py
index 2d7b607..3a7f667 100644
--- a/scripts/preprocessing/preprocess_quora_bert.py
+++ b/scripts/preprocessing/preprocess_quora_bert.py
@@ -7,7 +7,7 @@
 import fnmatch
 import json
 
-from a3v.data import Preprocessor
+from vaa.data import Preprocessor
 
 
 def preprocess_quora_data(inputdir,
diff --git a/scripts/preprocessing/preprocess_snli.py b/scripts/preprocessing/preprocess_snli.py
index b103bc8..db77f26 100644
--- a/scripts/preprocessing/preprocess_snli.py
+++ b/scripts/preprocessing/preprocess_snli.py
@@ -9,7 +9,7 @@
 import fnmatch
 import json
 
-from a3v.data import Preprocessor
+from vaa.data import Preprocessor
 
 
 def preprocess_SNLI_data(inputdir,
diff --git a/scripts/preprocessing/preprocess_snli_bert.py b/scripts/preprocessing/preprocess_snli_bert.py
index 805925f..389ec48 100644
--- a/scripts/preprocessing/preprocess_snli_bert.py
+++ b/scripts/preprocessing/preprocess_snli_bert.py
@@ -9,7 +9,7 @@
 import fnmatch
 import json
 
-from a3v.data import Preprocessor
+from vaa.data import Preprocessor
 
 
 def preprocess_SNLI_data(inputdir,
diff --git a/scripts/testing/test_mnli.py b/scripts/testing/test_mnli.py
index f67989f..ea23b69 100644
--- a/scripts/testing/test_mnli.py
+++ b/scripts/testing/test_mnli.py
@@ -10,8 +10,8 @@
 import json
 
 from torch.utils.data import DataLoader
-from a3v.data import NLIDataset
-from a3v.model import ESIM
+from vaa.data import NLIDataset
+from vaa.model import ESIM
 
 
 def predict(model, dataloader, labeldict):
diff --git a/scripts/testing/test_quora.py b/scripts/testing/test_quora.py
index 5cff3f6..c516f3f 100644
--- a/scripts/testing/test_quora.py
+++ b/scripts/testing/test_quora.py
@@ -9,9 +9,9 @@
 import torch
 
 from torch.utils.data import DataLoader
-from a3v.data import NLIDataset
-from a3v.model import ESIM
-from a3v.utils import correct_predictions
+from vaa.data import NLIDataset
+from vaa.model import ESIM
+from vaa.utils import correct_predictions
 from sklearn import metrics
 
 
diff --git a/scripts/testing/test_snli.py b/scripts/testing/test_snli.py
index 395ff23..0510a82 100644
--- a/scripts/testing/test_snli.py
+++ b/scripts/testing/test_snli.py
@@ -9,9 +9,9 @@
 import torch
 
 from torch.utils.data import DataLoader
-from a3v.data import NLIDataset
-from a3v.model import ESIM
-from a3v.utils import correct_predictions
+from vaa.data import NLIDataset
+from vaa.model import ESIM
+from vaa.utils import correct_predictions
 
 
 def test(model, dataloader):
diff --git a/scripts/training/test_quora_elmo.py b/scripts/training/test_quora_elmo.py
index d66d4b5..f05426c 100644
--- a/scripts/training/test_quora_elmo.py
+++ b/scripts/training/test_quora_elmo.py
@@ -10,10 +10,10 @@
 import argparse
 import torch
 import numpy as np
-from a3v.data import ElmoDataset
+from vaa.data import ElmoDataset
 from torch.utils.data import DataLoader
-from a3v.model_elmo import ESIM
-from a3v.utils import correct_predictions
+from vaa.model_elmo import ESIM
+from vaa.utils import correct_predictions
 from sklearn import metrics
 from allennlp.modules.elmo import batch_to_ids
 
diff --git a/scripts/training/test_snli_elmo.py b/scripts/training/test_snli_elmo.py
index 8fd0396..acdaf40 100644
--- a/scripts/training/test_snli_elmo.py
+++ b/scripts/training/test_snli_elmo.py
@@ -10,10 +10,10 @@
 import argparse
 import torch
 import numpy as np
-from a3v.data import ElmoDataset
+from vaa.data import ElmoDataset
 from torch.utils.data import DataLoader
-from a3v.model_elmo2 import ESIM
-from a3v.utils import correct_predictions
+from vaa.model_elmo2 import ESIM
+from vaa.utils import correct_predictions
 from sklearn import metrics
 from allennlp.modules.elmo import batch_to_ids
 
diff --git a/scripts/training/train_mnli.py b/scripts/training/train_mnli.py
index 3726039..a0bdf2b 100644
--- a/scripts/training/train_mnli.py
+++ b/scripts/training/train_mnli.py
@@ -13,8 +13,8 @@
 import torch.nn as nn
 
 from torch.utils.data import DataLoader
-from a3v.data import NLIDataset
-from a3v.model import ESIM
+from vaa.data import NLIDataset
+from vaa.model import ESIM
 from utils.utils_esim import train, validate
 
 
diff --git a/scripts/training/train_snli.py b/scripts/training/train_snli.py
index 9b946fb..fda04ab 100644
--- a/scripts/training/train_snli.py
+++ b/scripts/training/train_snli.py
@@ -13,8 +13,8 @@
 import torch.nn as nn
 
 from torch.utils.data import DataLoader
-from a3v.data import NLIDataset
-from a3v.model import ESIM
+from vaa.data import NLIDataset
+from vaa.model import ESIM
 from utils.utils_esim import train, validate
 
 
diff --git a/test_bert_mnli.py b/test_bert_mnli.py
index 3ea6e15..44722ea 100644
--- a/test_bert_mnli.py
+++ b/test_bert_mnli.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.runned.utils_test_three import validate
-from a3v.model_transformer import ESIM
-# from a3v.model_bert_transformer import ESIM
+from vaa.model_transformer import ESIM
+# from vaa.model_bert_transformer import ESIM
 import os
 import argparse
 import json
diff --git a/test_bert_snli.py b/test_bert_snli.py
index 5650a08..60a5d2c 100644
--- a/test_bert_snli.py
+++ b/test_bert_snli.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.runned.utils_test_three import validate
-from a3v.model_transformer import ESIM
-# from a3v.model_bert_transformer import ESIM
+from vaa.model_transformer import ESIM
+# from vaa.model_bert_transformer import ESIM
 import os
 import argparse
 import json
diff --git a/top_bert_mnli.py b/top_bert_mnli.py
index 37e05fe..b234f06 100644
--- a/top_bert_mnli.py
+++ b/top_bert_mnli.py
@@ -4,10 +4,10 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_top_transformer import train, validate
-# from a3v.model_transformer import TransformerESIM as ESIM
-from a3v.model_transformer import ESIM
-from a3v.model_transformer_top import TOP
-# from a3v.model_bert_transformer import ESIM
+# from vaa.model_transformer import TransformerESIM as ESIM
+from vaa.model_transformer import ESIM
+from vaa.model_transformer_top import TOP
+# from vaa.model_bert_transformer import ESIM
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/top_bert_mnli_test.py b/top_bert_mnli_test.py
index d610d22..b39f340 100644
--- a/top_bert_mnli_test.py
+++ b/top_bert_mnli_test.py
@@ -4,10 +4,10 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_top_transformer import test
-# from a3v.model_transformer import TransformerESIM as ESIM
-from a3v.model_transformer import ESIM
-from a3v.model_transformer_top import TOP
-# from a3v.model_bert_transformer import ESIM
+# from vaa.model_transformer import TransformerESIM as ESIM
+from vaa.model_transformer import ESIM
+from vaa.model_transformer_top import TOP
+# from vaa.model_bert_transformer import ESIM
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/top_bert_quora.py b/top_bert_quora.py
index a3081db..03951b8 100644
--- a/top_bert_quora.py
+++ b/top_bert_quora.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_top_transformer import train, validate
-from a3v.model_transformer import ESIM
-from a3v.model_transformer_top import TOP
+from vaa.model_transformer import ESIM
+from vaa.model_transformer_top import TOP
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/top_bert_quora_loss.py b/top_bert_quora_loss.py
index dd96387..c8018eb 100644
--- a/top_bert_quora_loss.py
+++ b/top_bert_quora_loss.py
@@ -4,8 +4,8 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_top_transformer import train_loss
-from a3v.model_transformer import ESIM
-from a3v.model_transformer_top import TOP
+from vaa.model_transformer import ESIM
+from vaa.model_transformer_top import TOP
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/top_bert_snli.py b/top_bert_snli.py
index f5a260c..23d2e72 100644
--- a/top_bert_snli.py
+++ b/top_bert_snli.py
@@ -4,10 +4,10 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_top_transformer import train, validate
-# from a3v.model_transformer import TransformerESIM as ESIM
-from a3v.model_transformer import ESIM
-from a3v.model_transformer_top import TOP
-# from a3v.model_bert_transformer import ESIM
+# from vaa.model_transformer import TransformerESIM as ESIM
+from vaa.model_transformer import ESIM
+from vaa.model_transformer_top import TOP
+# from vaa.model_bert_transformer import ESIM
 import torch.nn as nn
 import matplotlib.pyplot as plt
 import os
diff --git a/top_esim_mnli.py b/top_esim_mnli.py
index 6c4d731..96fbc89 100644
--- a/top_esim_mnli.py
+++ b/top_esim_mnli.py
@@ -4,9 +4,9 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_top_esim import train, validate
-from a3v.model import ESIM
-from a3v.model_top import TOP
-from a3v.data import NLIDataset
+from vaa.model import ESIM
+from vaa.model_top import TOP
+from vaa.data import NLIDataset
 from torch.utils.data import DataLoader
 import torch.nn as nn
 import matplotlib.pyplot as plt
diff --git a/top_esim_mnli_test.py b/top_esim_mnli_test.py
index 34123e7..5c75d1c 100644
--- a/top_esim_mnli_test.py
+++ b/top_esim_mnli_test.py
@@ -4,9 +4,9 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_top_esim import train, validate, test
-from a3v.model import ESIM
-from a3v.model_top import TOP
-from a3v.data import NLIDataset
+from vaa.model import ESIM
+from vaa.model_top import TOP
+from vaa.data import NLIDataset
 from torch.utils.data import DataLoader
 import torch.nn as nn
 import matplotlib.pyplot as plt
diff --git a/top_esim_quora.py b/top_esim_quora.py
index 6ba2a68..037fcb2 100644
--- a/top_esim_quora.py
+++ b/top_esim_quora.py
@@ -4,9 +4,9 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_top_esim import train, validate
-from a3v.model import ESIM
-from a3v.model_top import TOP
-from a3v.data import NLIDataset
+from vaa.model import ESIM
+from vaa.model_top import TOP
+from vaa.data import NLIDataset
 from torch.utils.data import DataLoader
 import torch.nn as nn
 import matplotlib.pyplot as plt
diff --git a/top_esim_snli.py b/top_esim_snli.py
index 27e4a7f..7ecb53e 100644
--- a/top_esim_snli.py
+++ b/top_esim_snli.py
@@ -4,9 +4,9 @@
 # Aurelien Coet, 2018.
 
 from utils.utils_top_esim import train, validate
-from a3v.model import ESIM
-from a3v.model_top import TOP
-from a3v.data import NLIDataset
+from vaa.model import ESIM
+from vaa.model_top import TOP
+from vaa.data import NLIDataset
 from torch.utils.data import DataLoader
 import torch.nn as nn
 import matplotlib.pyplot as plt
diff --git a/utils/droped/utils_transformer_new.py b/utils/droped/utils_transformer_new.py
index 556a9df..98e68e8 100644
--- a/utils/droped/utils_transformer_new.py
+++ b/utils/droped/utils_transformer_new.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from tqdm import tqdm
-from a3v.utils import correct_predictions
+from vaa.utils import correct_predictions
 from bert_serving.client import BertClient
 from transformers import *
 
diff --git a/utils/runned/utils_test_esim_quora.py b/utils/runned/utils_test_esim_quora.py
index a582719..3664c06 100644
--- a/utils/runned/utils_test_esim_quora.py
+++ b/utils/runned/utils_test_esim_quora.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 import pandas as pd
-from a3v.utils import correct_predictions
+from vaa.utils import correct_predictions
 from bert_serving.client import BertClient
 import numpy as np
 from torch.autograd import Variable
diff --git a/utils/runned/utils_test_esim_snli.py b/utils/runned/utils_test_esim_snli.py
index 7d837ea..a4c9c88 100644
--- a/utils/runned/utils_test_esim_snli.py
+++ b/utils/runned/utils_test_esim_snli.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 import pandas as pd
-from a3v.utils import correct_predictions
+from vaa.utils import correct_predictions
 from bert_serving.client import BertClient
 import numpy as np
 from torch.autograd import Variable
diff --git a/utils/runned/utils_test_three.py b/utils/runned/utils_test_three.py
index 48e73c3..301edab 100644
--- a/utils/runned/utils_test_three.py
+++ b/utils/runned/utils_test_three.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 import pandas as pd
-from a3v.utils import correct_predictions
+from vaa.utils import correct_predictions
 from bert_serving.client import BertClient
 import numpy as np
 from torch.autograd import Variable
diff --git a/utils/runned/utils_test_two.py b/utils/runned/utils_test_two.py
index 44addb4..1c8727b 100644
--- a/utils/runned/utils_test_two.py
+++ b/utils/runned/utils_test_two.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 import pandas as pd
-from a3v.utils import correct_predictions
+from vaa.utils import correct_predictions
 from bert_serving.client import BertClient
 import numpy as np
 from torch.autograd import Variable
diff --git a/utils/utils_esim.py b/utils/utils_esim.py
index 1e97a05..ed3e0f9 100644
--- a/utils/utils_esim.py
+++ b/utils/utils_esim.py
@@ -9,7 +9,7 @@
 import torch.nn as nn
 
 from tqdm import tqdm
-from a3v.utils import correct_predictions
+from vaa.utils import correct_predictions
 
 
 def train(model,
diff --git a/utils/utils_top_esim.py b/utils/utils_top_esim.py
index 71183c3..39030f0 100644
--- a/utils/utils_top_esim.py
+++ b/utils/utils_top_esim.py
@@ -5,7 +5,7 @@
 import time
 import torch.nn as nn
 from tqdm import tqdm
-from a3v.utils import correct_predictions
+from vaa.utils import correct_predictions
 from utils.utils_base import *
 
 def train(model,
diff --git a/utils/utils_top_transformer.py b/utils/utils_top_transformer.py
index 244db3b..78c38f4 100644
--- a/utils/utils_top_transformer.py
+++ b/utils/utils_top_transformer.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn as nn
 from tqdm import tqdm
-from a3v.utils import correct_predictions
+from vaa.utils import correct_predictions
 from bert_serving.client import BertClient
 from utils.utils_base import *
 
diff --git a/utils/utils_transformer.py b/utils/utils_transformer.py
index 3f1959a..44da590 100644
--- a/utils/utils_transformer.py
+++ b/utils/utils_transformer.py
@@ -6,7 +6,7 @@
 import torch.nn as nn
 import pandas as pd
 from tqdm import tqdm
-from a3v.utils import correct_predictions
+from vaa.utils import correct_predictions
 from bert_serving.client import BertClient
 
 
diff --git a/vaa/__init__.py b/vaa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vaa/data.py b/vaa/data.py
new file mode 100644
index 0000000..85b1d56
--- /dev/null
+++ b/vaa/data.py
@@ -0,0 +1,654 @@
+"""
+Preprocessor and dataset definition for NLI.
+"""
+# Aurelien Coet, 2018.
+
+import string
+import torch
+import numpy as np
+
+from collections import Counter
+from torch.utils.data import Dataset
+from allennlp.modules.elmo import Elmo, batch_to_ids
+from sklearn.preprocessing import LabelEncoder
+
+
+class Preprocessor(object):
+    """
+    Preprocessor class for Natural Language Inference datasets.
+
+    The class can be used to read NLI datasets, build worddicts for them
+    and transform their premises, hypotheses and labels into lists of
+    integer indices.
+    """
+
+    def __init__(self,
+                 lowercase=False,
+                 ignore_punctuation=False,
+                 num_words=None,
+                 stopwords=[],
+                 labeldict={},
+                 bos=None,
+                 eos=None):
+        """
+        Args:
+            lowercase: A boolean indicating whether the words in the datasets
+                being preprocessed must be lowercased or not. Defaults to
+                False.
+            ignore_punctuation: A boolean indicating whether punctuation must
+                be ignored or not in the datasets preprocessed by the object.
+            num_words: An integer indicating the number of words to use in the
+                worddict of the object. If set to None, all the words in the
+                data are kept. Defaults to None.
+            stopwords: A list of words that must be ignored when building the
+                worddict for a dataset. Defaults to an empty list.
+            bos: A string indicating the symbol to use for the 'beginning of
+                sentence' token in the data. If set to None, the token isn't
+                used. Defaults to None.
+            eos: A string indicating the symbol to use for the 'end of
+                sentence' token in the data. If set to None, the token isn't
+                used. Defaults to None.
+        """
+        self.lowercase = lowercase
+        self.ignore_punctuation = ignore_punctuation
+        self.num_words = num_words
+        self.stopwords = stopwords
+        self.labeldict = labeldict
+        self.bos = bos
+        self.eos = eos
+
+    def read_data(self, filepath):
+        """
+        Read the premises, hypotheses and labels from some NLI dataset's
+        file and return them in a dictionary. The file should be in the same
+        form as SNLI's .txt files.
+
+        Args:
+            filepath: The path to a file containing some premises, hypotheses
+                and labels that must be read. The file should be formatted in
+                the same way as the SNLI (and MultiNLI) dataset.
+
+        Returns:
+            A dictionary containing three lists, one for the premises, one for
+            the hypotheses, and one for the labels in the input data.
+        """
+        with open(filepath, "r", encoding="utf8") as input_data:
+            ids, premises, hypotheses, labels = [], [], [], []
+
+            # Translation tables to remove parentheses and punctuation from
+            # strings.
+            parentheses_table = str.maketrans({"(": None, ")": None})
+            punct_table = str.maketrans({key: " "
+                                         for key in string.punctuation})
+
+            # Ignore the headers on the first line of the file.
+            next(input_data)
+
+            for line in input_data:
+                line = line.strip().split("\t")
+
+                # Ignore sentences that have no gold label.
+                if line[0] == "-":
+                    continue
+
+                pair_id = line[7]
+                premise = line[1]
+                hypothesis = line[2]
+
+                # Remove '(' and ')' from the premises and hypotheses.
+                premise = premise.translate(parentheses_table)
+                hypothesis = hypothesis.translate(parentheses_table)
+
+                if self.lowercase:
+                    premise = premise.lower()
+                    hypothesis = hypothesis.lower()
+
+                if self.ignore_punctuation:
+                    premise = premise.translate(punct_table)
+                    hypothesis = hypothesis.translate(punct_table)
+
+                # Each premise and hypothesis is split into a list of words.
+                premises.append([w for w in premise.rstrip().split()
+                                 if w not in self.stopwords])
+                hypotheses.append([w for w in hypothesis.rstrip().split()
+                                   if w not in self.stopwords])
+                labels.append(line[0])
+                ids.append(pair_id)
+            # labels = list(LabelEncoder().fit_transform(labels))
+            return {"ids": ids,
+                    "premises": premises,
+                    "hypotheses": hypotheses,
+                    "labels": labels}
+
+    def read_data_bert(self, filepath):
+        """
+        Read the premises, hypotheses and labels from some NLI dataset's
+        file and return them in a dictionary. The file should be in the same
+        form as SNLI's .txt files.
+
+        Args:
+            filepath: The path to a file containing some premises, hypotheses
+                and labels that must be read. The file should be formatted in
+                the same way as the SNLI (and MultiNLI) dataset.
+
+        Returns:
+            A dictionary containing three lists, one for the premises, one for
+            the hypotheses, and one for the labels in the input data.
+        """
+        with open(filepath, "r", encoding="utf8") as input_data:
+            ids, premises, hypotheses, labels = [], [], [], []
+
+            # Translation tables to remove parentheses and punctuation from
+            # strings.
+            parentheses_table = str.maketrans({"(": None, ")": None})
+            punct_table = str.maketrans({key: " "
+                                         for key in string.punctuation})
+
+            # Ignore the headers on the first line of the file.
+            next(input_data)
+
+            for line in input_data:
+                line = line.strip().split("\t")
+
+                # Ignore sentences that have no gold label.
+                if line[0] == "-":
+                    continue
+
+                pair_id = line[7]
+                premise = line[1]
+                hypothesis = line[2]
+
+                # Remove '(' and ')' from the premises and hypotheses.
+                premise = premise.translate(parentheses_table)
+                hypothesis = hypothesis.translate(parentheses_table)
+
+                if self.lowercase:
+                    premise = premise.lower()
+                    hypothesis = hypothesis.lower()
+
+                if self.ignore_punctuation:
+                    premise = premise.translate(punct_table)
+                    hypothesis = hypothesis.translate(punct_table)
+
+                # Each premise and hypothesis is split into a list of words.
+                premises.append(premise.rstrip())
+                hypotheses.append(hypothesis.rstrip())
+                labels.append(line[0])
+                ids.append(pair_id)
+            label_encoder = LabelEncoder()
+            labels = list(label_encoder.fit_transform(labels))
+            # print(label_encoder.classes_)
+            # print(labels[0])
+            return {"ids": ids,
+                    "premises": premises,
+                    "hypotheses": hypotheses,
+                    "labels": labels}
+
+    def read_data_quora(self, filepath):
+        """
+        Read the premises, hypotheses and labels from some NLI dataset's
+        file and return them in a dictionary. The file should be in the same
+        form as quora's .tsv files.
+
+        Args:
+            filepath: The path to a file containing some premises, hypotheses
+                and labels that must be read. The file should be formatted in
+                the same way as the quora dataset.
+
+        Returns:
+            A dictionary containing three lists, one for the premises, one for
+            the hypotheses, and one for the labels in the input data.
+        """
+        with open(filepath, "r", encoding="utf8") as input_data:
+            ids, premises, hypotheses, labels = [], [], [], []
+
+            # Translation tables to remove parentheses and punctuation from
+            # strings.
+            punct_table = str.maketrans({key: " "
+                                         for key in string.punctuation})
+
+            for line in input_data:
+                line = line.strip().split("\t")
+
+                pair_id = line[3]
+                premise = line[1]
+                hypothesis = line[2]
+
+                if self.lowercase:
+                    premise = premise.lower()
+                    hypothesis = hypothesis.lower()
+
+                if self.ignore_punctuation:
+                    premise = premise.translate(punct_table)
+                    hypothesis = hypothesis.translate(punct_table)
+
+                # Each premise and hypothesis is split into a list of words.
+                premises.append([w for w in premise.rstrip().split()
+                                 if w not in self.stopwords])
+                hypotheses.append([w for w in hypothesis.rstrip().split()
+                                   if w not in self.stopwords])
+                labels.append(line[0])
+                ids.append(pair_id)
+
+            return {"ids": ids,
+                    "premises": premises,
+                    "hypotheses": hypotheses,
+                    "labels": labels}
+
+    def read_data_quora_bert(self, filepath):
+        """
+        Read the premises, hypotheses and labels from some NLI dataset's
+        file and return them in a dictionary. The file should be in the same
+        form as quora's .tsv files.
+
+        Args:
+            filepath: The path to a file containing some premises, hypotheses
+                and labels that must be read. The file should be formatted in
+                the same way as the quora dataset.
+
+        Returns:
+            A dictionary containing three lists, one for the premises, one for
+            the hypotheses, and one for the labels in the input data.
+        """
+        with open(filepath, "r", encoding="utf8") as input_data:
+            ids, premises, hypotheses, labels = [], [], [], []
+
+            # Translation tables to remove parentheses and punctuation from
+            # strings.
+            punct_table = str.maketrans({key: " "
+                                         for key in string.punctuation})
+
+            for line in input_data:
+                line = line.strip().split("\t")
+
+                pair_id = line[3]
+                premise = line[1]
+                hypothesis = line[2]
+
+                if self.lowercase:
+                    premise = premise.lower()
+                    hypothesis = hypothesis.lower()
+
+                if self.ignore_punctuation:
+                    premise = premise.translate(punct_table)
+                    hypothesis = hypothesis.translate(punct_table)
+
+                # Each premise and hypothesis is split into a list of words.
+                premises.append(premise.rstrip())
+                hypotheses.append(hypothesis.rstrip())
+                labels.append(line[0])
+                ids.append(pair_id)
+
+            return {"ids": ids,
+                    "premises": premises,
+                    "hypotheses": hypotheses,
+                    "labels": labels}
+
+    def read_data_quora_balance(self, filepath):
+        """
+        Read the premises, hypotheses and labels from some NLI dataset's
+        file and return them in a dictionary. The file should be in the same
+        form as quora's .tsv files.
+
+        Args:
+            filepath: The path to a file containing some premises, hypotheses
+                and labels that must be read. The file should be formatted in
+                the same way as the quora dataset.
+
+        Returns:
+            A dictionary containing three lists, one for the premises, one for
+            the hypotheses, and one for the labels in the input data.
+        """
+        with open(filepath, "r", encoding="utf8") as input_data:
+            ids, premises, hypotheses, labels = [], [], [], []
+
+            # Translation tables to remove parentheses and punctuation from
+            # strings.
+            punct_table = str.maketrans({key: " "
+                                         for key in string.punctuation})
+
+            for line in input_data:
+                line = line.strip().split("\t")
+
+                pair_id = line[3]
+                premise = line[1]
+                hypothesis = line[2]
+
+                if self.lowercase:
+                    premise = premise.lower()
+                    hypothesis = hypothesis.lower()
+
+                if self.ignore_punctuation:
+                    premise = premise.translate(punct_table)
+                    hypothesis = hypothesis.translate(punct_table)
+
+                # Each premise and hypothesis is split into a list of words.
+                premises.append([w for w in premise.rstrip().split()
+                                 if w not in self.stopwords])
+                hypotheses.append([w for w in hypothesis.rstrip().split()
+                                   if w not in self.stopwords])
+                labels.append(line[0])
+                ids.append(pair_id)
+
+            array_labels = np.array([int(x) for x in labels])
+            index = np.arange(len(array_labels))[array_labels == 1]
+            np.random.shuffle(index)
+            for i in range((array_labels == 0).sum() - (array_labels == 1).sum()):
+                idx = index[i]
+                ids.append(ids[idx])
+                premises.append(hypotheses[idx])
+                hypotheses.append(premises[idx])
+                labels.append(labels[idx])
+
+            return {"ids": ids,
+                    "premises": premises,
+                    "hypotheses": hypotheses,
+                    "labels": labels}
+
+    def build_worddict(self, data):
+        """
+        Build a dictionary associating words to unique integer indices for
+        some dataset. The worddict can then be used to transform the words
+        in datasets to their indices.
+
+        Args:
+            data: A dictionary containing the premises, hypotheses and
+                labels of some NLI dataset, in the format returned by the
+                'read_data' method of the Preprocessor class.
+        """
+        words = []
+        [words.extend(sentence) for sentence in data["premises"]]
+        [words.extend(sentence) for sentence in data["hypotheses"]]
+
+        counts = Counter(words)
+        num_words = self.num_words
+        if self.num_words is None:
+            num_words = len(counts)
+
+        self.worddict = {}
+
+        # Special indices are used for padding, out-of-vocabulary words, and
+        # beginning and end of sentence tokens.
+        self.worddict["_PAD_"] = 0
+        self.worddict["_OOV_"] = 1
+
+        offset = 2
+        if self.bos:
+            self.worddict["_BOS_"] = 2
+            offset += 1
+        if self.eos:
+            self.worddict["_EOS_"] = 3
+            offset += 1
+
+        for i, word in enumerate(counts.most_common(num_words)):
+            self.worddict[word[0]] = i + offset
+
+        if self.labeldict == {}:
+            label_names = set(data["labels"])
+            if len(label_names)==3:
+                label_names = ['entailment', 'neutral', 'contradiction']
+            self.labeldict = {label_name: i
+                              for i, label_name in enumerate(label_names)}
+        print('label_dict',self.labeldict)
+
+    def words_to_indices(self, sentence):
+        """
+        Transform the words in a sentence to their corresponding integer
+        indices.
+
+        Args:
+            sentence: A list of words that must be transformed to indices.
+
+        Returns:
+            A list of indices.
+        """
+        indices = []
+        # Include the beggining of sentence token at the start of the sentence
+        # if one is defined.
+        if self.bos:
+            indices.append(self.worddict["_BOS_"])
+
+        for word in sentence:
+            if word in self.worddict:
+                index = self.worddict[word]
+            else:
+                # Words absent from 'worddict' are treated as a special
+                # out-of-vocabulary word (OOV).
+                index = self.worddict["_OOV_"]
+            indices.append(index)
+        # Add the end of sentence token at the end of the sentence if one
+        # is defined.
+        if self.eos:
+            indices.append(self.worddict["_EOS_"])
+
+        return indices
+
+    def indices_to_words(self, indices):
+        """
+        Transform the indices in a list to their corresponding words in
+        the object's worddict.
+
+        Args:
+            indices: A list of integer indices corresponding to words in
+                the Preprocessor's worddict.
+
+        Returns:
+            A list of words.
+        """
+        return [list(self.worddict.keys())[list(self.worddict.values())
+                                           .index(i)]
+                for i in indices]
+
+    def transform_to_indices(self, data):
+        """
+        Transform the words in the premises and hypotheses of a dataset, as
+        well as their associated labels, to integer indices.
+
+        Args:
+            data: A dictionary containing lists of premises, hypotheses
+                and labels, in the format returned by the 'read_data'
+                method of the Preprocessor class.
+
+        Returns:
+            A dictionary containing the transformed premises, hypotheses and
+            labels.
+        """
+        transformed_data = {"ids": [],
+                            "premises": [],
+                            "hypotheses": [],
+                            "labels": []}
+
+        for i, premise in enumerate(data["premises"]):
+            # Ignore sentences that have a label for which no index was
+            # defined in 'labeldict'.
+            label = data["labels"][i]
+            if label not in self.labeldict and label != "hidden":
+                continue
+
+            transformed_data["ids"].append(data["ids"][i])
+
+            if label == "hidden":
+                transformed_data["labels"].append(-1)
+            else:
+                transformed_data["labels"].append(self.labeldict[label])
+
+            indices = self.words_to_indices(premise)
+            transformed_data["premises"].append(indices)
+
+            indices = self.words_to_indices(data["hypotheses"][i])
+            transformed_data["hypotheses"].append(indices)
+        return transformed_data
+
+    def build_embedding_matrix(self, embeddings_file):
+        """
+        Build an embedding matrix with pretrained weights for object's
+        worddict.
+
+        Args:
+            embeddings_file: A file containing pretrained word embeddings.
+
+        Returns:
+            A numpy matrix of size (num_words+n_special_tokens, embedding_dim)
+            containing pretrained word embeddings (the +n_special_tokens is for
+            the padding and out-of-vocabulary tokens, as well as BOS and EOS if
+            they're used).
+        """
+        # Load the word embeddings in a dictionnary.
+        embeddings = {}
+        with open(embeddings_file, "r", encoding="utf8") as input_data:
+            for line in input_data:
+                line = line.split()
+
+                try:
+                    # Check that the second element on the line is the start
+                    # of the embedding and not another word. Necessary to
+                    # ignore multiple word lines.
+                    float(line[1])
+                    word = line[0]
+                    if word in self.worddict:
+                        embeddings[word] = line[1:]
+
+                # Ignore lines corresponding to multiple words separated
+                # by spaces.
+                except ValueError:
+                    continue
+
+        num_words = len(self.worddict)
+        embedding_dim = len(list(embeddings.values())[0])
+        embedding_matrix = np.zeros((num_words, embedding_dim))
+
+        # Actual building of the embedding matrix.
+        missed = 0
+        for word, i in self.worddict.items():
+            if word in embeddings:
+                embedding_matrix[i] = np.array(embeddings[word], dtype=float)
+            else:
+                if word == "_PAD_":
+                    continue
+                missed += 1
+                # Out of vocabulary words are initialised with random gaussian
+                # samples.
+                embedding_matrix[i] = np.random.normal(size=(embedding_dim))
+        print("Missed words: ", missed)
+
+        return embedding_matrix
+
+    def build_embedding_matrix_elmo(self, options_file, weight_file, embedding_dim=1024):
+        """
+        Build an embedding matrix with pretrained weights for object's
+        worddict.
+
+        Args:
+            embeddings_file: A file containing pretrained word embeddings.
+
+        Returns:
+            A numpy matrix of size (num_words+n_special_tokens, embedding_dim)
+            containing pretrained word embeddings (the +n_special_tokens is for
+            the padding and out-of-vocabulary tokens, as well as BOS and EOS if
+            they're used).
+        """
+        options_file = options_file
+        weight_file = weight_file
+        elmo = Elmo(options_file, weight_file, 1, dropout=0)
+
+        num_words = len(self.worddict)
+        embedding_matrix = np.zeros((num_words, embedding_dim))
+
+        print(len(self.worddict))
+        # Actual building of the embedding matrix.
+        for word, i in self.worddict.items():
+            embedding_word = elmo(batch_to_ids([[word]]))['elmo_representations'][0].squeeze()
+            embedding_matrix[i] = np.array(embedding_word.detach().cpu(), dtype=float)
+            if (i+1) % 100 == 0:
+                print(i/len(self.worddict))
+        return embedding_matrix
+
+
+class ElmoDataset(Dataset):
+    def __init__(self,
+                 data):
+        self.data = data
+        self.num_sequences = len(data["premises"])
+
+    def __len__(self):
+        return self.num_sequences
+
+    def __getitem__(self, index):
+        return {
+                "premises": self.data["premises"][index],
+                "hypotheses": self.data["hypotheses"][index],
+                "labels": self.data["labels"][index]}
+
+
+class NLIDataset(Dataset):
+    """
+    Dataset class for Natural Language Inference datasets.
+
+    The class can be used to read preprocessed datasets where the premises,
+    hypotheses and labels have been transformed to unique integer indices
+    (this can be done with the 'preprocess_data' script in the 'scripts'
+    folder of this repository).
+    """
+
+    def __init__(self,
+                 data,
+                 padding_idx=0,
+                 max_premise_length=None,
+                 max_hypothesis_length=None):
+        """
+        Args:
+            data: A dictionary containing the preprocessed premises,
+                hypotheses and labels of some dataset.
+            padding_idx: An integer indicating the index being used for the
+                padding token in the preprocessed data. Defaults to 0.
+            max_premise_length: An integer indicating the maximum length
+                accepted for the sequences in the premises. If set to None,
+                the length of the longest premise in 'data' is used.
+                Defaults to None.
+            max_hypothesis_length: An integer indicating the maximum length
+                accepted for the sequences in the hypotheses. If set to None,
+                the length of the longest hypothesis in 'data' is used.
+                Defaults to None.
+        """
+        self.premises_lengths = [len(seq) for seq in data["premises"]]
+        self.max_premise_length = max_premise_length
+        if self.max_premise_length is None:
+            self.max_premise_length = max(self.premises_lengths)
+
+        self.hypotheses_lengths = [len(seq) for seq in data["hypotheses"]]
+        self.max_hypothesis_length = max_hypothesis_length
+        if self.max_hypothesis_length is None:
+            self.max_hypothesis_length = max(self.hypotheses_lengths)
+
+        self.num_sequences = len(data["premises"])
+
+        self.data = {"ids": [],
+                     "premises": torch.ones((self.num_sequences,
+                                             self.max_premise_length),
+                                            dtype=torch.long) * padding_idx,
+                     "hypotheses": torch.ones((self.num_sequences,
+                                               self.max_hypothesis_length),
+                                              dtype=torch.long) * padding_idx,
+                     "labels": torch.tensor(data["labels"], dtype=torch.long)}
+
+        for i, premise in enumerate(data["premises"]):
+            self.data["ids"].append(data["ids"][i])
+            end = min(len(premise), self.max_premise_length)
+            self.data["premises"][i][:end] = torch.tensor(premise[:end])
+
+            hypothesis = data["hypotheses"][i]
+            end = min(len(hypothesis), self.max_hypothesis_length)
+            self.data["hypotheses"][i][:end] = torch.tensor(hypothesis[:end])
+
+    def __len__(self):
+        return self.num_sequences
+
+    def __getitem__(self, index):
+        return {"id": self.data["ids"][index],
+                "premise": self.data["premises"][index],
+                "premise_length": min(self.premises_lengths[index],
+                                      self.max_premise_length),
+                "hypothesis": self.data["hypotheses"][index],
+                "hypothesis_length": min(self.hypotheses_lengths[index],
+                                         self.max_hypothesis_length),
+                "label": self.data["labels"][index]}
diff --git a/vaa/droped/__init__.py b/vaa/droped/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/vaa/droped/droped.py b/vaa/droped/droped.py
new file mode 100644
index 0000000..511a375
--- /dev/null
+++ b/vaa/droped/droped.py
@@ -0,0 +1,179 @@
+"""
+Definition of the ESIM model.
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+from vaa.layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention
+from vaa.utils import replace_masked
+import math
+from torch.nn.modules.transformer import *
+
+# Temporarily leave PositionalEncoding module here. Will be moved somewhere else.
+class PositionalEncoding(nn.Module):
+    r"""Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=5000).
+    Examples:
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        r"""Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+class TransformerESIM(nn.Module):
+    """
+    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
+    Natural Language Inference" by Chen et al.
+    """
+
+    def __init__(self,
+                 embedding_dim,
+                 hidden_size,
+                 padding_idx=0,
+                 dropout=0.5,
+                 num_classes=3,
+                 device="cpu"):
+        """
+        Args:
+            vocab_size: The size of the vocabulary of embeddings in the model.
+            embedding_dim: The dimension of the word embeddings.
+            hidden_size: The size of all the hidden layers in the network.
+            embeddings: A tensor of size (vocab_size, embedding_dim) containing
+                pretrained word embeddings. If None, word embeddings are
+                initialised randomly. Defaults to None.
+            padding_idx: The index of the padding token in the premises and
+                hypotheses passed as input to the model. Defaults to 0.
+            dropout: The dropout rate to use between the layers of the network.
+                A dropout rate of 0 corresponds to using no dropout at all.
+                Defaults to 0.5.
+            num_classes: The number of classes in the output of the network.
+                Defaults to 3.
+            device: The name of the device on which the model is being
+                executed. Defaults to 'cpu'.
+        """
+        super(TransformerESIM, self).__init__()
+
+        self.embedding_dim = embedding_dim
+        self.hidden_size = hidden_size
+        self.num_classes = num_classes
+        self.dropout = dropout
+        self.device = device
+
+        # if self.dropout:
+        #     self._rnn_dropout = RNNDropout(p=self.dropout)
+        #
+        #
+
+        self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout)
+        # self._composition = Seq2SeqEncoder(nn.LSTM,
+        #                                    self.hidden_size,
+        #                                    self.hidden_size,
+        #                                    bidirectional=True)
+
+        self.pos_encoder = PositionalEncoding(self.hidden_size, self.dropout)
+        encoder_layers = TransformerEncoderLayer(d_model=384, nhead=8)
+        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers=6)
+
+        self._combine = nn.Sequential(nn.Dropout(p=self.dropout),
+                                      nn.Linear(4*self.hidden_size, self.hidden_size),
+                                      nn.Tanh(),
+                                      nn.Dropout(p=self.dropout))
+
+        self._classification = nn.Sequential(nn.Linear(self.hidden_size, self.num_classes))
+
+    def _generate_square_subsequent_mask(self, sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+
+    def forward(self, premises, hypotheses):
+        """
+        Args:
+            premises: A batch of varaible length sequences of word indices
+                representing premises. The batch is assumed to be of size
+                (batch, premises_length).
+            premises_lengths: A 1D tensor containing the lengths of the
+                premises in 'premises'.
+            hypothesis: A batch of varaible length sequences of word indices
+                representing hypotheses. The batch is assumed to be of size
+                (batch, hypotheses_length).
+            hypotheses_lengths: A 1D tensor containing the lengths of the
+                hypotheses in 'hypotheses'.
+
+        Returns:
+            logits: A tensor of size (batch, num_classes) containing the
+                logits for each output class of the model.
+            probabilities: A tensor of size (batch, num_classes) containing
+                the probabilities of each output class in the model.
+        """
+        premises = premises[:, :min(128,premises.size()[1]), :]
+        hypotheses = hypotheses[:, :min(128, hypotheses.size()[1]), :]
+
+        premises_mask_key = (torch.sum(premises, dim=-1) == 0) # 需要被mask的 true
+        hypotheses_mask_key = (torch.sum(hypotheses, dim=-1) == 0)
+        premises_mask = 1 - premises_mask_key.float() # 不需要被mask的 1
+        hypotheses_mask = 1 - hypotheses_mask_key.float()
+        premises_lengths = premises_mask.sum(dim=-1).long()
+        hypotheses_lengths = hypotheses_mask.sum(dim=-1).long()
+
+        projected_premises, projected_hypotheses = self._attention(premises, premises_mask,
+                                                                   hypotheses, hypotheses_mask)
+        v_ai = self._composition(projected_premises, premises_lengths)
+        v_bj = self._composition(projected_hypotheses, hypotheses_lengths)
+
+        # projected_premises = self.pos_encoder(projected_premises.transpose(1, 0).contiguous())
+        # projected_hypotheses = self.pos_encoder(projected_hypotheses.transpose(1, 0).contiguous())
+        # mask1 = self._generate_square_subsequent_mask(len(projected_premises)).to(projected_premises.device)
+        # mask2 = self._generate_square_subsequent_mask(len(projected_hypotheses)).to(projected_hypotheses.device)
+        # v_ai = self.transformer_encoder(projected_premises).transpose(1, 0).contiguous()
+        # v_bj = self.transformer_encoder(projected_hypotheses).transpose(1, 0).contiguous()
+
+        v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1).transpose(2, 1), dim=1)\
+            / torch.sum(premises_mask, dim=1, keepdim=True)
+        v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1).transpose(2, 1), dim=1)\
+            / torch.sum(hypotheses_mask, dim=1, keepdim=True)
+
+        v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1)
+        v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)
+
+        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)
+        adv_logits = self._combine(v)
+        logits = self._classification(adv_logits)
+        probabilities = nn.functional.softmax(logits, dim=-1)
+
+        return logits, probabilities, adv_logits
+
diff --git a/vaa/droped/layers.py b/vaa/droped/layers.py
new file mode 100644
index 0000000..fecef0b
--- /dev/null
+++ b/vaa/droped/layers.py
@@ -0,0 +1,421 @@
+"""
+Definition of custom layers for the ESIM model.
+"""
+# Aurelien Coet, 2018.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.activation import MultiheadAttention
+from vaa.utils import sort_by_seq_lens, masked_softmax, weighted_sum, normal_softmax
+
+# Class widely inspired from:
+# https://github.com/allenai/allennlp/blob/master/allennlp/modules/input_variational_dropout.py
+class RNNDropout(nn.Dropout):
+    """
+    Dropout layer for the inputs of RNNs.
+
+    Apply the same dropout mask to all the elements of the same sequence in
+    a batch of sequences of size (batch, sequences_length, embedding_dim).
+    """
+
+    def forward(self, sequences_batch):
+        """
+        Apply dropout to the input batch of sequences.
+
+        Args:
+            sequences_batch: A batch of sequences of vectors that will serve
+                as input to an RNN.
+                Tensor of size (batch, sequences_length, emebdding_dim).
+
+        Returns:
+            A new tensor on which dropout has been applied.
+        """
+        ones = sequences_batch.data.new_ones(sequences_batch.shape[0],
+                                             sequences_batch.shape[-1])
+        dropout_mask = nn.functional.dropout(ones, self.p, self.training,
+                                             inplace=False)
+        return dropout_mask.unsqueeze(1) * sequences_batch
+
+
+# class TransformerEncoder(nn.Module):
+#     """
+#     RNN taking variable length padded sequences of vectors as input and
+#     encoding them into padded sequences of vectors of the same length.
+#
+#     This module is useful to handle batches of padded sequences of vectors
+#     that have different lengths and that need to be passed through a RNN.
+#     The sequences are sorted in descending order of their lengths, packed,
+#     passed through the RNN, and the resulting sequences are then padded and
+#     permuted back to the original order of the input sequences.
+#     """
+#
+#     def __init__(self,
+#                  input_size,
+#                  nhead=4,
+#                  num_layers=1):
+#
+#         super(TransformerEncoder, self).__init__()
+#
+#         self.input_size = input_size
+#         self.nhead = nhead
+#         self.num_layers = num_layers
+#         self._encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(
+#             self.input_size, nhead=nhead), num_layers=num_layers)
+#
+#     def forward(self, sequences_batch, sequences_lengths):
+#         sequences_batch = sequences_batch.transpose(1, 0).contiguous()
+#         outputs = self._encoder(sequences_batch)
+#         outputs = outputs.transpose(1, 0).contiguous()
+#
+#         sorted_batch, sorted_lengths, _, restoration_idx =\
+#             sort_by_seq_lens(outputs, sequences_lengths)
+#         packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, sorted_lengths, batch_first=True)
+#         outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch, batch_first=True)
+#         return outputs
+
+class LinerEncoder(nn.Module):
+    """
+    RNN taking variable length padded sequences of vectors as input and
+    encoding them into padded sequences of vectors of the same length.
+
+    This module is useful to handle batches of padded sequences of vectors
+    that have different lengths and that need to be passed through a RNN.
+    The sequences are sorted in descending order of their lengths, packed,
+    passed through the RNN, and the resulting sequences are then padded and
+    permuted back to the original order of the input sequences.
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 dropout=0.0):
+        super(LinerEncoder, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self._encoder = nn.Linear(input_size, hidden_size*2)
+
+    def forward(self, sequences_batch, sequences_lengths):
+        """
+        Args:
+            sequences_batch: A batch of variable length sequences of vectors.
+                The batch is assumed to be of size
+                (batch, sequence, vector_dim).
+            sequences_lengths: A 1D tensor containing the sizes of the
+                sequences in the input batch.
+
+        Returns:
+            reordered_outputs: The outputs (hidden states) of the encoder for
+                the sequences in the input batch, in the same order.
+        """
+        sorted_batch, sorted_lengths, _, restoration_idx =\
+            sort_by_seq_lens(sequences_batch, sequences_lengths)
+        packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch,
+                                                         sorted_lengths,
+                                                         batch_first=True)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch,
+                                                      batch_first=True)
+
+        outputs = self._encoder(outputs)
+
+        reordered_outputs = outputs.index_select(0, restoration_idx)
+
+        return reordered_outputs
+
+class LengthEncoder(nn.Module):
+
+    def forward(self, sequences_batch, sequences_lengths):
+        sorted_batch, sorted_lengths, _, restoration_idx =\
+            sort_by_seq_lens(sequences_batch, sequences_lengths)
+        packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch,
+                                                         sorted_lengths,
+                                                         batch_first=True)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch,
+                                                      batch_first=True)
+        reordered_outputs = outputs.index_select(0, restoration_idx)
+
+        return reordered_outputs
+
+class Seq2SeqEncoder(nn.Module):
+    """
+    RNN taking variable length padded sequences of vectors as input and
+    encoding them into padded sequences of vectors of the same length.
+
+    This module is useful to handle batches of padded sequences of vectors
+    that have different lengths and that need to be passed through a RNN.
+    The sequences are sorted in descending order of their lengths, packed,
+    passed through the RNN, and the resulting sequences are then padded and
+    permuted back to the original order of the input sequences.
+    """
+
+    def __init__(self,
+                 rnn_type,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 bias=True,
+                 dropout=0.0,
+                 bidirectional=False):
+        """
+        Args:
+            rnn_type: The type of RNN to use as encoder in the module.
+                Must be a class inheriting from torch.nn.RNNBase
+                (such as torch.nn.LSTM for example).
+            input_size: The number of expected features in the input of the
+                module.
+            hidden_size: The number of features in the hidden state of the RNN
+                used as encoder by the module.
+            num_layers: The number of recurrent layers in the encoder of the
+                module. Defaults to 1.
+            bias: If False, the encoder does not use bias weights b_ih and
+                b_hh. Defaults to True.
+            dropout: If non-zero, introduces a dropout layer on the outputs
+                of each layer of the encoder except the last one, with dropout
+                probability equal to 'dropout'. Defaults to 0.0.
+            bidirectional: If True, the encoder of the module is bidirectional.
+                Defaults to False.
+        """
+        assert issubclass(rnn_type, nn.RNNBase),\
+            "rnn_type must be a class inheriting from torch.nn.RNNBase"
+
+        super(Seq2SeqEncoder, self).__init__()
+
+        self.rnn_type = rnn_type
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.dropout = dropout
+        self.bidirectional = bidirectional
+
+        self._encoder = rnn_type(input_size,
+                                 hidden_size,
+                                 num_layers=num_layers,
+                                 bias=bias,
+                                 batch_first=True,
+                                 dropout=dropout,
+                                 bidirectional=bidirectional)
+
+    def forward(self, sequences_batch, sequences_lengths):
+        """
+        Args:
+            sequences_batch: A batch of variable length sequences of vectors.
+                The batch is assumed to be of size
+                (batch, sequence, vector_dim).
+            sequences_lengths: A 1D tensor containing the sizes of the
+                sequences in the input batch.
+
+        Returns:
+            reordered_outputs: The outputs (hidden states) of the encoder for
+                the sequences in the input batch, in the same order.
+        """
+        sorted_batch, sorted_lengths, _, restoration_idx =\
+            sort_by_seq_lens(sequences_batch, sequences_lengths)
+        packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch,
+                                                         sorted_lengths,
+                                                         batch_first=True)
+
+        outputs, _ = self._encoder(packed_batch, None)
+
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs,
+                                                      batch_first=True)
+        reordered_outputs = outputs.index_select(0, restoration_idx)
+
+        return reordered_outputs
+
+
+class SoftmaxAttention(nn.Module):
+
+    def __init__(self, hidden_size, dropout=0.5):
+        super(SoftmaxAttention, self).__init__()
+        # self.multi_head_attn = MultiheadAttention(hidden_size*2, 8)
+        self.liner1 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2))
+        self.liner2 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2))
+        self.liner3 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2))
+        self.liner4 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2))
+        self.liner5 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2))
+        self.liner6 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2))
+        self.liner7 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2))
+        self.liner8 = nn.Sequential(nn.Linear(hidden_size * 2, hidden_size // 2))
+        self.liner = nn.Sequential(nn.Linear(hidden_size * 4, hidden_size * 4), nn.ReLU(), RNNDropout(p=dropout))
+
+        # self._enhance = nn.Sequential(nn.Linear(2*7*2*hidden_size, 7*2*hidden_size), nn.ReLU(), RNNDropout(p=dropout))
+        self._projection = nn.Sequential(nn.Linear(7*2*hidden_size, hidden_size), nn.ReLU(), RNNDropout(p=dropout))
+
+        # self.Wb_inter = torch.nn.Parameter(torch.randn(hidden_size*2, hidden_size*2), requires_grad=True)
+        # self.Wb_intra = torch.nn.Parameter(torch.randn(hidden_size * 2, hidden_size * 2), requires_grad=True)
+
+    """
+    Attention layer taking premises and hypotheses encoded by an RNN as input
+    and computing the soft attention between their elements.
+    The dot product of the encoded vectors in the premises and hypotheses is
+    first computed. The softmax of the result is then used in a weighted sum
+    of the vectors of the premises for each element of the hypotheses, and
+    conversely for the elements of the premises.
+    """
+
+    def forward(self,
+                premise_batch,
+                premise_mask,
+                hypothesis_batch,
+                hypothesis_mask):
+        """
+        Args:
+            premise_batch: A batch of sequences of vectors representing the
+                premises in some NLI task. The batch is assumed to have the
+                size (batch, sequences, vector_dim).
+            premise_mask: A mask for the sequences in the premise batch, to
+                ignore padding data in the sequences during the computation of
+                the attention.
+            hypothesis_batch: A batch of sequences of vectors representing the
+                hypotheses in some NLI task. The batch is assumed to have the
+                size (batch, sequences, vector_dim).
+            hypothesis_mask: A mask for the sequences in the hypotheses batch,
+                to ignore padding data in the sequences during the computation
+                of the attention.
+
+        Returns:
+            attended_premises: The sequences of attention vectors for the
+                premises in the input batch.
+            attended_hypotheses: The sequences of attention vectors for the
+                hypotheses in the input batch.
+        """
+        # dot attn
+        enhanced_premises0, enhanced_hypotheses0 = self.dot_attn(premise_batch, premise_mask,
+                hypothesis_batch, hypothesis_mask)
+        # # bilinear attn
+        # enhanced_premises1, enhanced_hypotheses1 = self.bilinear_attn(premise_batch, premise_mask,
+        #         hypothesis_batch, hypothesis_mask)
+        #
+        # enhanced_premises = self._enhance(torch.cat((enhanced_premises0, enhanced_premises1), dim=-1))
+        # enhanced_hypotheses = self._enhance(torch.cat((enhanced_hypotheses0, enhanced_hypotheses1), dim=-1))
+
+        projected_premises = self._projection(enhanced_premises0)
+        projected_hypotheses = self._projection(enhanced_hypotheses0)
+
+        return projected_premises, projected_hypotheses
+
+    def bilinear_attn(self, premise_batch, premise_mask,
+                hypothesis_batch, hypothesis_mask):
+        # inter-attention Softmax attention weights.
+        Wb_inter = self.Wb_inter.repeat(premise_batch.size()[0], 1, 1)
+        Wb_intra = self.Wb_intra.repeat(premise_batch.size()[0], 1, 1)
+
+        similarity_matrix = premise_batch.bmm(Wb_inter).bmm(hypothesis_batch.transpose(2, 1).contiguous())
+        prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask)
+        hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask)
+        attended_premises = weighted_sum(hypothesis_batch, prem_hyp_attn, premise_mask)
+        attended_hypotheses = weighted_sum(premise_batch, hyp_prem_attn, hypothesis_mask)
+
+        self_premises_matrix = premise_batch.bmm(Wb_intra).bmm(premise_batch.transpose(2, 1).contiguous())
+        self_hypotheses_matrix = hypothesis_batch.bmm(Wb_intra).bmm(hypothesis_batch.transpose(2, 1).contiguous())
+        self_premises_attn = normal_softmax(self_premises_matrix)
+        self_hypotheses_attn = normal_softmax(self_hypotheses_matrix)
+        self_premises = self_premises_attn.bmm(premise_batch)
+        self_hypotheses = self_hypotheses_attn.bmm(hypothesis_batch)
+
+        # attn_importance
+        premise_importance = torch.sum(self_premises_attn, dim=-2).unsqueeze(-1)
+        hypotheses_importance = torch.sum(self_hypotheses_attn, dim=-2).unsqueeze(-1)
+        inter_hypotheses_importance = torch.sum(prem_hyp_attn, dim=-2).unsqueeze(-1)
+        inter_premise_importance = torch.sum(hyp_prem_attn, dim=-2).unsqueeze(-1)
+
+        enhanced_premises, enhanced_hypotheses = self.multi_importance(premise_importance, hypotheses_importance,
+                                                            inter_premise_importance, inter_hypotheses_importance,
+                                                            premise_batch, hypothesis_batch, attended_premises,
+                                                            attended_hypotheses, self_premises, self_hypotheses)
+
+        return enhanced_premises, enhanced_hypotheses
+
+    def dot_attn(self, premise_batch, premise_mask,
+                hypothesis_batch, hypothesis_mask):
+        sqrt_dim = np.sqrt(premise_batch.size()[2])
+        # inter-attention Softmax attention weights.
+        similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1).contiguous()) / sqrt_dim
+
+        prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask)
+        hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask)
+        attended_premises = weighted_sum(hypothesis_batch, prem_hyp_attn, premise_mask)
+        attended_hypotheses = weighted_sum(premise_batch, hyp_prem_attn, hypothesis_mask)
+
+        self_premises_matrix = premise_batch.bmm(premise_batch.transpose(2, 1).contiguous()) / sqrt_dim
+        self_hypotheses_matrix = hypothesis_batch.bmm(hypothesis_batch.transpose(2, 1).contiguous()) / sqrt_dim
+
+        self_premises_attn = normal_softmax(self_premises_matrix)
+        self_hypotheses_attn = normal_softmax(self_hypotheses_matrix)
+        self_premises = self_premises_attn.bmm(premise_batch)
+        self_hypotheses = self_hypotheses_attn.bmm(hypothesis_batch)
+
+        # attn_importance max
+        premise_importance = torch.sum(self_premises_attn, dim=-2).unsqueeze(-1)
+        hypotheses_importance = torch.sum(self_hypotheses_attn, dim=-2).unsqueeze(-1)
+        inter_hypotheses_importance = torch.sum(prem_hyp_attn, dim=-2).unsqueeze(-1)
+        inter_premise_importance = torch.sum(hyp_prem_attn, dim=-2).unsqueeze(-1)
+
+        enhanced_premises, enhanced_hypotheses = self.multi_importance(premise_importance, hypotheses_importance,
+                                                            inter_premise_importance, inter_hypotheses_importance,
+                                                            premise_batch, hypothesis_batch, attended_premises,
+                                                            attended_hypotheses, self_premises, self_hypotheses)
+
+        return enhanced_premises, enhanced_hypotheses
+
+
+    def multi_importance(self, premise_importance, hypotheses_importance,
+                         inter_premise_importance, inter_hypotheses_importance,
+                         premise_batch, hypothesis_batch,attended_premises,
+                         attended_hypotheses, self_premises, self_hypotheses):
+        # attn1
+        prem_all_attn1 = premise_importance * inter_premise_importance
+        hyp_all_attn1 = hypotheses_importance * inter_hypotheses_importance
+        attended_premises1 = self.liner1(premise_batch * prem_all_attn1)
+        attended_hypotheses1 = self.liner1(hypothesis_batch * hyp_all_attn1)
+
+        # attn2
+        prem_all_attn2 = premise_importance + inter_premise_importance
+        hyp_all_attn2 = hypotheses_importance + inter_hypotheses_importance
+        attended_premises2 = self.liner2(premise_batch * prem_all_attn2)
+        attended_hypotheses2 = self.liner2(hypothesis_batch * hyp_all_attn2)
+        # attn3
+        prem_all_attn3 = torch.max(premise_importance, inter_premise_importance)
+        hyp_all_attn3 = torch.max(hypotheses_importance, inter_hypotheses_importance)
+        attended_premises3 = self.liner3(premise_batch * prem_all_attn3)
+        attended_hypotheses3 = self.liner3(hypothesis_batch * hyp_all_attn3)
+        # attn4
+        attended_premises4_1 = premise_batch * premise_importance
+        attended_premises4_2 = premise_batch * inter_premise_importance
+        attended_premises4 = self.liner4(torch.max(attended_premises4_1, attended_premises4_2))
+        attended_hypotheses4_1 = hypothesis_batch * hypotheses_importance
+        attended_hypotheses4_2 = hypothesis_batch * inter_hypotheses_importance
+        attended_hypotheses4 = self.liner4(torch.max(attended_hypotheses4_1, attended_hypotheses4_2))
+        # attn5
+        attended_premises5 = self.liner5(premise_batch * (prem_all_attn1 + 1))
+        attended_hypotheses5 = self.liner5(hypothesis_batch * (hyp_all_attn1 + 1))
+        # attn6
+        attended_premises6 = self.liner6(premise_batch * (prem_all_attn2 + 1))
+        attended_hypotheses6 = self.liner6(hypothesis_batch * (hyp_all_attn2 + 1))
+        # attn7
+        attended_premises7 = self.liner7(premise_batch * (prem_all_attn3 + 1))
+        attended_hypotheses7 = self.liner7(hypothesis_batch * (hyp_all_attn3 + 1))
+        # attn8
+        attended_premises8 = self.liner8(torch.max(attended_premises4_1, attended_premises4_2) + premise_batch)
+        attended_hypotheses8 = self.liner8(torch.max(attended_hypotheses4_1, attended_hypotheses4_2) + hypothesis_batch)
+
+        premise_all = self.liner(torch.cat([attended_premises1, attended_premises2, attended_premises3,
+                                            attended_premises4, attended_premises5, attended_premises6,
+                                            attended_premises7, attended_premises8], dim=-1))
+        hypotheses_all = self.liner(torch.cat([attended_hypotheses1, attended_hypotheses2, attended_hypotheses3,
+                                               attended_hypotheses4, attended_hypotheses5, attended_hypotheses6,
+                                               attended_hypotheses7, attended_hypotheses8], dim=-1))
+
+        enhanced_premises = torch.cat([premise_batch, attended_premises, self_premises,
+                                       premise_batch - attended_premises, premise_batch * attended_premises,
+                                       premise_all
+                                       ],
+                                      dim=-1)
+        enhanced_hypotheses = torch.cat([hypothesis_batch, attended_hypotheses, self_hypotheses,
+                                         hypothesis_batch - attended_hypotheses, hypothesis_batch * attended_hypotheses,
+                                         hypotheses_all
+                                         ],
+                                        dim=-1)
+        return enhanced_premises, enhanced_hypotheses
\ No newline at end of file
diff --git a/vaa/droped/model_new.py b/vaa/droped/model_new.py
new file mode 100644
index 0000000..8f6a1f9
--- /dev/null
+++ b/vaa/droped/model_new.py
@@ -0,0 +1,146 @@
+"""
+Definition of the ESIM model.
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+from vaa.layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention, LinerEncoder
+from vaa.utils import get_mask, replace_masked
+# from allennlp.modules.elmo import Elmo, batch_to_ids
+
+class ESIM(nn.Module):
+    """
+    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
+    Natural Language Inference" by Chen et al.
+    """
+
+    def __init__(self,
+                 vocab_size,
+                 embedding_dim,
+                 hidden_size,
+                 embeddings=None,
+                 padding_idx=0,
+                 dropout=0.5,
+                 num_classes=3,
+                 device="cpu"):
+        """
+        Args:
+            vocab_size: The size of the vocabulary of embeddings in the model.
+            embedding_dim: The dimension of the word embeddings.
+            hidden_size: The size of all the hidden layers in the network.
+            embeddings: A tensor of size (vocab_size, embedding_dim) containing
+                pretrained word embeddings. If None, word embeddings are
+                initialised randomly. Defaults to None.
+            padding_idx: The index of the padding token in the premises and
+                hypotheses passed as input to the model. Defaults to 0.
+            dropout: The dropout rate to use between the layers of the network.
+                A dropout rate of 0 corresponds to using no dropout at all.
+                Defaults to 0.5.
+            num_classes: The number of classes in the output of the network.
+                Defaults to 3.
+            device: The name of the device on which the model is being
+                executed. Defaults to 'cpu'.
+        """
+        super(ESIM, self).__init__()
+
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.hidden_size = hidden_size
+        self.num_classes = num_classes
+        self.dropout = dropout
+        self.device = device
+
+        if self.dropout:
+            self._rnn_dropout = RNNDropout(p=self.dropout)
+
+        self._word_embedding = nn.Embedding(self.vocab_size,
+                                            self.embedding_dim,
+                                            padding_idx=padding_idx,
+                                            _weight=embeddings)
+
+        self.transformer_model = nn.Transformer(d_model=self.embedding_dim, nhead=4,
+                                                num_encoder_layers=3, num_decoder_layers=3)
+
+        self._composition = nn.LSTM(self.embedding_dim, self.hidden_size, bidirectional=True, batch_first=True)
+
+        self._classification = nn.Sequential(nn.Linear(self.hidden_size*2, self.num_classes))
+
+
+    def forward(self,
+                premises,
+                premises_lengths,
+                hypotheses,
+                hypotheses_lengths,
+                embedd=False,
+                premises_mask=None,
+                hypotheses_mask=None):
+        """
+        Args:
+            premises: A batch of varaible length sequences of word indices
+                representing premises. The batch is assumed to be of size
+                (batch, premises_length).
+            premises_lengths: A 1D tensor containing the lengths of the
+                premises in 'premises'.
+            hypothesis: A batch of varaible length sequences of word indices
+                representing hypotheses. The batch is assumed to be of size
+                (batch, hypotheses_length).
+            hypotheses_lengths: A 1D tensor containing the lengths of the
+                hypotheses in 'hypotheses'.
+
+        Returns:
+            logits: A tensor of size (batch, num_classes) containing the
+                logits for each output class of the model.
+            probabilities: A tensor of size (batch, num_classes) containing
+                the probabilities of each output class in the model.
+        """
+        if premises_mask is None:
+            premises_mask = get_mask(premises, premises_lengths).to(self.device)
+            hypotheses_mask = get_mask(hypotheses, hypotheses_lengths).to(self.device)
+
+        if embedd:
+            embedded_premises = premises
+            embedded_hypotheses = hypotheses
+        else:
+            embedded_premises = self._word_embedding(premises)
+            embedded_hypotheses = self._word_embedding(hypotheses)
+
+        if self.dropout:
+            embedded_premises = self._rnn_dropout(embedded_premises)
+            embedded_hypotheses = self._rnn_dropout(embedded_hypotheses)
+
+        # encoded_premises = self._encoding(embedded_premises, premises_lengths)
+        # encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths)
+
+        v = self.transformer_model(embedded_premises.transpose(0, 1), embedded_hypotheses.transpose(0,1)).transpose(0,1)
+        _, (hn, cn) = self._composition(v)
+        hn = hn.transpose(0, 1).contiguous()
+        logits = self._classification(hn.view(hn.size()[0], -1))
+        probabilities = nn.functional.softmax(logits, dim=-1)
+
+        return logits, probabilities, hn, \
+               (embedded_premises, embedded_hypotheses, premises_mask, hypotheses_mask)
+
+
+def _init_esim_weights(module):
+    """
+    Initialise the weights of the ESIM model.
+    """
+    if isinstance(module, nn.Linear):
+        nn.init.xavier_uniform_(module.weight.data)
+        nn.init.constant_(module.bias.data, 0.0)
+
+    elif isinstance(module, nn.LSTM):
+        nn.init.xavier_uniform_(module.weight_ih_l0.data)
+        nn.init.orthogonal_(module.weight_hh_l0.data)
+        nn.init.constant_(module.bias_ih_l0.data, 0.0)
+        nn.init.constant_(module.bias_hh_l0.data, 0.0)
+        hidden_size = module.bias_hh_l0.data.shape[0] // 4
+        module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0
+
+        if (module.bidirectional):
+            nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data)
+            nn.init.orthogonal_(module.weight_hh_l0_reverse.data)
+            nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0)
+            nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0)
+            module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0
diff --git a/vaa/droped/model_top.py b/vaa/droped/model_top.py
new file mode 100644
index 0000000..36032f3
--- /dev/null
+++ b/vaa/droped/model_top.py
@@ -0,0 +1,174 @@
+"""
+Definition of the ESIM model.
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+from vaa.layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention, LinerEncoder
+from vaa.utils import get_mask, replace_masked
+# from allennlp.modules.elmo import Elmo, batch_to_ids
+
+class TOP(nn.Module):
+    """
+    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
+    Natural Language Inference" by Chen et al.
+    """
+
+    def __init__(self,
+                 vocab_size,
+                 embedding_dim,
+                 hidden_size,
+                 embeddings=None,
+                 padding_idx=0,
+                 dropout=0.5,
+                 num_classes=3,
+                 device="cpu"):
+        """
+        Args:
+            vocab_size: The size of the vocabulary of embeddings in the model.
+            embedding_dim: The dimension of the word embeddings.
+            hidden_size: The size of all the hidden layers in the network.
+            embeddings: A tensor of size (vocab_size, embedding_dim) containing
+                pretrained word embeddings. If None, word embeddings are
+                initialised randomly. Defaults to None.
+            padding_idx: The index of the padding token in the premises and
+                hypotheses passed as input to the model. Defaults to 0.
+            dropout: The dropout rate to use between the layers of the network.
+                A dropout rate of 0 corresponds to using no dropout at all.
+                Defaults to 0.5.
+            num_classes: The number of classes in the output of the network.
+                Defaults to 3.
+            device: The name of the device on which the model is being
+                executed. Defaults to 'cpu'.
+        """
+        super(TOP, self).__init__()
+
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.hidden_size = hidden_size
+        self.num_classes = num_classes
+        self.dropout = dropout
+        self.device = device
+
+        # self._word_embedding = nn.Embedding(self.vocab_size,
+        #                                     self.embedding_dim,
+        #                                     padding_idx=padding_idx,
+        #                                     _weight=embeddings)
+        if self.dropout:
+            self._rnn_dropout = RNNDropout(p=self.dropout)
+
+        self._encoding = Seq2SeqEncoder(nn.LSTM,
+                                        self.embedding_dim,
+                                        self.hidden_size,
+                                        num_layers=1,
+                                        bidirectional=True)
+
+        self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout)
+
+        self._composition = Seq2SeqEncoder(nn.LSTM,
+                                           self.hidden_size,
+                                           self.hidden_size,
+                                           bidirectional=True)
+
+        for p in self.parameters():
+            p.requires_grad = False
+
+        self.linear_vulnerability = nn.Linear(3 * self.hidden_size, self.hidden_size)
+
+        self.classification = nn.Sequential(nn.Dropout(p=self.dropout),
+                                             nn.Linear((2*4+1)*self.hidden_size,
+                                                       self.hidden_size),
+                                             nn.Tanh(),
+                                             nn.Dropout(p=self.dropout),
+                                             nn.Linear(self.hidden_size,
+                                                       self.num_classes))
+
+        # Initialize all weights and biases in the model.
+        self.apply(_init_esim_weights)
+
+    def forward(self,
+                premises,
+                premises_lengths,
+                hypotheses,
+                hypotheses_lengths,
+                vulnerability,
+                premises_mask,
+                hypotheses_mask):
+        """
+        Args:
+            premises: A batch of varaible length sequences of word indices
+                representing premises. The batch is assumed to be of size
+                (batch, premises_length).
+            premises_lengths: A 1D tensor containing the lengths of the
+                premises in 'premises'.
+            hypothesis: A batch of varaible length sequences of word indices
+                representing hypotheses. The batch is assumed to be of size
+                (batch, hypotheses_length).
+            hypotheses_lengths: A 1D tensor containing the lengths of the
+                hypotheses in 'hypotheses'.
+
+        Returns:
+            logits: A tensor of size (batch, num_classes) containing the
+                logits for each output class of the model.
+            probabilities: A tensor of size (batch, num_classes) containing
+                the probabilities of each output class in the model.
+        """
+
+        if self.dropout:
+            embedded_premises = self._rnn_dropout(premises)
+            embedded_hypotheses = self._rnn_dropout(hypotheses)
+        else:
+            embedded_premises = premises
+            embedded_hypotheses = hypotheses
+
+        encoded_premises = self._encoding(embedded_premises, premises_lengths)
+        encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths)
+
+        projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask,
+                                                                 encoded_hypotheses, hypotheses_mask)
+
+        v_ai = self._composition(projected_premises, premises_lengths)
+        v_bj = self._composition(projected_hypotheses, hypotheses_lengths)
+
+        v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1)
+                                                .transpose(2, 1), dim=1)\
+            / torch.sum(premises_mask, dim=1, keepdim=True)
+        v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1)
+                                                  .transpose(2, 1), dim=1)\
+            / torch.sum(hypotheses_mask, dim=1, keepdim=True)
+
+        v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1)
+        v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)
+
+        vulnerability = self.linear_vulnerability(vulnerability)
+        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max, vulnerability], dim=1)
+
+        logits = self.classification(v)
+        probabilities = nn.functional.softmax(logits, dim=-1)
+
+        return logits, probabilities
+
+
+def _init_esim_weights(module):
+    """
+    Initialise the weights of the ESIM model.
+    """
+    if isinstance(module, nn.Linear):
+        nn.init.xavier_uniform_(module.weight.data)
+        nn.init.constant_(module.bias.data, 0.0)
+
+    elif isinstance(module, nn.LSTM):
+        nn.init.xavier_uniform_(module.weight_ih_l0.data)
+        nn.init.orthogonal_(module.weight_hh_l0.data)
+        nn.init.constant_(module.bias_ih_l0.data, 0.0)
+        nn.init.constant_(module.bias_hh_l0.data, 0.0)
+        hidden_size = module.bias_hh_l0.data.shape[0] // 4
+        module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0
+
+        if (module.bidirectional):
+            nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data)
+            nn.init.orthogonal_(module.weight_hh_l0_reverse.data)
+            nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0)
+            nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0)
+            module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0
diff --git a/vaa/droped/model_transformer_new.py b/vaa/droped/model_transformer_new.py
new file mode 100644
index 0000000..9bb008a
--- /dev/null
+++ b/vaa/droped/model_transformer_new.py
@@ -0,0 +1,104 @@
+"""
+Definition of the ESIM model.
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+from vaa.layers import RNNDropout, Seq2SeqEncoderLast, SoftmaxAttention, LengthEncoder
+from vaa.utils import replace_masked
+from torch.nn.modules import TransformerEncoder, TransformerEncoderLayer, LayerNorm
+import math
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+class ESIM(nn.Module):
+    """
+    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
+    Natural Language Inference" by Chen et al.
+    """
+
+    def __init__(self,
+                 embedding_dim,
+                 hidden_size,
+                 padding_idx=0,
+                 dropout=0.5,
+                 num_classes=3,
+                 device="cpu"):
+        """
+        Args:
+            vocab_size: The size of the vocabulary of embeddings in the model.
+            embedding_dim: The dimension of the word embeddings.
+            hidden_size: The size of all the hidden layers in the network.
+            embeddings: A tensor of size (vocab_size, embedding_dim) containing
+                pretrained word embeddings. If None, word embeddings are
+                initialised randomly. Defaults to None.
+            padding_idx: The index of the padding token in the premises and
+                hypotheses passed as input to the model. Defaults to 0.
+            dropout: The dropout rate to use between the layers of the network.
+                A dropout rate of 0 corresponds to using no dropout at all.
+                Defaults to 0.5.
+            num_classes: The number of classes in the output of the network.
+                Defaults to 3.
+            device: The name of the device on which the model is being
+                executed. Defaults to 'cpu'.
+        """
+        super(ESIM, self).__init__()
+
+        self.embedding_dim = embedding_dim
+        self.hidden_size = hidden_size
+        self.num_classes = num_classes
+        self.dropout = dropout
+        self.device = device
+
+        self.transformer_model = nn.Transformer(d_model=self.embedding_dim, nhead=8,
+                                                num_encoder_layers=6, num_decoder_layers=6)
+
+        # self._composition = nn.LSTM(self.embedding_dim, self.hidden_size, bidirectional=True, batch_first=True)
+
+        self._classification = nn.Sequential(nn.Linear(self.hidden_size*2, self.num_classes))
+
+
+
+    def forward(self, premises, hypotheses):
+        """
+        Args:
+            premises: A batch of varaible length sequences of word indices
+                representing premises. The batch is assumed to be of size
+                (batch, premises_length).
+            premises_lengths: A 1D tensor containing the lengths of the
+                premises in 'premises'.
+            hypothesis: A batch of varaible length sequences of word indices
+                representing hypotheses. The batch is assumed to be of size
+                (batch, hypotheses_length).
+            hypotheses_lengths: A 1D tensor containing the lengths of the
+                hypotheses in 'hypotheses'.
+
+        Returns:
+            logits: A tensor of size (batch, num_classes) containing the
+                logits for each output class of the model.
+            probabilities: A tensor of size (batch, num_classes) containing
+                the probabilities of each output class in the model.
+        """
+        premises = premises[:, :min(128,premises.size()[1]), :]
+        hypotheses = hypotheses[:, :min(128, hypotheses.size()[1]), :]
+
+        v = self.transformer_model(premises.transpose(0, 1), hypotheses.transpose(0,1)).transpose(0,1)[:,0]
+
+        logits = self._classification(v)
+        probabilities = nn.functional.softmax(logits, dim=-1)
+
+        return logits, probabilities, v
+
diff --git a/vaa/droped/resnet.py b/vaa/droped/resnet.py
new file mode 100644
index 0000000..219d8e8
--- /dev/null
+++ b/vaa/droped/resnet.py
@@ -0,0 +1,94 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PreActBlock(nn.Module):
+    '''Pre-activation version of the BasicBlock.'''
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(PreActBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(x))
+        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
+        out = self.conv1(out)
+        out = self.conv2(F.relu(self.bn2(out)))
+        out += shortcut
+        return out
+
+
+class PreActBottleneck(nn.Module):
+    '''Pre-activation version of the original Bottleneck module.'''
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(PreActBottleneck, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
+
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(x))
+        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
+        out = self.conv1(out)
+        out = self.conv2(F.relu(self.bn2(out)))
+        out = self.conv3(F.relu(self.bn3(out)))
+        out += shortcut
+        return out
+
+
+class PreActResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(PreActResNet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512*block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        logits = out.view(out.size(0), -1)
+        out = self.linear(logits)
+        # out = F.log_softmax(out, dim=1)
+        return out, logits
+
+
+def PreActResNet18():
+    return PreActResNet(PreActBlock, [2,2,2,2])
+
+
diff --git a/vaa/droped/resnet_top.py b/vaa/droped/resnet_top.py
new file mode 100644
index 0000000..77a5832
--- /dev/null
+++ b/vaa/droped/resnet_top.py
@@ -0,0 +1,110 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PreActBlock(nn.Module):
+    '''Pre-activation version of the BasicBlock.'''
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(PreActBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(x))
+        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
+        out = self.conv1(out)
+        out = self.conv2(F.relu(self.bn2(out)))
+        out += shortcut
+        return out
+
+
+class PreActBottleneck(nn.Module):
+    '''Pre-activation version of the original Bottleneck module.'''
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(PreActBottleneck, self).__init__()
+        self.bn1 = nn.BatchNorm2d(in_planes)
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
+
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(x))
+        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
+        out = self.conv1(out)
+        out = self.conv2(F.relu(self.bn2(out)))
+        out = self.conv3(F.relu(self.bn3(out)))
+        out += shortcut
+        return out
+
+
+class PreActResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(PreActResNet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+
+        for p in self.parameters():
+            p.requires_grad = False
+
+        self.linear = nn.Sequential(
+            nn.Dropout(p=0.5),
+            nn.Linear(512*block.expansion*3, 16*block.expansion),
+                                    #nn.ReLU()
+        )
+
+        self.classification = nn.Linear((512+16)*block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x, vulnerability):
+        out = self.conv1(x)
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        logits = out.view(out.size(0), -1)
+
+        vulnerability = self.linear(vulnerability)
+        logits = torch.cat([logits, vulnerability], dim=1)
+
+        # out = self.linear(logits)
+        out = self.classification(logits)
+        # out = F.log_softmax(out, dim=1)
+        return out
+
+
+def PreActResNet18Top():
+    return PreActResNet(PreActBlock, [2,2,2,2])
+
+
diff --git a/vaa/layers.py b/vaa/layers.py
new file mode 100644
index 0000000..7b3ea20
--- /dev/null
+++ b/vaa/layers.py
@@ -0,0 +1,420 @@
+"""
+Definition of custom layers for the ESIM model.
+"""
+# Aurelien Coet, 2018.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .utils import sort_by_seq_lens, masked_softmax, weighted_sum, get_mask
+
+# Class widely inspired from:
+# https://github.com/allenai/allennlp/blob/master/allennlp/modules/input_variational_dropout.py
+class RNNDropout(nn.Dropout):
+    """
+    Dropout layer for the inputs of RNNs.
+
+    Apply the same dropout mask to all the elements of the same sequence in
+    a batch of sequences of size (batch, sequences_length, embedding_dim).
+    """
+
+    def forward(self, sequences_batch):
+        """
+        Apply dropout to the input batch of sequences.
+
+        Args:
+            sequences_batch: A batch of sequences of vectors that will serve
+                as input to an RNN.
+                Tensor of size (batch, sequences_length, emebdding_dim).
+
+        Returns:
+            A new tensor on which dropout has been applied.
+        """
+        ones = sequences_batch.data.new_ones(sequences_batch.shape[0],
+                                             sequences_batch.shape[-1])
+        dropout_mask = nn.functional.dropout(ones, self.p, self.training,
+                                             inplace=False)
+        return dropout_mask.unsqueeze(1) * sequences_batch
+
+
+# class TransformerEncoder(nn.Module):
+#     """
+#     RNN taking variable length padded sequences of vectors as input and
+#     encoding them into padded sequences of vectors of the same length.
+#
+#     This module is useful to handle batches of padded sequences of vectors
+#     that have different lengths and that need to be passed through a RNN.
+#     The sequences are sorted in descending order of their lengths, packed,
+#     passed through the RNN, and the resulting sequences are then padded and
+#     permuted back to the original order of the input sequences.
+#     """
+#
+#     def __init__(self,
+#                  input_size,
+#                  nhead=4,
+#                  num_layers=1):
+#
+#         super(TransformerEncoder, self).__init__()
+#
+#         self.input_size = input_size
+#         self.nhead = nhead
+#         self.num_layers = num_layers
+#         self._encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(
+#             self.input_size, nhead=nhead), num_layers=num_layers)
+#
+#     def forward(self, sequences_batch, sequences_lengths):
+#         sequences_batch = sequences_batch.transpose(1, 0).contiguous()
+#         outputs = self._encoder(sequences_batch)
+#         outputs = outputs.transpose(1, 0).contiguous()
+#
+#         sorted_batch, sorted_lengths, _, restoration_idx =\
+#             sort_by_seq_lens(outputs, sequences_lengths)
+#         packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch, sorted_lengths, batch_first=True)
+#         outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch, batch_first=True)
+#         return outputs
+
+class LinerEncoder(nn.Module):
+    """
+    RNN taking variable length padded sequences of vectors as input and
+    encoding them into padded sequences of vectors of the same length.
+
+    This module is useful to handle batches of padded sequences of vectors
+    that have different lengths and that need to be passed through a RNN.
+    The sequences are sorted in descending order of their lengths, packed,
+    passed through the RNN, and the resulting sequences are then padded and
+    permuted back to the original order of the input sequences.
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 dropout=0.0):
+        super(LinerEncoder, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self._encoder = nn.Linear(input_size, hidden_size*2)
+
+    def forward(self, sequences_batch, sequences_lengths):
+        """
+        Args:
+            sequences_batch: A batch of variable length sequences of vectors.
+                The batch is assumed to be of size
+                (batch, sequence, vector_dim).
+            sequences_lengths: A 1D tensor containing the sizes of the
+                sequences in the input batch.
+
+        Returns:
+            reordered_outputs: The outputs (hidden states) of the encoder for
+                the sequences in the input batch, in the same order.
+        """
+        sorted_batch, sorted_lengths, _, restoration_idx =\
+            sort_by_seq_lens(sequences_batch, sequences_lengths)
+        packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch,
+                                                         sorted_lengths,
+                                                         batch_first=True)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch,
+                                                      batch_first=True)
+
+        outputs = self._encoder(outputs)
+
+        reordered_outputs = outputs.index_select(0, restoration_idx)
+
+        return reordered_outputs
+
+class LengthEncoder(nn.Module):
+
+    def forward(self, sequences_batch, sequences_lengths):
+        sorted_batch, sorted_lengths, _, restoration_idx =\
+            sort_by_seq_lens(sequences_batch, sequences_lengths)
+        packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch,
+                                                         sorted_lengths,
+                                                         batch_first=True)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_batch,
+                                                      batch_first=True)
+        reordered_outputs = outputs.index_select(0, restoration_idx)
+
+        return reordered_outputs
+
+class Seq2SeqEncoder(nn.Module):
+    """
+    RNN taking variable length padded sequences of vectors as input and
+    encoding them into padded sequences of vectors of the same length.
+
+    This module is useful to handle batches of padded sequences of vectors
+    that have different lengths and that need to be passed through a RNN.
+    The sequences are sorted in descending order of their lengths, packed,
+    passed through the RNN, and the resulting sequences are then padded and
+    permuted back to the original order of the input sequences.
+    """
+
+    def __init__(self,
+                 rnn_type,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 bias=True,
+                 dropout=0.0,
+                 bidirectional=False):
+        """
+        Args:
+            rnn_type: The type of RNN to use as encoder in the module.
+                Must be a class inheriting from torch.nn.RNNBase
+                (such as torch.nn.LSTM for example).
+            input_size: The number of expected features in the input of the
+                module.
+            hidden_size: The number of features in the hidden state of the RNN
+                used as encoder by the module.
+            num_layers: The number of recurrent layers in the encoder of the
+                module. Defaults to 1.
+            bias: If False, the encoder does not use bias weights b_ih and
+                b_hh. Defaults to True.
+            dropout: If non-zero, introduces a dropout layer on the outputs
+                of each layer of the encoder except the last one, with dropout
+                probability equal to 'dropout'. Defaults to 0.0.
+            bidirectional: If True, the encoder of the module is bidirectional.
+                Defaults to False.
+        """
+        assert issubclass(rnn_type, nn.RNNBase),\
+            "rnn_type must be a class inheriting from torch.nn.RNNBase"
+
+        super(Seq2SeqEncoder, self).__init__()
+
+        self.rnn_type = rnn_type
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.dropout = dropout
+        self.bidirectional = bidirectional
+
+        self._encoder = rnn_type(input_size,
+                                 hidden_size,
+                                 num_layers=num_layers,
+                                 bias=bias,
+                                 batch_first=True,
+                                 dropout=dropout,
+                                 bidirectional=bidirectional)
+
+    def forward(self, sequences_batch, sequences_lengths):
+        """
+        Args:
+            sequences_batch: A batch of variable length sequences of vectors.
+                The batch is assumed to be of size
+                (batch, sequence, vector_dim).
+            sequences_lengths: A 1D tensor containing the sizes of the
+                sequences in the input batch.
+
+        Returns:
+            reordered_outputs: The outputs (hidden states) of the encoder for
+                the sequences in the input batch, in the same order.
+        """
+        sorted_batch, sorted_lengths, _, restoration_idx =\
+            sort_by_seq_lens(sequences_batch, sequences_lengths)
+        packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch,
+                                                         sorted_lengths,
+                                                         batch_first=True)
+
+        outputs, _ = self._encoder(packed_batch, None)
+
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs,
+                                                      batch_first=True)
+        reordered_outputs = outputs.index_select(0, restoration_idx)
+
+        return reordered_outputs
+
+class Seq2SeqEncoderLast(nn.Module):
+    """
+    RNN taking variable length padded sequences of vectors as input and
+    encoding them into padded sequences of vectors of the same length.
+
+    This module is useful to handle batches of padded sequences of vectors
+    that have different lengths and that need to be passed through a RNN.
+    The sequences are sorted in descending order of their lengths, packed,
+    passed through the RNN, and the resulting sequences are then padded and
+    permuted back to the original order of the input sequences.
+    """
+
+    def __init__(self,
+                 rnn_type,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 bias=True,
+                 dropout=0.0,
+                 bidirectional=False):
+        """
+        Args:
+            rnn_type: The type of RNN to use as encoder in the module.
+                Must be a class inheriting from torch.nn.RNNBase
+                (such as torch.nn.LSTM for example).
+            input_size: The number of expected features in the input of the
+                module.
+            hidden_size: The number of features in the hidden state of the RNN
+                used as encoder by the module.
+            num_layers: The number of recurrent layers in the encoder of the
+                module. Defaults to 1.
+            bias: If False, the encoder does not use bias weights b_ih and
+                b_hh. Defaults to True.
+            dropout: If non-zero, introduces a dropout layer on the outputs
+                of each layer of the encoder except the last one, with dropout
+                probability equal to 'dropout'. Defaults to 0.0.
+            bidirectional: If True, the encoder of the module is bidirectional.
+                Defaults to False.
+        """
+        assert issubclass(rnn_type, nn.RNNBase),\
+            "rnn_type must be a class inheriting from torch.nn.RNNBase"
+
+        super(Seq2SeqEncoderLast, self).__init__()
+
+        self.rnn_type = rnn_type
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.dropout = dropout
+        self.bidirectional = bidirectional
+
+        self._encoder = rnn_type(input_size,
+                                 hidden_size,
+                                 num_layers=num_layers,
+                                 bias=bias,
+                                 batch_first=True,
+                                 dropout=dropout,
+                                 bidirectional=bidirectional)
+
+    def forward(self, sequences_batch, sequences_lengths):
+        """
+        Args:
+            sequences_batch: A batch of variable length sequences of vectors.
+                The batch is assumed to be of size
+                (batch, sequence, vector_dim).
+            sequences_lengths: A 1D tensor containing the sizes of the
+                sequences in the input batch.
+
+        Returns:
+            reordered_outputs: The outputs (hidden states) of the encoder for
+                the sequences in the input batch, in the same order.
+        """
+        sorted_batch, sorted_lengths, _, restoration_idx =\
+            sort_by_seq_lens(sequences_batch, sequences_lengths)
+        packed_batch = nn.utils.rnn.pack_padded_sequence(sorted_batch,
+                                                         sorted_lengths,
+                                                         batch_first=True)
+
+        outputs, (hidden, cell) = self._encoder(packed_batch, None)
+        hidden = hidden.transpose(0, 1).contiguous()
+        hidden = hidden.view(hidden.size()[0], -1)
+        hidden = hidden.index_select(0, restoration_idx)
+
+        return hidden
+
+
+class SoftmaxAttention(nn.Module):
+
+    def __init__(self, hidden_size, dropout=0.5):
+        super(SoftmaxAttention, self).__init__()
+
+        self._projection = nn.Sequential(nn.Linear(4*2*hidden_size, hidden_size), nn.ReLU(), RNNDropout(p=dropout))
+
+
+    """
+    Attention layer taking premises and hypotheses encoded by an RNN as input
+    and computing the soft attention between their elements.
+    The dot product of the encoded vectors in the premises and hypotheses is
+    first computed. The softmax of the result is then used in a weighted sum
+    of the vectors of the premises for each element of the hypotheses, and
+    conversely for the elements of the premises.
+    """
+
+    def forward(self,
+                premise_batch,
+                premise_mask,
+                hypothesis_batch,
+                hypothesis_mask):
+        """
+        Args:
+            premise_batch: A batch of sequences of vectors representing the
+                premises in some NLI task. The batch is assumed to have the
+                size (batch, sequences, vector_dim).
+            premise_mask: A mask for the sequences in the premise batch, to
+                ignore padding data in the sequences during the computation of
+                the attention.
+            hypothesis_batch: A batch of sequences of vectors representing the
+                hypotheses in some NLI task. The batch is assumed to have the
+                size (batch, sequences, vector_dim).
+            hypothesis_mask: A mask for the sequences in the hypotheses batch,
+                to ignore padding data in the sequences during the computation
+                of the attention.
+
+        Returns:
+            attended_premises: The sequences of attention vectors for the
+                premises in the input batch.
+            attended_hypotheses: The sequences of attention vectors for the
+                hypotheses in the input batch.
+        """
+        # dot attn
+        enhanced_premises0, enhanced_hypotheses0 = self.dot_attn(premise_batch, premise_mask,
+                hypothesis_batch, hypothesis_mask)
+        projected_premises = self._projection(enhanced_premises0)
+        projected_hypotheses = self._projection(enhanced_hypotheses0)
+
+        return projected_premises, projected_hypotheses
+
+    def dot_attn(self, premise_batch, premise_mask, hypothesis_batch, hypothesis_mask):
+        sqrt_dim = np.sqrt(premise_batch.size()[2])
+        # inter-attention Softmax attention weights.
+        similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1).contiguous()) / sqrt_dim
+        prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask)
+        hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask)
+        attended_premises = weighted_sum(hypothesis_batch, prem_hyp_attn, premise_mask)
+        attended_hypotheses = weighted_sum(premise_batch, hyp_prem_attn, hypothesis_mask)
+
+        enhanced_premises, enhanced_hypotheses = self.multi_importance(premise_batch, hypothesis_batch,
+                                                                       attended_premises, attended_hypotheses)
+
+        return enhanced_premises, enhanced_hypotheses
+
+
+    def multi_importance(self, premise_batch, hypothesis_batch,attended_premises,
+                         attended_hypotheses):
+        enhanced_premises = torch.cat([premise_batch, attended_premises,
+                                       premise_batch * attended_premises, premise_batch - attended_premises
+                                       ],
+                                      dim=-1)
+        enhanced_hypotheses = torch.cat([hypothesis_batch, attended_hypotheses,
+                                         hypothesis_batch * attended_hypotheses, hypothesis_batch - attended_hypotheses
+                                         ],
+                                        dim=-1)
+        return enhanced_premises, enhanced_hypotheses
+
+
+class WordEmbedding(nn.Module):
+    def __init__(self,
+                 vocab_size,
+                 embedding_dim,
+                 embeddings=None,
+                 padding_idx=0):
+
+        super(WordEmbedding, self).__init__()
+
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+
+        self._word_embedding = nn.Embedding(self.vocab_size,
+                                            self.embedding_dim,
+                                            padding_idx=padding_idx,
+                                            _weight=embeddings)
+
+    def forward(self,
+                premises,
+                premises_lengths,
+                hypotheses,
+                hypotheses_lengths):
+
+        premises_mask = get_mask(premises, premises_lengths).to(self.device)
+        hypotheses_mask = get_mask(hypotheses, hypotheses_lengths).to(self.device)
+
+        embedded_premises = self._word_embedding(premises)
+        embedded_hypotheses = self._word_embedding(hypotheses)
+        return (embedded_premises, embedded_hypotheses, premises_mask, hypotheses_mask)
\ No newline at end of file
diff --git a/vaa/model.py b/vaa/model.py
new file mode 100644
index 0000000..7f336be
--- /dev/null
+++ b/vaa/model.py
@@ -0,0 +1,190 @@
+"""
+Definition of the ESIM model.
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+from .layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention, LinerEncoder
+from .utils import get_mask, replace_masked
+# from allennlp.modules.elmo import Elmo, batch_to_ids
+
+class ESIM(nn.Module):
+    """
+    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
+    Natural Language Inference" by Chen et al.
+    """
+
+    def __init__(self,
+                 vocab_size,
+                 embedding_dim,
+                 hidden_size,
+                 embeddings=None,
+                 padding_idx=0,
+                 dropout=0.5,
+                 num_classes=3,
+                 device="cpu"):
+        """
+        Args:
+            vocab_size: The size of the vocabulary of embeddings in the model.
+            embedding_dim: The dimension of the word embeddings.
+            hidden_size: The size of all the hidden layers in the network.
+            embeddings: A tensor of size (vocab_size, embedding_dim) containing
+                pretrained word embeddings. If None, word embeddings are
+                initialised randomly. Defaults to None.
+            padding_idx: The index of the padding token in the premises and
+                hypotheses passed as input to the model. Defaults to 0.
+            dropout: The dropout rate to use between the layers of the network.
+                A dropout rate of 0 corresponds to using no dropout at all.
+                Defaults to 0.5.
+            num_classes: The number of classes in the output of the network.
+                Defaults to 3.
+            device: The name of the device on which the model is being
+                executed. Defaults to 'cpu'.
+        """
+        super(ESIM, self).__init__()
+
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.hidden_size = hidden_size
+        self.num_classes = num_classes
+        self.dropout = dropout
+        self.device = device
+
+        self._word_embedding = nn.Embedding(self.vocab_size,
+                                            self.embedding_dim,
+                                            padding_idx=padding_idx,
+                                            _weight=embeddings)
+        if self.dropout:
+            self._rnn_dropout = RNNDropout(p=self.dropout)
+
+        self._encoding = Seq2SeqEncoder(nn.LSTM,
+                                        self.embedding_dim,
+                                        self.hidden_size,
+                                        num_layers=1,
+                                        bidirectional=True)
+        # self._encoding = LinerEncoder(self.embedding_dim, self.hidden_size)
+
+        self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout)
+
+        # self._projection = nn.Sequential(nn.Linear(7*2*self.hidden_size,
+        #                                            self.hidden_size),
+        #                                  nn.ReLU())
+
+        self._composition = Seq2SeqEncoder(nn.LSTM,
+                                           self.hidden_size,
+                                           self.hidden_size,
+                                           bidirectional=True)
+
+        # self._classification = nn.Sequential(nn.Dropout(p=self.dropout),
+        #                                      nn.Linear(2*4*self.hidden_size,
+        #                                                self.hidden_size),
+        #                                      nn.Tanh(),
+        #                                      nn.Dropout(p=self.dropout),
+        #                                      nn.Linear(self.hidden_size,
+        #                                                self.num_classes))
+
+        self._combine = nn.Sequential(nn.Dropout(p=self.dropout),
+                                      nn.Linear(2*4*self.hidden_size, self.hidden_size),
+                                      nn.Tanh(),
+                                      nn.Dropout(p=self.dropout))
+
+        self._classification = nn.Sequential(nn.Linear(self.hidden_size, self.num_classes))
+
+
+        # Initialize all weights and biases in the model.
+        self.apply(_init_esim_weights)
+
+    def forward(self,
+                premises,
+                premises_lengths,
+                hypotheses,
+                hypotheses_lengths,
+                embedd=False,
+                premises_mask=None,
+                hypotheses_mask=None):
+        """
+        Args:
+            premises: A batch of varaible length sequences of word indices
+                representing premises. The batch is assumed to be of size
+                (batch, premises_length).
+            premises_lengths: A 1D tensor containing the lengths of the
+                premises in 'premises'.
+            hypothesis: A batch of varaible length sequences of word indices
+                representing hypotheses. The batch is assumed to be of size
+                (batch, hypotheses_length).
+            hypotheses_lengths: A 1D tensor containing the lengths of the
+                hypotheses in 'hypotheses'.
+
+        Returns:
+            logits: A tensor of size (batch, num_classes) containing the
+                logits for each output class of the model.
+            probabilities: A tensor of size (batch, num_classes) containing
+                the probabilities of each output class in the model.
+        """
+        if premises_mask is None:
+            premises_mask = get_mask(premises, premises_lengths).to(self.device)
+            hypotheses_mask = get_mask(hypotheses, hypotheses_lengths).to(self.device)
+
+        if embedd:
+            embedded_premises = premises
+            embedded_hypotheses = hypotheses
+        else:
+            embedded_premises = self._word_embedding(premises)
+            embedded_hypotheses = self._word_embedding(hypotheses)
+
+        if self.dropout:
+            embedded_premises = self._rnn_dropout(embedded_premises)
+            embedded_hypotheses = self._rnn_dropout(embedded_hypotheses)
+
+        encoded_premises = self._encoding(embedded_premises, premises_lengths)
+        encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths)
+
+        projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask,
+                                                                 encoded_hypotheses, hypotheses_mask)
+
+        v_ai = self._composition(projected_premises, premises_lengths)
+        v_bj = self._composition(projected_hypotheses, hypotheses_lengths)
+
+        v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1)
+                                                .transpose(2, 1), dim=1)\
+            / torch.sum(premises_mask, dim=1, keepdim=True)
+        v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1)
+                                                  .transpose(2, 1), dim=1)\
+            / torch.sum(hypotheses_mask, dim=1, keepdim=True)
+
+        v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1)
+        v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)
+
+        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)
+
+        adv_logits = self._combine(v)
+        logits = self._classification(adv_logits)
+        probabilities = nn.functional.softmax(logits, dim=-1)
+
+        return logits, probabilities, adv_logits, \
+               (embedded_premises, embedded_hypotheses, premises_mask, hypotheses_mask)
+
+
+def _init_esim_weights(module):
+    """
+    Initialise the weights of the ESIM model.
+    """
+    if isinstance(module, nn.Linear):
+        nn.init.xavier_uniform_(module.weight.data)
+        nn.init.constant_(module.bias.data, 0.0)
+
+    elif isinstance(module, nn.LSTM):
+        nn.init.xavier_uniform_(module.weight_ih_l0.data)
+        nn.init.orthogonal_(module.weight_hh_l0.data)
+        nn.init.constant_(module.bias_ih_l0.data, 0.0)
+        nn.init.constant_(module.bias_hh_l0.data, 0.0)
+        hidden_size = module.bias_hh_l0.data.shape[0] // 4
+        module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0
+
+        if (module.bidirectional):
+            nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data)
+            nn.init.orthogonal_(module.weight_hh_l0_reverse.data)
+            nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0)
+            nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0)
+            module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0
diff --git a/vaa/model_top.py b/vaa/model_top.py
new file mode 100644
index 0000000..314f99b
--- /dev/null
+++ b/vaa/model_top.py
@@ -0,0 +1,177 @@
+"""
+vulnerability vector pad on the embedding left
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+from .layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention, LinerEncoder
+from .utils import get_mask, replace_masked
+# from allennlp.modules.elmo import Elmo, batch_to_ids
+
+class TOP(nn.Module):
+    """
+    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
+    Natural Language Inference" by Chen et al.
+    """
+
+    def __init__(self,
+                 vocab_size,
+                 embedding_dim,
+                 hidden_size,
+                 embeddings=None,
+                 padding_idx=0,
+                 dropout=0.5,
+                 num_classes=3,
+                 device="cpu"):
+        """
+        Args:
+            vocab_size: The size of the vocabulary of embeddings in the model.
+            embedding_dim: The dimension of the word embeddings.
+            hidden_size: The size of all the hidden layers in the network.
+            embeddings: A tensor of size (vocab_size, embedding_dim) containing
+                pretrained word embeddings. If None, word embeddings are
+                initialised randomly. Defaults to None.
+            padding_idx: The index of the padding token in the premises and
+                hypotheses passed as input to the model. Defaults to 0.
+            dropout: The dropout rate to use between the layers of the network.
+                A dropout rate of 0 corresponds to using no dropout at all.
+                Defaults to 0.5.
+            num_classes: The number of classes in the output of the network.
+                Defaults to 3.
+            device: The name of the device on which the model is being
+                executed. Defaults to 'cpu'.
+        """
+        super(TOP, self).__init__()
+
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.hidden_size = hidden_size
+        self.num_classes = num_classes
+        self.dropout = dropout
+        self.device = device
+
+        self.linear_vulnerability = nn.Linear(3 * self.hidden_size, self.embedding_dim)
+
+        if self.dropout:
+            self._rnn_dropout = RNNDropout(p=self.dropout)
+
+        self._encoding = Seq2SeqEncoder(nn.LSTM,
+                                        self.embedding_dim,
+                                        self.hidden_size,
+                                        num_layers=1,
+                                        bidirectional=True)
+
+        self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout)
+
+        self._composition = Seq2SeqEncoder(nn.LSTM,
+                                           self.hidden_size,
+                                           self.hidden_size,
+                                           bidirectional=True)
+        # for p in self.parameters():
+        #     p.requires_grad = False
+
+        self.classification_v = nn.Sequential(nn.Dropout(p=self.dropout),
+                                             nn.Linear(((2*4)*self.hidden_size+self.embedding_dim),
+                                                       self.hidden_size),
+                                             nn.Tanh(),
+                                             nn.Dropout(p=self.dropout),
+                                             nn.Linear(self.hidden_size,
+                                                       self.num_classes))
+
+        # Initialize all weights and biases in the model.
+        self.apply(_init_esim_weights)
+
+    def forward(self,
+                premises,
+                premises_lengths,
+                hypotheses,
+                hypotheses_lengths,
+                vulnerability,
+                premises_mask,
+                hypotheses_mask):
+        """
+        Args:
+            premises: A batch of varaible length sequences of word indices
+                representing premises. The batch is assumed to be of size
+                (batch, premises_length).
+            premises_lengths: A 1D tensor containing the lengths of the
+                premises in 'premises'.
+            hypothesis: A batch of varaible length sequences of word indices
+                representing hypotheses. The batch is assumed to be of size
+                (batch, hypotheses_length).
+            hypotheses_lengths: A 1D tensor containing the lengths of the
+                hypotheses in 'hypotheses'.
+
+        Returns:
+            logits: A tensor of size (batch, num_classes) containing the
+                logits for each output class of the model.
+            probabilities: A tensor of size (batch, num_classes) containing
+                the probabilities of each output class in the model.
+        """
+        device = premises.device
+        vulnerability = self.linear_vulnerability(vulnerability)#.unsqueeze(dim=1)
+
+        # premises = torch.cat((vulnerability, premises), dim=1)
+        # hypotheses = torch.cat((vulnerability, hypotheses), dim=1)
+        # premises_lengths = premises_lengths+1
+        # hypotheses_lengths= hypotheses_lengths+1
+        # premises_mask = torch.cat((torch.ones(premises_mask.size()[0],1).to(device), premises_mask), dim=1)
+        # hypotheses_mask = torch.cat((torch.ones(hypotheses_mask.size()[0],1).to(device), hypotheses_mask), dim=1)
+
+        embedded_premises = premises
+        embedded_hypotheses = hypotheses
+        if self.dropout:
+            embedded_premises = self._rnn_dropout(premises)
+            embedded_hypotheses = self._rnn_dropout(hypotheses)
+
+        encoded_premises = self._encoding(embedded_premises, premises_lengths)
+        encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths)
+
+        projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask,
+                                                                 encoded_hypotheses, hypotheses_mask)
+
+        v_ai = self._composition(projected_premises, premises_lengths)
+        v_bj = self._composition(projected_hypotheses, hypotheses_lengths)
+
+        v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1)
+                                                .transpose(2, 1), dim=1)\
+            / torch.sum(premises_mask, dim=1, keepdim=True)
+        v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1)
+                                                  .transpose(2, 1), dim=1)\
+            / torch.sum(hypotheses_mask, dim=1, keepdim=True)
+
+        v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1)
+        v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)
+
+        # print(vulnerability.size(), v_a_avg.size())
+        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max, vulnerability], dim=1)
+
+        logits = self.classification_v(v)
+        probabilities = nn.functional.softmax(logits, dim=-1)
+
+        return logits, probabilities
+
+
+def _init_esim_weights(module):
+    """
+    Initialise the weights of the ESIM model.
+    """
+    if isinstance(module, nn.Linear):
+        nn.init.xavier_uniform_(module.weight.data)
+        nn.init.constant_(module.bias.data, 0.0)
+
+    elif isinstance(module, nn.LSTM):
+        nn.init.xavier_uniform_(module.weight_ih_l0.data)
+        nn.init.orthogonal_(module.weight_hh_l0.data)
+        nn.init.constant_(module.bias_ih_l0.data, 0.0)
+        nn.init.constant_(module.bias_hh_l0.data, 0.0)
+        hidden_size = module.bias_hh_l0.data.shape[0] // 4
+        module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0
+
+        if (module.bidirectional):
+            nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data)
+            nn.init.orthogonal_(module.weight_hh_l0_reverse.data)
+            nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0)
+            nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0)
+            module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0
diff --git a/vaa/model_top_embed.py b/vaa/model_top_embed.py
new file mode 100644
index 0000000..1700471
--- /dev/null
+++ b/vaa/model_top_embed.py
@@ -0,0 +1,173 @@
+"""
+vulnerability vector pad on the embedding left
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+from .layers import RNNDropout, Seq2SeqEncoder, SoftmaxAttention, LinerEncoder
+from .utils import get_mask, replace_masked
+# from allennlp.modules.elmo import Elmo, batch_to_ids
+
+class TOP(nn.Module):
+    """
+    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
+    Natural Language Inference" by Chen et al.
+    """
+
+    def __init__(self,
+                 vocab_size,
+                 embedding_dim,
+                 hidden_size,
+                 embeddings=None,
+                 padding_idx=0,
+                 dropout=0.5,
+                 num_classes=3,
+                 device="cpu"):
+        """
+        Args:
+            vocab_size: The size of the vocabulary of embeddings in the model.
+            embedding_dim: The dimension of the word embeddings.
+            hidden_size: The size of all the hidden layers in the network.
+            embeddings: A tensor of size (vocab_size, embedding_dim) containing
+                pretrained word embeddings. If None, word embeddings are
+                initialised randomly. Defaults to None.
+            padding_idx: The index of the padding token in the premises and
+                hypotheses passed as input to the model. Defaults to 0.
+            dropout: The dropout rate to use between the layers of the network.
+                A dropout rate of 0 corresponds to using no dropout at all.
+                Defaults to 0.5.
+            num_classes: The number of classes in the output of the network.
+                Defaults to 3.
+            device: The name of the device on which the model is being
+                executed. Defaults to 'cpu'.
+        """
+        super(TOP, self).__init__()
+
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+        self.hidden_size = hidden_size
+        self.num_classes = num_classes
+        self.dropout = dropout
+        self.device = device
+
+        self.linear_vulnerability = nn.Linear(3 * self.hidden_size, self.embedding_dim)
+
+        if self.dropout:
+            self._rnn_dropout = RNNDropout(p=self.dropout)
+
+        self._encoding = Seq2SeqEncoder(nn.LSTM,
+                                        self.embedding_dim,
+                                        self.hidden_size,
+                                        num_layers=1,
+                                        bidirectional=True)
+
+        self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout)
+
+        self._composition = Seq2SeqEncoder(nn.LSTM,
+                                           self.hidden_size,
+                                           self.hidden_size,
+                                           bidirectional=True)
+
+        self.classification = nn.Sequential(nn.Dropout(p=self.dropout),
+                                             nn.Linear((2*4)*self.hidden_size,
+                                                       self.hidden_size),
+                                             nn.Tanh(),
+                                             nn.Dropout(p=self.dropout),
+                                             nn.Linear(self.hidden_size,
+                                                       self.num_classes))
+
+        # Initialize all weights and biases in the model.
+        self.apply(_init_esim_weights)
+
+    def forward(self,
+                premises,
+                premises_lengths,
+                hypotheses,
+                hypotheses_lengths,
+                vulnerability,
+                premises_mask,
+                hypotheses_mask):
+        """
+        Args:
+            premises: A batch of varaible length sequences of word indices
+                representing premises. The batch is assumed to be of size
+                (batch, premises_length).
+            premises_lengths: A 1D tensor containing the lengths of the
+                premises in 'premises'.
+            hypothesis: A batch of varaible length sequences of word indices
+                representing hypotheses. The batch is assumed to be of size
+                (batch, hypotheses_length).
+            hypotheses_lengths: A 1D tensor containing the lengths of the
+                hypotheses in 'hypotheses'.
+
+        Returns:
+            logits: A tensor of size (batch, num_classes) containing the
+                logits for each output class of the model.
+            probabilities: A tensor of size (batch, num_classes) containing
+                the probabilities of each output class in the model.
+        """
+        device = premises.device
+        vulnerability = self.linear_vulnerability(vulnerability).unsqueeze(dim=1)
+        premises = torch.cat((vulnerability, premises), dim=1)
+        hypotheses = torch.cat((vulnerability, hypotheses), dim=1)
+        premises_lengths = premises_lengths+1
+        hypotheses_lengths= hypotheses_lengths+1
+        premises_mask = torch.cat((torch.ones(premises_mask.size()[0],1).to(device), premises_mask), dim=1)
+        hypotheses_mask = torch.cat((torch.ones(hypotheses_mask.size()[0],1).to(device), hypotheses_mask), dim=1)
+
+        embedded_premises = premises
+        embedded_hypotheses = hypotheses
+        if self.dropout:
+            embedded_premises = self._rnn_dropout(premises)
+            embedded_hypotheses = self._rnn_dropout(hypotheses)
+
+        encoded_premises = self._encoding(embedded_premises, premises_lengths)
+        encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths)
+
+        projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask,
+                                                                 encoded_hypotheses, hypotheses_mask)
+
+        v_ai = self._composition(projected_premises, premises_lengths)
+        v_bj = self._composition(projected_hypotheses, hypotheses_lengths)
+
+        v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1)
+                                                .transpose(2, 1), dim=1)\
+            / torch.sum(premises_mask, dim=1, keepdim=True)
+        v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1)
+                                                  .transpose(2, 1), dim=1)\
+            / torch.sum(hypotheses_mask, dim=1, keepdim=True)
+
+        v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1)
+        v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)
+
+        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)
+
+        logits = self.classification(v)
+        probabilities = nn.functional.softmax(logits, dim=-1)
+
+        return logits, probabilities
+
+
+def _init_esim_weights(module):
+    """
+    Initialise the weights of the ESIM model.
+    """
+    if isinstance(module, nn.Linear):
+        nn.init.xavier_uniform_(module.weight.data)
+        nn.init.constant_(module.bias.data, 0.0)
+
+    elif isinstance(module, nn.LSTM):
+        nn.init.xavier_uniform_(module.weight_ih_l0.data)
+        nn.init.orthogonal_(module.weight_hh_l0.data)
+        nn.init.constant_(module.bias_ih_l0.data, 0.0)
+        nn.init.constant_(module.bias_hh_l0.data, 0.0)
+        hidden_size = module.bias_hh_l0.data.shape[0] // 4
+        module.bias_hh_l0.data[hidden_size:(2*hidden_size)] = 1.0
+
+        if (module.bidirectional):
+            nn.init.xavier_uniform_(module.weight_ih_l0_reverse.data)
+            nn.init.orthogonal_(module.weight_hh_l0_reverse.data)
+            nn.init.constant_(module.bias_ih_l0_reverse.data, 0.0)
+            nn.init.constant_(module.bias_hh_l0_reverse.data, 0.0)
+            module.bias_hh_l0_reverse.data[hidden_size:(2*hidden_size)] = 1.0
diff --git a/vaa/model_transformer.py b/vaa/model_transformer.py
new file mode 100644
index 0000000..ad61b16
--- /dev/null
+++ b/vaa/model_transformer.py
@@ -0,0 +1,134 @@
+"""
+Definition of the ESIM model.
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+from .layers import RNNDropout, Seq2SeqEncoderLast, SoftmaxAttention, Seq2SeqEncoder
+from .utils import replace_masked
+from torch.nn.modules import TransformerEncoder, TransformerEncoderLayer
+import math
+
+
+class ESIM(nn.Module):
+    """
+    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
+    Natural Language Inference" by Chen et al.
+    """
+
+    def __init__(self,
+                 embedding_dim,
+                 hidden_size,
+                 padding_idx=0,
+                 dropout=0.5,
+                 num_classes=3,
+                 device="cpu"):
+        """
+        Args:
+            vocab_size: The size of the vocabulary of embeddings in the model.
+            embedding_dim: The dimension of the word embeddings.
+            hidden_size: The size of all the hidden layers in the network.
+            embeddings: A tensor of size (vocab_size, embedding_dim) containing
+                pretrained word embeddings. If None, word embeddings are
+                initialised randomly. Defaults to None.
+            padding_idx: The index of the padding token in the premises and
+                hypotheses passed as input to the model. Defaults to 0.
+            dropout: The dropout rate to use between the layers of the network.
+                A dropout rate of 0 corresponds to using no dropout at all.
+                Defaults to 0.5.
+            num_classes: The number of classes in the output of the network.
+                Defaults to 3.
+            device: The name of the device on which the model is being
+                executed. Defaults to 'cpu'.
+        """
+        super(ESIM, self).__init__()
+
+        self.embedding_dim = embedding_dim
+        self.hidden_size = hidden_size
+        self.num_classes = num_classes
+        self.dropout = dropout
+        self.device = device
+
+        if self.dropout:
+            self._rnn_dropout = RNNDropout(p=self.dropout)
+
+        self._encoding = Seq2SeqEncoder(nn.LSTM,
+                                        self.embedding_dim,
+                                        self.hidden_size,
+                                        bidirectional=True)
+
+        self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout)
+
+        self._composition = Seq2SeqEncoder(nn.LSTM,
+                                           self.hidden_size,
+                                           self.hidden_size,
+                                           bidirectional=True)
+
+        self._combine = nn.Sequential(nn.Dropout(p=self.dropout),
+                                      nn.Linear(2*4*self.hidden_size, self.hidden_size),
+                                      nn.Tanh(),
+                                      nn.Dropout(p=self.dropout))
+
+        self._classification = nn.Sequential(nn.Linear(self.hidden_size, self.num_classes))
+
+    def forward(self, premises, hypotheses):
+        """
+        Args:
+            premises: A batch of varaible length sequences of word indices
+                representing premises. The batch is assumed to be of size
+                (batch, premises_length).
+            premises_lengths: A 1D tensor containing the lengths of the
+                premises in 'premises'.
+            hypothesis: A batch of varaible length sequences of word indices
+                representing hypotheses. The batch is assumed to be of size
+                (batch, hypotheses_length).
+            hypotheses_lengths: A 1D tensor containing the lengths of the
+                hypotheses in 'hypotheses'.
+
+        Returns:
+            logits: A tensor of size (batch, num_classes) containing the
+                logits for each output class of the model.
+            probabilities: A tensor of size (batch, num_classes) containing
+                the probabilities of each output class in the model.
+        """
+        premises = premises[:, :min(128,premises.size()[1]), :]
+        hypotheses = hypotheses[:, :min(128, hypotheses.size()[1]), :]
+
+        premises_mask = (torch.sum(premises, dim=-1) != 0).float()
+        hypotheses_mask = (torch.sum(hypotheses, dim=-1) != 0).float()
+        premises_lengths = premises_mask.sum(dim=-1).long()
+        hypotheses_lengths = hypotheses_mask.sum(dim=-1).long()
+
+        embedded_premises, embedded_hypotheses = premises, hypotheses
+        if self.dropout:
+            embedded_premises = self._rnn_dropout(embedded_premises)
+            embedded_hypotheses = self._rnn_dropout(embedded_hypotheses)
+
+        encoded_premises = self._encoding(embedded_premises, premises_lengths)
+        encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths)
+
+        projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask,
+                                                                   encoded_hypotheses, hypotheses_mask)
+
+        v_ai = self._composition(projected_premises, premises_lengths)
+        v_bj = self._composition(projected_hypotheses, hypotheses_lengths)
+
+        v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1)
+                                                .transpose(2, 1), dim=1)\
+            / torch.sum(premises_mask, dim=1, keepdim=True)
+        v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1)
+                                                  .transpose(2, 1), dim=1)\
+            / torch.sum(hypotheses_mask, dim=1, keepdim=True)
+
+        v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1)
+        v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)
+
+        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)
+
+        adv_logits = self._combine(v)
+        logits = self._classification(adv_logits)
+        probabilities = nn.functional.softmax(logits, dim=-1)
+
+        return logits, probabilities, adv_logits
+
diff --git a/vaa/model_transformer_top.py b/vaa/model_transformer_top.py
new file mode 100644
index 0000000..04ff365
--- /dev/null
+++ b/vaa/model_transformer_top.py
@@ -0,0 +1,143 @@
+"""
+Definition of the ESIM model.
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+from .layers import RNNDropout, Seq2SeqEncoderLast, SoftmaxAttention, Seq2SeqEncoder
+from .utils import replace_masked
+
+class TOP(nn.Module):
+    """
+    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
+    Natural Language Inference" by Chen et al.
+    """
+
+    def __init__(self,
+                 embedding_dim,
+                 hidden_size,
+                 padding_idx=0,
+                 dropout=0.5,
+                 num_classes=3,
+                 device="cpu"):
+        """
+        Args:
+            vocab_size: The size of the vocabulary of embeddings in the model.
+            embedding_dim: The dimension of the word embeddings.
+            hidden_size: The size of all the hidden layers in the network.
+            embeddings: A tensor of size (vocab_size, embedding_dim) containing
+                pretrained word embeddings. If None, word embeddings are
+                initialised randomly. Defaults to None.
+            padding_idx: The index of the padding token in the premises and
+                hypotheses passed as input to the model. Defaults to 0.
+            dropout: The dropout rate to use between the layers of the network.
+                A dropout rate of 0 corresponds to using no dropout at all.
+                Defaults to 0.5.
+            num_classes: The number of classes in the output of the network.
+                Defaults to 3.
+            device: The name of the device on which the model is being
+                executed. Defaults to 'cpu'.
+        """
+        super(TOP, self).__init__()
+
+        self.embedding_dim = embedding_dim
+        self.hidden_size = hidden_size
+        self.num_classes = num_classes
+        self.dropout = dropout
+        self.device = device
+
+        self.linear_vulnerability = nn.Linear(3*self.hidden_size, self.embedding_dim)
+
+        if self.dropout:
+            self._rnn_dropout = RNNDropout(p=self.dropout)
+
+        self._encoding = Seq2SeqEncoder(nn.LSTM,
+                                        self.embedding_dim,
+                                        self.hidden_size,
+                                        bidirectional=True)
+
+        self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout)
+
+        self._composition = Seq2SeqEncoder(nn.LSTM,
+                                           self.hidden_size,
+                                           self.hidden_size,
+                                           bidirectional=True)
+
+        # for p in self.parameters():
+        #     p.requires_grad = False
+
+        self.classification_v = nn.Sequential(nn.Dropout(p=self.dropout),
+                                             nn.Linear((2*4*self.hidden_size+self.embedding_dim),
+                                                       self.hidden_size),
+                                             nn.Tanh(),
+                                             nn.Dropout(p=self.dropout),
+                                             nn.Linear(self.hidden_size,
+                                                       self.num_classes))
+
+
+
+    def forward(self, premises, hypotheses, vulnerability):
+        """
+        Args:
+            premises: A batch of varaible length sequences of word indices
+                representing premises. The batch is assumed to be of size
+                (batch, premises_length).
+            premises_lengths: A 1D tensor containing the lengths of the
+                premises in 'premises'.
+            hypothesis: A batch of varaible length sequences of word indices
+                representing hypotheses. The batch is assumed to be of size
+                (batch, hypotheses_length).
+            hypotheses_lengths: A 1D tensor containing the lengths of the
+                hypotheses in 'hypotheses'.
+
+        Returns:
+            logits: A tensor of size (batch, num_classes) containing the
+                logits for each output class of the model.
+            probabilities: A tensor of size (batch, num_classes) containing
+                the probabilities of each output class in the model.
+        """
+        premises = premises[:, :min(128,premises.size()[1]), :]
+        hypotheses = hypotheses[:, :min(128, hypotheses.size()[1]), :]
+
+        vulnerability = self.linear_vulnerability(vulnerability)#.unsqueeze(dim=1)
+        # premises = torch.cat((vulnerability, premises), dim=1)
+        # hypotheses = torch.cat((vulnerability, hypotheses), dim=1)
+
+        # max min 不为0
+        premises_mask = ((torch.max(premises, dim=-1)[0]-torch.min(premises, dim=-1)[0]) != 0).float()
+        hypotheses_mask = ((torch.max(hypotheses, dim=-1)[0]-torch.min(hypotheses, dim=-1)[0])!= 0).float()
+        premises_lengths = premises_mask.sum(dim=-1).long()
+        hypotheses_lengths = hypotheses_mask.sum(dim=-1).long()
+        # print(premises_mask)
+
+        embedded_premises, embedded_hypotheses = premises, hypotheses
+        if self.dropout:
+            embedded_premises = self._rnn_dropout(embedded_premises)
+            embedded_hypotheses = self._rnn_dropout(embedded_hypotheses)
+
+        encoded_premises = self._encoding(embedded_premises, premises_lengths)
+        encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths)
+
+        projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask,
+                                                                   encoded_hypotheses, hypotheses_mask)
+        v_ai = self._composition(projected_premises, premises_lengths)
+        v_bj = self._composition(projected_hypotheses, hypotheses_lengths)
+
+        v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1)
+                                                .transpose(2, 1), dim=1)\
+            / torch.sum(premises_mask, dim=1, keepdim=True)
+        v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1)
+                                                  .transpose(2, 1), dim=1)\
+            / torch.sum(hypotheses_mask, dim=1, keepdim=True)
+
+        v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1)
+        v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)
+
+        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max, vulnerability], dim=1)
+
+        logits = self.classification_v(v)
+        probabilities = nn.functional.softmax(logits, dim=-1)
+
+        return logits, probabilities
+
diff --git a/vaa/model_transformer_top_embed.py b/vaa/model_transformer_top_embed.py
new file mode 100644
index 0000000..a6ec457
--- /dev/null
+++ b/vaa/model_transformer_top_embed.py
@@ -0,0 +1,140 @@
+"""
+Definition of the ESIM model.
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+from .layers import RNNDropout, Seq2SeqEncoderLast, SoftmaxAttention, Seq2SeqEncoder
+from .utils import replace_masked
+
+class TOP(nn.Module):
+    """
+    Implementation of the ESIM model presented in the paper "Enhanced LSTM for
+    Natural Language Inference" by Chen et al.
+    """
+
+    def __init__(self,
+                 embedding_dim,
+                 hidden_size,
+                 padding_idx=0,
+                 dropout=0.5,
+                 num_classes=3,
+                 device="cpu"):
+        """
+        Args:
+            vocab_size: The size of the vocabulary of embeddings in the model.
+            embedding_dim: The dimension of the word embeddings.
+            hidden_size: The size of all the hidden layers in the network.
+            embeddings: A tensor of size (vocab_size, embedding_dim) containing
+                pretrained word embeddings. If None, word embeddings are
+                initialised randomly. Defaults to None.
+            padding_idx: The index of the padding token in the premises and
+                hypotheses passed as input to the model. Defaults to 0.
+            dropout: The dropout rate to use between the layers of the network.
+                A dropout rate of 0 corresponds to using no dropout at all.
+                Defaults to 0.5.
+            num_classes: The number of classes in the output of the network.
+                Defaults to 3.
+            device: The name of the device on which the model is being
+                executed. Defaults to 'cpu'.
+        """
+        super(TOP, self).__init__()
+
+        self.embedding_dim = embedding_dim
+        self.hidden_size = hidden_size
+        self.num_classes = num_classes
+        self.dropout = dropout
+        self.device = device
+
+        self.linear_vulnerability = nn.Linear(3*self.hidden_size, self.embedding_dim)
+
+        if self.dropout:
+            self._rnn_dropout = RNNDropout(p=self.dropout)
+
+        self._encoding = Seq2SeqEncoder(nn.LSTM,
+                                        self.embedding_dim,
+                                        self.hidden_size,
+                                        bidirectional=True)
+
+        self._attention = SoftmaxAttention(self.hidden_size, dropout=self.dropout)
+
+        self._composition = Seq2SeqEncoder(nn.LSTM,
+                                           self.hidden_size,
+                                           self.hidden_size,
+                                           bidirectional=True)
+
+        self.classification = nn.Sequential(nn.Dropout(p=self.dropout),
+                                             nn.Linear(2*4*self.hidden_size,
+                                                       self.hidden_size),
+                                             nn.Tanh(),
+                                             nn.Dropout(p=self.dropout),
+                                             nn.Linear(self.hidden_size,
+                                                       self.num_classes))
+
+
+
+    def forward(self, premises, hypotheses, vulnerability):
+        """
+        Args:
+            premises: A batch of varaible length sequences of word indices
+                representing premises. The batch is assumed to be of size
+                (batch, premises_length).
+            premises_lengths: A 1D tensor containing the lengths of the
+                premises in 'premises'.
+            hypothesis: A batch of varaible length sequences of word indices
+                representing hypotheses. The batch is assumed to be of size
+                (batch, hypotheses_length).
+            hypotheses_lengths: A 1D tensor containing the lengths of the
+                hypotheses in 'hypotheses'.
+
+        Returns:
+            logits: A tensor of size (batch, num_classes) containing the
+                logits for each output class of the model.
+            probabilities: A tensor of size (batch, num_classes) containing
+                the probabilities of each output class in the model.
+        """
+        premises = premises[:, :min(128,premises.size()[1]), :]
+        hypotheses = hypotheses[:, :min(128, hypotheses.size()[1]), :]
+
+        vulnerability = self.linear_vulnerability(vulnerability).unsqueeze(dim=1)
+        premises = torch.cat((vulnerability, premises), dim=1)
+        hypotheses = torch.cat((vulnerability, hypotheses), dim=1)
+
+        # max min 不为0
+        premises_mask = ((torch.max(premises, dim=-1)[0]-torch.min(premises, dim=-1)[0]) != 0).float()
+        hypotheses_mask = ((torch.max(hypotheses, dim=-1)[0]-torch.min(hypotheses, dim=-1)[0])!= 0).float()
+        premises_lengths = premises_mask.sum(dim=-1).long()
+        hypotheses_lengths = hypotheses_mask.sum(dim=-1).long()
+        # print(premises_mask)
+
+        embedded_premises, embedded_hypotheses = premises, hypotheses
+        if self.dropout:
+            embedded_premises = self._rnn_dropout(embedded_premises)
+            embedded_hypotheses = self._rnn_dropout(embedded_hypotheses)
+
+        encoded_premises = self._encoding(embedded_premises, premises_lengths)
+        encoded_hypotheses = self._encoding(embedded_hypotheses, hypotheses_lengths)
+
+        projected_premises, projected_hypotheses = self._attention(encoded_premises, premises_mask,
+                                                                   encoded_hypotheses, hypotheses_mask)
+        v_ai = self._composition(projected_premises, premises_lengths)
+        v_bj = self._composition(projected_hypotheses, hypotheses_lengths)
+
+        v_a_avg = torch.sum(v_ai * premises_mask.unsqueeze(1)
+                                                .transpose(2, 1), dim=1)\
+            / torch.sum(premises_mask, dim=1, keepdim=True)
+        v_b_avg = torch.sum(v_bj * hypotheses_mask.unsqueeze(1)
+                                                  .transpose(2, 1), dim=1)\
+            / torch.sum(hypotheses_mask, dim=1, keepdim=True)
+
+        v_a_max, _ = replace_masked(v_ai, premises_mask, -1e7).max(dim=1)
+        v_b_max, _ = replace_masked(v_bj, hypotheses_mask, -1e7).max(dim=1)
+
+        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)
+
+        logits = self.classification(v)
+        probabilities = nn.functional.softmax(logits, dim=-1)
+
+        return logits, probabilities
+
diff --git a/vaa/utils.py b/vaa/utils.py
new file mode 100644
index 0000000..37c9054
--- /dev/null
+++ b/vaa/utils.py
@@ -0,0 +1,179 @@
+"""
+Utility functions for the ESIM model.
+"""
+# Aurelien Coet, 2018.
+
+import torch
+import torch.nn as nn
+
+
+# Code widely inspired from:
+# https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py.
+def sort_by_seq_lens(batch, sequences_lengths, descending=True):
+    """
+    Sort a batch of padded variable length sequences by their length.
+
+    Args:
+        batch: A batch of padded variable length sequences. The batch should
+            have the dimensions (batch_size x max_sequence_length x *).
+        sequences_lengths: A tensor containing the lengths of the sequences in the
+            input batch. The tensor should be of size (batch_size).
+        descending: A boolean value indicating whether to sort the sequences
+            by their lengths in descending order. Defaults to True.
+
+    Returns:
+        sorted_batch: A tensor containing the input batch reordered by
+            sequences lengths.
+        sorted_seq_lens: A tensor containing the sorted lengths of the
+            sequences in the input batch.
+        sorting_idx: A tensor containing the indices used to permute the input
+            batch in order to get 'sorted_batch'.
+        restoration_idx: A tensor containing the indices that can be used to
+            restore the order of the sequences in 'sorted_batch' so that it
+            matches the input batch.
+    """
+    sorted_seq_lens, sorting_index =\
+        sequences_lengths.sort(0, descending=descending)
+
+    sorted_batch = batch.index_select(0, sorting_index)
+
+    # idx_range = sequences_lengths.new_tensor(torch.arange(0, len(sequences_lengths)))
+    idx_range = torch.arange(0, len(sequences_lengths)).to(sequences_lengths.device)
+    _, reverse_mapping = sorting_index.sort(0, descending=False)
+    restoration_index = idx_range.index_select(0, reverse_mapping)
+
+    return sorted_batch, sorted_seq_lens, sorting_index, restoration_index
+
+
+def get_mask(sequences_batch, sequences_lengths):
+    """
+    Get the mask for a batch of padded variable length sequences.
+
+    Args:
+        sequences_batch: A batch of padded variable length sequences
+            containing word indices. Must be a 2-dimensional tensor of size
+            (batch, sequence).
+        sequences_lengths: A tensor containing the lengths of the sequences in
+            'sequences_batch'. Must be of size (batch).
+
+    Returns:
+        A mask of size (batch, max_sequence_length), where max_sequence_length
+        is the length of the longest sequence in the batch.
+    """
+    batch_size = sequences_batch.size()[0]
+    max_length = torch.max(sequences_lengths)
+    mask = torch.ones(batch_size, max_length, dtype=torch.float)
+    mask[sequences_batch[:, :max_length] == 0] = 0.0
+    return mask
+
+
+# Code widely inspired from:
+# https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py.
+def masked_softmax(tensor, mask):
+    """
+    Apply a masked softmax on the last dimension of a tensor.
+    The input tensor and mask should be of size (batch, *, sequence_length).
+
+    Args:
+        tensor: The tensor on which the softmax function must be applied along
+            the last dimension.
+        mask: A mask of the same size as the tensor with 0s in the positions of
+            the values that must be masked and 1s everywhere else.
+
+    Returns:
+        A tensor of the same size as the inputs containing the result of the
+        softmax.
+    """
+    tensor_shape = tensor.size()
+    reshaped_tensor = tensor.view(-1, tensor_shape[-1])
+
+    # Reshape the mask so it matches the size of the input tensor.
+    while mask.dim() < tensor.dim():
+        mask = mask.unsqueeze(1)
+    # print(mask.size(), tensor.size())
+    mask = mask.expand_as(tensor).contiguous().float()
+    reshaped_mask = mask.view(-1, mask.size()[-1])
+
+    result = nn.functional.softmax(reshaped_tensor * reshaped_mask, dim=-1)
+    result = result * reshaped_mask
+    # 1e-13 is added to avoid divisions by zero.
+    result = result / (result.sum(dim=-1, keepdim=True) + 1e-13)
+
+    return result.view(*tensor_shape)
+
+
+def normal_softmax(tensor):
+    tensor_shape = tensor.size()
+    reshaped_tensor = tensor.view(-1, tensor_shape[-1])
+    result = nn.functional.softmax(reshaped_tensor, dim=-1)
+    # 1e-13 is added to avoid divisions by zero.
+    result = result / (result.sum(dim=-1, keepdim=True) + 1e-13)
+    return result.view(*tensor_shape)
+
+
+# Code widely inspired from:
+# https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py.
+def weighted_sum(tensor, weights, mask):
+    """
+    Apply a weighted sum on the vectors along the last dimension of 'tensor',
+    and mask the vectors in the result with 'mask'.
+
+    Args:
+        tensor: A tensor of vectors on which a weighted sum must be applied.
+        weights: The weights to use in the weighted sum.
+        mask: A mask to apply on the result of the weighted sum.
+
+    Returns:
+        A new tensor containing the result of the weighted sum after the mask
+        has been applied on it.
+    """
+    weighted_sum = weights.bmm(tensor)
+
+    while mask.dim() < weighted_sum.dim():
+        mask = mask.unsqueeze(1)
+    mask = mask.transpose(-1, -2)
+    mask = mask.expand_as(weighted_sum).contiguous().float()
+
+    return weighted_sum * mask
+
+
+# Code inspired from:
+# https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py.
+def replace_masked(tensor, mask, value):
+    """
+    Replace the all the values of vectors in 'tensor' that are masked in
+    'masked' by 'value'.
+
+    Args:
+        tensor: The tensor in which the masked vectors must have their values
+            replaced.
+        mask: A mask indicating the vectors which must have their values
+            replaced.
+        value: The value to place in the masked vectors of 'tensor'.
+
+    Returns:
+        A new tensor of the same size as 'tensor' where the values of the
+        vectors masked in 'mask' were replaced by 'value'.
+    """
+    mask = mask.unsqueeze(1).transpose(2, 1)
+    reverse_mask = 1.0 - mask
+    values_to_add = value * reverse_mask
+    return tensor * mask + values_to_add
+
+
+def correct_predictions(output_probabilities, targets):
+    """
+    Compute the number of predictions that match some target classes in the
+    output of a model.
+
+    Args:
+        output_probabilities: A tensor of probabilities for different output
+            classes.
+        targets: The indices of the actual target classes.
+
+    Returns:
+        The number of correct predictions in 'output_probabilities'.
+    """
+    _, out_classes = output_probabilities.max(dim=1)
+    correct = (out_classes == targets).sum()
+    return correct.item()