diff --git a/AlignReq.h b/AlignReq.h
new file mode 100644
index 0000000..ea50686
--- /dev/null
+++ b/AlignReq.h
@@ -0,0 +1,40 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#ifndef _ALIGNREQ_H_
+#define _ALIGNREQ_H_
+
+using namespace std;
+
+#include <vector>
+#include "Hypo.h"
+
+struct AlignReq {
+  int sb, se;		// requested alignment, we can use the word indices only since the source is constant for all hyps
+  vector<string> tgph;	// target phrase 	
+  WordID tgwid[16];	// mpped target wordID; TODO: this is an hack, we map many times the same target phrase
+  Hypo *hyp;		// corresponding hypothesis
+  int bs;		// index into bunch that will be processed by NN
+  float *logP;	 	// log proba (may be several scores)
+};
+
+#endif
diff --git a/Gpu.cu b/Gpu.cu
new file mode 100644
index 0000000..673197b
--- /dev/null
+++ b/Gpu.cu
@@ -0,0 +1,1799 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+using namespace std;
+#include <algorithm>
+#include <map>
+#include <sstream>
+#include <signal.h>
+#define RAISE raise(SIGINT);
+
+typedef float REAL;
+#define NULL_WORD (-1)		// from WordList.h
+#define LOG_PROBA_NONE 999	// from ErrFact.h
+#define LOCK_FNAME "/tmp/gpu_lock.pid%d.gpu%d"
+#define LOCK_FNAME_LEN 256	// Hack ;-)
+
+#include <npps.h>
+#include <cublas.h>
+#include <cuda_runtime_api.h>
+#include <nppcore.h>
+#include "nvml.h"
+#include "Gpu.cuh"
+#include "Tools.h" //For Error()
+
+
+// global variables
+curandGenerator_t cuda_gen;
+string cuda_user_list;	// user specified list of GPUs
+static REAL *gpu_result;  
+#define GPU_BUF_DIM 65536
+static REAL *gpu_buf;  
+
+size_t Gpu::curDevIndex  = (size_t)-1;   ///< current device index
+size_t Gpu::curConfIndex = (size_t)-1;   ///< current configuration index
+cudaStream_t Gpu::curStream = NULL;      ///< current stream
+bool Gpu::useConcurrentStreams = false;  ///< status of concurrent streams
+#ifdef GPU_CUBLAS_V2
+cublasHandle_t Gpu::curCbHandle = NULL;  ///< current Cublas handle
+#endif
+cudaDeviceProp* Gpu::curDevProps = NULL; ///< device properties
+vector<Gpu::Device> Gpu::vDevices; ///< vector of Gpu devices to be used
+vector<Gpu::Config> Gpu::vConfigs; ///< vector of Gpu configurations
+
+void HandlerSigTERM(int s)
+{
+  printf("Catched signal: removing lock-files\n");
+  Gpu::Unlock();
+  exit(1);
+}
+
+/**
+ * initializes Cuda and creates lock files
+ * @note selects first device and stream
+ * @returns configuration index 0
+ */
+size_t Gpu::Init()
+{
+  size_t stId = 0;
+  if (0 >= Gpu::vConfigs.size()) {
+    Gpu::vConfigs.resize(1);
+
+    cout << "Initializing Nvidia GPU card" << endl;
+    int dev_max = 0;
+    cudaGetDeviceCount(&dev_max);
+    bool bSelAuto = (':' != cuda_user_list[0]);
+    Gpu::Device dev;
+    if (0 < dev_max) {
+      if (1 == dev_max)
+        cout << " - found 1 card:" << endl;
+      else
+        cout << " - found " << dev_max << " cards:" << endl;
+      if (bSelAuto)
+        nvmlInit();
+      nvmlDevice_t nd;
+      nvmlUtilization_t nu;
+      multimap<uint,Gpu::Device> mSelDev;
+      for (dev.number = 0 ; dev.number < dev_max ; dev.number++) {
+        cudaGetDeviceProperties(&dev.props, dev.number);
+        int nb_cores_per_multiprocessor = -1;
+        if(dev.props.major == 1 && (dev.props.minor == 0||dev.props.minor == 1||dev.props.minor == 2||dev.props.minor == 3))
+            nb_cores_per_multiprocessor = 8;
+        else if(dev.props.major == 2 && dev.props.minor == 0)
+            nb_cores_per_multiprocessor = 32;
+        else if(dev.props.major == 2 && dev.props.minor == 1)
+            nb_cores_per_multiprocessor = 48;
+        else if(dev.props.major == 3 && (dev.props.minor == 0||dev.props.minor == 5))
+            nb_cores_per_multiprocessor = 192;
+
+
+        printf("    %d: %s with %d CPUs x %d threads running at %4.2f Ghz, %d MBytes of memory, use -arch=sm_%d%d",
+          dev.number, dev.props.name, dev.props.multiProcessorCount, nb_cores_per_multiprocessor,
+          dev.props.clockRate/1000000.0, (int) (dev.props.totalGlobalMem/1024/1024),
+          dev.props.major, dev.props.minor);
+        if (bSelAuto) {
+          if (   (nvmlDeviceGetHandleByIndex(dev.number, &nd) == NVML_SUCCESS)
+              && (nvmlDeviceGetUtilizationRates( nd    , &nu) == NVML_SUCCESS) )
+            printf(", utilization %d%%", nu.gpu);
+          mSelDev.insert(make_pair(nu.gpu, dev));
+        }
+        printf("\n");
+      }
+      if (bSelAuto) { // select devices automatically
+        nvmlShutdown();
+        int iMaxDev = std::min(std::max(atoi(cuda_user_list.c_str()), 0), dev_max);
+        for (multimap<uint,Gpu::Device>::const_iterator mmci = mSelDev.begin() ; 0 < iMaxDev-- ; mmci++)
+          Gpu::vDevices.push_back(mmci->second);
+      }
+    }
+
+    if (!bSelAuto) { // read devices specified by user
+      char c;
+      istringstream iss;
+      iss.str(cuda_user_list);
+      while (iss.good()) {
+        iss >> c >> dev.number;
+        Gpu::vDevices.push_back(dev);
+        cudaGetDeviceProperties(&Gpu::vDevices.back().props, dev.number);
+      }
+      if (iss.fail())
+        ErrorN("format error in the selection of CUDA devices \"%s\"", cuda_user_list.c_str() + 1);
+    }
+    size_t dev_sel = Gpu::vDevices.size();
+    switch (dev_sel) {
+      case 0: printf(" - no GPU device selected\n");
+              dev.number = 0;
+              Gpu::vDevices.push_back(dev);
+              dev_sel = 1;
+              cudaGetDeviceProperties(&Gpu::vDevices.back().props, dev.number);
+      case 1: printf(" - using device %d\n", Gpu::vDevices[0].number);
+              cudaSetDevice(Gpu::vDevices[0].number);
+              break;
+      default:
+        if (dev_sel > (size_t)dev_max) {
+          printf(" - requested more GPU devices than available, using %d first ones\n", dev_max);
+          dev_sel = dev_max;
+          Gpu::vDevices.resize(dev_sel);
+        }
+        printf(" - using %lu devices in parallel:", dev_sel);
+        for (size_t d = 0 ; d < dev_sel ; d++) {
+          int n = Gpu::vDevices[d].number;
+          printf(" %d", n);
+          if ((n < 0) || (n >= dev_max))
+            Error("illegal device identifier");
+        }
+        printf("\n");
+        cudaSetDevice(Gpu::vDevices[0].number);
+    }
+
+    // initialize cublas and random generator
+    cublasInit();
+    Gpu::CheckError("initialization of card\n");
+    curandCreateGenerator(&cuda_gen, CURAND_RNG_PSEUDO_DEFAULT);
+    // curandSetPseudoRandomGeneratorSeed(cuda_gen, CUDA_SEED);
+    Gpu::CheckError("initialization of random generator\n");
+
+    // allocate buffers
+    gpu_buf = Gpu::Alloc(GPU_BUF_DIM*sizeof(REAL),"internal buffer on GPU");
+
+    // locking devices
+    ofstream lfs;
+    char lfname[LOCK_FNAME_LEN] = LOCK_FNAME;
+    for (size_t d = 0 ; d < dev_sel ; d++) {
+      sprintf(lfname, LOCK_FNAME, getpid(), Gpu::vDevices[d].number);
+      lfs.open(lfname,ios::out);
+      CHECK_FILE(lfs, lfname);
+      lfs << "Runing job " << getpid() << " on GPU " << Gpu::vDevices[d].number << endl;
+      lfs.close();
+    }
+
+    // catch signals to clean up lock-files
+    signal(SIGINT , HandlerSigTERM);
+    signal(SIGHUP , HandlerSigTERM);
+    signal(SIGFPE , HandlerSigTERM);
+    signal(SIGSEGV, HandlerSigTERM);
+    signal(SIGTERM, HandlerSigTERM);
+
+    // create default configuration
+    Gpu::Config& newConfig = Gpu::vConfigs.back();
+    Gpu::curDevIndex = newConfig.devId = 0;
+    Gpu::curConfIndex = stId;
+    newConfig.stream = NULL;
+#ifdef GPU_CUBLAS_V2
+    cublasCreate(&newConfig.cbHandle);
+    Gpu::curCbHandle = newConfig.cbHandle;
+#endif
+    Gpu::curDevProps = &Gpu::vDevices[0].props;
+  }
+  return stId;
+}
+
+/**
+ * removes lock-files and deletes all configurations
+ */
+void Gpu::Unlock()
+{
+  // remove lock-files
+  Gpu::curDevIndex = (size_t)-1;
+  char lfname[LOCK_FNAME_LEN] = LOCK_FNAME;
+  for (std::vector<Gpu::Device>::iterator id = Gpu::vDevices.begin() ; id != Gpu::vDevices.end() ; id++) {
+    sprintf(lfname, LOCK_FNAME, getpid(), id->number);
+    if (unlink(lfname))
+      cerr << " - ERROR: removing lock file " << lfname << endl;
+  }
+
+  // destroy streams
+  Gpu::curConfIndex = (size_t)-1;
+  Gpu::curStream = NULL;
+  Gpu::useConcurrentStreams = false;
+#ifdef GPU_CUBLAS_V2
+  Gpu::curCbHandle = NULL;
+#endif
+  Gpu::curDevProps = NULL;
+  Gpu::vDevices.clear();
+  for (std::vector<Gpu::Config>::iterator igc = Gpu::vConfigs.begin() ; igc != Gpu::vConfigs.end() ; igc++) {
+    if (NULL != igc->stream)
+      cudaStreamDestroy(igc->stream);
+#ifdef GPU_CUBLAS_V2
+    if (NULL != igc->cbHandle)
+      cublasDestroy(igc->cbHandle);
+#endif
+  }
+  Gpu::vConfigs.clear();
+}
+
+
+/**
+ * creates a new Gpu stream on next device
+ * @note selects the next device and the new stream
+ * @returns new configuration index
+ */
+size_t Gpu::NewConfig()
+{
+  size_t stId = Gpu::vConfigs.size();
+  if (0 < stId) {
+    Gpu::useConcurrentStreams |= (Gpu::vDevices.size() <= (0.8 * (stId + 1)));
+    Gpu::vConfigs.resize(stId + 1);
+    Gpu::Config& newConfig = Gpu::vConfigs.back();
+    newConfig.devId = ((Gpu::curDevIndex + 1) % Gpu::vDevices.size());
+    newConfig.stream = NULL;
+#ifdef GPU_CUBLAS_V2
+    newConfig.cbHandle = NULL;
+#endif
+    Gpu::ChangeConfig(stId);
+    return stId;
+  }
+  else
+    return Gpu::Init();
+}
+
+/**
+ * changes current configuration
+ * @param stCfg index of configuration to use
+ */
+void Gpu::ChangeConfig(size_t stCfg)
+{
+  Gpu::curConfIndex = stCfg;
+  Gpu::Config& config = Gpu::vConfigs[Gpu::curConfIndex];
+  if (Gpu::curDevIndex != config.devId) {
+    Gpu::curDevIndex = config.devId;
+    cudaSetDevice(Gpu::vDevices[Gpu::curDevIndex].number);
+    Gpu::curDevProps = &Gpu::vDevices[Gpu::curDevIndex].props;
+  }
+#ifdef GPU_CUBLAS_V2
+  if (NULL == config.cbHandle)
+    cublasCreate(&config.cbHandle);
+  if (Gpu::useConcurrentStreams && (NULL == config.stream)) {
+    cudaStreamSynchronize(NULL);
+    cudaStreamCreate(&config.stream);
+    cublasSetStream(config.cbHandle, config.stream);
+  }
+  if (Gpu::curStream != config.stream) {
+    Gpu::curStream = config.stream;
+    nppSetStream(Gpu::curStream);
+  }
+  Gpu::curCbHandle = config.cbHandle;
+  debug4("Gpu::ChangeConfig cfg=%zu dev=%d str=%x cbh=%x\n", Gpu::curConfIndex, Gpu::vDevices[Gpu::curDevIndex].number, Gpu::curStream, Gpu::curCbHandle);
+#endif
+}
+
+/**
+ * sets current device with default stream
+ * @param stDevId device index
+ */
+void Gpu::SetDevice(size_t stDevId)
+{
+  Gpu::curConfIndex = (size_t)-1;
+  if (Gpu::curDevIndex != stDevId) {
+    Gpu::curDevIndex = (stDevId % Gpu::vDevices.size());
+    cudaSetDevice(Gpu::vDevices[Gpu::curDevIndex].number);
+    Gpu::curDevProps = &Gpu::vDevices[Gpu::curDevIndex].props;
+  }
+#ifdef GPU_CUBLAS_V2
+  if (NULL != Gpu::curStream) {
+    Gpu::curStream = NULL;
+    nppSetStream(Gpu::curStream);
+  }
+  Gpu::curCbHandle = NULL;
+#endif
+}
+
+/**
+ * allocates memory on Gpu and checks error
+ * @param msg message to print in case of error
+ */
+REAL* Gpu::Alloc(int dim, const char* msg) {
+  void* gpu_mem;
+  char err_msg[1024];
+  sprintf(err_msg, "CUDA: can't allocate memory for %s", msg);
+  sprintf(err_msg, "CUDA: can't allocate memory (%dMB) for %s", (int)(dim / 1024 / 1024 * sizeof(REAL)), msg);
+  if (dim > 0) {
+    cublasAlloc(dim, CUDA_SIZE, &gpu_mem);
+#ifdef DEBUG
+    int dev = -1;
+    cudaGetDevice(&dev);
+    debug3("allocated %ld at %p on device %d\n",  dim * CUDA_SIZE, gpu_mem, dev);
+#endif
+    Gpu::CheckError(err_msg);
+    if (NULL == gpu_mem)
+      Error(err_msg);
+    return (CUDA*)gpu_mem;
+  }
+  else
+    return NULL;
+}
+
+/**
+ * checks error
+ * @param msg message to print in case of error
+ */
+void Gpu::CheckError(const char* msg) {
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err)
+    ErrorN("CUDA: ERROR %d in %s: %s\n", cublasGetError(), msg, cudaGetErrorString(err));
+}
+
+
+// Corresponds to 2.0*numeric_limits<float>::min()
+__device__ REAL GPU_LOG_LOWER_BOUND = 2.35099e-38;
+__device__ REAL gpu_safelog(REAL x) { return (x<GPU_LOG_LOWER_BOUND) ? log(GPU_LOG_LOWER_BOUND) : log(x); };
+
+
+//-----------------------------------------------
+// forward pass for MachTab
+//-----------------------------------------------
+
+__global__
+void KernelMachTabForw(const int bsize, const int odim, REAL *gpu_data_in, REAL *gpu_t, REAL *gpu_data_out)
+{
+  for (int b=blockIdx.x ; b<bsize ; b+=gridDim.x) {
+    int idx= (int) gpu_data_in[b];
+    int offso=b*odim;
+    int offst=idx*odim;
+    for (int i=threadIdx.x ; i<odim ; i+=blockDim.x) {
+      if (idx==NULL_WORD) gpu_data_out[i+offso] = 0.0;
+                     else gpu_data_out[i+offso] = gpu_t[i+offst];
+    }
+  }
+}
+
+void Gpu::MachTabForw(const int bsize, const int odim,
+		    REAL *gpu_data_in, REAL *gpu_t, REAL *gpu_data_out)
+{
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], odim);
+  int n_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize);
+  KernelMachTabForw<<<n_blocks, n_threads, 0, Gpu::curStream>>>(bsize, odim, gpu_data_in, gpu_t, gpu_data_out);
+}
+
+
+//-----------------------------------------------
+// backward pass for MachTab
+//-----------------------------------------------
+
+__global__
+void KernelMachTabBackw(const REAL lrate, const int bsize, const int odim,
+                        REAL *gpu_data_in, REAL *gpu_t, REAL *gpu_grad_out)
+{
+  for (int b=blockIdx.x; b<bsize; b+=gridDim.x) {
+    for (int i=threadIdx.x; i<odim; i+=blockDim.x) {
+      int idx = (int) gpu_data_in[b];
+      // Use atomicAdd instead of += to avoid race conditions between threads
+      if (idx != NULL_WORD)
+        atomicAdd(gpu_t+i+idx*odim, lrate * gpu_grad_out[i+b*odim]);
+    }
+  }
+}
+
+void Gpu::MachTabBackw(const REAL lrate, const int bsize, const int odim,
+                     REAL *gpu_data_in, REAL *gpu_t, REAL *gpu_grad_out)
+{
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], odim);
+  int n_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize);
+  KernelMachTabBackw<<<n_blocks, n_threads, 0, Gpu::curStream>>>(lrate, bsize, odim, gpu_data_in, gpu_t, gpu_grad_out);
+}
+
+
+//-----------------------------------------------
+// Softmax normalization
+//-----------------------------------------------
+
+__global__ void KernelSoftmax(int M, int N,
+			      const REAL * x, const int sx0, const int sx1,
+ 			      REAL * sm, const int sm_s0, const int sm_s1)
+{
+  extern __shared__ REAL buf[];
+  for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) {
+    REAL sum = 0;
+#pragma unroll 16
+    for (int i = threadIdx.x; i< N; i += blockDim.x){
+      sum += exp(x[blockIDX * sx0 + i * sx1]);
+    }
+    buf[threadIdx.x] = sum;
+    __syncthreads();
+
+    // This function trashes buf[1..warpsize], leaving the reduction result in buf[0].
+    if (threadIdx.x < warpSize){
+#pragma unroll 8
+      for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize){
+                buf[threadIdx.x] += buf[i];
+      }
+      if (threadIdx.x < 16){
+                //reduce so that threadIdx.x 0 has the sum of everything
+                if(threadIdx.x + 16 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16];
+                if(threadIdx.x + 8 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8];
+                if(threadIdx.x + 4 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4];
+                if(threadIdx.x + 2 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2];
+                if(threadIdx.x + 1 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1];
+      }
+    }
+    __syncthreads();
+    REAL row_sum = buf[0];
+#pragma unroll 16
+    for (int i = threadIdx.x; i< N; i += blockDim.x){
+      sm[blockIDX * sm_s0 + i * sm_s1] = exp(x[blockIDX * sx0 + i * sx1]) / row_sum;
+    }
+    __syncthreads();
+  }
+}
+void Gpu::MachSoftmaxForw(const int bsize, const int odim, REAL *gpu_data_out)
+{
+  if(0){
+    //This is the original code that is know to work correctly in all case,
+    //But is slower.
+    nppsExp_32f_I(gpu_data_out, bsize*odim);
+
+    REAL sum, *optr=gpu_data_out;
+
+    for (int b=0; b<bsize; b++,optr+=odim) {
+      sum=Gpu::CublasSasum(odim,optr,1);  // exp(x) is always positive -> we can use the sum_i (ABS(x_i))
+      nppsMulC_32f_I(1.0/sum,optr,odim);
+    }
+    return;
+  }
+
+  //int warpSize = 32;
+//The follwing check need to access the GPU properties to do it.
+//To don't do this access each time, we have done it in MachSoftmax.cpp
+//  if(warpSize != 32){
+//    Error("Gpu::MachSoftmaxForw suppose the warpSize is 32. If run with a GPU with other warpSize"
+//	  " like the current GPU, it will return wrong Results. You must update the reduction in KernelSoftmax");
+//  }
+  int n_blocks = std::min(bsize, 32 * 1024);
+  int n_threads = std::min(odim, 512);
+  int n_shared_bytes = n_threads * sizeof(REAL);
+  if (bsize > 0){
+    KernelSoftmax<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+                            bsize,
+                            odim,
+                            gpu_data_out,
+                            odim, //x.stride[0
+                            1, //x.stride[1]
+                            gpu_data_out,
+                            odim, //sm.stride[0]
+                            1//sm.stride[1]
+                    );
+    cudaError_t err = cudaGetLastError();
+    if(cudaSuccess != err){
+      printf("KernelSoftmax: n_blockn=%d, n_threads=%d, n_shared_bytes=%d odim=%d\n",
+             n_blocks, n_threads, n_shared_bytes, odim);
+      Error(cudaGetErrorString(err));
+    }
+  }
+}
+
+//-----------------------------------------------
+// Softmax stable normalization
+//-----------------------------------------------
+
+__global__ void KernelSoftmaxStable(int M, int N,
+                                     const REAL * x, const int sx0, const int sx1,
+                                     REAL * sm, const int sm_s0, const int sm_s1)
+{
+  extern __shared__ REAL buf[];
+  for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) {
+    REAL max_ = x[blockIDX * sx0 + threadIdx.x * sx1];
+    for (int i = threadIdx.x + blockDim.x; i< N; i += blockDim.x) {
+      max_ = max(max_, x[blockIDX * sx0 + i * sx1]);
+    };
+    buf[threadIdx.x] = max_;
+    __syncthreads();
+
+    // This function trashes buf[1..n_threads], leaving the reduction result in buf[0].
+    // Find the max to stabilize the softmax
+    if (threadIdx.x < warpSize)
+    {
+      for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize) {
+                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
+      }
+      if (threadIdx.x < 16) {
+                //reduce so that threadIdx.x 0 has the max of everything
+                if(threadIdx.x + 16 < N)
+                    buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+16]);
+                if(threadIdx.x + 8 < N)
+                    buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+8]);
+                if(threadIdx.x + 4 < N)
+                    buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+4]);
+                if(threadIdx.x + 2 < N)
+                    buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+2]);
+                if(threadIdx.x + 1 < N)
+                    buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+1]);
+            }
+    }
+
+    __syncthreads();
+    REAL row_max = buf[0];
+    __syncthreads();
+    REAL sum = 0;
+    for(int i=threadIdx.x; i<N; i+=blockDim.x){
+      sum += exp(x[blockIDX * sx0 + i * sx1] - row_max);
+    };
+    buf[threadIdx.x] = sum; 
+    __syncthreads();
+
+    // This function trashes buf[1..N], leaving the reduction result in buf[0].
+    if (threadIdx.x < warpSize){
+      for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize){
+                buf[threadIdx.x] += buf[i];
+      }
+      if (threadIdx.x < 16){
+                //reduce so that threadIdx.x 0 has the sum of everything
+                if(threadIdx.x + 16 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16];
+                if(threadIdx.x + 8 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8];
+                if(threadIdx.x + 4 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4];
+                if(threadIdx.x + 2 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2];
+                if(threadIdx.x + 1 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1];
+      }
+    }
+    __syncthreads();
+    REAL row_sum = buf[0];
+    for (int i = threadIdx.x; i< N; i += blockDim.x){
+      sm[blockIDX * sm_s0 + i * sm_s1] = exp(x[blockIDX * sx0 + i * sx1] - row_max) / row_sum;
+    }
+    __syncthreads();
+  }
+}
+
+void Gpu::MachSoftmaxStableForw(const int bsize, const int odim, REAL *gpu_data_out)
+{
+  if(0){
+    Error("Not implemented!");
+    //This is the original code that is know to work correctly in all case,
+    //But is slower.
+    nppsExp_32f_I(gpu_data_out, bsize*odim);
+
+    REAL sum, *optr=gpu_data_out;
+
+    for (int b=0; b<bsize; b++,optr+=odim) {
+      sum=Gpu::CublasSasum(odim,optr,1);  // exp(x) is always positive -> we can use the sum_i (ABS(x_i))
+      nppsMulC_32f_I(1.0/sum,optr,odim);
+    }
+    return;
+  }
+  //int warpSize = 32;
+//The follwing check need to access the GPU properties to do it.
+//To don't do this access each time, we have done it in MachSoftmaxStable.cpp
+//  if(warpSize != 32){
+//    Error("Gpu::MachSoftmaxStableForw suppose the warpSize is 32. If run with a GPU with other warpSize"
+//        " like the current GPU, it will return wrong Results. You must update the reduction in KernelSoftmaxStable");
+//  }
+  int n_blocks = std::min(bsize, 32 * 1024);
+  int n_threads = std::min(odim, 512);
+  int n_shared_bytes = n_threads * sizeof(REAL);
+  if (bsize > 0){
+    KernelSoftmaxStable<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+                            bsize,
+                            odim,
+                            gpu_data_out,
+                            odim, //x.stride[0]
+                            1, //x.stride[1]
+                            gpu_data_out,
+                            odim, //sm.stride[0]
+                            1//sm.stride[1]
+                    );
+    cudaError_t err = cudaGetLastError();
+    if(cudaSuccess != err){
+      printf("n_blocks=%d, n_threads=%d, n_shared_bytes=%d odim=%d\n",
+             n_blocks, n_threads, n_shared_bytes, odim);
+      Error(cudaGetErrorString(err));
+    }
+  }
+}
+
+//-----------------------------------------------
+// Linear Rectifier units
+//-----------------------------------------------
+
+__global__
+void KernelLinRectifForw(const int n, REAL *gpu_data_out)
+{
+  int tx = threadIdx.x;
+  int bx = blockIdx.x;
+  int n_threads = blockDim.x * gridDim.x;
+  int id = tx * blockDim.x + bx * gridDim.x;
+  for(int i = id; i < n; i += n_threads){
+    if (gpu_data_out[i]<0) gpu_data_out[i]=0;
+  }
+}
+
+void Gpu::LinRectifForw(const int n, REAL *gpu_data_out)
+{
+  int nb_thread = std::min(n, 256);
+  int nb_block = n / 256;
+  KernelLinRectifForw<<<nb_block, nb_thread, 0, Gpu::curStream>>>(n, gpu_data_out);
+}
+
+__global__
+void KernelLinRectifBackw(const int n, REAL *gpu_data_out, REAL *gpu_grad_out)
+{
+  int tx = threadIdx.x;
+  int bx = blockIdx.x;
+  int n_threads = blockDim.x * gridDim.x;
+  int id = tx * blockDim.x + bx * gridDim.x;
+  for(int i = id; i < n; i += n_threads){
+    if (gpu_data_out[i]<0) gpu_grad_out[i]=0; else gpu_grad_out[i]=1;
+  }
+}
+
+void Gpu::LinRectifBackw(const int n, REAL *gpu_data_out, REAL *gpu_grad_out)
+{
+  int nb_thread = std::min(n, 256);
+  int nb_block = n / 256;
+  KernelLinRectifBackw<<<nb_block, nb_thread, 0, Gpu::curStream>>>(n, gpu_data_out, gpu_grad_out);
+}
+
+//-----------------------------------------------
+// Helper functions for drop-out
+//-----------------------------------------------
+
+__global__
+void KernelDropOut(const int n, REAL *gpu_vect, REAL *rand, REAL thresh)
+{
+  int tx = threadIdx.x;
+  int bx = blockIdx.x;
+  int n_threads = blockDim.x * gridDim.x;
+  int id = tx * blockDim.x + bx * gridDim.x;
+  for (int i = id; i < n; i += n_threads) {
+    if (rand[i]<thresh) gpu_vect[i]=0.0;
+  }
+}
+
+void Gpu::DropOut(const int n, REAL *gpu_vect, REAL *rand, REAL thresh)
+{
+  int nb_thread = std::min(n, 256);
+  int nb_block = n / 256;
+  KernelDropOut<<<nb_block, nb_thread, 0, Gpu::curStream>>>(n, gpu_vect, rand, thresh);
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcValue
+//-----------------------------------------------
+
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcValue(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target,
+					     REAL *gpu_res)
+{
+  extern __shared__ REAL buf[];
+  REAL err=0.0;
+  for (int b=threadIdx.x ; b<bsize ; b+=blockDim.x)
+     err += gpu_safelog(gpu_data_out[b*odim + (uint) gpu_target[b]]);
+  buf[threadIdx.x] = err;
+  __syncthreads();
+  if(threadIdx.x == 0) {
+    for(int i=1 ; i<blockDim.x ; i++)
+      err += buf[i];
+    atomicAdd(gpu_res, err);
+  }
+}
+
+
+REAL Gpu::ErrFctSoftmCrossEntNgramCalcValue(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target)
+{
+  REAL res;
+  if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL));
+  cudaMemsetAsync(gpu_result, 0.0, sizeof(REAL), Gpu::curStream); //Each thread will atomicAdd into it.
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  KernelErrFctSoftmCrossEntNgramCalcValue<<<1, n_threads, n_threads*sizeof(REAL), Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_target, gpu_result);
+  cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+  return res;
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcValueNull
+//-----------------------------------------------
+
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcValueNull(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target,
+					     REAL *gpu_res)
+{
+  extern __shared__ REAL buf[];
+  REAL err=0.0;
+  for (int b=threadIdx.x ; b<bsize ; b+=blockDim.x) {
+     int tidx = gpu_target[b]; // do not cast to uint ! Otherwise, nvcc will transform the -1 to 0!
+     if (tidx != NULL_WORD) err += gpu_safelog(gpu_data_out[b*odim + tidx]);
+  }
+  buf[threadIdx.x] = err;
+  __syncthreads();
+  if(threadIdx.x == 0) {
+    for(int i=1 ; i<blockDim.x ; i++)
+      err += buf[i];
+    atomicAdd(gpu_res, err);
+  }
+}
+
+
+REAL Gpu::ErrFctSoftmCrossEntNgramCalcValueNull(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target)
+{
+  REAL res;
+  if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL));
+  cudaMemsetAsync(gpu_result, 0.0, sizeof(REAL), Gpu::curStream); //Each thread will atomicAdd into it.
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  KernelErrFctSoftmCrossEntNgramCalcValueNull<<<1, n_threads, n_threads*sizeof(REAL), Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_target, gpu_result);
+  cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+  return res;
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcValueBatch
+//-----------------------------------------------
+
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcValueBatch(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target, REAL *tmp_buf)
+{
+  //extern __shared__ REAL buf[];
+  for (int b=threadIdx.x ; b<bsize ; b+=blockDim.x) {
+     int tidx = gpu_target[b]; // do not cast to uint ! Otherwise, nvcc will transform the -1 to 0!
+     if (tidx== NULL_WORD)
+       tmp_buf[b] = LOG_PROBA_NONE;	// handle NULL_WORD
+     else
+       tmp_buf[b] = gpu_safelog(gpu_data_out[b*odim + tidx]);
+  }
+}
+
+
+void Gpu::ErrFctSoftmCrossEntNgramCalcValueBatch(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target, REAL *res_vect)
+{
+  if (odim > GPU_BUF_DIM)
+    Error("Gpu::ErrFctSoftmCrossEntNgramCalcValueBatch(): odim (%d) is larger than internal buffer (%d)"); //,odim,GPU_BUF_DIM);
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  KernelErrFctSoftmCrossEntNgramCalcValueBatch<<<1, n_threads, 0, Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_target, gpu_buf);
+  cudaMemcpyAsync(res_vect, gpu_buf, bsize*sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcMax
+//-----------------------------------------------
+
+void Gpu::ErrFctSoftmCrossEntNgramCalcMax(const int eff_bsize, const int dim, REAL *output, REAL *target, REAL *res, int *pos)
+{
+  Error("TODO: Gpu::ErrFctSoftmCrossEntNgramCalcMax()");
+}
+
+#if 0 // not used anymore, use CalcvalueBatch() instead
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcValueNth(const int idx, const int odim, REAL *gpu_data_out, REAL *gpu_target, REAL *gpu_res)
+{
+  int tidx = (int) gpu_target[idx]; // do not cast to uint ! Otherwise, nvcc will transform the -1 to 0!
+  if (tdx<0) // NULL_WORD 
+    *gpu_res=-1;
+  else
+    *gpu_res = gpu_safelog(gpu_data_out[idx*odim + tidx]);
+}
+
+
+REAL Gpu::ErrFctSoftmCrossEntNgramCalcValueNth(const int idx, const int odim, REAL *gpu_data_out, REAL *gpu_target)
+{
+  REAL res;
+  if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL));
+  KernelErrFctSoftmCrossEntNgramCalcValueNth<<<1, 1, 1*sizeof(REAL), Gpu::curStream>>>(idx, odim, gpu_data_out, gpu_target, gpu_result);
+  cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+  return res;
+#endif
+
+
+//-----------------------------------------------
+// ErrFctSoftmClassCrossEntNgram::CalcWordClassError
+//-----------------------------------------------
+
+__global__
+void KernelErrFctSoftmClassError(const int bsize, const int n_classes, REAL *gpu_class_out, REAL *gpu_class_target,
+                                 REAL *gpu_res)
+{
+  int class_err=0;
+  REAL *ocptr=gpu_class_out;
+  REAL *tcptr=gpu_class_target;
+  for (int b=0; b<bsize; b++) {
+    REAL max_oclass = ocptr[0];
+    int argmax = 0;
+    for (int i=1; i<n_classes; i++) {
+      REAL oclass_i = ocptr[i];
+      if (oclass_i > max_oclass) {
+        argmax = i;
+        max_oclass = oclass_i;
+      }
+    }
+    if ((int) *tcptr != argmax)
+      class_err++;
+
+    ocptr += n_classes;
+    tcptr++;
+  }
+  *gpu_res = (REAL) class_err;
+}
+
+__global__ void KernelErrFctSoftmClassError2(const int bsize, const int n_classes,
+    REAL *gpu_class_out, REAL *gpu_class_target, REAL *gpu_res)
+{
+  extern __shared__ REAL buf[];
+  buf[threadIdx.x] = 0;
+  for (int i = threadIdx.x; i < bsize; i += blockDim.x) {
+    int argmax = 0;
+    REAL max_oclass = gpu_class_out[i*n_classes];
+    for (int j = 1; j < n_classes; j++) {
+      REAL oclass_j = gpu_class_out[i*n_classes + j];
+      if (oclass_j > max_oclass) {
+        argmax = j;
+        max_oclass = oclass_j;
+      }
+    }
+    if ((int) gpu_class_target[i] != argmax)
+      buf[threadIdx.x] += 1;
+  }
+  __syncthreads();
+  // Reduce sum into buf[0]
+  if (threadIdx.x < warpSize) {
+    for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize) {
+      buf[threadIdx.x] += buf[i];
+    }
+    if (threadIdx.x < 16) {
+      if (threadIdx.x + 16 < n_classes)
+        buf[threadIdx.x] += buf[threadIdx.x + 16];
+      if (threadIdx.x + 8 < n_classes)
+        buf[threadIdx.x] += buf[threadIdx.x + 8];
+      if (threadIdx.x + 4 < n_classes)
+        buf[threadIdx.x] += buf[threadIdx.x + 4];
+      if (threadIdx.x + 2 < n_classes)
+        buf[threadIdx.x] += buf[threadIdx.x + 2];
+      if (threadIdx.x + 1 < n_classes)
+        buf[threadIdx.x] += buf[threadIdx.x + 1];
+    }
+  }
+  if (threadIdx.x == 0)
+    *gpu_res = buf[0];
+}
+
+REAL Gpu::ErrFctSoftmClassError(const int bsize, const int n_classes, REAL *gpu_class_out, REAL *gpu_class_target)
+{
+  REAL res;
+  if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL));
+  int n_threads = std::min(bsize, 512);
+  int n_blocks = bsize / n_threads + ((bsize % n_threads) ? 1 : 0);
+  int n_shared_bytes = n_threads * sizeof(REAL);
+  KernelErrFctSoftmClassError2<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, n_classes, gpu_class_out, gpu_class_target, gpu_result);
+  cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+  return res;
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcGrad
+//-----------------------------------------------
+/**
+ * @note This kernel need many block to compute the grad but also need to do a reduction.
+ * The first block will do the reduction and compute the grad associated with it
+ * and all the other will compute the grad for other words.
+ */
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcGrad(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target,
+					    REAL *gpu_res)
+{
+  if (blockIdx.x == 0) {
+    // the first block computes the error and grad for used words
+    extern __shared__ REAL buf[];
+    REAL err=0.0;
+    for (int b=threadIdx.x; b<bsize; b+=blockDim.x) {
+      unsigned int tidx=(uint) gpu_target[b];
+      gpu_grad[b*odim + tidx] = (1.0f - gpu_grad[b*odim + tidx]);
+      err += gpu_safelog(gpu_data_out[b*odim + tidx]);
+    }
+    buf[threadIdx.x] = err;
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      for (int i=1; i<blockDim.x; i++)
+        err += buf[i];
+      *gpu_res=err;
+    }
+  }
+  else
+    // the next blocks computes the grad for all other words
+    for (int b=blockIdx.x-1; b<bsize; b+=gridDim.x-1) {
+      unsigned int tidx=(uint) gpu_target[b];
+      for (int i=threadIdx.x; i<odim; i+=blockDim.x)
+        if (tidx != (uint)i)
+          gpu_grad[b*odim + i] *= -1.0f;
+    }
+}
+
+void Gpu::ErrFctSoftmCrossEntNgramCalcGrad(const int bsize, const int odim, REAL *gpu_data_out,
+                                         REAL *gpu_grad, REAL *gpu_target, REAL * gpu_res)
+{
+  cudaMemcpyAsync(gpu_grad, gpu_data_out, bsize*odim*sizeof(REAL), cudaMemcpyDeviceToDevice, Gpu::curStream);
+
+  int nb_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize + 1);
+  int nb_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  int n_shared_bytes = nb_threads * sizeof(REAL);
+  KernelErrFctSoftmCrossEntNgramCalcGrad<<<nb_blocks, nb_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, odim, gpu_data_out, gpu_grad, gpu_target, gpu_res);
+
+  cudaError_t err = cudaGetLastError();
+  if(cudaSuccess != err){
+    ErrorN("Error in Gpu::ErrFctSoftmCrossEntNgramCalcGrad: %s", cudaGetErrorString(err));
+  }
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcGradNull
+//-----------------------------------------------
+/**
+ * @note This kernel need many block to compute the grad but also need to do a reduction.
+ * The first block will do the reduction and compute the grad associated with it
+ * and all the other will compute the grad for other words.
+ */
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcGradNull(const int bsize, const int odim,
+     REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target,
+                                            REAL *gpu_res)
+{
+  if (blockIdx.x == 0) {
+    // the first block computes the error and grad for non NULL words
+    extern __shared__ REAL buf[];
+    REAL err=0.0;
+    for (int b=threadIdx.x; b<bsize; b+=blockDim.x) {
+      //Do not cast or use unsigned for tidx. Otherwise, nvcc will transform the -1 to 0!
+      //This is a difference compared to the GPU!
+      int tidx = gpu_target[b];
+      debug5(" -batch=%d target=%d -> output at %p is %f, update grad at %p\n", b, tidx, &(gpu_data_out[b*odim + tidx]), gpu_data_out[b*odim + tidx], &(gpu_grad[b*odim+tidx]));
+      if (tidx != NULL_WORD) {
+        gpu_grad[b*odim + tidx] = (1.0f - gpu_grad[b*odim + tidx]);
+        err += gpu_safelog(gpu_data_out[b*odim + tidx]);
+      }
+    }
+    buf[threadIdx.x] = err;
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      for (int i=1; i<blockDim.x; i++)
+        err += buf[i];
+      *gpu_res=err;
+    }
+  }
+  else
+    // the next blocks computes the grad for all other words
+    for (int b=blockIdx.x-1; b<bsize; b+=gridDim.x-1) {
+      int tidx = gpu_target[b];
+      for (int i=threadIdx.x; i<odim; i+=blockDim.x) {
+        if (tidx == NULL_WORD)
+          gpu_grad[b*odim + i] = 0;
+        else if (tidx != i)
+          gpu_grad[b*odim + i] *= -1.0f;
+      }
+    }
+}
+
+void Gpu::ErrFctSoftmCrossEntNgramCalcGradNull(const int bsize, const int odim, REAL *gpu_data_out,
+                                         REAL *gpu_grad, REAL *gpu_target, REAL * gpu_res)
+{
+  cudaMemcpyAsync(gpu_grad, gpu_data_out, bsize*odim*sizeof(REAL), cudaMemcpyDeviceToDevice, Gpu::curStream);
+
+  int nb_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize + 1);
+  int nb_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  int n_shared_bytes = nb_threads * sizeof(REAL);
+  KernelErrFctSoftmCrossEntNgramCalcGradNull<<<nb_blocks, nb_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, odim, gpu_data_out, gpu_grad, gpu_target, gpu_res);
+
+  cudaError_t err = cudaGetLastError();
+  if(cudaSuccess != err){
+    ErrorN("Error in Gpu::ErrFctSoftmCrossEntNgramCalcGradNull: %s", cudaGetErrorString(err));
+  }
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcGradCumul
+//-----------------------------------------------
+/**
+ * @note This kernel need many block to compute the grad but also need to do a reduction.
+ * The first block will do the reduction and compute the grad associated with it
+ * and all the other will compute the grad for other words.
+ */
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcGradCumul(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target,
+					    REAL *gpu_res)
+{
+  if (blockIdx.x == 0) {
+    // the first block computes the error and grad for used words
+    extern __shared__ REAL buf[];
+    REAL err=0.0;
+    unsigned int tidx;
+
+    for (int b=threadIdx.x ; b<bsize ; b+=blockDim.x) {
+      tidx=(b*odim + (uint) gpu_target[b]);
+      gpu_grad[tidx] = (1.0f - gpu_grad[tidx]);
+      err += gpu_safelog(gpu_data_out[tidx]);
+    }
+    buf[threadIdx.x] = err;
+    __syncthreads();
+    if(threadIdx.x == 0) {
+      for(int i=1 ; i<blockDim.x ; i++)
+        err += buf[i];
+      atomicAdd(gpu_res, err);
+    }
+  }
+  else
+    // the next blocks computes the grad for all other words
+    for (int b=blockIdx.x-1; b<bsize; b+=gridDim.x-1) {
+      unsigned int tidx = gpu_target[b];
+      for (int i=threadIdx.x; i<odim; i+=blockDim.x)
+        if (tidx != (uint)i)
+          gpu_grad[b*odim + i] *= -1.0f;
+    }
+}
+
+
+void Gpu::ErrFctSoftmCrossEntNgramCalcGradCumul(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target)
+{
+  if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL));
+
+  cudaMemsetAsync(gpu_result, 0.0, sizeof(REAL), Gpu::curStream); //Each thread will atomicAdd into it.
+  cudaMemcpyAsync(gpu_grad, gpu_data_out, bsize*odim*sizeof(REAL), cudaMemcpyDeviceToDevice, Gpu::curStream);
+  int nb_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize + 1);
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  KernelErrFctSoftmCrossEntNgramCalcGradCumul<<<nb_blocks, n_threads, n_threads*sizeof(REAL), Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_grad, gpu_target, gpu_result);
+  Error("Gpu::ErrFctSoftmCrossEntNgramCalcGradCumul not finished!");
+
+  //REAL res;
+  //cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  //cudaStreamSynchronize(Gpu::curStream);
+  //return res;
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgramMulit::CalcGrad
+//-----------------------------------------------
+/**
+ * @note This kernel need many block to compute the grad but also need to do a reduction.
+ * The first part of blocks will do the reduction and compute the grad associated with it
+ * and all the other will compute the grad for other words.
+ */
+__global__
+void KernelErrFctSoftmCrossEntNgramMultiCalcGrad(const int bsize, const int dim, const int nb,
+						 REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target,
+	                                         REAL *gpu_res)
+{
+  if (blockIdx.y == 0) {
+    if (threadIdx.x < nb) {
+      // the first part of blocks computes the error and grad for non NULL words
+      extern __shared__ REAL buf[];
+      REAL err=0.0;
+      for (int b=blockIdx.x; b<bsize; b+=gridDim.x)
+        for (int n=threadIdx.x; n<nb; n+=blockDim.x) {
+          int tidx=(int) gpu_target[b*nb + n];
+          if (tidx != NULL_WORD) {
+            gpu_grad[(b*nb+n)*dim + tidx] = (1.0 - gpu_grad[(b*nb+n)*dim + tidx]);
+            err += gpu_safelog(gpu_data_out[(b*nb+n)*dim + tidx]);
+            debug6("grad ngram-multi:  b=%d, n=%d, tidx=%u, out=%f -> err=%e, grad@target=%e\n", b, n, tidx, gpu_data_out[(b*nb+n)*dim + tidx], err, gpu_grad[(b*nb+n)*dim + tidx]);
+          }
+          else {
+            debug4("grad ngram-multi:  b=%d, n=%d, tidx=NULL, out=%f -> err=%e\n", b, n, gpu_data_out[(b*nb+n)*dim + tidx], err);
+          }
+        }
+      buf[threadIdx.x] = err;
+      __syncthreads();
+      if (threadIdx.x == 0) {
+        for (int i=1; (i<nb) && (i<blockDim.x); i++)
+          err += buf[i];
+        atomicAdd(gpu_res, err);
+      }
+    }
+  }
+  else if (threadIdx.x < dim)
+    // the next blocks computes the grad for all other words
+    for (int b=blockIdx.x; b<bsize; b+=gridDim.x)
+      for (int n=(blockIdx.y-1); n<nb; n+=(gridDim.y-1)) {
+        int tidx=(int) gpu_target[b*nb + n];
+        for (int i=threadIdx.x; i<dim; i+=blockDim.x) {
+          if (tidx == NULL_WORD)
+            gpu_grad[(b*nb+n)*dim + i] = 0;
+          else if (tidx != i)
+            gpu_grad[(b*nb+n)*dim + i] *= -1.0;
+        }
+      }
+}
+
+REAL Gpu::ErrFctSoftmCrossEntNgramMultiCalcGrad(const int bsize, const int dim, const int nb,
+                                              REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target)
+{
+  if (gpu_result==NULL) cudaMalloc(&gpu_result, sizeof(REAL));
+
+// same below
+  int n=bsize*nb*dim;
+  cudaMemcpyAsync(gpu_grad, gpu_data_out, n*sizeof(REAL),
+             cudaMemcpyDeviceToDevice, Gpu::curStream);
+  
+  cudaMemsetAsync(gpu_result, 0.0, sizeof(REAL), Gpu::curStream);//Each block will atomicAdd into it.
+ 
+  cudaError_t sts = cudaGetLastError();
+  if (cudaSuccess != sts)
+    Error("Error before KernelErrFctSoftmCrossEntNgramMultiCalcGrad");
+  int nb_threads = std::min(std::max(nb, dim), Gpu::curDevProps->maxThreadsDim[0]);
+  int n_shared_bytes = std::min(nb, nb_threads) * sizeof(REAL);
+  dim3 nb_blocks(std::min( bsize, Gpu::curDevProps->maxGridSize[0]),
+                 std::min(nb + 1, Gpu::curDevProps->maxGridSize[1]));
+  KernelErrFctSoftmCrossEntNgramMultiCalcGrad<<<nb_blocks, nb_threads, n_shared_bytes, Gpu::curStream>>>(
+    bsize, dim, nb, gpu_data_out, gpu_grad, gpu_target, gpu_result);
+  sts = cudaGetLastError();
+  if (cudaSuccess != sts) 
+  {
+    printf(cudaGetErrorString(sts));
+    Error("KernelErrFctSoftmCrossEntNgramMultiCalcGrad cuda error: ");
+  }
+  REAL res;
+  cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+
+  return res;
+}
+
+
+//-----------------------------------------------
+// MachSoftmaxClass
+//-----------------------------------------------
+// Forw
+/* This function performs the equivalent of various Gemv, with different sizes
+   and offsets for each example in a minibatch. */
+__global__ void KernelLinForwOffset(const int bsize, const int idim, const int odim,
+                                    REAL* input, REAL* weights, REAL* bias, REAL* output,
+                                    int* class_info)
+{
+  // Each block corresponds to one (or more) sub-vector of the output. Each thread
+  // corresponds to one of its elements.
+  // Axis x of the grid corresponds to the output rows: if sizes takes large values,
+  // j will need to go beyond gridDim.x * blockDim.x
+  // Axis y of the grid corresponds to the batch size.
+
+  extern __shared__ REAL buf[];
+
+  for (int i = blockIdx.y; i < bsize; i += gridDim.y) {
+    int offset = class_info[2*i];
+    int size = class_info[2*i+1];
+    REAL* in_vec = input + i*idim;
+
+    // Copy in_vec into shared memory, so all threads in this block can access it faster
+    for (int k = threadIdx.x; k < idim; k += blockDim.x) {
+      buf[k] = in_vec[k];
+    }
+    __syncthreads();
+
+    for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < size; j += blockDim.x * gridDim.x) {
+      // Compute one (vector-vector) dot product
+      REAL dot = bias[offset + j];
+      REAL* w_vec = weights + offset + j;
+      for (int k = 0; k < idim; k++) {
+        dot += buf[k] * w_vec[k*odim];
+      }
+      output[i*odim + offset + j] = dot;
+    }
+  }
+}
+
+void Gpu::MachSoftmaxClassLinForw(const int bsize, const int idim, const int odim,
+                                REAL* input, REAL* weights, REAL* bias, REAL* output,
+                                int* class_info, const int max_size)
+{
+  debug4("bsize: %d, idim: %d, odim: %d, max_size: %d\n", bsize, idim, odim, max_size);
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], max_size);
+  int n_blocks_y = std::min(Gpu::curDevProps->maxGridSize[1], bsize);
+  int n_blocks_x = std::min(Gpu::curDevProps->maxGridSize[0], max_size/n_threads + (max_size%n_threads==0?0:1));
+  int n_shared_bytes = idim*sizeof(REAL);
+  dim3 n_blocks(n_blocks_x, n_blocks_y);
+
+  debug3("n_threads: %d, n_blocks: (%d, %d)\n", n_threads, n_blocks_x, n_blocks_y);
+  KernelLinForwOffset<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, idim, odim, input, weights, bias, output, class_info);
+  cudaError_t err = cudaGetLastError();
+  if(cudaSuccess != err){
+    printf("KernelLinForwOffset: n_blocks=(%d, %d), n_threads=%d, shared=%d bytes\n",
+           n_blocks_x, n_blocks_y, n_threads, n_shared_bytes);
+    Error(cudaGetErrorString(err));
+  }
+}
+
+__global__ void KernelBatchedSoftmaxOffset(int M,
+    const REAL * x, const int sx0, const int sx1,
+    REAL * sm, const int sm_s0, const int sm_s1,
+    int * offsets, const int offsets_s,
+    int * sizes, const int sizes_s)
+{
+  extern __shared__ REAL buf[];
+  for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) {
+    REAL sum = 0;
+    int offset = offsets[blockIDX * offsets_s];
+    int size = sizes[blockIDX * sizes_s];
+#pragma unroll 16
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      sum += exp(x[blockIDX * sx0 + (offset + i) * sx1]);
+    }
+    buf[threadIdx.x] = sum;
+    __syncthreads();
+
+    // This function trashes buf[1..warpsize], leaving the reduction result in buf[0].
+    if (threadIdx.x < warpSize){
+#pragma unroll 8
+      for (int i = threadIdx.x + warpSize; i < blockDim.x && i < size; i += warpSize){
+        buf[threadIdx.x] += buf[i];
+      }
+      if (threadIdx.x < 16){
+        //reduce so that threadIdx.x 0 has the sum of everything
+        if (threadIdx.x + 16 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16];
+        if (threadIdx.x + 8 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8];
+        if (threadIdx.x + 4 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4];
+        if (threadIdx.x + 2 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2];
+        if (threadIdx.x + 1 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1];
+      }
+    }
+    __syncthreads();
+    REAL row_sum = buf[0];
+#pragma unroll 16
+    for (int i = threadIdx.x; i < size; i += blockDim.x){
+      sm[blockIDX * sm_s0 + (offset + i) * sm_s1] = exp(x[blockIDX * sx0 + (offset + i) * sx1]) / row_sum;
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void KernelBatchedSoftmaxStableOffset(int M,
+    const REAL * x, const int sx0, const int sx1,
+    REAL * sm, const int sm_s0, const int sm_s1,
+    int * offsets, const int offsets_s,
+    int * sizes, const int sizes_s)
+{
+  extern __shared__ REAL buf[];
+  for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) {
+    int offset = offsets[blockIDX * offsets_s];
+    int size = sizes[blockIDX * sizes_s];
+    REAL max_ = x[blockIDX * sx0 + (offset + threadIdx.x) * sx1];
+    for (int i = threadIdx.x + blockDim.x; i < size; i += blockDim.x) {
+      max_ = max(max_, x[blockIDX * sx0 + (offset + i) * sx1]);
+    };
+    buf[threadIdx.x] = max_;
+    __syncthreads();
+
+    // This function trashes buf[1..n_threads], leaving the reduction result in buf[0].
+    // Find the max to stabilize the softmax
+    if (threadIdx.x < warpSize)
+    {
+      for (int i = threadIdx.x + warpSize; i < blockDim.x && i < size; i += warpSize) {
+        buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
+      }
+      if (threadIdx.x < 16) {
+        //reduce so that threadIdx.x 0 has the max of everything
+        if (threadIdx.x + 16 < size)
+          buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+16]);
+        if (threadIdx.x + 8 < size)
+          buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+8]);
+        if (threadIdx.x + 4 < size)
+          buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+4]);
+        if (threadIdx.x + 2 < size)
+          buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+2]);
+        if (threadIdx.x + 1 < size)
+          buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+1]);
+      }
+    }
+    __syncthreads();
+    REAL row_max = buf[0];
+    __syncthreads();
+
+    REAL sum = 0;
+#pragma unroll 16
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      sum += exp(x[blockIDX * sx0 + (offset + i) * sx1] - row_max);
+    }
+    buf[threadIdx.x] = sum;
+    __syncthreads();
+
+    // This function trashes buf[1..warpsize], leaving the reduction result in buf[0].
+    if (threadIdx.x < warpSize){
+#pragma unroll 8
+      for (int i = threadIdx.x + warpSize; i < blockDim.x && i < size; i += warpSize){
+                buf[threadIdx.x] += buf[i];
+      }
+      if (threadIdx.x < 16) {
+        //reduce so that threadIdx.x 0 has the sum of everything
+        if (threadIdx.x + 16 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16];
+        if (threadIdx.x + 8 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8];
+        if (threadIdx.x + 4 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4];
+        if (threadIdx.x + 2 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2];
+        if (threadIdx.x + 1 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1];
+      }
+    }
+    __syncthreads();
+    REAL row_sum = buf[0];
+#pragma unroll 16
+    for (int i = threadIdx.x; i < size; i += blockDim.x){
+      sm[blockIDX * sm_s0 + (offset + i) * sm_s1] = exp(x[blockIDX * sx0 + (offset + i) * sx1] - row_max) / row_sum;
+    }
+    __syncthreads();
+  }
+}
+
+void Gpu::MachSoftmaxClassSoftmForw(const int bsize, const int odim, REAL* gpu_data_out,
+                                  int* class_info, const int max_size, const int stable)
+{
+  int n_blocks = std::min(bsize, 32 * 1024);
+  int n_threads = std::min(max_size, 512);
+  int n_shared_bytes = n_threads * sizeof(REAL);
+  if (bsize > 0) {
+    if (stable) {
+      KernelBatchedSoftmaxStableOffset<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(bsize,
+          gpu_data_out, odim, 1,
+          gpu_data_out, odim, 1,
+          class_info, 2,
+          class_info + 1, 2);
+    }
+    else {
+      KernelBatchedSoftmaxOffset<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(bsize,
+          gpu_data_out, odim, 1,
+          gpu_data_out, odim, 1,
+          class_info, 2,
+          class_info + 1, 2);
+      cudaError_t err = cudaGetLastError();
+      if(cudaSuccess != err){
+        printf("KernelBatchedSoftmaxOffset: n_blocks=%d, n_threads=%d, n_shared_bytes=%d odim=%d\n",
+               n_blocks, n_threads, n_shared_bytes, odim);
+        Error(cudaGetErrorString(err));
+      }
+    }
+  }
+}
+
+__global__ void KernelBatchedSoftmCrossEntGradOffset(int M,
+    const REAL* x, const int sx0, const int sx1,
+    REAL* grad, const int sg0, const int sg1,
+    REAL* target, const int st,
+    int* offsets, const int so,
+    int* sizes, const int ss,
+    REAL* res)
+{
+  extern __shared__ REAL buf[];
+  REAL err = 0.0f;
+  for (int i = threadIdx.x; i < M; i += blockDim.x) {
+    int offset = offsets[i * so];
+    int size = sizes[i * ss];
+    for (int j = 0; j < size; j++) {
+      grad[i * sg0 + (offset + j) * sg1] = - x[i * sx0 + (offset + j) * sx1];
+    }
+    unsigned int tidx = (uint) target[i * st] - offset;
+    grad[i * sg0 + (offset + tidx) * sg1] += 1.0f;
+    err += gpu_safelog(x[i * sx0 + (offset + tidx) * sx1]);
+  }
+  buf[threadIdx.x] = err;
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    for (int i = 1; i < blockDim.x; i++) {
+      err += buf[i];
+    }
+    *res = err;
+  }
+}
+
+void Gpu::ErrFctSoftmClassCrossEntNgramCalcGrad(const int bsize, const int odim,
+    REAL* gpu_data_out, REAL* gpu_grad, REAL* gpu_target, int* class_info, REAL* gpu_res)
+{
+  int nb_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  int n_shared_bytes = nb_threads * sizeof(REAL);
+  KernelBatchedSoftmCrossEntGradOffset<<<1, nb_threads, n_shared_bytes, Gpu::curStream>>>(bsize,
+      gpu_data_out, odim, 1,
+      gpu_grad, odim, 1,
+      gpu_target, 1,
+      class_info, 2,
+      class_info + 1, 2,
+      gpu_res);
+
+  cudaError_t err = cudaGetLastError();
+  if(cudaSuccess != err){
+    ErrorN("Error in Gpu::ErrFctSoftmClassCrossEntNgramCalcGrad: %s", cudaGetErrorString(err));
+  }
+}
+
+__global__ void KernelLinGradInOffset(const int bsize, const int idim,
+                                      REAL* grad_out, const int sgo0, const int sgo1,
+                                      REAL* weights, const int sw0, const int sw1,
+                                      REAL* grad_in, const int sgi0, const int sgi1,
+                                      int* offsets, const int so,
+                                      int* sizes, const int ss)
+{
+  /*
+     Computes the a dot product (equivalent of gemv) on each row of grad_in,
+     using a different part of grad_out and weights each time (determined
+     from offsets and sizes).
+     Each row of grad_in (index i) corresponds to one blockIdx.y.
+     Columns of grad_in (lines of weights, index j) are split in groups
+     indexed by blockIdx.x. Each group has blockDim.y indices, each index
+     corresponds to a value of threadIdx.y.
+     For each (i, j), a scalar (vector-vector) dot product is computed, over
+     two vectors of length sizes[i], this sum is indexed by k. blockDim.x partial
+     sums are computed in parallel and stored in buf[threadIdx.y][threadIdx.x],
+     then a reduction steps computes the final dot product.
+     We use threadIdx.x as the fast-moving index to maximize coalesced memory
+     reads and writes.
+  */
+  extern __shared__ REAL buf[];
+  for (int i = blockIdx.y; i < bsize; i += gridDim.y) {
+    int offset = offsets[i * so];
+    int size = sizes[i * ss];
+
+    REAL* ograd_vec = grad_out + i * sgo0;
+    REAL* buf_y = buf + blockDim.x * threadIdx.y;
+    for (int j = blockDim.y * blockIdx.x + threadIdx.y; j < idim; j += gridDim.x * blockDim.y) {
+      // Perform partially-summed dot product, stored in buf[]
+      REAL* w_vec = weights + j * sw0 + offset * sw1;
+      REAL dot = 0;
+      for (int k = threadIdx.x; k < size; k += blockDim.x) {
+        dot += ograd_vec[(offset + k) * sgo1] * w_vec[k * sw1];
+      }
+      buf_y[threadIdx.x] = dot;
+      __syncthreads();
+
+      // Perform the final summation into the first columns of buf[]
+      // and accumulate the final result in grad_in
+      if (threadIdx.x < 16 && threadIdx.x + 16 < size)
+        buf_y[threadIdx.x] += buf_y[threadIdx.x + 16];
+      if (threadIdx.x <  8 && threadIdx.x +  8 < size)
+        buf_y[threadIdx.x] += buf_y[threadIdx.x + 8];
+      if (threadIdx.x <  4 && threadIdx.x +  4 < size)
+        buf_y[threadIdx.x] += buf_y[threadIdx.x + 4];
+      if (threadIdx.x <  2 && threadIdx.x +  2 < size)
+        buf_y[threadIdx.x] += buf_y[threadIdx.x + 2];
+      if (threadIdx.x == 0)
+        grad_in[i * sgi0 + j * sgi1] += buf_y[0] + buf_y[1];
+    }
+  }
+}
+
+void Gpu::MachSoftmaxClassLinGradIn(const int bsize, const int idim, const int odim,
+                                  REAL* grad_out, REAL* weights, REAL* grad_in,
+                                  int* class_info, const int max_size)
+{
+  int n_threads_x = Gpu::curDevProps->warpSize; // one warp
+  int n_threads_y = std::min(Gpu::curDevProps->maxThreadsPerBlock / n_threads_x, Gpu::curDevProps->maxThreadsDim[1]); // Maximum possible
+  int n_blocks_x = std::min(Gpu::curDevProps->maxGridSize[0], idim / n_threads_y + (idim%n_threads_y==0?0:1));
+  int n_blocks_y = std::min(Gpu::curDevProps->maxGridSize[1], bsize);
+  int n_shared_bytes = n_threads_x * n_threads_y * sizeof(REAL);
+  dim3 n_threads(n_threads_x, n_threads_y);
+  dim3 n_blocks(n_blocks_x, n_blocks_y);
+
+  KernelLinGradInOffset<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, idim,
+      grad_out, odim, 1,
+      weights, odim, 1,
+      grad_in, idim, 1,
+      class_info, 2,
+      class_info + 1, 2);
+
+  cudaError_t err = cudaGetLastError();
+  if(cudaSuccess != err){
+    ErrorN("Error in Gpu::MachSoftmaxClassLinGrad: %s", cudaGetErrorString(err));
+  }
+}
+
+__global__ void KernelLinGradUpdate(const int bsize, const int idim,
+                                    REAL* input, const int si0, const int si1,
+                                    REAL* grad_out, const int sg0, const int sg1,
+                                    REAL* weights, const int sw0, const int sw1,
+                                    REAL* bias, const int sb,
+                                    int* offsets, const int so,
+                                    int* sizes, const int ss,
+                                    const REAL lrate, const REAL wdecay)
+{
+  /*
+     Computes a series of rank-1 updates (equivalent of ger) on sub-matrices
+     of weights. Also performs updates on bias directly proportional to
+     the relevant sub-vectors of grad_out.
+     Each row of grad_out and of input (index k) corresponds to one blockIdx.y.
+     Rows of weights (columns of inputs, index i) split in groups indexed by
+     blockIdx.x. Each group has blockDim.y indices, each index corresponds to a
+     value of threadIdx.y.
+     Columns of weights and grad_out (index j) are iterated over with blockDim.x
+     parallel threads, indexed by threadIdx.x.
+
+     Using blockDim.x == 1 warp seems to maximize speed.
+
+     NOTE: Applying weight decay on the whole weight matrix would be too slow
+     (in the order of +50% execution time), so we apply it in this kernel,
+     only on the weights that were used for this minibatch.
+     Since there is no atomic multiplication primitive, the value of weights we
+     read before the update may have already been updated (by another example in
+     the same minibatch), or not. It should not make a large difference.
+  */
+
+
+  for (int k = blockIdx.y; k < bsize; k += gridDim.y) {
+    int offset = offsets[k * so];
+    int size = sizes[k * ss];
+    REAL* in_vec = input + k * si0;
+    REAL* grad_vec = grad_out + k * sg0 + offset * sg1;
+
+    for (int i = blockIdx.x * blockDim.y + threadIdx.y; i < idim; i += gridDim.x * blockDim.y) {
+      REAL* w_vec = weights + i * sw0 + offset * sw1;
+      for (int j = threadIdx.x; j < size; j += blockDim.x)
+      {
+         REAL update = lrate * (in_vec[i * si1] * grad_vec[j * sg1]
+         // TODO: if wdecay > 0, this "+" sign should probably be a "-",
+         // but this is the convention used in MachLin.cpp.
+                                + wdecay * w_vec[j]);
+         atomicAdd(w_vec + j * sw1, update);
+      }
+
+      // Block with i == 0 also updates the bias
+      if (i == 0)
+      {
+        for (int j = threadIdx.x; j < size; j += blockDim.x)
+          atomicAdd(bias + (offset + j) * sb, lrate * grad_vec[j * sg1]);
+      }
+    }
+  }
+}
+
+void Gpu::MachSoftmaxClassLinGradUpdate(const int bsize, const int idim, const int odim,
+                                      REAL* input, REAL* grad_out,
+                                      REAL* weights, REAL* bias,
+                                      int* class_info, const int max_size,
+                                      const REAL lrate, const REAL wdecay)
+{
+  int n_threads_x = Gpu::curDevProps->warpSize; // one warp
+  int n_threads_y = std::min(Gpu::curDevProps->maxThreadsPerBlock / n_threads_x, Gpu::curDevProps->maxThreadsDim[1]); // Maximum possible
+  int n_blocks_x = std::min(Gpu::curDevProps->maxGridSize[0], idim / n_threads_y + (idim%n_threads_y==0?0:1));
+  int n_blocks_y = std::min(Gpu::curDevProps->maxGridSize[1], bsize);
+  dim3 n_threads(n_threads_x, n_threads_y);
+  dim3 n_blocks(n_blocks_x, n_blocks_y);
+  int n_shared_bytes = 0;
+
+  KernelLinGradUpdate<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, idim,
+      input, idim, 1,
+      grad_out, odim, 1,
+      weights, odim, 1,
+      bias, 1,
+      class_info, 2,
+      class_info + 1, 2,
+      lrate,
+      wdecay);
+}
+
+//-----------------------------------------------
+// Copy
+//-----------------------------------------------
+__global__
+void KernelCopyVectorToMatrix(REAL * mat, REAL * vec, const int M, const int N)
+{
+  for(int b = blockIdx.x; b<M; b+=gridDim.x)
+    for(int i = threadIdx.x; i<N; i+=blockDim.x)
+      mat[b * N + i] = vec[i];
+}
+
+/*
+ * This copy the vector on each line of the matrix.
+ */
+void Gpu::CopyVectorToMatrix(REAL * mat, REAL * vec, const int M, const int N)
+{
+  int nb_blocks = std::min(M, Gpu::curDevProps->maxGridSize[0]);
+  int nb_threads = std::min(N, Gpu::curDevProps->maxThreadsDim[0]);
+  debug4("Gpu::CopyVectorToMatrix(%p, %p %d %d)\n", mat, vec, M, N);
+  KernelCopyVectorToMatrix<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(mat, vec, M, N);
+  cudaError_t cuda_stat=cudaGetLastError();
+  if (cuda_stat != cudaSuccess)
+  { ErrorN("CUDA: ERROR %d in Gpu::CopyVectorToMatrix(%p, %p %d %d): %s\n",
+           cuda_stat, mat, vec, M, N, cudaGetErrorString(cuda_stat));
+  }
+}
+
+__global__
+void KernelCopyMatrixToMatrixStrided(REAL * dst, REAL * src, const int M, const int N, const int row_stride)
+{
+  for(int b = blockIdx.x; b<M; b+=gridDim.x)
+    for(int i = threadIdx.x; i<N; i+=blockDim.x)
+      dst[b * row_stride + i] = src[b * N + i]; 
+}
+
+__global__
+void KernelCopyMatrixStridedToMatrix(REAL * dst, REAL * src, const int M, const int N,
+                                     const int row_stride_src)
+{
+  for(int b = blockIdx.x; b<M; b+=gridDim.x)
+    for(int i = threadIdx.x; i<N; i+=blockDim.x)
+      dst[b * N + i] = src[b * row_stride_src + i]; 
+}
+
+/*
+ * This copy each line of a contiguous matrix to another matrix that is strided
+ */
+void Gpu::CopyMatrixToMatrixStrided(REAL * dst, REAL * src, const int M, const int N, const int row_stride)
+{
+  int nb_blocks = std::min(M, Gpu::curDevProps->maxGridSize[0]);
+  int nb_threads = std::min(N, Gpu::curDevProps->maxThreadsDim[0]);
+  KernelCopyMatrixToMatrixStrided<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(dst, src, M, N, row_stride);
+  cudaError_t cuda_stat=cudaGetLastError();
+  if (cuda_stat != cudaSuccess){
+    ErrorN("CUDA: ERROR %d in Gpu::CopyMatrixToMatrixStrided: %s\n",
+           cuda_stat, cudaGetErrorString(cuda_stat));
+  }
+}
+
+/*
+ * This copy each line of a strided matrix to another matrix that is contiguous
+ */
+void Gpu::CopyMatrixStridedToMatrix(REAL * dst, REAL * src, const int M, const int N, const int row_stride)
+{
+  int nb_blocks = std::min(M, Gpu::curDevProps->maxGridSize[0]);
+  int nb_threads = std::min(N, Gpu::curDevProps->maxThreadsDim[0]);
+  KernelCopyMatrixStridedToMatrix<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(dst, src, M, N, row_stride);
+  cudaError_t cuda_stat=cudaGetLastError();
+  if (cuda_stat != cudaSuccess){
+    ErrorN("CUDA: ERROR %d in Gpu::CopyMatrixStridedToMatrix: %s\n",
+           cuda_stat, cudaGetErrorString(cuda_stat));
+  }
+}
+
+//-----------------------------------------------
+// Multiple AXPY input row on one output row
+//-----------------------------------------------
+
+// Each block compute a fixed number of colums for all batch.
+// This allow to have read coalesced and don't need atomic opartion.
+__global__
+void KernelBatchedAXPY(const int n, const REAL a, REAL * x, const int incx,
+                       REAL * y, const int incy, const int nb_batch){
+  for(int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n;
+      idx += blockDim.x*gridDim.x){
+    for(int b=0; b<nb_batch; b++){
+      y[idx * incy] += a * x[b * n * incx + idx * incx];
+    }
+  }
+}
+
+void Gpu::BatchedAXPY(const int n, const REAL a, REAL * x, const int incx,
+                    REAL * y, const int incy, const int nb_batch){
+  int nb_threads = std::min(128, n);
+  int nb_blocks = std::min(Gpu::curDevProps->maxGridSize[0], n/nb_threads+(n%nb_threads==0?0:1));
+  nb_blocks = std::max(nb_blocks, 1);
+  KernelBatchedAXPY<<<nb_blocks,nb_threads, 0, Gpu::curStream>>>(n, a, x, incx, y, incy, nb_batch); 
+ 
+}
+
+
+//-----------------------------------------------
+// Element-wise exponential
+//-----------------------------------------------
+__global__ void KernelElemwiseExp(const int size, REAL *gpu_data_in, REAL *gpu_data_out) {
+  for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) {
+    gpu_data_out[idx] = exp(gpu_data_in[idx]);
+  }
+}
+
+/*
+ * Performs gpu_data_out[i] = exp(gpu_data_in[i]) for 0 <= i < size
+ */
+void Gpu::ElemwiseExp(const int size, REAL *gpu_data_in, REAL *gpu_data_out) {
+  int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]);
+  int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]);
+  KernelElemwiseExp<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(size, gpu_data_in, gpu_data_out);
+}
+
+//-----------------------------------------------
+// Tanh and its gradient
+//-----------------------------------------------
+__global__ void KernelElemwiseTanh(const int size, REAL *gpu_data_in, REAL *gpu_data_out) {
+  for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) {
+    gpu_data_out[idx] = tanh(gpu_data_in[idx]);
+  }
+}
+
+__global__ void KernelElemwiseTanhGrad(const int size, REAL *gpu_data_out, REAL *gpu_grad_out, REAL *gpu_grad_in) {
+  for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) {
+    REAL data_out = gpu_data_out[idx];
+    gpu_grad_in[idx] = (1.0f - data_out * data_out) * gpu_grad_out[idx];
+  }
+}
+
+/*
+ * Performs gpu_data_out[i] = tanh(gpu_data_in[i]) for 0 <= i < size
+ * where tanh(x) = sinh/cosh = (exp x - exp -x) / (exp x + exp -x)
+ *               = (exp(2*x) - 1) / (exp(2*x) + 1)
+ */
+void Gpu::ElemwiseTanh(const int size, REAL *gpu_data_in, REAL *gpu_data_out) {
+  int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]);
+  int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]);
+  KernelElemwiseTanh<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(size, gpu_data_in, gpu_data_out);
+}
+
+/*
+ * Performs gpu_grad_in[i] = (1 - gpu_data_out[i]**2) * gpu_grad_out[i]
+ * for 0 <= i < size
+ * which corresponds to the backpropagation of the gradient through tanh.
+ */
+void Gpu::ElemwiseTanhGrad(const int size, REAL *gpu_data_out, REAL* gpu_grad_out, REAL *gpu_grad_in) {
+  int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]);
+  int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]);
+  KernelElemwiseTanhGrad<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(size, gpu_data_out, gpu_grad_out, gpu_grad_in);
+}
+
+/*
+ * set GPU memory to a value - equivalent to memset() on CPU
+ */
+
+__global__ void KernelMemSet(const int size, REAL *adr, REAL val) {
+  for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) {
+    adr[idx] = val;
+  }
+}
+
+void Gpu::MemSet(REAL *adr, REAL val, int size) {
+  int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]);
+  int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]);
+  KernelMemSet<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(size, adr, val);
+}
+
+//-----------------------------------------------
+// Helpers
+//-----------------------------------------------
+
+void Gpu::ResSet(REAL val) {
+  cudaMemcpyAsync(gpu_result, &val, sizeof(REAL), cudaMemcpyHostToDevice, Gpu::curStream);
+}
+
+REAL Gpu::ResGet() {
+  REAL val;
+  cudaMemcpyAsync(&val, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost);
+  cudaStreamSynchronize(Gpu::curStream);
+  return val;
+}
diff --git a/KENLM b/KENLM
deleted file mode 100644
index e69de29..0000000
diff --git a/NBest.cpp b/NBest.cpp
new file mode 100644
index 0000000..0b51e5a
--- /dev/null
+++ b/NBest.cpp
@@ -0,0 +1,585 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+
+#include "NBest.h"
+#include "Tools.h"
+
+#include <sstream>
+#include <algorithm>
+
+// blocks separated by '|||'
+//  0:	sentence id
+//  1:	hypthesis
+//  2:	feature functions
+//  3:	global score
+//  4:	phrase alignments, e.g. 0-1=0-1 2-4=2-3 5=4
+
+bool NBest::ParseLine(inputfilestream& inpf, inputfilestream& auxf, const int n, const bool need_alignments, const int aux_dim)
+{
+  static string line; // used internally to buffer an input line
+  static int prev_id=-1; // used to detect a change of the n-best ID
+  int new_id;
+  vector<float> f;
+  vector<string> blocks;
+  static REAL* aux_data=NULL;
+  REAL AuxValue;
+  vector<REAL> aux_data_vec;
+  
+  if (line.empty()) {
+     getline(inpf,line);
+     if (inpf.eof()) return false;
+     if (0 < aux_dim)
+     {
+        if (!auxf)  Error("Not enough auxiliary data available");
+    	for (int i = 0 ; i<aux_dim ; i++)
+    	{
+	        auxf >> AuxValue;
+                aux_data_vec.push_back(AuxValue);	
+                if (auxf.eof()) return false;
+    	}
+    }
+  }
+  else {
+	if (aux_data) 
+	{ 
+        	for (int i = 0 ; i<aux_dim ; i++)
+        	{
+            		aux_data_vec.push_back(aux_data[i]);
+         	}
+  	}
+  }
+
+  debug1("NBest::ParseLine(): %s\n", line.c_str());
+    // split line into blocks
+  //cerr << "PARSE line: " << line << endl;
+  uint pos=0, epos;
+  //while ((epos=line.find(NBEST_DELIM,pos))!=string::npos) {
+  while ((epos=line.find(NBEST_DELIM,pos))<100000) {
+    blocks.push_back(line.substr(pos,epos-pos));
+    //cerr << " block from " << pos << " to " << epos << " : " <<  blocks.back() << endl;
+    pos=epos+strlen(NBEST_DELIM);
+  }
+  blocks.push_back(line.substr(pos,line.size()));
+  // cerr << " block: " << blocks.back() << endl;
+
+  if (blocks.size()<4) {
+    cerr << "ERROR: can't parse the following line (skipped)" << endl << line << endl;
+    line.clear(); // force read of new line
+    return true;
+  }
+
+  if (need_alignments && blocks.size()<5) {
+    Error("alignments are needed when rescoring phrase-tables");
+  }
+
+    // parse ID
+  new_id=Scan<int>(blocks[0]);
+  if (prev_id>=0 && new_id!=prev_id) {
+      if (!aux_data) aux_data = new REAL[aux_dim];
+      int j=0;
+      for (vector<REAL>::iterator x = aux_data_vec.begin(); x != aux_data_vec.end(); x++) {
+         aux_data[j]= *x;
+	 j++;
+      }
+      prev_id=new_id; return false;
+  } // new nbest list has started
+  prev_id=new_id;
+  id=new_id;
+  //cerr << "same ID " << id << endl;
+
+  if (n>0 && nbest.size() >= (uint) n) {
+    //cerr << "skipped" << endl;
+    line.clear();
+    return true; // skip parsing of unused hypos
+  }
+
+    // parse feature function scores
+  //cerr << "PARSE features: '" << blocks[2] << "' size: " << blocks[2].size() << endl;
+  pos=blocks[2].find_first_not_of(' ');
+  while (pos<blocks[2].size() && (epos=blocks[2].find(" ",pos))!=string::npos) {
+    string feat=blocks[2].substr(pos,epos-pos);
+    //cerr << " feat: '" << feat << "', pos: " << pos << ", " << epos << endl;
+    if (feat.find(":",0)!=string::npos || feat.find("=",0)!=string::npos) {
+      // skip feature names (old or new Moses style)
+      //cerr << "  name: " << feat << endl;
+    }
+    else { 
+      f.push_back(Scan<float>(feat));
+      //cerr << "  value: " << f.back() << endl;
+    }
+    pos=blocks[2].find_first_not_of(' ',epos+1);
+  }
+  //cerr << " FOUND " << f.size() << " features" << endl;
+
+#ifdef BOLT_NBEST
+  if (blocks.size()>4) { // copy all additional fields to the output
+    string extra_info;
+    for (size_t bb=4; bb<blocks.size(); bb++) {
+      extra_info.append(NBEST_DELIM);
+      extra_info.append(blocks[bb]);
+    }
+    nbest.push_back(Hypo(id, blocks[1], f, Scan<float>(blocks[3]), extra_info, aux_data_vec, aux_dim) );
+  }
+  else {
+    nbest.push_back(Hypo(id, blocks[1], f, Scan<float>(blocks[3]), aux_data_vec, aux_dim) );
+  }
+#else
+    // eventually parse segmentation
+  if (blocks.size()>4) {
+    vector<Align> a;
+    pos=blocks[4].find_first_not_of(' ');
+
+    debug1("parsing alignment in: %s\n", blocks[4].c_str());
+    blocks[4].append(" "); // simplifies parsing
+
+    //while (pos<blocks[4].size() && (epos=blocks[4].find(" ",pos))!=string::npos) // does not work !?
+    while (pos<blocks[4].size() && (epos=blocks[4].find(" ",pos)) < 100000)
+    {
+      string align_txt=blocks[4].substr(pos,epos-pos);
+
+      debug1(" parsing alignmnent %s:\n",align_txt.c_str());
+      uint tpos=align_txt.find('=');
+      if (tpos>align_txt.size()) {cerr << align_txt; Error("format error in alignment (no target phrase)"); }
+
+      uint pos2;
+      int sb,se,tb,te;
+      pos2=align_txt.rfind('-',tpos);
+      if (pos2>align_txt.size()) {
+        debug2(" src: pos %d-%d\n",0,tpos);
+        se=sb=Scan<int>(align_txt.substr(0,tpos));
+      }
+      else {
+        debug2(" sb: pos %d-%d\n",0,pos2);
+        sb=Scan<int>(align_txt.substr(0,pos2));
+        pos=pos2+1; pos2=align_txt.find('=',pos);
+        debug2(" se: pos %d-%d\n",pos,pos2);
+        if (pos2>align_txt.size())  {cerr << align_txt; Error("format error in alignment (end of source phrase)"); }
+        se=Scan<int>(align_txt.substr(pos,pos2-pos));
+      }
+
+      tpos++;
+      pos2=align_txt.find('-',tpos);
+      if (pos2>align_txt.size()) {
+        debug1(" tgt: pos %d\n",tpos);
+        te=tb=Scan<int>(align_txt.substr(tpos));
+      }
+      else {
+        debug2(" tb: pos %d-%d\n",tpos,pos2);
+        tb=Scan<int>(align_txt.substr(tpos,pos2-tpos));
+        te=Scan<int>(align_txt.substr(pos2+1));
+      }
+
+      if (sb<0 || se<0 || tb<0 || te<0 || sb>se || tb>te)  {cerr << align_txt; Error("wrong numbers in alignment"); }
+      debug4(" result %d-%d = %d-%d\n", sb,se,tb,te);
+      a.push_back(Align(sb,se,tb,te));
+
+      pos=blocks[4].find_first_not_of(' ',epos+1);
+    }
+
+    debug1("found %d phrases\n",(int) a.size());
+    nbest.push_back(Hypo(id, blocks[1], f, Scan<float>(blocks[3]), a, aux_data_vec, aux_dim) );
+  }
+  else {
+    nbest.push_back(Hypo(id, blocks[1], f, Scan<float>(blocks[3]), aux_data_vec, aux_dim) );
+  }
+#endif
+
+  line.clear(); // force read of new line
+  return true;
+}
+
+
+NBest::NBest(inputfilestream &inpf, inputfilestream &auxf, const int n, const bool need_alignments, const int aux_dim) 
+  : max_req(262144), nreq(0), nb_diff_align(0)
+{
+  debug0("NBEST: constructor called\n");
+  areq = new AlignReq[max_req];
+  //areq.reserve(max_req);
+  while (ParseLine(inpf, auxf, n, need_alignments, aux_dim));
+}
+
+
+NBest::~NBest()
+{
+  debug0("NBEST: destructor called\n");
+  nbest.clear();
+  srcw.clear();
+  if (areq) delete [] areq;
+  //areq.clear();
+}
+
+void NBest::Write(outputfilestream &outf, int n)
+{
+  if (n<1 || (uint) n>nbest.size()) n=nbest.size();
+  for (int i=0; i<n; i++) nbest[i].Write(outf);
+}
+
+
+void NBest::CalcGlobal(Weights &w)
+{
+  //cerr << "NBEST: calc global of size " << nbest.size() << endl;
+  for (vector<Hypo>::iterator i = nbest.begin(); i != nbest.end(); i++) {
+    (*i).CalcGlobal(w);
+  }
+}
+
+
+void NBest::Sort() {
+  sort(nbest.begin(),nbest.end());
+}
+
+
+void NBest::AddID(const int o)
+{
+  for (vector<Hypo>::iterator i = nbest.begin(); i != nbest.end(); i++) {
+    (*i).AddID(o);
+  }
+}
+
+void NBest::RescoreLM(NbestLM &lm, const int lm_pos)
+{
+  for (vector<Hypo>::iterator i = nbest.begin(); i != nbest.end(); i++) {
+    lm.RescoreHyp(*i,lm_pos);
+  }
+  lm.FinishPending();
+}
+
+#undef OLD
+#ifdef OLD
+void NBest::RescorePtable(PtableMosesPtree &pt, ifstream &srcf, const int tm_pos)
+{
+    // get a source line and segment into words
+  string src;
+  getline(srcf,src);
+  if (srcf.eof())
+    ErrorN("EOF in source text for n-best hypothesis id=%d", id);
+
+  srcw.clear();
+  srcw = Moses::Tokenize<std::string>(src);
+
+  for (vector<Hypo>::iterator i = nbest.begin(); i != nbest.end(); i++) {
+     pt.RescoreHyp(*i,srcw,tm_pos);
+  }
+}
+#else
+
+void NBest::RescorePtable(PtableMosesPtree &pt, ifstream &srcf, const int tm_pos)
+{
+    // get a source line and segment into words
+  string src;
+  getline(srcf,src);
+  if (srcf.eof())
+    ErrorN("EOF in source text for n-best hypothesis id=%d", id);
+
+  srcw.clear();
+  srcw = Moses::Tokenize<std::string>(src);
+
+  int nscores = pt.GetNscores();
+  debug2("NBest::RescorePtable(): %d scores at position %d\n", nscores, tm_pos);
+  debug2("SRC with %d words: %s\n", (int) srcw.size(),  src.c_str());
+
+  vector<float> null_scores(nscores, 0.0);
+
+  for (vector<Hypo>::iterator hi=nbest.begin(); hi!= nbest.end(); hi++) {
+      // reset the features that will be modified in BlockFinish()
+      // we already append them here if requested
+    if (nscores>1) (*hi).SetFeature(null_scores, tm_pos);
+              else (*hi).SetFeature(0.0, tm_pos);
+    
+    hi->trgw = Moses::Tokenize<std::string>(hi->trg);
+    for (vector<Align>::iterator ali=(*hi).a.begin(); ali!=(*hi).a.end(); ali++) {
+      areq[nreq].sb = (*ali).sb;
+      areq[nreq].se = (*ali).se;
+      for (int w=(*ali).tb; w<=(*ali).te; w++) areq[nreq].tgph.push_back((*hi).trgw[w]);
+      areq[nreq].hyp=&(*hi);
+      if (++nreq >= max_req) BlockFinish(pt,tm_pos);
+    }
+  }
+  BlockFinish(pt,tm_pos);
+}
+#endif
+
+void NBest::RescorePtableInv(PtableMosesPtree &pt, ifstream &srcf, const int tm_pos)
+{
+  Error("NBest::RescorePtableInv");
+    // get a source line and segment into words
+  string src;
+  getline(srcf,src);
+  if (srcf.eof())
+    ErrorN("EOF in source text for n-best hypothesis id=%d", id);
+
+  srcw.clear();
+  srcw = Moses::Tokenize<std::string>(src);
+
+  int nscores = pt.GetNscores();
+  debug2("NBest::RescorePtable(): %d scores at position %d\n", nscores, tm_pos);
+  debug2("SRC with %d words: %s\n", (int) srcw.size(),  src.c_str());
+
+  vector<float> null_scores(nscores, 0.0);
+
+  for (vector<Hypo>::iterator hi=nbest.begin(); hi!= nbest.end(); hi++) {
+      // reset the features that will be modified in BlockFinish()
+      // we already append them here if requested
+    if (nscores>1) (*hi).SetFeature(null_scores, tm_pos);
+              else (*hi).SetFeature(0.0, tm_pos);
+    
+    hi->trgw = Moses::Tokenize<std::string>(hi->trg);
+    for (vector<Align>::iterator ali=(*hi).a.begin(); ali!=(*hi).a.end(); ali++) {
+      areq[nreq].sb = (*ali).sb;
+      areq[nreq].se = (*ali).se;
+      for (int w=(*ali).tb; w<=(*ali).te; w++) areq[nreq].tgph.push_back((*hi).trgw[w]);
+      areq[nreq].hyp=&(*hi);
+      if (++nreq >= max_req) BlockFinish(pt,tm_pos);
+    }
+  }
+  BlockFinish(pt,tm_pos);
+}
+
+  // compare source and target phrases
+int AlignReqComp(const void *v1, const void *v2)
+{
+  AlignReq* a1=(AlignReq*) v1, *a2=(AlignReq*) v2;
+
+  if (a1->sb < a2->sb) return -1;
+  if (a1->sb > a2->sb) return  1;
+  if (a1->se < a2->se) return -1;
+  if (a1->se > a2->se) return  1;
+  if (a1->tgph.size() < a2->tgph.size()) return -1;
+  if (a1->tgph.size() > a2->tgph.size()) return  1;
+  for (int w=0; w<(int)a1->tgph.size(); w++) {
+    if (a1->tgph[w] < a2->tgph[w]) return -1;
+    if (a1->tgph[w] > a2->tgph[w]) return  1;
+  }
+
+  return 0; // both are equal
+}
+
+  // compare source phrases only
+int AlignReqCompSrc(const void *v1, const void *v2)
+{
+  AlignReq* a1=(AlignReq*) v1, *a2=(AlignReq*) v2;
+
+  if (a1->sb < a2->sb) return -1;
+  if (a1->sb > a2->sb) return  1;
+  if (a1->se < a2->se) return -1;
+  if (a1->se > a2->se) return  1;
+
+  return 0; // both are equal
+}
+  
+
+float NBest::GetAlignProb(PtableMosesPtree &pt, AlignReq &aq, const int tm_pos, vector<float> *logP_v) // TODO: param tm_pos is unused
+{
+  debug1("TGT: %s\n", aq.hyp->trg.c_str());
+  debug4("ALIGN %d-%d = %s-%s\n", aq.sb, aq.se, aq.tgph[0].c_str(), aq.tgph.back().c_str());
+
+  if (aq.se >= (int) srcw.size()) Error("phrase table rescoring: last source word in phrase is out of bounds\n");
+
+    // build up current source phrase pair, TODO: switch to reference ?
+  vector<string> srcph;
+  for (int w=aq.sb; w<=aq.se; w++) srcph.push_back(srcw[w]);
+
+  //printf("get Prob for %s..%s || %s..%s  -> %f\n",srcw[0].c_str(),srcw.back().c_str(),trgw[0].c_str(),trgw.back().c_str,pt.GetProb(srcph,trgph));
+  //printf("ALIGN %d-%d = %s-%s -> P=%f\n",aq.sb,aq.se,aq.tb,aq.te,pt.GetProb(srcph,trgph));
+  if (logP_v) {
+    pt.GetProb(srcph,aq.tgph,logP_v);
+    for (vector<float>::iterator fi=logP_v->begin(); fi!=logP_v->end(); fi++) *fi = log(*fi);
+    return (*logP_v)[0];
+  }
+  else {
+    return log(pt.GetProb(srcph,aq.tgph));
+  }
+}
+
+void NBest::BlockFinish(PtableMosesPtree &pt, int tm_pos)
+{
+  debug2("BlockFinish(): processing %d delayed requests, source: %d words\n", nreq, (int)srcw.size());
+
+  if (nreq==0) return;
+
+  qsort(areq, nreq, sizeof(AlignReq), AlignReqComp);
+
+  int nscores = pt.GetNscores();
+  int cnt=1;
+
+  if (tm_pos==0) tm_pos=areq[0].hyp->f.size()-nscores+1; // correct position in append mode
+  debug2("cumulating %d scores starting at position %d\n", nscores, tm_pos);
+
+    // request phrase probas for the first alignment
+  if (nscores>1) {
+    vector<float> logP_scores(nscores, 0.0);
+    debug4("request align 0: %d-%d %s-%s (several scores)\n",areq[0].sb,areq[0].se,areq[0].tgph[0].c_str(),areq[0].tgph.back().c_str());
+    GetAlignProb(pt,areq[0],tm_pos, &logP_scores);
+    areq[0].hyp->AddFeature(logP_scores,tm_pos);
+
+    for (int n=1; n<nreq; n++) {
+      if (AlignReqComp(areq+n-1, areq+n) != 0) {
+          // new alignment pair -> calculate new logP
+        debug5("request align %d: %d-%d %s-%s\n", cnt,areq[n].sb,areq[n].se,areq[n].tgph[0].c_str(),areq[n].tgph.back().c_str());
+        GetAlignProb(pt,areq[n],tm_pos, &logP_scores);
+        cnt++;
+      }
+      //printf("add %f to hyp %s\n",logP,areq[n].hyp->trg.c_str());
+      areq[n].hyp->AddFeature(logP_scores,tm_pos);	// cumulate
+    }
+  }
+  else {
+    debug4("request align 0: %d-%d %s-%s\n",areq[0].sb,areq[0].se,areq[0].tgph[0].c_str(),areq[0].tgph.back().c_str());
+    float logP = GetAlignProb(pt,areq[0],tm_pos);
+    areq[0].hyp->AddFeature(logP,tm_pos);
+
+    for (int n=1; n<nreq; n++) {
+      if (AlignReqComp(areq+n-1, areq+n) != 0) {
+          // new alignment pair -> calculate new logP
+        debug5("request align %d: %d-%d %s-%s\n", cnt,areq[n].sb,areq[n].se,areq[n].tgph[0].c_str(),areq[n].tgph.back().c_str());
+        logP = GetAlignProb(pt,areq[n],tm_pos);
+        cnt++;
+      }
+      //printf("add %f to hyp %s\n",logP,areq[n].hyp->trg.c_str());
+      areq[n].hyp->AddFeature(logP,tm_pos);	// cumulate
+    }
+  }
+
+  debug1(" %d different alignments\n", cnt);
+  nb_diff_align += cnt;
+}
+
+int NBest::NbPhrases()
+{
+  int cnt=0;
+  for (vector<Hypo>::iterator i = nbest.begin(); i != nbest.end(); i++) {
+    cnt += (*i).NbPhrases();
+  }
+
+  return cnt;
+}
+
+//**********************************************************
+//
+// caching algorithm for TM rescoring with CSTM
+//
+//**********************************************************
+
+
+// this is identical to Moses ptable rescoring, we just call a different BlockFinish
+void NBest::RescorePtable(NbestCSTM &cstm, ifstream &srcf, const int tm_pos)
+{
+    // get a source line and segment into words
+  string src;
+  getline(srcf,src);
+  if (srcf.eof())
+    ErrorN("EOF in source text for n-best hypothesis id=%d", id);
+
+  srcw.clear();
+  srcw = Moses::Tokenize<std::string>(src);
+
+  debug1("NBest::RescorePtable(): CSTM score at position %d\n", tm_pos);
+  debug2("SRC with %d words: %s\n", (int) srcw.size(),  src.c_str());
+
+  for (vector<Hypo>::iterator hi=nbest.begin(); hi!= nbest.end(); hi++) {
+      // reset the feature that will be modified in BlockFinish()
+      // we already append it here if requested
+    (*hi).SetFeature(0.0, tm_pos);
+    
+    hi->trgw = Moses::Tokenize<std::string>(hi->trg);
+    int nw=(int) hi->trgw.size();
+    debug2("CSTM token target: %s  %d words\n", hi->trg.c_str(), nw);
+    for (vector<Align>::iterator ali=(*hi).a.begin(); ali!=(*hi).a.end(); ali++) {
+      areq[nreq].sb = (*ali).sb;
+      areq[nreq].se = (*ali).se;
+      debug5("CSTM process areq %d, src: %d-%d, tgt: %d-%d\n",nreq,(*ali).sb,(*ali).se,(*ali).tb,(*ali).te);
+      if ((*ali).tb<0 || (*ali).tb>=nw || ((*ali).te<0 || (*ali).te>=nw)) {
+        fprintf(stderr,"skipping line with targets out of bound in alignment %d-%d=%d-%d\n",(*ali).sb,(*ali).se,(*ali).tb,(*ali).te);
+        continue;
+      }
+      for (int w=(*ali).tb; w<=(*ali).te; w++) areq[nreq].tgph.push_back((*hi).trgw[w]);
+      cstm.LookupTarget(areq[nreq].tgph, areq[nreq].tgwid); // TODO: this is inefficient, the same target will appear many times
+      areq[nreq].hyp=&(*hi);
+      if (++nreq >= max_req) BlockFinish(cstm,tm_pos);
+    }
+  }
+  BlockFinish(cstm,tm_pos);
+}
+
+// this is identical to Moses ptable rescoring, we just call a different BlockFinish
+void NBest::RescorePtableInv(NbestCSTM &cstm, ifstream &srcf, const int tm_pos)
+{
+  Error("NBest::RescorePtableInv()");
+}
+
+void NBest::BlockFinish(NbestCSTM &cstm, int tm_pos)
+{
+  debug2("BlockFinish(): processing %d delayed requests, source: %d words\n", nreq, (int)srcw.size());
+
+  if (nreq==0) return;
+  int bsize=cstm.mach->GetBsize();
+
+  qsort(areq, nreq, sizeof(AlignReq), AlignReqComp);
+
+  if (tm_pos==0) tm_pos=areq[0].hyp->f.size(); // correct position in append mode
+  debug1("cumulating 1 score starting at position %d\n", tm_pos);
+
+  vector<string> srcph;				// one source phrase
+  vector< vector<string> > src_phrases;		// all possible source phrase in this block, size
+  
+    // process first phrase pair
+  areq[0].bs=0;
+  cstm.AddToInput(0,srcw,areq[0].sb,areq[0].se);
+  srcph.clear();
+  for (int w=areq[0].sb; w<=areq[0].se; w++) srcph.push_back(srcw[w]);
+  src_phrases.push_back(srcph);
+
+  int cnt=1;
+
+  int req_beg=0;	// start of current CSLM block in large request array
+  int bs=0;             // current block index in forward bunch
+
+  for (int n=1; n<nreq; n++) {
+    if (AlignReqCompSrc(areq+n-1, areq+n) != 0) { // new source phrase 
+        // first process bunch if full
+      bs++;
+      debug1("   %d new context\n", bs);
+      if (bs >= bsize) {
+        cstm.trainer->ForwAndCollect(src_phrases,areq,req_beg,n-1,bs,tm_pos);
+        bs=0; req_beg=n;
+      }
+          // add new source phrase to bunch for forward pass
+          // REMARK: this is not perfect since some of the examples may be out of slist and we actually wouldn't
+          //         need a forward pass for them. However, all request of an n-best block must be performed before
+          //         we go to the next n-best block, In practice there are often less than 128 difference source phrases.
+          //         Therefore, we only do one forward pass anyway
+      areq[n].bs=bs;
+      cstm.AddToInput(bs,srcw,areq[n].sb,areq[n].se);
+      srcph.clear();
+      for (int w=areq[n].sb; w<=areq[n].se; w++) srcph.push_back(srcw[w]);
+      src_phrases.push_back(srcph);
+      cnt++;
+    }
+    else
+      areq[n].bs=bs;
+  }
+  cstm.trainer->ForwAndCollect(src_phrases,areq,req_beg,nreq-1,bs+1,tm_pos);
+  // FreeReq(); TODO
+
+  printf(" %d different source phrases\n", cnt);
+  nb_diff_align += cnt;
+}
+
diff --git a/NBest.h b/NBest.h
new file mode 100644
index 0000000..51c4628
--- /dev/null
+++ b/NBest.h
@@ -0,0 +1,73 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+#ifndef _NBEST_H_
+#define _NBEST_H_
+
+using namespace std;
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "Toolsgz.h"
+#include "Hypo.h"
+#include "NbestLM.h"
+#include "NbestCSTM.h"
+#include "PtableMosesPtree.h"
+
+#include "AlignReq.h"
+
+class NBest {
+  int 		   id;
+  vector<string>   srcw;	// source sentence parsed into words (only available for TM rescoring)
+  vector<Hypo> nbest;
+  bool ParseLine(inputfilestream& inpf, inputfilestream& auxf, const int, const bool, const int);
+    // Delayed translation model rescoring
+  int max_req;			// max number of request cumulated before we perform them in a block
+  int nreq;			// current number of request cumulated
+  AlignReq *areq;		// array to allocate all requests
+  int nb_diff_align;		// stats
+ public:
+  NBest(inputfilestream&, inputfilestream& , const int=0, const bool =false , const int=0);
+  ~NBest();
+  int NbNBest() {return nbest.size(); }
+  int NbPhrases();
+  int NbDiffPhrases() {return nb_diff_align; }
+  void CalcGlobal(Weights&);
+  void Sort(); // largest values first
+  void Write(outputfilestream&, int=0);
+  void AddID(const int offs);
+  void RescoreLM(NbestLM&, const int); // recalc LM score on hypothesis (uses optional auxiliary data)
+    // Delayed translation model rescoring with on disk phrase table
+  void RescorePtable(PtableMosesPtree&, ifstream&, const int);
+  void RescorePtableInv(PtableMosesPtree&, ifstream&, const int);
+  void BlockFinish(PtableMosesPtree&, int);
+  REAL GetAlignProb(PtableMosesPtree&, AlignReq&, const int, vector<float>* = NULL);
+    // Delayed translation model rescoring with CSTM
+  void RescorePtable(NbestCSTM&, ifstream&, const int);
+  void RescorePtableInv(NbestCSTM&, ifstream&, const int);
+  void BlockFinish(NbestCSTM&, int);
+  void ForwAndCollect(int, int, int);
+};
+
+
+#endif
diff --git a/NbestCSTM.cpp b/NbestCSTM.cpp
new file mode 100644
index 0000000..d44dc8b
--- /dev/null
+++ b/NbestCSTM.cpp
@@ -0,0 +1,123 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+using namespace std;
+
+#include <string>
+
+#include "Tools.h"
+#include "Hypo.h"
+#include "NbestCSTM.h"
+#include "ErrFctSoftmCrossEntNgramMulti.h"
+
+
+NbestCSTM::~NbestCSTM() {
+  if (mach) delete mach;
+  if (trainer) delete trainer;
+}
+
+
+void NbestCSTM::Read(char *fname, char *wl_src_fname, char *wl_tgt_fname, char *pt_fname, int nscores, char *scores_specif)
+{
+  ifstream ifs;
+  ifs.open(fname,ios::binary);
+  CHECK_FILE(ifs,fname);
+  mach = Mach::Read(ifs);
+  ifs.close();
+
+  mach->Info();
+
+  // create vocabulary from our source word list, this must be exactly the same order than in extract2bin !!!
+  cout << " - reading source word list from file " << wl_src_fname << flush;
+  src_wlist.SetSortBehavior(this->stable_sort);
+  src_wlist.Read(wl_src_fname);
+  cout << ", got " << src_wlist.GetSize() << " words" << endl;
+
+  // create vocabulary from our target word list, this must be exactly the same order than in extract2bin !!!
+  cout << " - reading target word list from file " << wl_tgt_fname << flush;
+  tgt_wlist.SetSortBehavior(this->stable_sort);
+  tgt_wlist.Read(wl_tgt_fname);
+  cout << ", got " << tgt_wlist.GetSize() << " words" << endl;
+
+  trainer = new TrainerPhraseSlist(mach, &src_wlist, &tgt_wlist, pt_fname, nscores, scores_specif);
+}
+
+void NbestCSTM::AddToInput(int b, vector<string> &vsrcw, int sb, int se)
+{
+  int idim=mach->GetIdim();
+  if (sb-se+1 > idim) {
+    ErrorN("NbestCSTM::AddToInput(): source phrase too long (%d) for machine (%d)\n", sb-se+1, idim);
+  }
+
+  REAL *iptr=trainer->GetBufInput() + b*idim;
+  int i=0;
+
+  // get index of each source word
+  debug0("NbestCSTM::AddToInput():");
+  REAL unk_wi = (REAL) src_wlist.GetIndex(WordList::WordUnknown);
+  for (int w=sb; w<=se; w++) {
+    WordList::WordIndex wi = src_wlist.GetIndex(vsrcw[w].c_str());
+    if (wi==WordList::BadIndex) {
+      fprintf(stderr, "ERROR: source word not found: %s\n", vsrcw[w].c_str());
+      *iptr++ = unk_wi;
+    }
+    else 
+      *iptr++ = (REAL) wi;
+    debug2(" %s->%f", vsrcw[w].c_str(), iptr[-1]);
+    i++;
+  }
+  debug0("\n");
+
+  // fill up input phrase to the dimension of the machine
+  for (; i<idim; i++) *iptr++=NULL_WORD;
+}
+ 
+void NbestCSTM::LookupTarget(vector<string> &vtrgw, WordID *wid)
+{
+  int nph=trainer->GetTgtNbPhr();
+  int vdim=(int) vtrgw.size();
+
+  if (vdim>nph) {
+    ErrorN("NbestCSTM::MapTarget(): phrase (%d) exceeds length of machine (%d)\n",vdim, nph);
+  }
+  
+  int i;
+  debug0("NbestCSTM::LookupTarget():");
+  for (i=0; i<vdim; i++) {
+    WordList::WordIndex wi = tgt_wlist.GetIndex(vtrgw[i].c_str());
+    if (wi==WordList::BadIndex) {
+      //ErrorN("ERROR: target word not found: %s\n", vtrgw[i].c_str());
+      // TODO: count these events
+      
+      // this has as effect that the word won't be processed by the CSTM (out of short list)
+      // maybe the external phrase table knows it?
+      wid[i]=trainer->GetSlistLen();
+    }
+    else
+      wid[i] = (WordID) wi;
+    debug2(" %s->%d", vtrgw[i].c_str(), wid[i]);
+  }
+  debug0("\n");
+
+  // fill up
+  for (; i<nph; i++) wid[i] = NULL_WORD;
+}
diff --git a/NbestCSTM.h b/NbestCSTM.h
new file mode 100644
index 0000000..8ebf495
--- /dev/null
+++ b/NbestCSTM.h
@@ -0,0 +1,51 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+
+#ifndef _NBESTCSTM_H_
+#define _NBESTCSTM_H_
+
+using namespace std;
+
+#include "Mach.h" // from the CSTM toolkit
+#include "TrainerPhraseSlist.h" 
+#include "WordList.h"
+
+class NbestCSTM {
+private:
+  WordList src_wlist;
+  WordList tgt_wlist;
+  Mach *mach;
+  TrainerPhraseSlist *trainer;
+  bool stable_sort;	// use stable sort (default=true), set to false for compatibility with CSLM <= V3.0
+public:
+  NbestCSTM() : src_wlist(true), tgt_wlist(true), mach(NULL), trainer(NULL), stable_sort(true) {}
+  virtual ~NbestCSTM();
+  virtual void SetSortBehavior(bool val) {stable_sort=val;}
+  virtual void Read (char*, char*, char* , char*, int, char*);
+  virtual void AddToInput(int, vector<string> &, int, int);
+  virtual void LookupTarget(vector<string> &v, WordID *);
+  virtual void Stats() {trainer->BlockStats();}
+  friend class NBest;
+};
+
+#endif
diff --git a/Ptable.h b/Ptable.h
new file mode 100644
index 0000000..6b518f7
--- /dev/null
+++ b/Ptable.h
@@ -0,0 +1,49 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#ifndef _Ptable_h
+#define _Ptable_h
+
+using namespace std;
+
+#include <string>
+#include <vector>
+#include "Tools.h"		// for type REAL
+//#include "DataNgramBin.h"	// for type WordID
+
+// interface class to classical phrase tables
+//
+//
+
+#define NULL_LN_PROB (1.0)   // this value must not be possible as a normal return value of ln Prob
+
+class Ptable {
+ private:
+ public:
+  Ptable(const int, const int=2, const bool=false) {};				// initialize
+  virtual ~Ptable() {};
+  virtual void Read(const string &) {};						// read form file
+  virtual REAL GetProb(vector<string>&, vector<string>&) {return 0;}		// get backoff LM P(w|ctxt) from seqeuence of words
+  //virtual REAL GetProbWid(REAL *src, WordID *tgt) {return 0;} 
+};
+
+#endif
diff --git a/PtableMosesPtree.cpp b/PtableMosesPtree.cpp
new file mode 100644
index 0000000..40efd99
--- /dev/null
+++ b/PtableMosesPtree.cpp
@@ -0,0 +1,194 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#include "PtableMosesPtree.h"
+
+
+PtableMosesPtree::~PtableMosesPtree ()
+{
+  for (vector<Moses::PhraseDictionaryTree*>::iterator p=ptree.begin(); p!=ptree.end(); p++)
+    (*p)->FreeMemory();
+}
+
+//
+// read a new phrase table
+//
+void PtableMosesPtree::Read(const string &fname, const int p_nscores, const char *scores_specif)
+{
+  if (strlen(scores_specif)<2 || scores_specif[1]!=':')
+    Error("format error in the specification of the TM scores");
+  if (scores_specif[0]<'1' || scores_specif[0]>'4')
+    Error("wrong value for the number of TM scores");
+
+  if (ptree.size()==0)
+    nscores=scores_specif[0]-'0';
+  else {
+    if (nscores!=scores_specif[0]-'0')
+      Error("PtableMosesPtree::Read(): inconsistent number of scores to be returned from multiple phrase tables");
+  }
+  if (nscores > p_nscores)
+    Error("PtableMosesPtree::Read(): the number of scores to be returned exceeds the number of available ones");
+
+  ptree.push_back(new Moses::PhraseDictionaryTree);
+  pos_scores.push_back(scores_specif[2]-'0');
+
+  ptree.back()->NeedAlignmentInfo(false);
+  cout << " - loading Moses binary phrase table from file " << fname << " with " << p_nscores << " scores" << endl;
+  ptree.back()->Read(fname);
+  cout << "   using " << nscores << " scores starting at position " << pos_scores.back() << endl;
+  tgtcands.clear();
+};
+
+
+//
+// Get probabilities from the phrase-tables
+//  - scores=NULL:	return either one value as a function result
+//  - scores!=NULL:	return a sequence of values in that vector (as many as the vector has space)
+//
+
+REAL PtableMosesPtree::GetProb(vector<string> &src, vector<string> &tgt, vector<float> *scores)
+{
+  uint w;
+
+#ifdef DEBUG
+  cout << "Ptable prob:";
+  for (w=0; w<src.size(); w++) cout << " " << src[w];
+  cout << " |||";
+  for (w=0; w<tgt.size(); w++) cout << " " << tgt[w];
+  cout << " ||| " << endl;
+#endif
+
+  if (scores && scores->size() == 0)
+    Error("PtableMosesPtree::GetProb() parameter scores has zero dimension");
+
+  if (scores && (int) scores->size() > nscores)
+    Error("PtableMosesPtree::GetProb() requesting too much scores form the phrase table");
+
+
+  for (uint p=0; p<ptree.size(); p++) {
+
+      // get all target phrases with scores from current phrase table
+    tgtcands.clear();
+    ptree[p]->GetTargetCandidates(src, tgtcands);
+    debug2(" - phrase table %u has %d candidates:\n", p, (int) tgtcands.size());
+    size_t pos=pos_scores[p];
+
+      // search for our target phrase
+    for (uint tph=0; tph<tgtcands.size(); tph++) {
+      //debug2(" - candidate %d, length %d\n", tph, (int) tgtcands[tph].tokens.size());
+      if (tgt.size() != tgtcands[tph].tokens.size()) continue;
+      bool match=true;
+      for (w=0; match && w<tgt.size(); w++) {
+        match = (tgt[w].compare(*(tgtcands[tph].tokens[w])) == 0);
+        //debug4("   word[%d] %s / %s -> %d\n",w, tgt[w].c_str(), tgtcands[tph].tokens[w]->c_str(), match);
+      }
+      if (match) {
+        debug5("     found phrase of length %u/%u at pos %d out of %d, p=%f\n", (uint) src.size(), (uint) tgt.size(), tph, (int) tgtcands.size(), tgtcands[tph].scores[pos]);
+        if (scores) {
+          for (uint s=0; s<scores->size(); s++) {
+            (*scores)[s]=tgtcands[tph].scores[pos+s]; // return sequence of scores
+            debug2(" score[%u]: %f\n",s, (*scores)[s]);
+          }
+        }
+        return tgtcands[tph].scores[pos];
+      }
+    } 
+ 
+  } 
+      
+    // phrase pair wasn't found in any phrase table
+    // do we have an unknown word which was copied to the target ?
+  if (src.size()==1 && tgt.size()==1 && src[0]==tgt[0]) {
+    debug0("     UNK: source copied to target\n");
+    if (scores) {
+      for (uint s=0; s<scores->size(); s++) (*scores)[s]=PROBA_COPY_UNK; // return sequence of scores
+    }
+    return PROBA_COPY_UNK;
+  }
+  
+#ifdef DEBUG
+  cout << "ERROR: can't find the following phrase pair in the external phrase tables: SETTING PROBA TO " << PROBA_NOT_IN_PTABLE << endl;
+  for (w=0; w<src.size(); w++) cout << " " << src[w];
+  cout << " |||";
+  for (w=0; w<tgt.size(); w++) cout << " " << tgt[w];
+  cout << " ||| " << endl;
+#endif
+  if (scores) {
+    for (uint s=0; s<scores->size(); s++) (*scores)[s]=PROBA_NOT_IN_PTABLE; // return sequence of scores
+  }
+  return PROBA_NOT_IN_PTABLE;
+}
+
+/*
+void PtableMosesPtree::BlockEval (Hypo &hyp, vector<string> &srcw, const int pos)
+{
+}
+*/
+
+void PtableMosesPtree::RescoreHyp (Hypo &hyp, vector<string> &srcw, const int pos)
+{
+  debug1("TGT: %s\n", hyp.trg.c_str());
+  vector<string> trgw = Moses::Tokenize<std::string>(hyp.trg);
+
+  int nws=srcw.size(), nwt=trgw.size();
+  debug3("Ptable rescoring with %d source and %d target words, %d phrases\n", nws, nwt, (int) hyp.a.size());
+  vector<string> srcph, trgph;  // needed to build up current phrase pair
+
+
+  if (nscores>1) { 
+    vector<float> res(nscores,0.0); // we request more than one score form the phrase table
+    vector<float> logP(nscores,0.0); // we request more than one score form the phrase table
+
+    for (vector<Align>::iterator al=hyp.a.begin(); al!=hyp.a.end(); al++) {
+      if ((*al).se>=nws) Error("phrase table rescoring: last source word in phrase out of bounds\n");
+      if ((*al).te>=nwt) Error("phrase table rescoring: last target word in phrase out of bounds\n");
+
+      debug4("ALIGN %d-%d = %d-%d\n", (*al).sb, (*al).se, (*al).tb, (*al).te);
+      srcph.clear();
+      for (int w=(*al).sb; w<=(*al).se; w++) srcph.push_back(srcw[w]);
+      trgph.clear();
+      for (int w=(*al).tb; w<=(*al).te; w++) trgph.push_back(trgw[w]);
+
+      GetProb(srcph,trgph,&res); // TODO: this is very inefficient, we should group together request for the same source phrase
+      for (int i=0; i<nscores; i++) logP[i] += log(res[i]);
+    }
+    hyp.SetFeature(logP,pos);
+
+  }
+  else {
+    REAL logP=0;	// we request only one score from the phrase table
+
+    for (vector<Align>::iterator al=hyp.a.begin(); al!=hyp.a.end(); al++) {
+      if ((*al).se>=nws) Error("phrase table rescoring: last source word in phrase out of bounds\n");
+      if ((*al).te>=nwt) Error("phrase table rescoring: last target word in phrase out of bounds\n");
+
+      debug4("ALIGN %d-%d = %d-%d\n", (*al).sb, (*al).se, (*al).tb, (*al).te);
+      srcph.clear();
+      for (int w=(*al).sb; w<=(*al).se; w++) srcph.push_back(srcw[w]);
+      trgph.clear();
+      for (int w=(*al).tb; w<=(*al).te; w++) trgph.push_back(trgw[w]);
+
+      logP+=log(GetProb(srcph,trgph)); // TODO: this is very inefficient, we should group together request for the same source phrase
+    }
+    hyp.SetFeature(logP,pos);
+  }
+}
diff --git a/PtableMosesPtree.h b/PtableMosesPtree.h
new file mode 100644
index 0000000..53f0632
--- /dev/null
+++ b/PtableMosesPtree.h
@@ -0,0 +1,77 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#ifndef _PtableMosesPtree_h
+#define _PtableMosesPtree_h
+
+using namespace std;
+
+#include "Ptable.h"
+#include "Hypo.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+
+// from Moses:
+#include <TranslationModel/PhraseDictionaryTree.h>
+#include <Util.h>
+
+
+// interface class to Moses binary on-disk prahse tables
+// (implementation with a prefix tree)
+
+const REAL PROBA_COPY_UNK (1);	// translation probability when an unknown word is copied from source to target
+const REAL PROBA_NOT_IN_PTABLE (1e-20);	// translation probability when a phrase pair is not found in the Moses phrase table
+					// this can happen when some words are mapped to <unk> because of limited source or target vocabularies
+
+//
+// helper class to store and compare Phrase requests
+// ugly C-style structure, but this seems to be more efficient
+
+/*
+struct PhraseReq {
+  Align	a;
+  vector<string>  &trgw;
+  int cnt;
+  REAL *res_ptr;
+};
+*/
+
+class PtableMosesPtree {
+ private:
+   vector<Moses::PhraseDictionaryTree*> ptree;	// main and eventually secondary phrase tables
+   vector<int> pos_scores;			// starting position of the scores to be returned from each phrase table
+   int nscores;					// number of scores to be returned (must be same for all phrase-tables)
+   vector<Moses::StringTgtCand> tgtcands;
+ public:
+  PtableMosesPtree() {};
+  virtual ~PtableMosesPtree();
+  virtual void Read(const string &, const int, const char*);		// read next phrase table from file
+  virtual REAL GetProb(vector<string>&, vector<string>&, vector<float> * =NULL);		// return one proba for a tokenized phrase-pair or vector of scores
+  //virtual REAL GetProbWid(REAL *src, WordID *tgt) {return 0;} 
+  virtual void RescoreHyp (Hypo&, vector<string> &, const int);
+  virtual int GetNscores() {return nscores; }
+};
+
+#endif
diff --git a/TrainerPhraseSlist.cpp b/TrainerPhraseSlist.cpp
new file mode 100644
index 0000000..88010f9
--- /dev/null
+++ b/TrainerPhraseSlist.cpp
@@ -0,0 +1,1164 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ */
+
+using namespace std;
+#include <iostream>
+#include <algorithm>
+#include <unistd.h>
+#include <time.h>
+
+#include "Tools.h"
+#include "Mach.h"
+#include "MachTab.h"
+#include "MachPar.h"
+#include "MachSeq.h"
+#include "MachSplit.h"
+#include "TrainerPhraseSlist.h"
+#include "ErrFctSoftmCrossEntNgram.h"
+
+#include "NBest.h" 
+#include "sort.cpp" 
+
+// activate mapping of input
+// not really necessary, may only speed up calculations due to cache locality
+// if you activvate this option, you must do so for all your networks
+#undef TRAINER_PHASE_SLIST_MAP_INPUT
+
+void TrainerPhraseSlist::DoConstructorWork()
+{
+  idim=mach->GetIdim(); odim=mach->GetOdim(); bsize=mach->GetBsize();
+
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  gpu_input = Gpu::Alloc(idim*bsize, "inputs in Trainer");
+  host_output = new REAL[odim*bsize];
+#endif
+  buf_target_wid = new WordID[odim*bsize];	// TODO: those are actually too big, we need tg_nbphr*bsize ??
+  buf_target_ext = new WordID[odim*bsize];
+  buf_target_in_blocks = new REAL[odim*bsize];
+
+    // set up vector to outputs of the target phrases
+  if (mach->GetMType() != file_header_mtype_mseq)
+    Error("CSTM: sequential machine needed\n");
+  MachSeq *mseq=(MachSeq*) mach;
+  if (mseq->MachGetNb()<2)
+    Error("CSTM: the number of machines is suspiciously small");
+
+    // check input layer
+  if (mseq->MachGet(0)->GetMType() != file_header_mtype_mpar)
+    Error("CSTM: the input layer has the wrong architecture\n");
+  MachPar *mpar = (MachPar*) mseq->MachGet(0);
+  if (mpar->MachGet(0)->GetMType() != file_header_mtype_tab)
+    Error("CSTM: the input layer has the wrong architecture\n");
+  MachTab *mtab = (MachTab*) mpar->MachGet(0);
+  max_inp_idx = mtab->GetMaxInpVal();
+
+    // check output layer
+  if (mseq->MachGet(mseq->MachGetNb()-1)->GetMType() != file_header_mtype_msplit)
+    Error("CSTM: the output layer has the wrong architecture\n");
+  MachSplit *msp = (MachSplit*) mseq->MachGet(mseq->MachGetNb()-1);
+  tg_nbphr=msp->MachGetNb();
+  if (data_train && (data_train->GetOdim() != tg_nbphr)) {
+    ErrorN("CSTM: output dimension of the training data should be %d, found %d\n", tg_nbphr, data_train->GetOdim());
+  }
+
+  cout << " - using cross entropy for each output vector" << endl;
+  phrase_mach.clear();
+  mach_errfct.clear();
+  for (int m=0; m<tg_nbphr; m++) {
+    phrase_mach.push_back(msp->MachGet(m));
+    if (m>0 && phrase_mach[m-1]->GetOdim() != phrase_mach[m]->GetOdim())
+      Error("CSTM: the output layer dimension must be identical for all phrases\n");
+    //ErrFctSoftmCrossEntNgram *tmp=dynamic_cast<ErrFctSoftmCrossEntNgram*>(errfct);
+    //mach_errfct.push_back(new ErrFctSoftmCrossEntNgram(*tmp));	// create copy of user specified error function
+    mach_errfct.push_back(new ErrFctSoftmCrossEntNgram(*phrase_mach[m]));	// each machine gets its own error function with local mem for grad
+#ifdef BLAS_CUDA
+    Gpu::SetConfig(mach_errfct[m]->GetGpuConfig());
+    gpu_target.push_back(Gpu::Alloc(bsize*sizeof(REAL), "targets in Trainer"));
+#endif
+  }
+  dim_per_phrase = phrase_mach[0]->GetOdim();
+  cout << " - this machine can predict up to " << phrase_mach.size() << " phrases, each with an output layer of dimension " << dim_per_phrase << endl;
+  tg_slist_len = dim_per_phrase-1;
+
+
+    // get source word list
+  if (sr_wlist == NULL) {
+    vector<WordList> *vect_wlist = NULL;
+    if (data_dev != NULL)
+      vect_wlist = data_dev->GetSrcWList();
+    else if (data_train != NULL)
+      vect_wlist = data_train->GetSrcWList();
+    if ((vect_wlist != NULL) && !vect_wlist->empty())
+      sr_wlist = &(vect_wlist->front());
+  }
+  if (sr_wlist == NULL)
+    Error("no source word list available");
+  if ((int) sr_wlist->GetSize() > max_inp_idx)
+    ErrorN("the size of the source word list (%d) exceeds the number of input words the machine was trained for (%d)",(int) sr_wlist->GetSize(),max_inp_idx);
+  debug1("* using source word list with %d words\n",(int)sr_wlist->GetSize());
+
+    // get target word list
+  if (tg_wlist == NULL) {
+    vector<WordList> *vect_wlist = NULL;
+    if (data_dev != NULL)
+      vect_wlist = data_dev->GetTgtWList();
+    else if (data_train != NULL)
+      vect_wlist = data_train->GetTgtWList();
+    if ((vect_wlist != NULL) && !vect_wlist->empty())
+      tg_wlist = &(vect_wlist->front());
+  }
+  if (tg_wlist == NULL)
+    Error("no target word list available");
+  if (!tg_wlist->FrequSort())
+    Error("the target word list doesn't contain word counts");
+  if (tg_wlist->GetSize() <= tg_slist_len)
+    Error("TrainerPhraseSlist: the output layer is larger than the target word list");
+  debug1("* using target word list with %d words\n",(int)tg_wlist->GetSize());
+
+  ulong sum_sl=0, sum=0;
+  tg_wlist->SetShortListLength(tg_slist_len);
+  tg_wlist->CountWords(sum_sl, sum);
+  printf (" - setting up target short list of %d words, coverage of %5.2f%%\n", tg_slist_len, 100.0*sum_sl/sum);
+
+#ifdef DEBUG2
+  cout << "Words in slist:" << endl;
+  WordID ci=tg_slist_len;
+  WordList::const_iterator iter, end = tg_wlist->End();
+  for (iter=tg_wlist->Begin(); (iter!=end) && (ci > 0); iter++, ci--)
+    printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id);
+#endif
+
+#ifdef DEBUG2
+  cout << "Words not in slist:" << endl;
+  for (; iter!=end; iter++)
+    printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id);
+#endif
+
+#ifdef DEBUG2
+   // just needed for debugging
+  words.reserve(tg_wlist->GetSize());
+  for (iter=tg_wlist->Begin(); iter!=end; iter++) words[iter->id] = strdup(iter->word);
+#endif
+  
+  debug0(" + done init TrainerPhraseSlist\n");
+}
+
+//
+// constructor for training
+//
+
+TrainerPhraseSlist::TrainerPhraseSlist (Mach *pmach, Lrate *lrate, ErrFct *perrfct,
+	const char *train_fname, const char *dev_fname, const char *pt_fname, int p_nscores,
+	REAL p_wd, int p_maxep, int p_ep)
+ : Trainer(pmach,lrate,perrfct,NULL,NULL,p_wd,p_maxep,p_ep),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(NULL), tg_wlist(NULL),
+   ptable(NULL),
+   nb_ex_slist(0), nb_ex_short_tgt(0),
+   nb_forw(0)
+{
+  debug2("*** Constructor TrainerPhraseSlist for training idim=%d, odim=%d ***\n",idim,odim);
+  cout << "Setting up CSTM training with short list" << endl;
+
+  if (train_fname) {
+    data_train = new Data(train_fname);
+    if (idim != data_train->GetIdim()) {
+      ErrorN("TrainerPhraseSlist: input dimension of the training data (%d) does not match the one of the machine (%d)\n", data_train->GetIdim(), idim);
+    }
+    if (data_train->GetOdim()<1 || data_train->GetOdim()>32) {
+      ErrorN("TrainerPhraseSlist: output dimension of the training data should be 1..10, found %d\n", data_train->GetOdim());
+    }
+    auxdim = data_train->GetAuxdim();
+  }
+  else 
+    data_train=NULL;
+
+  if (dev_fname) {
+    data_dev = new Data(dev_fname);
+    data_dev_alloc=true;
+    if (idim != data_dev->GetIdim()) {
+      ErrorN("TrainerPhraseSlist: input dimension of the validation data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim);
+    }
+    if (data_dev->GetOdim()<1 || data_dev->GetOdim()>32) {
+      ErrorN("TrainerPhraseSlist: output dimension of the validation data should be 1..10, found %d\n", data_dev->GetOdim());
+    }
+    int auxdim_dev = data_dev->GetAuxdim();
+    if (0 >= auxdim)
+      auxdim = auxdim_dev;
+    else if (auxdim != auxdim_dev)
+      ErrorN("TrainerPhraseSlist: auxiliary data dimension of the validation data should be %d, found %d", auxdim, auxdim_dev);
+  }
+  else {
+    data_dev=NULL;
+    data_dev_alloc=false;
+  }
+  iaux = (idim - auxdim);
+
+  DoConstructorWork();
+
+  if (data_dev) {
+    if (pt_fname) {
+      ptable = new(PtableMosesPtree);
+      ptable->Read(pt_fname,5,"1:2");
+    }
+    else
+      cout << " - no external phrase table provided (unhandled phrase pairs receive 0 logproba)" << endl;
+  }
+}
+
+//
+// constructor for testing
+//
+
+TrainerPhraseSlist::TrainerPhraseSlist (Mach *pmach, ErrFct *perrfct,
+	Data *data, char *pt_fname, int p_nscores)
+ : Trainer(pmach,NULL,perrfct,NULL,NULL),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(NULL), tg_wlist(NULL),
+   ptable(NULL),
+   nb_ex_slist(0), nb_ex_short_tgt(0),
+   nb_forw(0)
+{
+  debug0("*** Constructor TrainerPhraseSlist for testing ***\n");
+  cout << "Setting up testing with short list" << endl;
+
+  data_train=NULL;
+  data_dev=data;
+  data_dev_alloc=false; // do not free it by this class !
+
+  if (idim != data_dev->GetIdim()) {
+    ErrorN("TrainerPhraseSlist: input dimension of the test data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim);
+  }
+  auxdim = data_dev->GetAuxdim();
+  iaux = (idim - auxdim);
+
+  DoConstructorWork();
+
+  if (pt_fname) {
+    ptable = new(PtableMosesPtree);
+#ifdef BACKWARD_TM
+    ptable->Read(pt_fname,5,"1:0"); // backward TM prob
+#else
+    ptable->Read(pt_fname,5,"1:2"); // forward TM prob
+#endif
+  }
+  else
+    cout << " - no external phrase table provided (unhandled phrase pairs receive 0 logproba)" << endl;
+}
+
+//
+// constructor for nbest rescoring
+//
+
+TrainerPhraseSlist::TrainerPhraseSlist (Mach *pmach,
+    WordList *p_sr_wlist, WordList *p_tg_wlist,
+	char *pt_fname, int nscores, char *scores_specif)
+ : Trainer(pmach,NULL,NULL,NULL,NULL),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(p_sr_wlist), tg_wlist(p_tg_wlist),
+   ptable(NULL),
+   nb_ex_short_tgt(0), nb_forw(0)
+{
+  debug0("*** Constructor TrainerPhraseSlist for block operations ***\n");
+  cout << "Setting up CSTM with short list" << endl;
+  // TODO: init with TrainerNgram before
+  data_train=NULL;
+  data_dev=NULL;
+  DoConstructorWork();
+
+  if (pt_fname) {
+    ptable = new(PtableMosesPtree);
+    ptable->Read(pt_fname, nscores, scores_specif);
+  }
+  else
+    cout << " - no external phrase table provided (unhandled phrase pairs receive 0 logproba)" << endl;
+}
+
+//**************************************************************************************
+
+TrainerPhraseSlist::~TrainerPhraseSlist ()
+{ 
+  debug0("*** Destructor TrainerPhraseSlist ***\n");
+
+  if (buf_target_wid) delete [] buf_target_wid;
+  if (buf_target_ext) delete [] buf_target_ext;
+  if (buf_target_in_blocks) delete [] buf_target_in_blocks;
+    // buf_input and buf_target will be deleted by ~Trainer()
+
+#ifdef BLAS_CUDA
+    // free local gpu_target buffer on each GPU
+  for (vector<REAL*>::iterator it=gpu_target.begin(); it!=gpu_target.end(); ++it)
+    if (*it) cudaFree(*it);
+  gpu_target.clear();
+#endif
+
+  phrase_mach.clear();
+  mach_errfct.clear();
+
+#ifdef DEBUG2
+  vector<char*>::const_iterator iter, end = words.end();
+  for (iter=words.begin(); iter!=end; iter++) delete *iter;
+  words.clear();
+#endif
+}
+
+
+//**************************************************************************************
+//
+// We have MachSplit() at the ouput
+// this means that each machine has its own error function with its own gradient
+//   these error functions point to the outputs in the individual machines
+//   and the gradients stored in this Trainer
+
+REAL TrainerPhraseSlist::Train()
+{
+  if (!data_train) return -1;
+#ifdef DEBUG
+  printf("*****************\n");
+  printf("TrainerPhraseSlist::Train():\n");
+  printf(" - idim=%d, odim=%d, tg_nbphr=%d\n", idim, odim, tg_nbphr);
+  printf(" -          data_in: %p \n", (void*) buf_input);
+  printf(" -           target: %p \n", (void*) buf_target);
+  printf(" - target_in_blocks: %p \n", (void*) buf_target_in_blocks);
+  printf(" -          tgt WID: %p \n", (void*) buf_target_wid);
+#endif
+
+  Timer ttrain;		// total training time
+  //Timer tload;		// total time to select examples
+  //Timer ttransfer;      // total transfer time of data to GPU
+  //Timer tforw;          // total forw time
+  //Timer tgrad;          // total time fr gradient
+  //Timer tbackw;         // total backw time
+
+  ttrain.start();
+  data_train->Rewind();
+
+  REAL log_sum=0;
+  int i;
+  nb_ex=nb_ex_slist=nb_ex_short_inp=nb_ex_short_tgt=0;
+  nb_tg_words=nb_tg_words_slist=0;
+
+
+    // set input 
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);		// we copy from buf_input to gpu_input
+  debug1(" - gpu_input %p\n", gpu_input);
+#else
+  mach->SetDataIn(buf_input);
+  debug1(" - buf_input %p\n", buf_input);
+#endif
+
+    // connect the error functions for each individual machine
+    // buf_target does sequentially contain all the targets for block0, than block1 and so on
+    // buf_target_in_blocks
+    //  targets are arranged by blocks of bsize, i.e. first bsize targets for 1st machine, than 2nd and so on
+    //  by these means we don't need to copy or re-arrange data later in the GPU
+#ifdef BLAS_CUDA
+  REAL *tptr;
+#else
+  REAL *tptr=buf_target_in_blocks;
+#endif
+  debug0("Error functions of the individual machines:\n");
+  for (i=0; i<tg_nbphr; i++) {
+    mach_errfct[i]->SetOutput(phrase_mach[i]->GetDataOut());
+#ifdef BLAS_CUDA
+    tptr=gpu_target[i];	// we copy later from buf_target_in_blocks to gpu_target
+#endif
+    mach_errfct[i]->SetTarget(tptr);
+    phrase_mach[i]->SetGradOut(mach_errfct[i]->GetGrad());
+    debug5(" %d: fct=%p, output=%p, target=%p, grad=%p\n",i,(void*)mach_errfct[i],(void*)phrase_mach[i]->GetDataOut(),(void*)tptr,(void*)mach_errfct[i]->GetGrad());
+#ifndef BLAS_CUDA
+    tptr += bsize;	// each example provides 1 target for each output machine (the word ID)
+#endif
+  }
+
+  eos_src = eos_tgt = NULL_WORD;
+  if (sr_wlist->HasEOS()) {
+    eos_src=sr_wlist->GetEOSIndex();
+    printf(" - using a special token for short source sequences (%d)\n", eos_src);
+  }
+  if (tg_wlist->HasEOS()) {
+    eos_tgt=tg_wlist->GetEOSIndex();
+    printf(" - using a special token for short target sequences (%d)\n", eos_tgt);
+  }
+
+    // master loop on all training data
+  bool data_available;
+  do {
+    //tload.start();
+
+      // get a bunch of data and map all the words
+    int n=0;
+    data_available = true;
+    while (n < mach->GetBsize() && data_available) {
+      data_available = data_train->Next();
+      if (!data_available) break;
+      debug0("TRAIN DATA: input: ");
+      bool at_least_one_short=false;
+      for (i=0; i<iaux; i++) { // copy word indexes
+        WordID inp=(WordID) data_train->input[i];
+        debug2(" %s[%d]", sr_wlist->GetWordInfo(inp).word,inp);
+#if TRAINER_PHASE_SLIST_MAP_INPUT // default is not to do so
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+        else {
+          buf_input[n*idim + i] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist::Train(): input");       // map context words IDs
+          if (inp==eos_src) at_least_one_short=true;
+        }
+#else
+        buf_input[n*idim + i] = inp;
+        if (inp == NULL_WORD || inp==eos_src)
+          at_least_one_short=true;
+        else if (inp<0 || inp>=(int)sr_wlist->GetSize())
+          ErrorN("TrainerPhraseSlist::Train(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize());
+#endif
+      }
+      for (; i < idim ; i++) // copy auxiliary data
+        buf_input[n * idim + i] = data_train->input[i];
+      if (at_least_one_short) nb_ex_short_inp++;
+
+      debug0("\n - > mapped output: ");
+      
+      bool all_in_slist=true;  // ALL to be predicted words are in short list
+      at_least_one_short=false;
+      int nbtgsl=0;
+      for (i=0; i<tg_nbphr; i++) {
+        WordID outp=(WordID) data_train->target[i];
+        int idx=i+n*tg_nbphr;
+        buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist::Train(): output");  // TODO: not really needed during training, just the current value
+        if (outp==NULL_WORD
+            || (at_least_one_short && outp==eos_tgt))	// we only predict the FIRST EOS, the other ones are set to NULL_WORD
+        {   // NULL_WORDS are mapped, they will be detected in gradient calculation
+          buf_target[idx] = (REAL) NULL_WORD;
+          at_least_one_short=true;
+          debug1(" -[%d->NULL]",(int) buf_target[idx]);
+        }
+        else {
+	    // map normal word or EOS
+          nb_tg_words++; // also count EOS since we need to predict them at the output
+          if (outp==eos_tgt) at_least_one_short=true;
+          if (tg_wlist->InShortList(buf_target_wid[idx])) {
+            buf_target[idx] = (REAL) buf_target_wid[idx];
+            debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]);
+            nbtgsl++;
+          }
+          else {
+	    buf_target[idx] = (REAL) tg_slist_len;	// words that are not in slist are ALL done by the last output neuron
+            debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]);
+            all_in_slist=false;
+          }
+        }
+      }
+      if (all_in_slist) {
+        nb_ex_slist++;
+        nb_tg_words_slist += nbtgsl;
+      }
+      if (at_least_one_short) nb_ex_short_tgt++;
+      debug1("     all_slist=%d\n",all_in_slist);
+
+      n++;
+    }  // loop to get a bunch of examples
+    debug4("train bunch of %d words, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n));
+    //tload.stop();
+
+#ifdef DEBUG2
+    printf("network data:\n");
+    REAL *iptr=buf_input;
+    for (int nn=0;nn<n;nn++) {
+       for (i=0;i<idim;i++) printf(" %f", *iptr++); printf(" -> ");
+       for (i=0;i<tg_nbphr;i++) printf(" %f", *tptr++); printf("\n");
+    }
+#endif
+
+      // process the bunch by the neural network
+      // TODO: a lot of this code is identical with testing -> factor
+    if (n>0) {
+        // copy targets from buf_target to buf_target_in_blocks by re-arranging them into blocks per machine
+      
+      debug0("re-arrange targets\n");
+      for (i=0; i<tg_nbphr; i++) {
+        tptr=buf_target_in_blocks + i*bsize;	// destination start is always at full bsize blocks
+        debug2(" %d starts at %p\n",i,(void*)tptr);
+        REAL *tptr_src=buf_target+i;
+        for (int b=0; b<n; b++) {	// be careful with bsize and current n !
+          *tptr++=*tptr_src;
+          tptr_src+=tg_nbphr;
+        }
+      }
+   
+#ifdef BLAS_CUDA
+      //ttransfer.start();
+      Gpu::MemcpyAsync(gpu_input, buf_input , n*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+      REAL *tptr=buf_target_in_blocks;
+      for (i=0; i<tg_nbphr; i++) {
+        Gpu::MemcpyAsync(gpu_target[i], tptr , n*sizeof(REAL), cudaMemcpyHostToDevice);
+        tptr += n;
+      }
+      Gpu::StreamSynchronize();
+      //ttransfer.stop();
+#endif
+
+      //tforw.start();
+      mach->Forw(n,true);
+      //tforw.stop();
+
+      //tgrad.start();
+      debug0("call Error functions of the individual machines:\n");
+      for (i=0; i<tg_nbphr; i++) {
+        debug2(" %d: %p\n",i,(void*)mach_errfct[i]);
+#ifdef BLAS_CUDA
+        debug2("#### CUDA: calc gradient for output %d on GPU %d\n", i, Gpu::GetCudaDevice(Gpu::GetDevice(mach_errfct[i]->GetGpuConfig())));
+#endif
+          // the returned log_sum is cumulated over a full batch for one specific output word
+        log_sum += mach_errfct[i]->CalcGradNull(n);
+      }
+      //tgrad.stop();
+
+      debug1("  log_sum=%e\n",log_sum);
+#ifdef DEBUG2
+      int t=(int) data_train->target[0];
+# ifdef BLAS_CUDA
+      Gpu::SetConfig(mach->GetGpuConfig());
+      REAL * tmp = Gpu::Alloc(5, "tmp buffer for DEBUG2");
+      cublasGetVector(odim,CUDA_SIZE,mach->GetDataOut(),1,tmp,1);
+      printf("OUTPUT:");
+      for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n");
+      cublasGetVector(3, CUDA_SIZE, data_train->target, 1, tmp, 1);
+      printf("TARGET:");
+      for (int i=0;i<1; i++) printf(" %f", tmp[i]); printf("\n");
+      //TODO check if we need odim or idim!
+      // TODO: cublasGetVector(odim*bsize, CUDA_SIZE, errfct->GetGrad(), 1, tmp, 1);
+      printf("  GRAD:");
+      for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n");
+      cublasFree(tmp);
+# else
+      printf("OUTPUT:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",mach->GetDataOut()[i]); printf("\n");
+      printf("TARGET:") ; for (int i=0;i<1; i++) printf(" %f",data_train->target[i]); printf("\n");
+      printf("  GRAD:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",errfct->GetGrad()[i]); printf("\n");
+# endif //BLAS_CUDA
+#endif //DEBUG2
+
+      lrate->UpdateLrateOnForw(mach->GetNbForw());
+      //tbackw.start();
+      mach->Backw(lrate->GetLrate(), wdecay, n);
+      //tbackw.stop();
+    }
+
+    nb_ex += n;
+  } while (data_available);
+#ifdef BLAS_CUDA
+  Gpu::StreamSynchronize();
+#endif
+
+  ttrain.stop();
+  ttrain.disp(" - training time: ");
+  //tload.disp(" including load: ");
+  //ttransfer.disp(" transfer: ");
+  //tforw.disp(" forw: ");
+  //tgrad.disp(" grad: ");
+  //tbackw.disp(" backw: ");
+  printf("\n");
+  
+  printf(" - CSTM log_sum=%.2f%s, target words=%d, in shortlist=%d, nb_tg_words_slist=%d\n",
+	log_sum, tg_wlist->HasEOS() ? " including EOS" : "", nb_tg_words, nb_ex_slist, nb_tg_words_slist);
+  if (nb_tg_words>0) return exp(-log_sum / (REAL) nb_tg_words);  // when normalizing consider that all examples lead to a forward pass 
+
+  return -1;
+}
+
+//**************************************************************************************
+// 
+
+void TrainerPhraseSlist::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni)
+{
+  int Nbest=100;
+
+    // get input length
+  int input_length;
+  for (input_length=0;input_length<iaux;input_length++) {
+    if (buf_input[ni*idim+input_length] == NULL_WORD) break;
+  }
+
+  std::vector<std::vector<std::pair<float, std::size_t> > > prepared_scores
+   = prepare_hypotheses(optr, tg_nbphr, dim_per_phrase, Nbest);
+  std::vector<std::pair<float, std::vector<std::size_t> > > best
+   = sort_ngrams(prepared_scores, input_length, Nbest);
+
+  for(std::size_t i = 0; i < best.size(); ++i) {
+      // source
+    for (int j=0; j<iaux; j++) {
+      if (buf_input[ni*idim+j] == NULL_WORD) break;
+      fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+j]).word << " ";
+    }
+
+      // target
+    fspt << "|||";
+    for(std::size_t j = 0; j < best[i].second.size(); ++j) {
+      fspt << " " << tg_wlist->GetWordInfoMapped(best[i].second[j]).word;
+    }
+
+      // score
+    fspt << " ||| " << exp(best[i].first);
+    fspt << "\n";
+  }
+
+}
+
+//**************************************************************************************
+// 
+#if 0
+void TrainerPhraseSlist::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni)
+{
+  int i;
+	  // Find most likely outputs
+        for (i=0;i<iaux;i++) {
+          if (buf_input[ni*idim+i] == NULL_WORD) break;
+          fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+i]).word << " ";
+        }
+        fspt << "||| ";
+        
+        for (i=0; i<tg_nbphr; i++) {
+          if (buf_target_wid[i+ni*tg_nbphr] == NULL_WORD) break;
+  tgrad.disp(" including ");
+  tgrad.disp(" including ");
+	    // find max of current word
+	  REAL *sptr=optr+i*dim_per_phrase, max=*sptr++; int max_idx=0;
+          for (int s=1; s<dim_per_phrase; s++, sptr++) {
+            if (*sptr>max) { max=*sptr; max_idx=s; }
+          }
+          fspt << tg_wlist->GetWordInfoMapped(max_idx).word << "[" << max << "] ";
+        }
+  fspt << endl;
+}
+#endif
+ 
+//**************************************************************************************
+// 
+
+REAL TrainerPhraseSlist::TestDev(char *fname)
+{
+  if (!data_dev) return -1;
+
+  vector<string> src_phrase;	// interface with classical phrase tables
+  vector<string> tgt_phrase;
+  vector<bool> done_by_cstm;
+
+  ofstream fs;
+  if (fname) {
+    cout << " - dumping phrase probability stream to file '" << fname << "'" << endl;
+    fs.open(fname,ios::out);
+    CHECK_FILE(fs,fname);
+  }
+
+#undef DUMP_PHRASE_TABLE
+#ifdef DUMP_PHRASE_TABLE
+  char *ptfname = (char*) "alltrans.txt";
+  ofstream fspt;
+  fspt.open(ptfname,ios::out);
+  CHECK_FILE(fspt,ptfname);
+  cout << " - dumping new phrase table to file '" << ptfname << "'" << endl;
+#endif
+
+  nb_ex=nb_ex_slist=nb_ex_short_inp=nb_ex_short_tgt=0;
+  nb_tg_words=nb_tg_words_slist=0;
+  int nb_not_in_ptable=0;	// this counts the number of phrase pairs which were not found in the external phrase table
+  int nb_src_words=0;
+  REAL log_sum=0;
+  REAL log_sum_notunk=0;	// all known phrase pairs, either CSTM or ptable (count=nb+_ex - nb_not_in_ptable)
+  REAL log_sum_cstm=0;		// only CSLM, i.e. considering phrases done by CSTM
+  REAL log_sum_cstm_short=0;	// like CSTM, limited to short n-grams, i.e. we do not count the prediction of (multiple) EOS
+
+  uint idx;
+
+    // set input 
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);		// we copy from buf_input to gpu_input
+  debug1(" - gpu_input %p\n", gpu_input);
+#else
+  mach->SetDataIn(buf_input);
+  debug1(" - buf_input %p\n", buf_input);
+#endif
+
+    // connect the error functions for each individual machine
+    // buf_target does sequentially contain all the targets for block0, than block1 and so on
+    // buf_target_in_blocks
+    //  targets are arranged by blocks of bsize, i.e. first bsize targets for 1st machine, than 2nd and so on
+    //  by these means we don't need to copy or re-arange data later in the GPU
+#ifdef BLAS_CUDA
+  REAL *tptr;
+#else
+  REAL *tptr=buf_target_in_blocks;
+#endif
+  debug0("Error functions of the individual machines:\n");
+  for (int i=0; i<tg_nbphr; i++) {
+    mach_errfct[i]->SetOutput(phrase_mach[i]->GetDataOut());
+#ifdef BLAS_CUDA
+    tptr=gpu_target[i];	// we copy later from buf_target_in_blocks to gpu_target
+#endif
+    mach_errfct[i]->SetTarget(tptr);
+    phrase_mach[i]->SetGradOut(mach_errfct[i]->GetGrad());
+    debug5(" %d: fct=%p, output=%p, target=%p, grad=%p\n",i,(void*)mach_errfct[i],(void*)phrase_mach[i]->GetDataOut(),(void*)tptr,(void*)mach_errfct[i]->GetGrad());
+#ifndef BLAS_CUDA
+    tptr += bsize;	// each example provides 1 target for each output machine (the word ID)
+#endif
+  }
+
+    // how do we handle short sequences ?
+  eos_src = eos_tgt = NULL_WORD;
+  if (sr_wlist->HasEOS()) {
+    eos_src=sr_wlist->GetEOSIndex();
+    printf(" - using a special token for short source sequences (%d)\n", eos_src);
+  }
+  if (tg_wlist->HasEOS()) {
+    eos_tgt=tg_wlist->GetEOSIndex();
+    printf(" - using a special token for short target sequences (%d)\n", eos_tgt);
+  }
+
+  bool data_available;
+  data_dev->Rewind();
+  do {
+      // get a bunch of data
+    int n=0, i;
+    data_available = true;
+    debug0("start bunch\n");
+    done_by_cstm.clear();
+    while (n < mach->GetBsize() && data_available) {
+      data_available = data_dev->Next();
+      if (!data_available) break;
+
+      debug0("DEV DATA: input: ");
+      bool at_least_one_short=false;
+      for (i=0; i<iaux; i++) { // copy word indexes
+        WordID inp=(WordID) data_dev->input[i];
+        idx=n*idim + i;
+        debug2(" %s[%d]", tg_wlist->GetWordInfo(inp).word,inp);
+#if TRAINER_PHASE_SLIST_MAP_INPUT // default is not to do so
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+        else {
+          buf_input[idx] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist::TesDev(): input");       // map context words IDs
+          nb_src_words++;
+          if (inp==eos_src) at_least_one_short=true;
+        }
+#else
+        buf_input[idx] = inp;
+        if (inp == NULL_WORD || inp==eos_src)
+          at_least_one_short=true;
+        else {
+          if (inp<0 || inp>=(int)sr_wlist->GetSize())
+            ErrorN("TrainerPhraseSlist::TestDev(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize());
+          nb_src_words++;
+        }
+#endif
+      }
+      for (; i < idim ; i++) // copy auxiliary data
+        buf_input[n * idim + i] = data_dev->input[i];
+      if (at_least_one_short) nb_ex_short_inp++;
+
+      debug0("\n - > mapped output: ");
+      
+      bool all_in_slist=true;  // ALL to be predicted words are in short list
+      int nbtgsl=0;
+      at_least_one_short=false;
+      for (i=0; i<tg_nbphr; i++) {
+        WordID outp=(WordID) data_dev->target[i];
+        idx=i+n*tg_nbphr;
+        buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist::TestDev(): output");
+        buf_target_ext[idx] = outp;		// keep unmapped target word ID for Moses phrase-table
+        if (outp==NULL_WORD
+            || (at_least_one_short && outp==eos_tgt))   // we only predict the FIRST EOS, the other ones are set to NULL_WORD
+        {   // NULL_WORDS are mapped, they will be detected in gradient calculation
+          buf_target_wid[idx] = NULL_WORD;
+          buf_target[idx] = (REAL) NULL_WORD;
+          at_least_one_short=true;
+          debug1(" -[%d->NULL]",(int) buf_target_wid[idx]);
+        }
+        else {
+            // map normal word or EOS
+          nb_tg_words++; // also count EOS since we need to predict them at the output
+          if (outp==eos_tgt) at_least_one_short=true;
+          if (tg_wlist->InShortList(buf_target_wid[idx])) {
+            buf_target[idx] = (REAL) buf_target_wid[idx];
+            debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]);
+	    nbtgsl++;
+          }
+          else {
+	      // TODO: we actually don't need a forward pass for words in the short lists or short n-grams
+	      //       this could be used to save some time (5-10%)
+            buf_target_wid[idx] = tg_slist_len;
+	    buf_target[idx] = (REAL) tg_slist_len;	// words that are not in slist are ALL done by the last output neuron
+            debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]);
+            all_in_slist=false;
+          }
+        }
+      }
+      done_by_cstm.push_back(all_in_slist);
+      if (all_in_slist) {
+        nb_ex_slist++;
+        nb_tg_words_slist += nbtgsl;
+      }
+      if (!at_least_one_short) nb_ex_short_tgt++;
+      debug1("     all_slist=%d\n",all_in_slist);
+
+      n++;
+    }  // loop to get a bunch ef examples
+    debug4("dev bunch of %d phrases, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n));
+
+#ifdef DEBUG2
+printf("network data:\n");
+REAL *iptr=buf_input;
+REAL *tptr=buf_target;
+for (int nn=0;nn<n;nn++) {
+   for (i=0;i<idim;i++) printf(" %f", *iptr++); printf(" -> ");
+   for (i=0;i<tg_nbphr;i++) printf(" %f", *tptr++); printf("\n");
+}
+#endif
+
+
+      // process the bunch by the neural network
+    if (n>0) {
+        // copy targets from buf_target to buf_target_in_blocks by re-arranging them into blocks per machine
+      
+      debug0("re-arrange targets\n");
+      for (i=0; i<tg_nbphr; i++) {
+        tptr=buf_target_in_blocks + i*bsize;	// destination start is always at full bsize blocks
+        debug2(" %d starts at %p\n",i,(void*)tptr);
+        REAL *tptr_src=buf_target+i;
+        for (int b=0; b<n; b++) {	// be careful with bsize and current n !
+          *tptr++=*tptr_src;
+          tptr_src+=tg_nbphr;
+        }
+      }
+    
+#ifdef BLAS_CUDA
+      Gpu::MemcpyAsync(gpu_input, buf_input , n*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+      REAL *tptr=buf_target_in_blocks;
+      for (i=0; i<tg_nbphr; i++) {
+        Gpu::MemcpyAsync(gpu_target[i], tptr , n*sizeof(REAL), cudaMemcpyHostToDevice);
+        tptr += n;
+      }
+      Gpu::StreamSynchronize();
+#endif
+      mach->Forw(n,false); 
+      for (i=0; i<tg_nbphr; i++) {
+          // the returned log_sum is cumulated over a full batch for one specific output word
+        //log_sum += mach_errfct[i]->CalcValueNull(n);
+        log_sum += mach_errfct[i]->CalcGradNull(n);	// TODO: should use CalcValueNull()
+      }
+    }
+
+#if DIRECT_PROBA_CALCULATION
+      // get probas from CSLM or back-off LM
+#ifdef BLAS_CUDA
+      // host output is of dim bsize*odim - bsize*tg_nphr*dim_per_phrase
+      // it contains the whole bunch of the 1st output, then whole bunch of 2nd output, etc
+    for (int i=0; i<tg_nbphr; i++) {
+      Gpu::MemcpyAsync(host_output+i*bsize*dim_per_phrase,phrase_mach[i]->GetDataOut(), n*dim_per_phrase*sizeof(REAL), cudaMemcpyDeviceToHost);
+      // TODO: we actually copy too much data, for each output vector we only need one value !
+    }
+    Gpu::StreamSynchronize();
+#endif
+
+    debug1("Collect n=%d\n", n);
+    if (n!=(int) done_by_cstm.size())
+      Error("TrainerPhraseSlist::TestDev(): internal error, number of phrases done by CSTM does not match");
+
+    REAL *ptr_input = buf_input;	// n times idim values
+    for (int ni=0; ni<n; ni++) {
+      REAL logP=0.0, logP_short=0.0;
+      if (done_by_cstm[ni]) {
+          // get proba from CSTM (removed renorm)
+
+        for (i=0; i<tg_nbphr; i++) {
+          WordID cur_tg=buf_target_wid[i+ni*tg_nbphr];
+          if (cur_tg == NULL_WORD) break;
+		// get proba from output i for bunch ni
+#ifdef BLAS_CUDA
+	  REAL *optr=host_output+i*bsize*dim_per_phrase + ni*dim_per_phrase;
+#else
+	  REAL *optr=phrase_mach[i]->GetDataOut() + ni*dim_per_phrase;
+#endif
+          logP += safelog(optr[cur_tg]); // no error check on indices necessary here
+          if (buf_target_ext[i+ni*tg_nbphr] != eos_tgt) { // exclude the (easy) prediction of EOS from stats
+            logP_short += safelog(optr[cur_tg]); // no error check on indices necessary here
+          }
+          debug5("n=%3d, pos=%d, tg_w=%d (unmapped %d), P=%f\n",ni,i,cur_tg,buf_target_ext[i+ni*tg_nbphr],optr[cur_tg]);
+        }
+        debug4(" -      -> logP=%f/%d, logP_short=%f/%d\n",logP,logP_short); 
+
+#ifdef DUMP_PHRASE_TABLE
+          // create output phrase table
+        for (i=0;i<iaux;i++) {
+          if (buf_input[ni*idim+i] == NULL_WORD) break;
+          fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+i]).word << " ";
+        }
+        fspt << "||| ";
+        for (i=0;i<tg_nbphr;i++) {
+          if (buf_target_wid[i+ni*tg_nbphr] == eos_tgt) break;
+          fspt << tg_wlist->GetWordInfoMapped(buf_target_wid[ni*tg_nbphr+i]).word << " ";
+        }
+        fspt << "||| " << logP << endl;
+#endif
+
+#ifdef DUMP_PHRASE_TABLE_NBEST
+	Error("GetMostLikelyTranslations() change to work with multiple output vectors");
+        GetMostLikelyTranslations(fspt,optr,ni);
+#endif
+
+        debug1(" CSLM: logP=%e\n", logP);
+        log_sum_cstm += logP;
+        log_sum_cstm_short += logP_short;
+        log_sum_notunk += logP;
+        log_sum += logP;
+      }
+      else {
+Error("not done by CSTM");
+
+       if (ptable) {
+          // request proba from Moses phrase-table
+         debug0("create textual phrase pair for external phrase table (word + index)\n");
+         src_phrase.clear();
+         debug0("  source:");
+         for (i=0; i<iaux && ptr_input[i]!=NULL_WORD; i++) {
+           src_phrase.push_back(sr_wlist->GetWordInfo((uint) ptr_input[i]).word);	// TODO: char* to string
+           debug2(" %s[%d]", src_phrase.back().c_str(), (uint) ptr_input[i]);
+#ifdef DUMP_PHRASE_TABLE
+           fspt << src_phrase.back() << " ";
+#endif
+         }
+
+#ifdef DUMP_PHRASE_TABLE
+         fspt << "|P| ";
+#endif
+         tgt_phrase.clear();
+         debug0("  target:");
+         for (i=0; i<tg_nbphr && buf_target_ext[i+ni*tg_nbphr]!=eos_tgt; i++) {
+           tgt_phrase.push_back(tg_wlist->GetWordInfoMapped(buf_target_ext[i+ni*tg_nbphr]).word);	// TODO: char* to string
+           debug2(" %s[%d]", tgt_phrase.back().c_str(), buf_target_ext[i+ni*tg_nbphr]);
+#ifdef DUMP_PHRASE_TABLE
+           fspt << tgt_phrase.back() << " ";
+#endif
+         }
+# ifdef BACKWARD_TM
+         logP = ptable->GetProb(tgt_phrase, src_phrase);
+# else
+         logP = ptable->GetProb(src_phrase, tgt_phrase);
+# endif
+         if (logP == PROBA_NOT_IN_PTABLE) nb_not_in_ptable++;
+                                     else log_sum_notunk += logP;
+         logP = safelog(logP); // take log now
+         debug1("  => logP=%e\n",logP);
+         log_sum += logP;
+       }
+       else { // no ptable was specified
+         logP=0; // flag output that it wasn't done by CSTM
+       }
+#ifdef DUMP_PHRASE_TABLE
+       fspt << "||| " << logP << endl;
+#endif
+      } // not done by CSTM
+          
+      ptr_input += idim;  // next example in bunch at input
+      if (fname) {
+        fs << logP << endl;
+      }
+    }
+#endif // old proba calculation
+
+    nb_ex += n;
+    debug2("%d: %f\n",nb_ex,exp(-log_sum/nb_ex));
+  } while (data_available);
+
+  printf(" - %d phrases, %d target words, avr length src=%.1f tgt=%.1f, CSTM: %d phrases (%.2f), %d target words (%.2f)\n",
+	 nb_ex, nb_tg_words, (REAL) nb_src_words/nb_ex, (REAL) nb_tg_words/nb_ex,
+	 nb_ex_slist, 100.0*nb_ex_slist/nb_ex, nb_tg_words_slist, 100.0 * nb_tg_words_slist/nb_tg_words);
+  if (ptable) {
+    printf(" - %d words were looked up in external phrase table, %d (%.2f%% were not found)\n",
+	nb_ex-nb_ex_slist, nb_not_in_ptable, 100.0*nb_not_in_ptable/(nb_ex-nb_ex_slist));
+  }
+
+#ifdef DIRECT_PROBA_CALCULATION
+  REAL px = (nb_ex>0) ? exp(-log_sum / (REAL) nb_ex) : -1;
+  printf("   cstm px=%.2f, ln_sum=%.2f, cstm_short_px=%.2f, ln_sum=%.2f, overall px=%.2f, with unk=%.2f\n",
+        (nb_ex_slist>0) ? exp(-log_sum_cstm / (REAL) nb_ex_slist) : -1, log_sum_cstm,
+        (nb_ex_slist>0) ? exp(-log_sum_cstm_short / (REAL) nb_ex_slist) : -1, log_sum_cstm_short,
+        (nb_ex-nb_not_in_ptable>0) ? exp(-log_sum_notunk / (REAL) (nb_ex-nb_not_in_ptable)) : -1,
+        px);
+#else
+  REAL px = (nb_ex>0) ? exp(-log_sum / (REAL) nb_tg_words_slist) : -1;
+  printf("   px=%.2f, ln_sum=%.2f\n", px, log_sum);
+#endif
+
+  if (fname) fs.close();
+#ifdef DUMP_PHRASE_TABLE
+  fspt.close();
+#endif
+
+  return px;
+}
+
+
+//**************************************************************************************
+// information after finishing an epoch
+
+void TrainerPhraseSlist::InfoPost ()
+{
+    // if EOS is predicted by the NN, we don't count it as short
+  printf(" - epoch finished, %d target words in %d phrases (%.2f/%.2f%% short source/target)\n",
+	nb_tg_words, nb_ex,
+	100.0*nb_ex_short_inp/nb_ex, 100.0*nb_ex_short_tgt/nb_ex);
+  printf("   CSTM: %d target words in %d phrases (%.2f%%), avrg px=%.2f\n",
+	nb_tg_words_slist, nb_ex_slist, 100.0*nb_ex_slist/nb_ex,
+	err_train);
+}
+
+//**************************************************************************************
+// request one n-gram probability, usually the called will be delayed
+// and processes later 
+
+
+//**************************************************************************************
+// collect all delayed probability requests
+
+
+void TrainerPhraseSlist::ForwAndCollect(vector< vector<string> > &src_phrases, AlignReq *areq, int req_beg, int req_end, int bs, int tm_pos)
+{
+  if (bs<=0) return;
+  debug3("TrainerPhraseSlist::ForwAndCollect(): collecting outputs %d .. %d from bunch of size %d\n", req_beg, req_end, bs);
+  debug3("\ttarget machines %d x dim %d = total %d\n", tg_nbphr, dim_per_phrase, odim);
+
+  if (bs != (int) src_phrases.size())
+    ErrorN("TrainerPhraseSlist::ForwAndCollect(): the number of source phrases (%d) does not match block length (%d)", (int) src_phrases.size(), bs);
+
+#ifdef DEBUG
+  printf("bunch of %d\n",bs);
+  for (int b=0; b<bs; b++) {
+    printf("%3d:", b);
+    for (int ii=0; ii<idim; ii++) printf(" %.2f", buf_input[b*idim+ii]); printf("\n");
+  }
+#endif
+
+  nb_forw++;
+#ifdef CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);
+  Gpu::MemcpyAsync(gpu_input, buf_input , bs*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+#else
+  mach->SetDataIn(buf_input);
+#endif
+  mach->Forw(bs,false);
+
+#ifdef BLAS_CUDA
+  for (int tw=0; tw<tg_nbphr; tw++)
+    Gpu::MemcpyAsync(host_output + tw*bsize*dim_per_phrase, phrase_mach[tw]->GetDataOut(), bs*dim_per_phrase*sizeof(REAL), cudaMemcpyDeviceToHost);
+  Gpu::StreamSynchronize();
+#endif
+
+    // stats
+  int cnt_ex_slist=0, cnt_tg_words=0, cnt_tg_words_slist=0;
+
+  for (int n=req_beg; n<=req_end; n++) {
+    REAL logP=0;
+    int b=areq[n].bs;
+
+    if ((int) areq[n].tgph.size() > tg_nbphr)
+      ErrorN("TrainerPhraseSlist::ForwAndCollect(): target phrase too long (%d) for machine (%d)", (int) areq[n].tgph.size(), tg_nbphr);
+
+#ifdef DEBUG
+    printf("collect b=%3d \n input:", b);
+    for (int ii=0; ii<idim; ii++) printf(" %f",buf_input[b*idim+ii]); printf("\n");
+#endif
+
+      // map target words
+    debug0(" output:");
+    bool all_in_slist=true;
+    int tw;
+    for (tw=0; all_in_slist && tw<tg_nbphr; tw++) {
+      WordID outp = areq[n].tgwid[tw];
+      debug1(" %d",outp);
+      if (outp==eos_tgt) break;
+      cnt_tg_words++;
+      buf_target_wid[tw] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist::ForwAndCollect() output");
+      debug1("->%d",buf_target_wid[tw]);
+      all_in_slist=tg_wlist->InShortList(buf_target_wid[tw]);
+    }
+      // fill up
+    for (; tw<tg_nbphr; tw++) {
+      debug0(" fill");
+      buf_target_wid[tw]=eos_tgt;
+    }
+    debug1("    slist=%d\n",all_in_slist);
+
+    if (!all_in_slist) {
+        // get proba from external phrase table
+      logP=safelog(ptable->GetProb(src_phrases[areq[n].bs], areq[n].tgph));
+      debug1(" ptable: logP=%f\n", logP);
+    }
+    else {
+        // get proba from CSLM
+      debug0(" -  in slist CSLM:");
+      logP=0; int cnt=0;
+      for (int tw=0; tw<tg_nbphr; tw++) {
+        if (buf_target_wid[tw] == eos_tgt) break;
+#ifdef BLAS_CUDA
+        //old;  REAL *optr=host_output + b*odim;
+        //test: REAL *optr=host_output+i*bsize*dim_per_phrase + ni*dim_per_phrase;
+        REAL *optr=host_output+tw*bsize*dim_per_phrase + b*dim_per_phrase;
+#else
+        //old: REAL *optr=mach->GetDataOut() + b*odim;
+        //test: REAL *optr=phrase_mach[i]->GetDataOut() + ni*dim_per_phrase;
+        //TODO: it would be much more efficient to do all the examples of one machine and then switch to the next one
+        REAL *optr=phrase_mach[tw]->GetDataOut() + b*dim_per_phrase;
+#endif
+        debug1(" %e", optr[buf_target_wid[tw]]);
+        logP += safelog(optr[buf_target_wid[tw]]);
+        cnt++;
+      }
+      if (cnt==0) Error("no target phrases when collecting output");
+      logP /= cnt; // TODO: is this normalization correct ?
+      debug1(" -> log avr=%f\n",logP);
+
+      cnt_ex_slist++;
+      cnt_tg_words_slist += cnt;
+    }
+
+        // store LM proba
+    areq[n].hyp->AddFeature(logP,tm_pos);
+  } // for (ni=...)
+
+  printf(" nb of phrases: %d with %d target words, by CSTM %d (%5.2f%%), avrg length %1.2f words\n",
+	 req_end-req_beg+1, cnt_tg_words, cnt_ex_slist, (float) 100.0* cnt_ex_slist / (req_end-req_beg+1), (float) cnt_tg_words_slist/cnt_ex_slist);
+  nb_ex += (req_end-req_beg+1);
+  nb_ex_slist += cnt_ex_slist;
+  nb_tg_words_slist += cnt_tg_words_slist;
+  nb_tg_words += cnt_tg_words;
+}
+
+
+void TrainerPhraseSlist::BlockStats() {
+   //printf(" - %d phrase probability requests, %d=%5.2f short phrase %d forward passes (avrg of %d probas), %d=%5.2f%% predicted by CSTM\n",
+	//nb_ngram, nb_ex_short_tgt, 100.0*nb_ex_short_tgt/nb_ngram, nb_forw, nb_ngram/nb_forw, nb_ex_slist, 100.0*nb_ex_slist/nb_ngram);
+   printf(" - CSTM: %d forward passes, %d=%5.2f%% phrases were predicted by CSTM\n",
+	nb_forw, nb_ex_slist, 100.0 * nb_ex_slist/nb_ex);
+}
diff --git a/TrainerPhraseSlist.h b/TrainerPhraseSlist.h
new file mode 100644
index 0000000..81a83f8
--- /dev/null
+++ b/TrainerPhraseSlist.h
@@ -0,0 +1,114 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#ifndef _TrainerPhraseSlist_h
+#define _TrainerPhraseSlist_h
+
+#include <ostream>
+#include "Tools.h"
+#include "Mach.h"
+#include "ErrFct.h"
+#include "DataPhraseBin.h"
+#include "Trainer.h"
+#include "WordList.h"
+
+#include "PtableMosesPtree.h"
+#include "AlignReq.h"
+
+//
+// Class to train neural networks to predict phrase probabilities
+//  - we use a short list of target words for which the NN predicts the proba
+//  - the proba of the other target words are obtained by a classical Moses phrase table
+//  - the NN also predicts the proba mass of ALL the words not in the short slist
+//    for this we use the last output neuron of the network
+
+
+class TrainerPhraseSlist : public Trainer
+{
+private:
+  int		max_inp_idx;		// largest index -1 of a word at the input (# of entries in projection table)
+  int		tg_nbphr;		// number of phrases at output, odim should be (tg_slist_len+1) * tg_nbphr
+  int		dim_per_phrase;		// output dimension of each phrase prediction layer (must be equal size)
+  WordID	tg_slist_len;		// length of slist (this is set to dim_per_phrase MINUS ONE)
+  WordList	*sr_wlist;
+  WordList	*tg_wlist;
+  vector<Mach*> phrase_mach;		// pointer to the output machine for each phrase
+  vector<ErrFct*> mach_errfct;		// each individual machine has its own error function with local memory
+					// in this version of the Trainer the error function is identical to all machines
+					// (we use the one in the local variable of the mother class Trainer)
+ 
+  PtableMosesPtree	*ptable;	// classical phrase table
+
+    // handling of short sequences
+    // 			input		output	
+    // NULL_WORD	set proj=0	set grad=0
+    // EOS		as normal word	as normal word
+    //
+  WordID eos_src, eos_tgt;		// defaults to NULL_WORD if no special symbol in word list
+
+    // various stats
+  int		nb_ex_slist;		// total number of examples processed in slist
+  int		nb_ex_short_inp;	// total number of incomplete input phrases
+  int		nb_ex_short_tgt;	// total number of incomplete target phrases
+  int		nb_tg_words;		// total number of target words (there can be several target words for a phrase pair)
+  int		nb_tg_words_slist;	// total number of target words which are in short list
+// TODO: use WordID vector for targets in order to make less casts 
+  WordID	*buf_target_wid;	// used instead of buf_target to avoid casts between REAL and WordID
+					// size is odim x bsize
+  WordID	*buf_target_ext;	// similar to buf_target_wid[], but keep even word id out side of short list
+					// needed to request probas from external phrase table
+  REAL		*buf_target_in_blocks;	// same data than in buf_target of Trainer class, but re-arranged in blocks for individual machines
+#ifdef BLAS_CUDA
+  vector<REAL*> gpu_target;	// copied from trainer to GPU
+#endif
+#ifdef DEBUG
+  vector<char*>  words;			// give UTF8 word for a given CSLM internal index
+#endif
+  REAL DoTestDev(char*, bool);	// internal helper function
+  void DoConstructorWork();	// internal helper function for the various constructors
+    // data and functions for block processing
+  int	nb_forw;		// stats on total number of forward passes
+  void GetMostLikelyTranslations(ofstream&,REAL*,int);
+protected:
+  virtual void InfoPost();			// dump information after finishing a training epoch
+public:
+  TrainerPhraseSlist(Mach*, Lrate*, ErrFct*,	// mach, lrate, errfct
+	  const char*, const char*, const char*, int,	// train, dev, external phrase table, number of scores
+	  REAL =0, int =10, int =0);			// wdecay, max epochs, current epoch
+  TrainerPhraseSlist(Mach*, ErrFct*, Data*,	// for testing only: mach, errfct, binary data
+	  char*, int);				// external phrase table, number of scores
+  TrainerPhraseSlist(Mach*, WordList*, WordList*,	// for general proba calculation: mach, src word list, tgt word list
+	  char*, int , char*);			// external phrase table, number of scores, score specif
+  virtual ~TrainerPhraseSlist();
+  virtual REAL Train();				// train for one epoch
+  virtual REAL TestDev(char* =NULL);		// test current network on dev data and save outputs into file
+    // fast block evaluation functions
+  virtual void StoreInput(int b, int d, REAL val) {buf_input[b*bsize+d]=val;}
+  virtual void ForwAndCollect(vector< vector<string> > &, AlignReq*, int,int,int,int);	// for nbest rescoring
+  virtual void BlockStats();				// display some stats on Block mode
+    // interface functions
+  virtual int GetTgtNbPhr() {return tg_nbphr; }
+  virtual int GetSlistLen() {return tg_slist_len; }
+  virtual REAL *GetBufInput() {return buf_input; }
+};
+
+#endif
diff --git a/TrainerPhraseSlist1.cpp b/TrainerPhraseSlist1.cpp
new file mode 100644
index 0000000..5f369d4
--- /dev/null
+++ b/TrainerPhraseSlist1.cpp
@@ -0,0 +1,951 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ */
+
+using namespace std;
+#include <iostream>
+#include <algorithm>
+#include <unistd.h>
+#include <time.h>
+
+#include "Tools.h"
+#include "Mach.h"
+#include "MachTab.h"
+#include "MachPar.h"
+#include "MachSeq.h"
+#include "MachSplit1.h"
+#include "TrainerPhraseSlist1.h"
+
+#include "NBest.h" 
+#include "sort.cpp" 
+
+void TrainerPhraseSlist1::DoConstructorWork()
+{
+  char	msg[1024];
+
+  idim=mach->GetIdim(); odim=mach->GetOdim(); bsize=mach->GetBsize();
+
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  gpu_input = Gpu::Alloc(idim*bsize, "inputs in Trainer");
+  gpu_target = Gpu::Alloc(odim*bsize, "targets in Trainer");
+  host_output = new REAL[odim*bsize];
+#endif
+  buf_target_wid = new WordID[odim*bsize];
+  buf_target_ext = new WordID[odim*bsize];
+
+    // set up vector to outputs of the target phrases
+  if (mach->GetMType() != file_header_mtype_mseq)
+    Error("CSTM: sequential machine needed\n");
+  MachSeq *mseq=(MachSeq*) mach;
+  if (mseq->MachGetNb()<2)
+    Error("CSTM: the number of machines is suspeciously small");
+
+    // check input layer
+  if (mseq->MachGet(0)->GetMType() != file_header_mtype_mpar)
+    Error("TrainerPhraseSlist1::DoConstructorWork: CSTM: the input layer has the wrong architecture\n");
+  MachPar *mpar = (MachPar*) mseq->MachGet(0);
+  if (mpar->MachGet(0)->GetMType() != file_header_mtype_tab)
+    Error("TrainerPhraseSlist1::DoConstructorWork: CSTM: the input layer has the wrong architecture\n");
+  MachTab *mtab = (MachTab*) mpar->MachGet(0);
+  max_inp_idx = mtab->GetMaxInpVal();
+
+    // check output layer
+  if (mseq->MachGet(mseq->MachGetNb()-1)->GetMType() != file_header_mtype_msplit1)
+    Error("CSTM: the output layer has the wrong architecture\n");
+  MachSplit1 *msp = (MachSplit1*) mseq->MachGet(mseq->MachGetNb()-1);
+  tg_nbphr=msp->MachGetNb();
+  if (data_train && (data_train->GetOdim() != tg_nbphr)) {
+    sprintf(msg,"CSTM: output dimension of the training data should be %d, found %d\n", tg_nbphr, data_train->GetOdim());
+    Error(msg);
+  }
+
+  phrase_mach.clear();
+  for (int m=0; m<tg_nbphr; m++) {
+    phrase_mach.push_back(msp->MachGet(m));
+    if (m>0 && phrase_mach[m-1]->GetOdim() != phrase_mach[m]->GetOdim())
+      Error("CSTM: the output layer dimension must be identical for all phrases\n");
+  }
+  dim_per_phrase = phrase_mach[0]->GetOdim();
+  cout << " - this machine can predict up to " << phrase_mach.size() << " phrases, each with an output layer of dimension " << dim_per_phrase << endl;
+  tg_slist_len = dim_per_phrase-1;
+
+
+    // get source word list
+  if (sr_wlist == NULL) {
+    vector<WordList> *vect_wlist = NULL;
+    if (data_dev != NULL)
+      vect_wlist = data_dev->GetSrcWList();
+    else if (data_train != NULL)
+      vect_wlist = data_train->GetSrcWList();
+    if ((vect_wlist != NULL) && !vect_wlist->empty())
+      sr_wlist = &(vect_wlist->front());
+  }
+  if (sr_wlist == NULL)
+    Error("no source word list available");
+  if ((int) sr_wlist->GetSize() > max_inp_idx)
+    Error("the size of the source word list exceeds the number of input words the machine was trained for");
+
+    // get target word list
+  if (tg_wlist == NULL) {
+    vector<WordList> *vect_wlist = NULL;
+    if (data_dev != NULL)
+      vect_wlist = data_dev->GetTgtWList();
+    else if (data_train != NULL)
+      vect_wlist = data_train->GetTgtWList();
+    if ((vect_wlist != NULL) && !vect_wlist->empty())
+      tg_wlist = &(vect_wlist->front());
+  }
+  if (tg_wlist == NULL)
+    Error("no target word list available");
+  if (!tg_wlist->FrequSort())
+    Error("the target word list don't contain word count");
+  if (tg_wlist->GetSize() <= tg_slist_len)
+    Error("TrainerPhraseSlist1: the output layer is larger than the target word list");
+
+  ulong sum_sl=0, sum=0;
+  tg_wlist->SetShortListLength(tg_slist_len);
+  tg_wlist->CountWords(sum_sl, sum);
+  printf (" - setting up target short list of %d words, coverage of %5.2f%%\n", tg_slist_len, 100.0*sum_sl/sum);
+
+#ifdef DEBUG2
+  cout << "Words in slist:" << endl;
+  WordID ci=tg_slist_len;
+  WordList::const_iterator iter, end = tg_wlist->End();
+  for (iter=tg_wlist->Begin(); (iter!=end) && (ci > 0); iter++, ci--)
+    printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id);
+#endif
+
+#ifdef DEBUG2
+  cout << "Words not in slist:" << endl;
+  for (; iter!=end; iter++)
+    printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id);
+#endif
+
+#ifdef DEBUG2
+   // just needed for debugging
+  words.reserve(tg_wlist->GetSize());
+  for (iter=tg_wlist->Begin(); iter!=end; iter++) words[iter->id] = strdup(iter->word);
+#endif
+  
+  debug0(" + done init TrainerPhraseSlist1\n");
+}
+
+//
+// constructor for training
+//
+
+TrainerPhraseSlist1::TrainerPhraseSlist1 (Mach *pmach, Lrate *lrate, ErrFct *perrfct,
+	const char *train_fname, const char *dev_fname, const char *pt_fname, int p_nscores,
+	REAL p_wd, int p_maxep, int p_ep)
+ : Trainer(pmach,lrate,perrfct,NULL,NULL,p_wd,p_maxep,p_ep),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(NULL), tg_wlist(NULL),
+   nb_ex_slist(0), nb_ex_short(0),
+   nb_forw(0)
+{
+  debug2("*** Constructor TrainerPhraseSlist1 for training idim=%d, odim=%d ***\n",idim,odim);
+  cout << "Setting up CSTM training with short list" << endl;
+  char msg[1024];
+
+  if (train_fname) {
+    data_train = new Data(train_fname);
+    if (idim != data_train->GetIdim()) {
+      sprintf(msg,"TrainerPhraseSlist1: input dimension of the training data (%d) does not match the one of the machine (%d)\n", data_train->GetIdim(), idim);
+      Error(msg);
+    }
+    if (data_train->GetOdim()<1 || data_train->GetOdim()>10) {
+      sprintf(msg,"TrainerPhraseSlist1: output dimension of the training data should be 1..10, found %d\n", data_train->GetOdim());
+      Error(msg);
+    }
+    auxdim = data_train->GetAuxdim();
+  }
+  else 
+    data_train=NULL;
+
+  if (dev_fname) {
+    data_dev = new Data(dev_fname);
+    data_dev_alloc=true;
+    if (idim != data_dev->GetIdim()) {
+      sprintf(msg,"TrainerPhraseSlist1: input dimension of the validation data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim);
+      Error(msg);
+    }
+    if (data_dev->GetOdim()<1 || data_dev->GetOdim()>10) {
+      sprintf(msg,"TrainerPhraseSlist1: output dimension of the validation data should be 1..10, found %d\n", data_dev->GetOdim());
+      Error(msg);
+    }
+    int auxdim_dev = data_dev->GetAuxdim();
+    if (0 >= auxdim)
+      auxdim = auxdim_dev;
+    else if (auxdim != auxdim_dev)
+      ErrorN("TrainerPhraseSlist1: auxiliary data dimension of the validation data should be %d, found %d", auxdim, auxdim_dev);
+  }
+  else {
+    data_dev=NULL;
+    data_dev_alloc=false;
+  }
+  iaux = (idim - auxdim);
+
+  DoConstructorWork();
+
+  if (data_dev) {
+    if (pt_fname) {
+      cout << " - loading external phrase table from " << pt_fname << endl;
+      ptable.Read(pt_fname,5,"1:2");
+    }
+    else
+      cout << " - no external phrase table provided" << endl;
+  }
+}
+
+//
+// constructor for testing
+//
+
+TrainerPhraseSlist1::TrainerPhraseSlist1 (Mach *pmach, ErrFct *perrfct,
+	Data *data, char *pt_fname, int p_nscores)
+ : Trainer(pmach,NULL,perrfct,NULL,NULL),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(NULL), tg_wlist(NULL),
+   nb_ex_slist(0), nb_ex_short(0),
+   nb_forw(0)
+{
+  debug0("*** Constructor TrainerPhraseSlist1 for testing ***\n");
+  cout << "Setting up testing with short list" << endl;
+  char	msg[1024];
+
+  data_train=NULL;
+  data_dev=data;
+  data_dev_alloc=false; // do not free it by this class !
+
+  if (idim != data_dev->GetIdim()) {
+    sprintf(msg,"TrainerPhraseSlist1: input dimension of the test data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim);
+    Error(msg);
+  }
+  auxdim = data_dev->GetAuxdim();
+  iaux = (idim - auxdim);
+
+  DoConstructorWork();
+
+  cout << " - loading external phrase table from " << pt_fname << endl;
+#ifdef BACKWRAD_TM
+  ptable.Read(pt_fname,5,"1:0"); // backward TM prob
+#else
+  ptable.Read(pt_fname,5,"1:2"); // forward TM prob
+#endif
+}
+
+//
+// constructor for nbest rescoring
+//
+
+TrainerPhraseSlist1::TrainerPhraseSlist1 (Mach *pmach,
+    WordList *p_sr_wlist, WordList *p_tg_wlist,
+	char *pt_fname, int nscores, char *scores_specif)
+ : Trainer(pmach,NULL,NULL,NULL,NULL), // TODO; should I call:  TrainerNgram(pmach,NULL,NULL,NULL),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(p_sr_wlist), tg_wlist(p_tg_wlist),
+   nb_ex_short(0), nb_forw(0)
+{
+  debug0("*** Constructor TrainerPhraseSlist1 for block operations ***\n");
+  cout << "Setting up CSTM with short list" << endl;
+  // TODO: init with TrainerNgram before
+  DoConstructorWork();
+
+  cout << " - loading external phrase table from " << pt_fname << endl;
+  ptable.Read(pt_fname, nscores, scores_specif);
+}
+
+//**************************************************************************************
+
+TrainerPhraseSlist1::~TrainerPhraseSlist1 ()
+{ 
+  debug0("*** Destructor TrainerPhraseSlist1 ***\n");
+
+  if (buf_target_wid) delete [] buf_target_wid;
+  if (buf_target_ext) delete [] buf_target_ext;
+    // buf_input and buf_target will be deleted by ~Trainer()
+
+  phrase_mach.clear();
+
+#ifdef DEBUG2
+  vector<char*>::const_iterator iter, end = words.end();
+  for (iter=words.begin(); iter!=end; iter++) delete *iter;
+  words.clear();
+#endif
+}
+
+
+//**************************************************************************************
+
+REAL TrainerPhraseSlist1::Train()
+{
+  if (!data_train) return -1;
+#ifdef DEBUG
+  printf("*****************\n");
+  printf("TrainerPhraseSlist1::Train():\n");
+  printf(" -    idim=%d, odim=%d, tg_nbphr=%d\n", idim, odim, tg_nbphr);
+  printf(" -  data_in: %p \n", (void*) buf_input);
+  printf(" -   target: %p \n", (void*) buf_target);
+  printf(" -  tgt WID: %p \n", (void*) buf_target_wid);
+  printf(" - grad_out: %p \n", (void*) errfct->GetGrad());
+#endif
+
+  Timer ttrain;		// total training time
+  Timer tload;
+  Timer ttransfer;      // total transfer time of data to GPU
+  Timer tforw;          // total forw time
+  Timer tgrad;          // total gradient time
+  Timer tbackw;         // total backw time
+  ttrain.start();
+
+  data_train->Rewind();
+
+  REAL log_sum=0;
+  int i;
+  nb_ex=nb_ex_slist=nb_ex_short_inp=nb_ex_short=0;
+  nb_tg_words=nb_tg_words_slist=0;
+
+
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);		// we copy from buf_input to gpu_input
+  errfct->SetTarget(gpu_target);	// we copy from buf_target to gpu_target
+  debug1(" - gpu_input %p\n", gpu_input);
+  debug1(" - gpu_target %p\n", gpu_target);
+#else
+  mach->SetDataIn(buf_input);
+  errfct->SetTarget(buf_target);
+  debug1(" - buf_input %p\n", buf_input);
+  debug1(" - buf_target %p\n", buf_target);
+#endif
+  errfct->SetOutput(mach->GetDataOut());
+  mach->SetGradOut(errfct->GetGrad());
+  bool data_available;
+  do {
+    tload.start();
+      // get a bunch of data and map all the words
+    int n=0, nbtgsl=0;
+    data_available = true;
+    while (n < mach->GetBsize() && data_available) {
+      data_available = data_train->Next();
+      if (!data_available) break;
+      debug0("TRAIN DATA: input: ");
+      bool at_least_one_short=false;
+      for (i=0; i<iaux; i++) { // copy word indexes
+        WordID inp=(WordID) data_train->input[i];
+        debug2(" %s[%d]", sr_wlist->GetWordInfo(inp).word,inp);
+#if TODO // should we map input data ?
+        buf_input[n*idim + i] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist1::Train(): input");       // map context words IDs
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+#else
+        buf_input[n*idim + i] = inp;
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+        else if (inp<0 || inp>=(int)sr_wlist->GetSize())
+          ErrorN("TrainerPhraseSlist1::Train(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize());
+#endif
+      }
+      for (; i < idim ; i++) // copy auxiliary data
+        buf_input[n * idim + i] = data_train->input[i];
+      if (at_least_one_short) nb_ex_short_inp++;
+
+      debug0(" - > mapped: ");
+      
+      bool all_in_slist=true;  // ALL to be predicted words are in short list
+      at_least_one_short=false;
+      nbtgsl=0;
+      for (i=0; i<tg_nbphr; i++) {
+        WordID outp=(WordID) data_train->target[i];
+        int idx=i+n*tg_nbphr;
+        buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist1::Train(): output");     // TODO: not really needed during training, just the current value
+        if (outp==NULL_WORD) {
+          buf_target[idx] = (REAL) NULL_WORD;
+          at_least_one_short=true;
+          debug1(" -[%d->NULL]",(int) buf_target[idx]);
+        }
+        else {
+          nb_tg_words++;
+          if (tg_wlist->InShortList(buf_target_wid[idx])) {
+            buf_target[idx] = (REAL) buf_target_wid[idx];
+            debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word,(int) buf_target_wid[idx], outp);
+            nbtgsl++;
+          }
+          else {
+	    buf_target[idx] = (REAL) tg_slist_len;	// words that are not in slist are ALL done by the last output neuron
+            debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,(int) buf_target_wid[idx], outp);
+            all_in_slist=false;
+          }
+        }
+      }
+      if (all_in_slist) {
+        nb_ex_slist++;
+        nb_tg_words_slist += nbtgsl;
+      }
+      if (at_least_one_short) nb_ex_short++;
+      debug1("     all_slist=%d\n",all_in_slist);
+
+      n++;
+    }  // loop to get a bunch of examples
+    debug4("train bunch of %d words, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n));
+    tload.stop();
+
+#ifdef DEBUG2
+printf("network data:\n");
+REAL *iptr=buf_input;
+REAL *tptr=buf_target;
+for (int nn=0;nn<n;nn++) {
+   for (i=0;i<idim;i++) printf(" %f", *iptr++); printf(" -> ");
+   for (i=0;i<tg_nbphr;i++) printf(" %f", *tptr++); printf("\n");
+}
+#endif
+
+    if (n>0) {
+#ifdef BLAS_CUDA
+      ttransfer.start();
+      Gpu::MemcpyAsync(gpu_input, buf_input , n*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+      Gpu::MemcpyAsync(gpu_target, buf_target , n*odim*sizeof(REAL), cudaMemcpyHostToDevice);
+      Gpu::StreamSynchronize();
+      ttransfer.stop();
+#endif
+      tforw.start();
+      mach->Forw(n,true);
+      tforw.stop();
+
+      tgrad.start();
+      log_sum += errfct->CalcGrad(n);
+      tgrad.stop();
+
+      debug1("  log_sum=%e\n",log_sum);
+#ifdef DEBUG2
+      int t=(int) data_train->target[0];
+#ifdef BLAS_CUDA
+      Gpu::SetConfig(mach->GetGpuConfig());
+      REAL * tmp = Gpu::Alloc(5, "tmp buffer for DEBUG2");
+      cublasGetVector(odim,CUDA_SIZE,mach->GetDataOut(),1,tmp,1);
+      printf("OUTPUT:");
+      for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n");
+      cublasGetVector(3, CUDA_SIZE, data_train->target, 1, tmp, 1);
+      printf("TARGET:");
+      for (int i=0;i<1; i++) printf(" %f", tmp[i]); printf("\n");
+      //TODO check if we need odim or idim!
+      cublasGetVector(odim*bsize, CUDA_SIZE, errfct->GetGrad(), 1, tmp, 1);
+      printf("  GRAD:");
+      for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n");
+      cublasFree(tmp);
+#else
+printf("OUTPUT:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",mach->GetDataOut()[i]); printf("\n");
+printf("TARGET:") ; for (int i=0;i<1; i++) printf(" %f",data_train->target[i]); printf("\n");
+printf("  GRAD:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",errfct->GetGrad()[i]); printf("\n");
+#endif //BLAS_CUDA
+#endif //DEBUG2
+      lrate->UpdateLrateOnForw(mach->GetNbForw());
+      tbackw.start();
+      mach->Backw(lrate->GetLrate(), wdecay, n);
+      tbackw.stop();
+    }
+
+    nb_ex += n;
+  } while (data_available);
+#ifdef BLAS_CUDA
+  Gpu::StreamSynchronize();
+#endif
+
+  ttrain.stop();
+  ttrain.disp(" - training time: ");
+  tload.disp(" including load: ");
+#ifdef BLAS_CUDA
+  ttransfer.disp(" transfer: ");
+#endif
+  tforw.disp(" forw: ");
+  tgrad.disp(" grad: ");
+  tbackw.disp(" backw: ");
+  printf("\n");
+  
+  printf(" = log_sum=%.2f, nb_tg_words=%d, nb_ex_slist=%d, nb_tg_words_slist=%d\n", log_sum, nb_tg_words, nb_ex_slist, nb_tg_words_slist);
+  if (nb_tg_words>0) return exp(-log_sum / (REAL) nb_tg_words);  // when normalizing consider that all examples lead to a forward pass 
+
+  return -1;
+}
+
+//**************************************************************************************
+// 
+
+void TrainerPhraseSlist1::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni)
+{
+  int Nbest=100;
+
+    // get input length
+  int input_length;
+  for (input_length=0;input_length<iaux;input_length++) {
+    if (buf_input[ni*idim+input_length] == NULL_WORD) break;
+  }
+
+  std::vector<std::vector<std::pair<float, std::size_t> > > prepared_scores
+   = prepare_hypotheses(optr, tg_nbphr, dim_per_phrase, Nbest);
+  std::vector<std::pair<float, std::vector<std::size_t> > > best
+   = sort_ngrams(prepared_scores, input_length, Nbest);
+
+  for(std::size_t i = 0; i < best.size(); ++i) {
+      // source
+    for (int j=0; j<iaux; j++) {
+      if (buf_input[ni*idim+j] == NULL_WORD) break;
+      fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+j]).word << " ";
+    }
+
+      // target
+    fspt << "|||";
+    for(std::size_t j = 0; j < best[i].second.size(); ++j) {
+      fspt << " " << tg_wlist->GetWordInfoMapped(best[i].second[j]).word;
+    }
+
+      // score
+    fspt << " ||| " << exp(best[i].first);
+    fspt << "\n";
+  }
+
+}
+
+//**************************************************************************************
+// 
+#if 0
+void TrainerPhraseSlist1::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni)
+{
+  int i;
+	  // Find most likely outputs
+        for (i=0;i<iaux;i++) {
+          if (buf_input[ni*idim+i] == NULL_WORD) break;
+          fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+i]).word << " ";
+        }
+        fspt << "||| ";
+        
+        for (i=0; i<tg_nbphr; i++) {
+          if (buf_target_wid[i+ni*tg_nbphr] == NULL_WORD) break;
+  tgrad.disp(" including ");
+  tgrad.disp(" including ");
+	    // find max of current word
+	  REAL *sptr=optr+i*dim_per_phrase, max=*sptr++; int max_idx=0;
+          for (int s=1; s<dim_per_phrase; s++, sptr++) {
+            if (*sptr>max) { max=*sptr; max_idx=s; }
+          }
+          fspt << tg_wlist->GetWordInfoMapped(max_idx).word << "[" << max << "] ";
+        }
+  fspt << endl;
+}
+#endif
+ 
+//**************************************************************************************
+// 
+
+REAL TrainerPhraseSlist1::TestDev(char *fname)
+{
+  if (!data_dev) return -1;
+
+  vector<string> src_phrase;	// interface with classical phrase tables
+  vector<string> tgt_phrase;
+  vector<bool> done_by_cstm;
+
+  ofstream fs;
+  if (fname) {
+    cout << " - dumping phrase probability stream to file '" << fname << "'" << endl;
+    fs.open(fname,ios::out);
+    CHECK_FILE(fs,fname);
+  }
+
+  char *ptfname = (char*) "alltrans.txt";
+  ofstream fspt;
+  cout << " - dumping new phrase table to file '" << ptfname << "'" << endl;
+  fspt.open(ptfname,ios::out);
+  CHECK_FILE(fspt,ptfname);
+
+  nb_ex=nb_ex_slist=nb_ex_short=0;
+  nb_tg_words=nb_tg_words_slist=0;
+  int nb_probs=0;	// this counts the number of cumulated log probs.
+			// This increments by only one for external phrase tables, independently of the target phrase length
+  REAL logP, log_sum=0;
+  REAL log_sum_cstm=0;	// only CSLM, i.e. considering phrases done by CSTM
+
+  uint idx;
+
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);		// we copy from buf_input to gpu_input
+  errfct->SetTarget(gpu_target);	// we copy from buf_target to gpu_target
+  debug1(" - gpu_input %p\n", gpu_input);
+  debug1(" - gpu_target %p\n", gpu_target);
+#else
+  mach->SetDataIn(buf_input);
+  errfct->SetTarget(buf_target);
+#endif
+  errfct->SetOutput(mach->GetDataOut());
+
+  bool data_available;
+  data_dev->Rewind();
+  do {
+      // get a bunch of data
+    int n=0, i;
+    data_available = true;
+    debug0("start bunch\n");
+    done_by_cstm.clear();
+    while (n < mach->GetBsize() && data_available) {
+      data_available = data_dev->Next();
+      if (!data_available) break;
+
+      debug0("DEV DATA: input: ");
+      bool at_least_one_short=false;
+      for (i=0; i<iaux; i++) { // copy word indexes
+        WordID inp=(WordID) data_dev->input[i];
+        idx=n*idim + i;
+        debug1(" %d", inp);
+#if TODO // should we map input data ?
+        buf_input[idx] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist1::TestDev(): input");       // map context words IDs
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+#else
+        buf_input[idx] = inp;
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+        else if (inp<0 || inp>=(int)sr_wlist->GetSize())
+          ErrorN("TrainerPhraseSlist1::TestDev(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize());
+#endif
+      }
+      for (; i < idim ; i++) // copy auxiliary data
+        buf_input[n * idim + i] = data_dev->input[i];
+      if (at_least_one_short) nb_ex_short_inp++;
+
+      debug0(" - > mapped: ");
+      
+      bool all_in_slist=true;  // ALL to be predicted words are in short list
+      int nb_words_not_null=0;
+      at_least_one_short=false;
+      for (i=0; i<tg_nbphr; i++) {
+        WordID outp=(WordID) data_dev->target[i];
+        idx=n*tg_nbphr + i;
+        buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist1::TestDev(): output");
+        buf_target_ext[idx] = buf_target_wid[idx];		// keep target word ID for Moses phrase-table
+        if (outp==NULL_WORD) {
+          buf_target[idx] = (REAL) NULL_WORD;
+          at_least_one_short=true;			// TODO: optimize: we should be able to stop the loop on "i"
+          debug1(" %d[NULL]",(int) buf_target_wid[idx]);
+        }
+        else {
+          nb_tg_words++;
+          nb_words_not_null++;
+          if (tg_wlist->InShortList(buf_target_wid[idx])) {
+            buf_target[idx] = (REAL) buf_target_wid[idx];
+            debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word, (int) buf_target_wid[idx], outp);
+	    //nbtgsl++;
+          }
+          else {
+	      // TODO: we actually don't need a forward pass for words in the short lists or short n-grams
+	      //       this could be used to save some time (5-10%)
+            buf_target_wid[idx] = tg_slist_len;
+	    buf_target[idx] = (REAL) tg_slist_len;	// words that are not in slist are ALL done by the last output neuron
+            debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,(int) buf_target_wid[idx], outp);
+            all_in_slist=false;
+          }
+        }
+      }
+      done_by_cstm.push_back(all_in_slist);
+      if (all_in_slist) {
+        nb_ex_slist++;
+        nb_tg_words_slist += nb_words_not_null;
+        //nb_tg_words_slist += nbtgsl;
+      }
+      if (!at_least_one_short) nb_ex_short++;
+      debug1("     all_slist=%d\n",all_in_slist);
+
+      n++;
+    }  // loop to get a bunch ef examples
+    debug4("dev bunch of %d phrases, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n));
+
+#ifdef DEBUG2
+printf("network data:\n");
+REAL *iptr=buf_input;
+REAL *tptr=buf_target;
+for (int nn=0;nn<n;nn++) {
+   for (i=0;i<idim;i++) printf(" %f", *iptr++); printf(" -> ");
+   for (i=0;i<tg_nbphr;i++) printf(" %f", *tptr++); printf("\n");
+}
+#endif
+
+
+      // process the bunch by the neural network
+    if (n>0) {
+#ifdef BLAS_CUDA
+      Gpu::MemcpyAsync(gpu_input, buf_input , n*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+      Gpu::MemcpyAsync(gpu_target, buf_target , n*odim*sizeof(REAL), cudaMemcpyHostToDevice);
+#endif
+      mach->Forw(n,false); 
+      log_sum_cstm += errfct->CalcValue(n);
+    }
+
+      // get probas from CSLM or back-off LM
+#ifdef BLAS_CUDA
+    cudaMemcpy(host_output, mach->GetDataOut(), n*odim*sizeof(REAL), cudaMemcpyDeviceToHost);
+    REAL *optr=host_output;
+    Error("TrainerPhraseSlist1::TestDev TODO CUDA");
+#else
+    REAL *optr=mach->GetDataOut();	// n times (tg_nbphr*tg_slen) = odim values
+#endif
+
+    debug1("Collect n=%d\n", n);
+    if (n!=(int) done_by_cstm.size())
+      Error("TrainerPhraseSlist1::TestDev(): internal error, number of phrases done by CSTM does not match");
+
+    REAL *ptr_input = buf_input;	// n times idim values
+    for (int ni=0; ni<n; ni++) {
+      int nb_tg=0; // for normalization
+      if (done_by_cstm[ni]) {
+          // get proba from CSTM (removed renorm)
+          
+#define DUMP_PHRASE_TABLE
+#ifdef DUMP_PHRASE_TABLE
+          // create output phrase table
+        for (i=0;i<iaux;i++) {
+          if (buf_input[ni*idim+i] == NULL_WORD) break;
+          fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+i]).word << " ";
+        }
+        fspt << "||| ";
+        for (i=0;i<tg_nbphr;i++) {
+          if (buf_target_wid[i+ni*tg_nbphr] == NULL_WORD) break;
+          fspt << tg_wlist->GetWordInfoMapped(buf_target_wid[ni*tg_nbphr+i]).word << " ";
+        }
+        fspt << "||| ";
+#endif
+
+        logP=0;
+        REAL *optr2=optr;
+        for (i=0; i<tg_nbphr; i++) {
+          if (buf_target_wid[i+ni*tg_nbphr] == NULL_WORD) break;
+          logP += safelog(optr2[buf_target_wid[i+ni*tg_nbphr]]); // no error check on indices necessary here
+          nb_tg++;
+#ifdef DUMP_PHRASE_TABLE2
+	  fspt << optr2[buf_target_wid[i+ni*tg_nbphr]] << " ";  
+#endif
+          optr2+=dim_per_phrase;
+        }
+#ifdef DUMP_PHRASE_TABLE
+        fspt << logP/nb_tg << endl;
+#endif
+
+#ifdef DUMP_PHRASE_TABLE_NBEST
+        GetMostLikelyTranslations(fspt,optr,ni);
+#endif
+
+        nb_probs+=i;
+        debug1(" CSLM: logP=%e\n", logP);
+      }
+      else {
+          // request proba from Moses phrase-table
+#if 1
+        debug0("create textual phrase pair for external phrase table (word + index)\n");
+        src_phrase.clear();
+        debug0("  source:");
+        for (i=0; i<iaux && ptr_input[i]!=NULL_WORD; i++) {
+          src_phrase.push_back(sr_wlist->GetWordInfo((uint) ptr_input[i]).word);	// TODO: char* to string
+          debug2(" %s[%d]", src_phrase.back().c_str(), (uint) ptr_input[i]);
+        }
+        tgt_phrase.clear();
+        debug0("  target:");
+        for (i=0; i<tg_nbphr && buf_target_ext[i+ni*tg_nbphr]!=NULL_WORD; i++) {
+          tgt_phrase.push_back(tg_wlist->GetWordInfoMapped(buf_target_ext[i+ni*tg_nbphr]).word);	// TODO: char* to string
+          debug2(" %s[%d]", tgt_phrase.back().c_str(), buf_target_ext[i+ni*tg_nbphr]);
+        }
+#ifdef BACKWRAD_TM
+        logP = safelog(ptable.GetProb(tgt_phrase, src_phrase));
+#else
+        logP = safelog(ptable.GetProb(src_phrase, tgt_phrase));
+#endif
+        nb_probs++;
+        debug1("  => logP=%e\n",logP);
+#else
+        logP=1;
+#endif
+      }
+
+      log_sum += logP;
+      ptr_input += idim;  // next example in bunch at input
+      optr += odim;  // next example in bunch at output
+      if (fname) {
+        fs << ((nb_tg>0) ? logP/nb_tg : -1) << endl;
+      }
+    }
+
+    nb_ex += n;
+    debug2("%d: %f\n",nb_ex,exp(-log_sum/nb_ex));
+  } while (data_available);
+
+  printf(" %d target words in %d phrases (%d=%.2f%% uncomplete), CSTM: %d target words in %d phrases (%.2f%%)\n",
+         nb_tg_words, nb_ex, 
+         nb_ex_short, 100.0*nb_ex_short/nb_ex,
+         nb_tg_words_slist, nb_ex_slist, 100.0*nb_ex_slist/nb_ex);
+
+ 
+  REAL px = (nb_probs>0) ? exp(-log_sum / (REAL) nb_probs) : -1;
+  printf("   cstm px=%.2f, ln_sum=%.2f, overall px=%.2f (%d values)\n",
+        (nb_tg_words_slist>0) ? exp(-log_sum_cstm / (REAL) nb_tg_words_slist) : -1, log_sum_cstm, px, nb_probs);
+
+  if (fname) fs.close();
+  fspt.close();
+
+  return px;
+}
+
+
+//**************************************************************************************
+// information after finishing an epoch
+
+void TrainerPhraseSlist1::InfoPost ()
+{
+  printf(" - epoch finished, %d target words in %d phrases (%.2f/%.2f%% short source/target)\n",
+	nb_tg_words, nb_ex,
+	100.0*nb_ex_short_inp/nb_ex_slist, 100.0*nb_ex_short/nb_ex_slist);
+  printf("   CSTM: %d target words in %d phrases (%.2f%%), avrg px=%.2f\n",
+	nb_tg_words_slist, nb_ex_slist, 100.0*nb_ex_slist/nb_ex,
+	err_train);
+}
+
+//**************************************************************************************
+// request one n-gram probability, usually the called will be delayed
+// and processes later 
+
+
+//**************************************************************************************
+// collect all delayed probability requests
+
+
+void TrainerPhraseSlist1::ForwAndCollect(vector< vector<string> > &src_phrases, AlignReq *areq, int req_beg, int req_end, int bs, int tm_pos)
+{
+  if (bs<=0) return;
+  debug3("TrainerPhraseSlist1::ForwAndCollect(): collecting outputs %d .. %d from bunch of size %d\n", req_beg, req_end, bs);
+  debug3("\ttarget machines %d x dim %d = total %d\n", tg_nbphr, dim_per_phrase, odim);
+
+  if (bs != (int) src_phrases.size())
+    ErrorN("TrainerPhraseSlist1::ForwAndCollect(): the number of source phrases (%d) does not match block length (%d)", (int) src_phrases.size(), bs);
+
+#ifdef DEBUG
+  printf("bunch of %d\n",bs);
+  for (int b=0; b<bs; b++) {
+    printf("%3d:", b);
+    for (int ii=0; ii<idim; ii++) printf(" %.2f", buf_input[b*idim+ii]); printf("\n");
+  }
+#endif
+
+  nb_forw++;
+#ifdef CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);
+  Gpu::MemcpyAsync(gpu_input, buf_input , bs*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+#else
+  mach->SetDataIn(buf_input);
+#endif
+  mach->Forw(bs,false);
+
+#ifdef BLAS_CUDA
+  Gpu::MemcpyAsync(host_output, mach->GetDataOut(), bs*odim*sizeof(REAL), cudaMemcpyDeviceToHost);
+  Gpu::StreamSynchronize();
+#endif
+
+    // stats
+  int cnt_ex_slist=0, cnt_tg_words=0, cnt_tg_words_slist=0;
+
+  for (int n=req_beg; n<=req_end; n++) {
+    REAL logP=0;
+    int b=areq[n].bs;
+
+    if ((int) areq[n].tgph.size() > tg_nbphr)
+      ErrorN("TrainerPhraseSlist1::ForwAndCollect(): target phrase too long (%d) for machine (%d)", (int) areq[n].tgph.size(), tg_nbphr);
+
+#ifdef DEBUG
+    printf("collect b=%3d \n input:", b);
+    for (int ii=0; ii<idim; ii++) printf(" %f",buf_input[b*idim+ii]); printf("\n");
+#endif
+
+      // map target words
+    debug0(" output:");
+    bool all_in_slist=true;
+    int tw;
+    for (tw=0; all_in_slist && tw<tg_nbphr; tw++) {
+      WordID outp = areq[n].tgwid[tw];
+      debug1(" %d",outp);
+      if (outp==NULL_WORD) break;
+      cnt_tg_words++;
+      buf_target_wid[tw] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist1::ForwAndCollect() output");
+      debug1("->%d",buf_target_wid[tw]);
+      all_in_slist=tg_wlist->InShortList(buf_target_wid[tw]);
+    }
+      // fill up
+    for (; tw<tg_nbphr; tw++) {
+      debug0(" fill");
+      buf_target_wid[tw]=NULL_WORD;
+    }
+    debug1("    slist=%d\n",all_in_slist);
+
+#ifdef BLAS_CUDA
+    REAL *optr=host_output + b*odim;
+#else
+    REAL *optr=mach->GetDataOut() + b*odim;
+#endif
+
+    if (!all_in_slist) {
+        // get proba from external phrase table
+      logP=ptable.GetProb(src_phrases[areq[n].bs], areq[n].tgph);
+      debug1(" ptable: logP=%f\n", logP);
+    }
+    else {
+        // get proba from CSLM
+      debug0(" -  in slist CSLM:");
+      logP=0; int cnt=0;
+      for (int tw=0; tw<tg_nbphr; tw++) {
+        if (buf_target_wid[tw] == NULL_WORD) break;
+        debug1(" %e", optr[buf_target_wid[tw]]);
+        logP += safelog(optr[buf_target_wid[tw]]);
+        optr+=dim_per_phrase;
+        cnt++;
+      }
+      if (cnt==0) Error("no target phrases when collecting output");
+      logP /= cnt; // TODO: is this normalization correct ?
+      debug1(" -> log avr=%f\n",logP);
+
+      cnt_ex_slist++;
+      cnt_tg_words_slist += cnt;
+    }
+
+        // store LM proba
+    areq[n].hyp->AddFeature(logP,tm_pos);
+  } // for (ni=...)
+
+  printf(" nb of phrases: %d with %d target words, by CSTM %d (%5.2f%%), avrg length %1.2f words\n",
+	 req_end-req_beg+1, cnt_tg_words, cnt_ex_slist, (float) 100.0* cnt_ex_slist / (req_end-req_beg+1), (float) cnt_tg_words_slist/cnt_ex_slist);
+  nb_ex += (req_end-req_beg+1);
+  nb_ex_slist += cnt_ex_slist;
+  nb_tg_words_slist += cnt_tg_words_slist;
+  nb_tg_words += cnt_tg_words;
+}
+
+
+void TrainerPhraseSlist1::BlockStats() {
+   //printf(" - %d phrase probability requests, %d=%5.2f short phrase %d forward passes (avrg of %d probas), %d=%5.2f%% predicted by CSTM\n",
+	//nb_ngram, nb_ex_short, 100.0*nb_ex_short/nb_ngram, nb_forw, nb_ngram/nb_forw, nb_ex_slist, 100.0*nb_ex_slist/nb_ngram);
+   printf(" - CSTM: %d forward passes, %d=%5.2f%% phrases were predicted by CSTM\n",
+	nb_forw, nb_ex_slist, 100.0 * nb_ex_slist/nb_ex);
+}
diff --git a/TrainerPhraseSlist1.h b/TrainerPhraseSlist1.h
new file mode 100644
index 0000000..dad0a95
--- /dev/null
+++ b/TrainerPhraseSlist1.h
@@ -0,0 +1,105 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#ifndef _TrainerPhraseSlist1_h
+#define _TrainerPhraseSlist1_h
+
+#include <ostream>
+#include "Tools.h"
+#include "Mach.h"
+#include "ErrFct.h"
+#include "DataPhraseBin.h"
+#include "Trainer.h"
+#include "WordList.h"
+
+#include "PtableMosesPtree.h"
+#include "AlignReq.h"
+
+//
+// Class to train neural networks to predict phrase probabilities
+//  - we use a short list of target words for which the NN predicts the proba
+//  - the proba of the other target words are obtained by a classical Moses phrase table
+//  - the NN also predicts the proba mass of ALL the words not in the short slist
+//    for this we use the last output neuron of the network
+
+
+//
+// helper class to store and compare one ngram LM request
+// ugly C-style structure, but this seems to be more efficient
+
+class TrainerPhraseSlist1 : public Trainer
+{
+private:
+  int		max_inp_idx;		// largest index -1 of a word at the input (# of entries in projection table)
+  int		tg_nbphr;		// number of phrases at output, odim should be (tg_slist_len+1) * tg_nbphr
+  int		dim_per_phrase;		// output dimension of each phrase prediction layer (must be equal size)
+  WordID	tg_slist_len;		// length of slist (this is set to dim_per_phrase MINUS ONE)
+  WordList	*sr_wlist;
+  WordList	*tg_wlist;
+  vector<Mach*> phrase_mach;	// pointer to the output machine for each phrase
+ 
+    // classical phrase table
+  PtableMosesPtree	ptable;
+
+    // various stats
+  int		nb_ex_slist;		// total number of examples processed in slist
+  int		nb_ex_short_inp;	// total number of incomplete input phrases
+  int		nb_ex_short;		// total number of incomplete target phrases
+  int		nb_tg_words;		// total number of target words (there can be several target words for a phrase pair)
+  int		nb_tg_words_slist;	// total number of target words which are in short list
+// TODO: use WordID vector for targets in order to make less casts 
+  WordID	*buf_target_wid;	// used instead of buf_target to evitate casts between REAL and WordID
+					// size is odim x bsize
+  WordID	*buf_target_ext;	// similar to buf_target_wid[], but keep even word id out side of short list
+					// needed to request probas from external phrase table
+#ifdef DEBUG
+  vector<char*>  words;		// give UTF8 word for a given CSLM internal index
+#endif
+  REAL DoTestDev(char*, bool);	// internal helper function
+  void DoConstructorWork();	// internal helper function for the various constructors
+    // data and functions for block processing
+  int	nb_forw;		// stats on total number of forward passes
+  void GetMostLikelyTranslations(ofstream&,REAL*,int);
+protected:
+  virtual void InfoPost();			// dump information after finishing a training epoch
+public:
+  TrainerPhraseSlist1(Mach*, Lrate*, ErrFct*,	// mach, lrate, errfct
+	  const char*, const char*, const char*, int,	// train, dev, external phrase table, number of scores
+	  REAL =0, int =10, int =0);			// wdecay, max epochs, current epoch
+  TrainerPhraseSlist1(Mach*, ErrFct*, Data*,	// for testing only: mach, errfct, binary data
+	  char*, int);				// external phrase table, number of scores
+  TrainerPhraseSlist1(Mach*, WordList*, WordList*,	// for general proba calculation: mach, src word list, tgt word list
+	  char*, int , char*);			// external phrase table, number of scores, score specif
+  virtual ~TrainerPhraseSlist1();
+  virtual REAL Train();				// train for one epoch
+  virtual REAL TestDev(char* =NULL);		// test current network on dev data and save outputs into file
+    // fast block evaluation functions
+  virtual void StoreInput(int b, int d, REAL val) {buf_input[b*bsize+d]=val;}
+  virtual void ForwAndCollect(vector< vector<string> > &, AlignReq*, int,int,int,int);	// for nbest rescoring
+  virtual void BlockStats();				// display some stats on Block mode
+    // interface functions
+  virtual int GetTgtNbPhr() {return tg_nbphr; }
+  virtual int GetSlistLen() {return tg_slist_len; }
+  virtual REAL *GetBufInput() {return buf_input; }
+};
+
+#endif
diff --git a/docs/Descritpion-of-features.txt b/docs/Description-of-features.txt
similarity index 100%
rename from docs/Descritpion-of-features.txt
rename to docs/Description-of-features.txt
diff --git a/sort.cpp b/sort.cpp
new file mode 100644
index 0000000..2965593
--- /dev/null
+++ b/sort.cpp
@@ -0,0 +1,82 @@
+#include<vector>
+#include<math.h>
+#include <iostream>
+#include <algorithm>
+
+//simple exponential decay as length penalty (input length = output length: no penalty)
+REAL weight_lengths(std::size_t input_length, std::size_t output_length) {
+    return log(0.8)*abs(input_length-output_length);
+}
+
+
+//change data structure (vector of vectors of pairs) and prune number of hypotheses per length to N
+std::vector<std::vector<std::pair<REAL, std::size_t> > > prepare_hypotheses(REAL* scores, std::size_t maxLength, std::size_t vocab_size, std::size_t Nbest) {
+
+    // outermost vector: one item per length
+    std::vector<std::vector<std::pair<REAL, std::size_t> > > ret;
+
+    // for each length
+    for(std::size_t i = 0; i < maxLength; ++i){
+        std::vector<std::pair<REAL, std::size_t> > vec (vocab_size);
+
+        // for each word in the vocabulary
+        for(std::size_t j = (i*vocab_size); j < ((i+1)*vocab_size); ++j){
+            std::size_t idx = j-(i*vocab_size);
+            vec[idx] = std::make_pair(scores[j],idx); //store probability and index
+        }
+
+        // prune to N most probable members
+        std::nth_element(vec.begin(), min(vec.end(),vec.begin()+Nbest), vec.end(), std::greater<std::pair<REAL, std::size_t> >());
+        vec.resize(std::min(Nbest,vec.size()));
+
+        ret.push_back(vec);
+    }
+    return ret;
+}
+
+std::vector<std::pair<REAL, std::vector<std::size_t> > > sort_ngrams(std::vector<std::vector<std::pair<REAL, std::size_t> > > scores, std::size_t input_length, std::size_t Nbest) {
+
+    //stack of hypotheses for building next greater length
+    std::vector<std::pair<REAL, std::vector<std::size_t> > > seed;
+    std::vector<std::size_t> tmp;
+    seed.push_back(std::make_pair(0,tmp));
+
+    std::vector<std::pair<REAL, std::vector<std::size_t> > > ret;
+
+    // for each n-gram length
+    for(std::size_t i = 0; i < scores.size(); ++i){
+
+        std::vector<std::pair<REAL, std::vector<std::size_t> > > scores_current;
+
+        //for each word in vocab (already pruned in prepare_hypotheses)
+        for(std::size_t j = 0; j < scores[i].size(); ++j){
+
+            //for each hypothesis we kept from (n-gram-length-1)
+            for(std::size_t k = 0; k < seed.size(); ++k){
+
+                std::vector<size_t> tempvect (seed[k].second);
+                tempvect.push_back(scores[i][j].second);
+
+                scores_current.push_back(std::make_pair(seed[k].first + log(scores[i][j].first), tempvect));
+            }
+        }
+
+        //we only need Nbest hypotheses
+        std::nth_element(scores_current.begin(), min(scores_current.end(),scores_current.begin()+Nbest), scores_current.end(), std::greater<std::pair<REAL, std::vector<std::size_t> > >());
+        seed.resize(std::min(Nbest,scores_current.size()));
+
+        REAL length_penalty = weight_lengths(input_length,i+1);
+        for(std::size_t j = 0; j < std::min(Nbest,scores_current.size()); ++j) {
+            ret.push_back(std::make_pair((scores_current[j].first+length_penalty)/(i+1), scores_current[j].second)); // normalized by length
+            seed[j] = scores_current[j]; // unnormalized; used to generate longer hypotheses
+        }
+
+    }
+
+    // compare n-grams of different lengths and return Nbest
+    std::sort(ret.begin(), ret.end(), std::greater<std::pair<REAL, std::vector<std::size_t> > >());
+    ret.resize(std::min(ret.size(),Nbest));
+
+    return ret;
+}
+