From 0aff529002b615348bdf699515a41eae9c91a43b Mon Sep 17 00:00:00 2001
From: Holger Schwenk <Holger.Schwenk@lium.univ-lemans.fr>
Date: Thu, 9 Jul 2015 04:13:42 +0200
Subject: [PATCH] added missing files

---
 AlignReq.h                                    |   40 +
 Gpu.cu                                        | 1799 +++++++++++++++++
 KENLM                                         |    0
 NBest.cpp                                     |  585 ++++++
 NBest.h                                       |   73 +
 NbestCSTM.cpp                                 |  123 ++
 NbestCSTM.h                                   |   51 +
 Ptable.h                                      |   49 +
 PtableMosesPtree.cpp                          |  194 ++
 PtableMosesPtree.h                            |   77 +
 TrainerPhraseSlist.cpp                        | 1164 +++++++++++
 TrainerPhraseSlist.h                          |  114 ++
 TrainerPhraseSlist1.cpp                       |  951 +++++++++
 TrainerPhraseSlist1.h                         |  105 +
 ...atures.txt => Description-of-features.txt} |    0
 sort.cpp                                      |   82 +
 16 files changed, 5407 insertions(+)
 create mode 100644 AlignReq.h
 create mode 100644 Gpu.cu
 delete mode 100644 KENLM
 create mode 100644 NBest.cpp
 create mode 100644 NBest.h
 create mode 100644 NbestCSTM.cpp
 create mode 100644 NbestCSTM.h
 create mode 100644 Ptable.h
 create mode 100644 PtableMosesPtree.cpp
 create mode 100644 PtableMosesPtree.h
 create mode 100644 TrainerPhraseSlist.cpp
 create mode 100644 TrainerPhraseSlist.h
 create mode 100644 TrainerPhraseSlist1.cpp
 create mode 100644 TrainerPhraseSlist1.h
 rename docs/{Descritpion-of-features.txt => Description-of-features.txt} (100%)
 create mode 100644 sort.cpp
diff --git a/AlignReq.h b/AlignReq.h
new file mode 100644
index 0000000..ea50686
--- /dev/null
+++ b/AlignReq.h
@@ -0,0 +1,40 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#ifndef _ALIGNREQ_H_
+#define _ALIGNREQ_H_
+
+using namespace std;
+
+#include <vector>
+#include "Hypo.h"
+
+struct AlignReq {
+  int sb, se;		// requested alignment, we can use the word indices only since the source is constant for all hyps
+  vector<string> tgph;	// target phrase 	
+  WordID tgwid[16];	// mpped target wordID; TODO: this is an hack, we map many times the same target phrase
+  Hypo *hyp;		// corresponding hypothesis
+  int bs;		// index into bunch that will be processed by NN
+  float *logP;	 	// log proba (may be several scores)
+};
+
+#endif
diff --git a/Gpu.cu b/Gpu.cu
new file mode 100644
index 0000000..673197b
--- /dev/null
+++ b/Gpu.cu
@@ -0,0 +1,1799 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+using namespace std;
+#include <algorithm>
+#include <map>
+#include <sstream>
+#include <signal.h>
+#define RAISE raise(SIGINT);
+
+typedef float REAL;
+#define NULL_WORD (-1)		// from WordList.h
+#define LOG_PROBA_NONE 999	// from ErrFact.h
+#define LOCK_FNAME "/tmp/gpu_lock.pid%d.gpu%d"
+#define LOCK_FNAME_LEN 256	// Hack ;-)
+
+#include <npps.h>
+#include <cublas.h>
+#include <cuda_runtime_api.h>
+#include <nppcore.h>
+#include "nvml.h"
+#include "Gpu.cuh"
+#include "Tools.h" //For Error()
+
+
+// global variables
+curandGenerator_t cuda_gen;
+string cuda_user_list;	// user specified list of GPUs
+static REAL *gpu_result;  
+#define GPU_BUF_DIM 65536
+static REAL *gpu_buf;  
+
+size_t Gpu::curDevIndex  = (size_t)-1;   ///< current device index
+size_t Gpu::curConfIndex = (size_t)-1;   ///< current configuration index
+cudaStream_t Gpu::curStream = NULL;      ///< current stream
+bool Gpu::useConcurrentStreams = false;  ///< status of concurrent streams
+#ifdef GPU_CUBLAS_V2
+cublasHandle_t Gpu::curCbHandle = NULL;  ///< current Cublas handle
+#endif
+cudaDeviceProp* Gpu::curDevProps = NULL; ///< device properties
+vector<Gpu::Device> Gpu::vDevices; ///< vector of Gpu devices to be used
+vector<Gpu::Config> Gpu::vConfigs; ///< vector of Gpu configurations
+
+void HandlerSigTERM(int s)
+{
+  printf("Catched signal: removing lock-files\n");
+  Gpu::Unlock();
+  exit(1);
+}
+
+/**
+ * initializes Cuda and creates lock files
+ * @note selects first device and stream
+ * @returns configuration index 0
+ */
+size_t Gpu::Init()
+{
+  size_t stId = 0;
+  if (0 >= Gpu::vConfigs.size()) {
+    Gpu::vConfigs.resize(1);
+
+    cout << "Initializing Nvidia GPU card" << endl;
+    int dev_max = 0;
+    cudaGetDeviceCount(&dev_max);
+    bool bSelAuto = (':' != cuda_user_list[0]);
+    Gpu::Device dev;
+    if (0 < dev_max) {
+      if (1 == dev_max)
+        cout << " - found 1 card:" << endl;
+      else
+        cout << " - found " << dev_max << " cards:" << endl;
+      if (bSelAuto)
+        nvmlInit();
+      nvmlDevice_t nd;
+      nvmlUtilization_t nu;
+      multimap<uint,Gpu::Device> mSelDev;
+      for (dev.number = 0 ; dev.number < dev_max ; dev.number++) {
+        cudaGetDeviceProperties(&dev.props, dev.number);
+        int nb_cores_per_multiprocessor = -1;
+        if(dev.props.major == 1 && (dev.props.minor == 0||dev.props.minor == 1||dev.props.minor == 2||dev.props.minor == 3))
+            nb_cores_per_multiprocessor = 8;
+        else if(dev.props.major == 2 && dev.props.minor == 0)
+            nb_cores_per_multiprocessor = 32;
+        else if(dev.props.major == 2 && dev.props.minor == 1)
+            nb_cores_per_multiprocessor = 48;
+        else if(dev.props.major == 3 && (dev.props.minor == 0||dev.props.minor == 5))
+            nb_cores_per_multiprocessor = 192;
+
+
+        printf("    %d: %s with %d CPUs x %d threads running at %4.2f Ghz, %d MBytes of memory, use -arch=sm_%d%d",
+          dev.number, dev.props.name, dev.props.multiProcessorCount, nb_cores_per_multiprocessor,
+          dev.props.clockRate/1000000.0, (int) (dev.props.totalGlobalMem/1024/1024),
+          dev.props.major, dev.props.minor);
+        if (bSelAuto) {
+          if (   (nvmlDeviceGetHandleByIndex(dev.number, &nd) == NVML_SUCCESS)
+              && (nvmlDeviceGetUtilizationRates( nd    , &nu) == NVML_SUCCESS) )
+            printf(", utilization %d%%", nu.gpu);
+          mSelDev.insert(make_pair(nu.gpu, dev));
+        }
+        printf("\n");
+      }
+      if (bSelAuto) { // select devices automatically
+        nvmlShutdown();
+        int iMaxDev = std::min(std::max(atoi(cuda_user_list.c_str()), 0), dev_max);
+        for (multimap<uint,Gpu::Device>::const_iterator mmci = mSelDev.begin() ; 0 < iMaxDev-- ; mmci++)
+          Gpu::vDevices.push_back(mmci->second);
+      }
+    }
+
+    if (!bSelAuto) { // read devices specified by user
+      char c;
+      istringstream iss;
+      iss.str(cuda_user_list);
+      while (iss.good()) {
+        iss >> c >> dev.number;
+        Gpu::vDevices.push_back(dev);
+        cudaGetDeviceProperties(&Gpu::vDevices.back().props, dev.number);
+      }
+      if (iss.fail())
+        ErrorN("format error in the selection of CUDA devices \"%s\"", cuda_user_list.c_str() + 1);
+    }
+    size_t dev_sel = Gpu::vDevices.size();
+    switch (dev_sel) {
+      case 0: printf(" - no GPU device selected\n");
+              dev.number = 0;
+              Gpu::vDevices.push_back(dev);
+              dev_sel = 1;
+              cudaGetDeviceProperties(&Gpu::vDevices.back().props, dev.number);
+      case 1: printf(" - using device %d\n", Gpu::vDevices[0].number);
+              cudaSetDevice(Gpu::vDevices[0].number);
+              break;
+      default:
+        if (dev_sel > (size_t)dev_max) {
+          printf(" - requested more GPU devices than available, using %d first ones\n", dev_max);
+          dev_sel = dev_max;
+          Gpu::vDevices.resize(dev_sel);
+        }
+        printf(" - using %lu devices in parallel:", dev_sel);
+        for (size_t d = 0 ; d < dev_sel ; d++) {
+          int n = Gpu::vDevices[d].number;
+          printf(" %d", n);
+          if ((n < 0) || (n >= dev_max))
+            Error("illegal device identifier");
+        }
+        printf("\n");
+        cudaSetDevice(Gpu::vDevices[0].number);
+    }
+
+    // initialize cublas and random generator
+    cublasInit();
+    Gpu::CheckError("initialization of card\n");
+    curandCreateGenerator(&cuda_gen, CURAND_RNG_PSEUDO_DEFAULT);
+    // curandSetPseudoRandomGeneratorSeed(cuda_gen, CUDA_SEED);
+    Gpu::CheckError("initialization of random generator\n");
+
+    // allocate buffers
+    gpu_buf = Gpu::Alloc(GPU_BUF_DIM*sizeof(REAL),"internal buffer on GPU");
+
+    // locking devices
+    ofstream lfs;
+    char lfname[LOCK_FNAME_LEN] = LOCK_FNAME;
+    for (size_t d = 0 ; d < dev_sel ; d++) {
+      sprintf(lfname, LOCK_FNAME, getpid(), Gpu::vDevices[d].number);
+      lfs.open(lfname,ios::out);
+      CHECK_FILE(lfs, lfname);
+      lfs << "Runing job " << getpid() << " on GPU " << Gpu::vDevices[d].number << endl;
+      lfs.close();
+    }
+
+    // catch signals to clean up lock-files
+    signal(SIGINT , HandlerSigTERM);
+    signal(SIGHUP , HandlerSigTERM);
+    signal(SIGFPE , HandlerSigTERM);
+    signal(SIGSEGV, HandlerSigTERM);
+    signal(SIGTERM, HandlerSigTERM);
+
+    // create default configuration
+    Gpu::Config& newConfig = Gpu::vConfigs.back();
+    Gpu::curDevIndex = newConfig.devId = 0;
+    Gpu::curConfIndex = stId;
+    newConfig.stream = NULL;
+#ifdef GPU_CUBLAS_V2
+    cublasCreate(&newConfig.cbHandle);
+    Gpu::curCbHandle = newConfig.cbHandle;
+#endif
+    Gpu::curDevProps = &Gpu::vDevices[0].props;
+  }
+  return stId;
+}
+
+/**
+ * removes lock-files and deletes all configurations
+ */
+void Gpu::Unlock()
+{
+  // remove lock-files
+  Gpu::curDevIndex = (size_t)-1;
+  char lfname[LOCK_FNAME_LEN] = LOCK_FNAME;
+  for (std::vector<Gpu::Device>::iterator id = Gpu::vDevices.begin() ; id != Gpu::vDevices.end() ; id++) {
+    sprintf(lfname, LOCK_FNAME, getpid(), id->number);
+    if (unlink(lfname))
+      cerr << " - ERROR: removing lock file " << lfname << endl;
+  }
+
+  // destroy streams
+  Gpu::curConfIndex = (size_t)-1;
+  Gpu::curStream = NULL;
+  Gpu::useConcurrentStreams = false;
+#ifdef GPU_CUBLAS_V2
+  Gpu::curCbHandle = NULL;
+#endif
+  Gpu::curDevProps = NULL;
+  Gpu::vDevices.clear();
+  for (std::vector<Gpu::Config>::iterator igc = Gpu::vConfigs.begin() ; igc != Gpu::vConfigs.end() ; igc++) {
+    if (NULL != igc->stream)
+      cudaStreamDestroy(igc->stream);
+#ifdef GPU_CUBLAS_V2
+    if (NULL != igc->cbHandle)
+      cublasDestroy(igc->cbHandle);
+#endif
+  }
+  Gpu::vConfigs.clear();
+}
+
+
+/**
+ * creates a new Gpu stream on next device
+ * @note selects the next device and the new stream
+ * @returns new configuration index
+ */
+size_t Gpu::NewConfig()
+{
+  size_t stId = Gpu::vConfigs.size();
+  if (0 < stId) {
+    Gpu::useConcurrentStreams |= (Gpu::vDevices.size() <= (0.8 * (stId + 1)));
+    Gpu::vConfigs.resize(stId + 1);
+    Gpu::Config& newConfig = Gpu::vConfigs.back();
+    newConfig.devId = ((Gpu::curDevIndex + 1) % Gpu::vDevices.size());
+    newConfig.stream = NULL;
+#ifdef GPU_CUBLAS_V2
+    newConfig.cbHandle = NULL;
+#endif
+    Gpu::ChangeConfig(stId);
+    return stId;
+  }
+  else
+    return Gpu::Init();
+}
+
+/**
+ * changes current configuration
+ * @param stCfg index of configuration to use
+ */
+void Gpu::ChangeConfig(size_t stCfg)
+{
+  Gpu::curConfIndex = stCfg;
+  Gpu::Config& config = Gpu::vConfigs[Gpu::curConfIndex];
+  if (Gpu::curDevIndex != config.devId) {
+    Gpu::curDevIndex = config.devId;
+    cudaSetDevice(Gpu::vDevices[Gpu::curDevIndex].number);
+    Gpu::curDevProps = &Gpu::vDevices[Gpu::curDevIndex].props;
+  }
+#ifdef GPU_CUBLAS_V2
+  if (NULL == config.cbHandle)
+    cublasCreate(&config.cbHandle);
+  if (Gpu::useConcurrentStreams && (NULL == config.stream)) {
+    cudaStreamSynchronize(NULL);
+    cudaStreamCreate(&config.stream);
+    cublasSetStream(config.cbHandle, config.stream);
+  }
+  if (Gpu::curStream != config.stream) {
+    Gpu::curStream = config.stream;
+    nppSetStream(Gpu::curStream);
+  }
+  Gpu::curCbHandle = config.cbHandle;
+  debug4("Gpu::ChangeConfig cfg=%zu dev=%d str=%x cbh=%x\n", Gpu::curConfIndex, Gpu::vDevices[Gpu::curDevIndex].number, Gpu::curStream, Gpu::curCbHandle);
+#endif
+}
+
+/**
+ * sets current device with default stream
+ * @param stDevId device index
+ */
+void Gpu::SetDevice(size_t stDevId)
+{
+  Gpu::curConfIndex = (size_t)-1;
+  if (Gpu::curDevIndex != stDevId) {
+    Gpu::curDevIndex = (stDevId % Gpu::vDevices.size());
+    cudaSetDevice(Gpu::vDevices[Gpu::curDevIndex].number);
+    Gpu::curDevProps = &Gpu::vDevices[Gpu::curDevIndex].props;
+  }
+#ifdef GPU_CUBLAS_V2
+  if (NULL != Gpu::curStream) {
+    Gpu::curStream = NULL;
+    nppSetStream(Gpu::curStream);
+  }
+  Gpu::curCbHandle = NULL;
+#endif
+}
+
+/**
+ * allocates memory on Gpu and checks error
+ * @param msg message to print in case of error
+ */
+REAL* Gpu::Alloc(int dim, const char* msg) {
+  void* gpu_mem;
+  char err_msg[1024];
+  sprintf(err_msg, "CUDA: can't allocate memory for %s", msg);
+  sprintf(err_msg, "CUDA: can't allocate memory (%dMB) for %s", (int)(dim / 1024 / 1024 * sizeof(REAL)), msg);
+  if (dim > 0) {
+    cublasAlloc(dim, CUDA_SIZE, &gpu_mem);
+#ifdef DEBUG
+    int dev = -1;
+    cudaGetDevice(&dev);
+    debug3("allocated %ld at %p on device %d\n",  dim * CUDA_SIZE, gpu_mem, dev);
+#endif
+    Gpu::CheckError(err_msg);
+    if (NULL == gpu_mem)
+      Error(err_msg);
+    return (CUDA*)gpu_mem;
+  }
+  else
+    return NULL;
+}
+
+/**
+ * checks error
+ * @param msg message to print in case of error
+ */
+void Gpu::CheckError(const char* msg) {
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err)
+    ErrorN("CUDA: ERROR %d in %s: %s\n", cublasGetError(), msg, cudaGetErrorString(err));
+}
+
+
+// Corresponds to 2.0*numeric_limits<float>::min()
+__device__ REAL GPU_LOG_LOWER_BOUND = 2.35099e-38;
+__device__ REAL gpu_safelog(REAL x) { return (x<GPU_LOG_LOWER_BOUND) ? log(GPU_LOG_LOWER_BOUND) : log(x); };
+
+
+//-----------------------------------------------
+// forward pass for MachTab
+//-----------------------------------------------
+
+__global__
+void KernelMachTabForw(const int bsize, const int odim, REAL *gpu_data_in, REAL *gpu_t, REAL *gpu_data_out)
+{
+  for (int b=blockIdx.x ; b<bsize ; b+=gridDim.x) {
+    int idx= (int) gpu_data_in[b];
+    int offso=b*odim;
+    int offst=idx*odim;
+    for (int i=threadIdx.x ; i<odim ; i+=blockDim.x) {
+      if (idx==NULL_WORD) gpu_data_out[i+offso] = 0.0;
+                     else gpu_data_out[i+offso] = gpu_t[i+offst];
+    }
+  }
+}
+
+void Gpu::MachTabForw(const int bsize, const int odim,
+		    REAL *gpu_data_in, REAL *gpu_t, REAL *gpu_data_out)
+{
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], odim);
+  int n_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize);
+  KernelMachTabForw<<<n_blocks, n_threads, 0, Gpu::curStream>>>(bsize, odim, gpu_data_in, gpu_t, gpu_data_out);
+}
+
+
+//-----------------------------------------------
+// backward pass for MachTab
+//-----------------------------------------------
+
+__global__
+void KernelMachTabBackw(const REAL lrate, const int bsize, const int odim,
+                        REAL *gpu_data_in, REAL *gpu_t, REAL *gpu_grad_out)
+{
+  for (int b=blockIdx.x; b<bsize; b+=gridDim.x) {
+    for (int i=threadIdx.x; i<odim; i+=blockDim.x) {
+      int idx = (int) gpu_data_in[b];
+      // Use atomicAdd instead of += to avoid race conditions between threads
+      if (idx != NULL_WORD)
+        atomicAdd(gpu_t+i+idx*odim, lrate * gpu_grad_out[i+b*odim]);
+    }
+  }
+}
+
+void Gpu::MachTabBackw(const REAL lrate, const int bsize, const int odim,
+                     REAL *gpu_data_in, REAL *gpu_t, REAL *gpu_grad_out)
+{
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], odim);
+  int n_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize);
+  KernelMachTabBackw<<<n_blocks, n_threads, 0, Gpu::curStream>>>(lrate, bsize, odim, gpu_data_in, gpu_t, gpu_grad_out);
+}
+
+
+//-----------------------------------------------
+// Softmax normalization
+//-----------------------------------------------
+
+__global__ void KernelSoftmax(int M, int N,
+			      const REAL * x, const int sx0, const int sx1,
+ 			      REAL * sm, const int sm_s0, const int sm_s1)
+{
+  extern __shared__ REAL buf[];
+  for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) {
+    REAL sum = 0;
+#pragma unroll 16
+    for (int i = threadIdx.x; i< N; i += blockDim.x){
+      sum += exp(x[blockIDX * sx0 + i * sx1]);
+    }
+    buf[threadIdx.x] = sum;
+    __syncthreads();
+
+    // This function trashes buf[1..warpsize], leaving the reduction result in buf[0].
+    if (threadIdx.x < warpSize){
+#pragma unroll 8
+      for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize){
+                buf[threadIdx.x] += buf[i];
+      }
+      if (threadIdx.x < 16){
+                //reduce so that threadIdx.x 0 has the sum of everything
+                if(threadIdx.x + 16 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16];
+                if(threadIdx.x + 8 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8];
+                if(threadIdx.x + 4 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4];
+                if(threadIdx.x + 2 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2];
+                if(threadIdx.x + 1 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1];
+      }
+    }
+    __syncthreads();
+    REAL row_sum = buf[0];
+#pragma unroll 16
+    for (int i = threadIdx.x; i< N; i += blockDim.x){
+      sm[blockIDX * sm_s0 + i * sm_s1] = exp(x[blockIDX * sx0 + i * sx1]) / row_sum;
+    }
+    __syncthreads();
+  }
+}
+void Gpu::MachSoftmaxForw(const int bsize, const int odim, REAL *gpu_data_out)
+{
+  if(0){
+    //This is the original code that is know to work correctly in all case,
+    //But is slower.
+    nppsExp_32f_I(gpu_data_out, bsize*odim);
+
+    REAL sum, *optr=gpu_data_out;
+
+    for (int b=0; b<bsize; b++,optr+=odim) {
+      sum=Gpu::CublasSasum(odim,optr,1);  // exp(x) is always positive -> we can use the sum_i (ABS(x_i))
+      nppsMulC_32f_I(1.0/sum,optr,odim);
+    }
+    return;
+  }
+
+  //int warpSize = 32;
+//The follwing check need to access the GPU properties to do it.
+//To don't do this access each time, we have done it in MachSoftmax.cpp
+//  if(warpSize != 32){
+//    Error("Gpu::MachSoftmaxForw suppose the warpSize is 32. If run with a GPU with other warpSize"
+//	  " like the current GPU, it will return wrong Results. You must update the reduction in KernelSoftmax");
+//  }
+  int n_blocks = std::min(bsize, 32 * 1024);
+  int n_threads = std::min(odim, 512);
+  int n_shared_bytes = n_threads * sizeof(REAL);
+  if (bsize > 0){
+    KernelSoftmax<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+                            bsize,
+                            odim,
+                            gpu_data_out,
+                            odim, //x.stride[0
+                            1, //x.stride[1]
+                            gpu_data_out,
+                            odim, //sm.stride[0]
+                            1//sm.stride[1]
+                    );
+    cudaError_t err = cudaGetLastError();
+    if(cudaSuccess != err){
+      printf("KernelSoftmax: n_blockn=%d, n_threads=%d, n_shared_bytes=%d odim=%d\n",
+             n_blocks, n_threads, n_shared_bytes, odim);
+      Error(cudaGetErrorString(err));
+    }
+  }
+}
+
+//-----------------------------------------------
+// Softmax stable normalization
+//-----------------------------------------------
+
+__global__ void KernelSoftmaxStable(int M, int N,
+                                     const REAL * x, const int sx0, const int sx1,
+                                     REAL * sm, const int sm_s0, const int sm_s1)
+{
+  extern __shared__ REAL buf[];
+  for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) {
+    REAL max_ = x[blockIDX * sx0 + threadIdx.x * sx1];
+    for (int i = threadIdx.x + blockDim.x; i< N; i += blockDim.x) {
+      max_ = max(max_, x[blockIDX * sx0 + i * sx1]);
+    };
+    buf[threadIdx.x] = max_;
+    __syncthreads();
+
+    // This function trashes buf[1..n_threads], leaving the reduction result in buf[0].
+    // Find the max to stabilize the softmax
+    if (threadIdx.x < warpSize)
+    {
+      for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize) {
+                buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
+      }
+      if (threadIdx.x < 16) {
+                //reduce so that threadIdx.x 0 has the max of everything
+                if(threadIdx.x + 16 < N)
+                    buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+16]);
+                if(threadIdx.x + 8 < N)
+                    buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+8]);
+                if(threadIdx.x + 4 < N)
+                    buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+4]);
+                if(threadIdx.x + 2 < N)
+                    buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+2]);
+                if(threadIdx.x + 1 < N)
+                    buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+1]);
+            }
+    }
+
+    __syncthreads();
+    REAL row_max = buf[0];
+    __syncthreads();
+    REAL sum = 0;
+    for(int i=threadIdx.x; i<N; i+=blockDim.x){
+      sum += exp(x[blockIDX * sx0 + i * sx1] - row_max);
+    };
+    buf[threadIdx.x] = sum; 
+    __syncthreads();
+
+    // This function trashes buf[1..N], leaving the reduction result in buf[0].
+    if (threadIdx.x < warpSize){
+      for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize){
+                buf[threadIdx.x] += buf[i];
+      }
+      if (threadIdx.x < 16){
+                //reduce so that threadIdx.x 0 has the sum of everything
+                if(threadIdx.x + 16 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16];
+                if(threadIdx.x + 8 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8];
+                if(threadIdx.x + 4 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4];
+                if(threadIdx.x + 2 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2];
+                if(threadIdx.x + 1 < N)
+                    buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1];
+      }
+    }
+    __syncthreads();
+    REAL row_sum = buf[0];
+    for (int i = threadIdx.x; i< N; i += blockDim.x){
+      sm[blockIDX * sm_s0 + i * sm_s1] = exp(x[blockIDX * sx0 + i * sx1] - row_max) / row_sum;
+    }
+    __syncthreads();
+  }
+}
+
+void Gpu::MachSoftmaxStableForw(const int bsize, const int odim, REAL *gpu_data_out)
+{
+  if(0){
+    Error("Not implemented!");
+    //This is the original code that is know to work correctly in all case,
+    //But is slower.
+    nppsExp_32f_I(gpu_data_out, bsize*odim);
+
+    REAL sum, *optr=gpu_data_out;
+
+    for (int b=0; b<bsize; b++,optr+=odim) {
+      sum=Gpu::CublasSasum(odim,optr,1);  // exp(x) is always positive -> we can use the sum_i (ABS(x_i))
+      nppsMulC_32f_I(1.0/sum,optr,odim);
+    }
+    return;
+  }
+  //int warpSize = 32;
+//The follwing check need to access the GPU properties to do it.
+//To don't do this access each time, we have done it in MachSoftmaxStable.cpp
+//  if(warpSize != 32){
+//    Error("Gpu::MachSoftmaxStableForw suppose the warpSize is 32. If run with a GPU with other warpSize"
+//        " like the current GPU, it will return wrong Results. You must update the reduction in KernelSoftmaxStable");
+//  }
+  int n_blocks = std::min(bsize, 32 * 1024);
+  int n_threads = std::min(odim, 512);
+  int n_shared_bytes = n_threads * sizeof(REAL);
+  if (bsize > 0){
+    KernelSoftmaxStable<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+                            bsize,
+                            odim,
+                            gpu_data_out,
+                            odim, //x.stride[0]
+                            1, //x.stride[1]
+                            gpu_data_out,
+                            odim, //sm.stride[0]
+                            1//sm.stride[1]
+                    );
+    cudaError_t err = cudaGetLastError();
+    if(cudaSuccess != err){
+      printf("n_blocks=%d, n_threads=%d, n_shared_bytes=%d odim=%d\n",
+             n_blocks, n_threads, n_shared_bytes, odim);
+      Error(cudaGetErrorString(err));
+    }
+  }
+}
+
+//-----------------------------------------------
+// Linear Rectifier units
+//-----------------------------------------------
+
+__global__
+void KernelLinRectifForw(const int n, REAL *gpu_data_out)
+{
+  int tx = threadIdx.x;
+  int bx = blockIdx.x;
+  int n_threads = blockDim.x * gridDim.x;
+  int id = tx * blockDim.x + bx * gridDim.x;
+  for(int i = id; i < n; i += n_threads){
+    if (gpu_data_out[i]<0) gpu_data_out[i]=0;
+  }
+}
+
+void Gpu::LinRectifForw(const int n, REAL *gpu_data_out)
+{
+  int nb_thread = std::min(n, 256);
+  int nb_block = n / 256;
+  KernelLinRectifForw<<<nb_block, nb_thread, 0, Gpu::curStream>>>(n, gpu_data_out);
+}
+
+__global__
+void KernelLinRectifBackw(const int n, REAL *gpu_data_out, REAL *gpu_grad_out)
+{
+  int tx = threadIdx.x;
+  int bx = blockIdx.x;
+  int n_threads = blockDim.x * gridDim.x;
+  int id = tx * blockDim.x + bx * gridDim.x;
+  for(int i = id; i < n; i += n_threads){
+    if (gpu_data_out[i]<0) gpu_grad_out[i]=0; else gpu_grad_out[i]=1;
+  }
+}
+
+void Gpu::LinRectifBackw(const int n, REAL *gpu_data_out, REAL *gpu_grad_out)
+{
+  int nb_thread = std::min(n, 256);
+  int nb_block = n / 256;
+  KernelLinRectifBackw<<<nb_block, nb_thread, 0, Gpu::curStream>>>(n, gpu_data_out, gpu_grad_out);
+}
+
+//-----------------------------------------------
+// Helper functions for drop-out
+//-----------------------------------------------
+
+__global__
+void KernelDropOut(const int n, REAL *gpu_vect, REAL *rand, REAL thresh)
+{
+  int tx = threadIdx.x;
+  int bx = blockIdx.x;
+  int n_threads = blockDim.x * gridDim.x;
+  int id = tx * blockDim.x + bx * gridDim.x;
+  for (int i = id; i < n; i += n_threads) {
+    if (rand[i]<thresh) gpu_vect[i]=0.0;
+  }
+}
+
+void Gpu::DropOut(const int n, REAL *gpu_vect, REAL *rand, REAL thresh)
+{
+  int nb_thread = std::min(n, 256);
+  int nb_block = n / 256;
+  KernelDropOut<<<nb_block, nb_thread, 0, Gpu::curStream>>>(n, gpu_vect, rand, thresh);
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcValue
+//-----------------------------------------------
+
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcValue(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target,
+					     REAL *gpu_res)
+{
+  extern __shared__ REAL buf[];
+  REAL err=0.0;
+  for (int b=threadIdx.x ; b<bsize ; b+=blockDim.x)
+     err += gpu_safelog(gpu_data_out[b*odim + (uint) gpu_target[b]]);
+  buf[threadIdx.x] = err;
+  __syncthreads();
+  if(threadIdx.x == 0) {
+    for(int i=1 ; i<blockDim.x ; i++)
+      err += buf[i];
+    atomicAdd(gpu_res, err);
+  }
+}
+
+
+REAL Gpu::ErrFctSoftmCrossEntNgramCalcValue(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target)
+{
+  REAL res;
+  if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL));
+  cudaMemsetAsync(gpu_result, 0.0, sizeof(REAL), Gpu::curStream); //Each thread will atomicAdd into it.
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  KernelErrFctSoftmCrossEntNgramCalcValue<<<1, n_threads, n_threads*sizeof(REAL), Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_target, gpu_result);
+  cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+  return res;
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcValueNull
+//-----------------------------------------------
+
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcValueNull(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target,
+					     REAL *gpu_res)
+{
+  extern __shared__ REAL buf[];
+  REAL err=0.0;
+  for (int b=threadIdx.x ; b<bsize ; b+=blockDim.x) {
+     int tidx = gpu_target[b]; // do not cast to uint ! Otherwise, nvcc will transform the -1 to 0!
+     if (tidx != NULL_WORD) err += gpu_safelog(gpu_data_out[b*odim + tidx]);
+  }
+  buf[threadIdx.x] = err;
+  __syncthreads();
+  if(threadIdx.x == 0) {
+    for(int i=1 ; i<blockDim.x ; i++)
+      err += buf[i];
+    atomicAdd(gpu_res, err);
+  }
+}
+
+
+REAL Gpu::ErrFctSoftmCrossEntNgramCalcValueNull(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target)
+{
+  REAL res;
+  if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL));
+  cudaMemsetAsync(gpu_result, 0.0, sizeof(REAL), Gpu::curStream); //Each thread will atomicAdd into it.
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  KernelErrFctSoftmCrossEntNgramCalcValueNull<<<1, n_threads, n_threads*sizeof(REAL), Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_target, gpu_result);
+  cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+  return res;
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcValueBatch
+//-----------------------------------------------
+
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcValueBatch(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target, REAL *tmp_buf)
+{
+  //extern __shared__ REAL buf[];
+  for (int b=threadIdx.x ; b<bsize ; b+=blockDim.x) {
+     int tidx = gpu_target[b]; // do not cast to uint ! Otherwise, nvcc will transform the -1 to 0!
+     if (tidx== NULL_WORD)
+       tmp_buf[b] = LOG_PROBA_NONE;	// handle NULL_WORD
+     else
+       tmp_buf[b] = gpu_safelog(gpu_data_out[b*odim + tidx]);
+  }
+}
+
+
+void Gpu::ErrFctSoftmCrossEntNgramCalcValueBatch(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target, REAL *res_vect)
+{
+  if (odim > GPU_BUF_DIM)
+    Error("Gpu::ErrFctSoftmCrossEntNgramCalcValueBatch(): odim (%d) is larger than internal buffer (%d)"); //,odim,GPU_BUF_DIM);
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  KernelErrFctSoftmCrossEntNgramCalcValueBatch<<<1, n_threads, 0, Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_target, gpu_buf);
+  cudaMemcpyAsync(res_vect, gpu_buf, bsize*sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcMax
+//-----------------------------------------------
+
+void Gpu::ErrFctSoftmCrossEntNgramCalcMax(const int eff_bsize, const int dim, REAL *output, REAL *target, REAL *res, int *pos)
+{
+  Error("TODO: Gpu::ErrFctSoftmCrossEntNgramCalcMax()");
+}
+
+#if 0 // not used anymore, use CalcvalueBatch() instead
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcValueNth(const int idx, const int odim, REAL *gpu_data_out, REAL *gpu_target, REAL *gpu_res)
+{
+  int tidx = (int) gpu_target[idx]; // do not cast to uint ! Otherwise, nvcc will transform the -1 to 0!
+  if (tdx<0) // NULL_WORD 
+    *gpu_res=-1;
+  else
+    *gpu_res = gpu_safelog(gpu_data_out[idx*odim + tidx]);
+}
+
+
+REAL Gpu::ErrFctSoftmCrossEntNgramCalcValueNth(const int idx, const int odim, REAL *gpu_data_out, REAL *gpu_target)
+{
+  REAL res;
+  if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL));
+  KernelErrFctSoftmCrossEntNgramCalcValueNth<<<1, 1, 1*sizeof(REAL), Gpu::curStream>>>(idx, odim, gpu_data_out, gpu_target, gpu_result);
+  cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+  return res;
+#endif
+
+
+//-----------------------------------------------
+// ErrFctSoftmClassCrossEntNgram::CalcWordClassError
+//-----------------------------------------------
+
+__global__
+void KernelErrFctSoftmClassError(const int bsize, const int n_classes, REAL *gpu_class_out, REAL *gpu_class_target,
+                                 REAL *gpu_res)
+{
+  int class_err=0;
+  REAL *ocptr=gpu_class_out;
+  REAL *tcptr=gpu_class_target;
+  for (int b=0; b<bsize; b++) {
+    REAL max_oclass = ocptr[0];
+    int argmax = 0;
+    for (int i=1; i<n_classes; i++) {
+      REAL oclass_i = ocptr[i];
+      if (oclass_i > max_oclass) {
+        argmax = i;
+        max_oclass = oclass_i;
+      }
+    }
+    if ((int) *tcptr != argmax)
+      class_err++;
+
+    ocptr += n_classes;
+    tcptr++;
+  }
+  *gpu_res = (REAL) class_err;
+}
+
+__global__ void KernelErrFctSoftmClassError2(const int bsize, const int n_classes,
+    REAL *gpu_class_out, REAL *gpu_class_target, REAL *gpu_res)
+{
+  extern __shared__ REAL buf[];
+  buf[threadIdx.x] = 0;
+  for (int i = threadIdx.x; i < bsize; i += blockDim.x) {
+    int argmax = 0;
+    REAL max_oclass = gpu_class_out[i*n_classes];
+    for (int j = 1; j < n_classes; j++) {
+      REAL oclass_j = gpu_class_out[i*n_classes + j];
+      if (oclass_j > max_oclass) {
+        argmax = j;
+        max_oclass = oclass_j;
+      }
+    }
+    if ((int) gpu_class_target[i] != argmax)
+      buf[threadIdx.x] += 1;
+  }
+  __syncthreads();
+  // Reduce sum into buf[0]
+  if (threadIdx.x < warpSize) {
+    for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize) {
+      buf[threadIdx.x] += buf[i];
+    }
+    if (threadIdx.x < 16) {
+      if (threadIdx.x + 16 < n_classes)
+        buf[threadIdx.x] += buf[threadIdx.x + 16];
+      if (threadIdx.x + 8 < n_classes)
+        buf[threadIdx.x] += buf[threadIdx.x + 8];
+      if (threadIdx.x + 4 < n_classes)
+        buf[threadIdx.x] += buf[threadIdx.x + 4];
+      if (threadIdx.x + 2 < n_classes)
+        buf[threadIdx.x] += buf[threadIdx.x + 2];
+      if (threadIdx.x + 1 < n_classes)
+        buf[threadIdx.x] += buf[threadIdx.x + 1];
+    }
+  }
+  if (threadIdx.x == 0)
+    *gpu_res = buf[0];
+}
+
+REAL Gpu::ErrFctSoftmClassError(const int bsize, const int n_classes, REAL *gpu_class_out, REAL *gpu_class_target)
+{
+  REAL res;
+  if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL));
+  int n_threads = std::min(bsize, 512);
+  int n_blocks = bsize / n_threads + ((bsize % n_threads) ? 1 : 0);
+  int n_shared_bytes = n_threads * sizeof(REAL);
+  KernelErrFctSoftmClassError2<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, n_classes, gpu_class_out, gpu_class_target, gpu_result);
+  cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+  return res;
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcGrad
+//-----------------------------------------------
+/**
+ * @note This kernel need many block to compute the grad but also need to do a reduction.
+ * The first block will do the reduction and compute the grad associated with it
+ * and all the other will compute the grad for other words.
+ */
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcGrad(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target,
+					    REAL *gpu_res)
+{
+  if (blockIdx.x == 0) {
+    // the first block computes the error and grad for used words
+    extern __shared__ REAL buf[];
+    REAL err=0.0;
+    for (int b=threadIdx.x; b<bsize; b+=blockDim.x) {
+      unsigned int tidx=(uint) gpu_target[b];
+      gpu_grad[b*odim + tidx] = (1.0f - gpu_grad[b*odim + tidx]);
+      err += gpu_safelog(gpu_data_out[b*odim + tidx]);
+    }
+    buf[threadIdx.x] = err;
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      for (int i=1; i<blockDim.x; i++)
+        err += buf[i];
+      *gpu_res=err;
+    }
+  }
+  else
+    // the next blocks computes the grad for all other words
+    for (int b=blockIdx.x-1; b<bsize; b+=gridDim.x-1) {
+      unsigned int tidx=(uint) gpu_target[b];
+      for (int i=threadIdx.x; i<odim; i+=blockDim.x)
+        if (tidx != (uint)i)
+          gpu_grad[b*odim + i] *= -1.0f;
+    }
+}
+
+void Gpu::ErrFctSoftmCrossEntNgramCalcGrad(const int bsize, const int odim, REAL *gpu_data_out,
+                                         REAL *gpu_grad, REAL *gpu_target, REAL * gpu_res)
+{
+  cudaMemcpyAsync(gpu_grad, gpu_data_out, bsize*odim*sizeof(REAL), cudaMemcpyDeviceToDevice, Gpu::curStream);
+
+  int nb_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize + 1);
+  int nb_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  int n_shared_bytes = nb_threads * sizeof(REAL);
+  KernelErrFctSoftmCrossEntNgramCalcGrad<<<nb_blocks, nb_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, odim, gpu_data_out, gpu_grad, gpu_target, gpu_res);
+
+  cudaError_t err = cudaGetLastError();
+  if(cudaSuccess != err){
+    ErrorN("Error in Gpu::ErrFctSoftmCrossEntNgramCalcGrad: %s", cudaGetErrorString(err));
+  }
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcGradNull
+//-----------------------------------------------
+/**
+ * @note This kernel need many block to compute the grad but also need to do a reduction.
+ * The first block will do the reduction and compute the grad associated with it
+ * and all the other will compute the grad for other words.
+ */
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcGradNull(const int bsize, const int odim,
+     REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target,
+                                            REAL *gpu_res)
+{
+  if (blockIdx.x == 0) {
+    // the first block computes the error and grad for non NULL words
+    extern __shared__ REAL buf[];
+    REAL err=0.0;
+    for (int b=threadIdx.x; b<bsize; b+=blockDim.x) {
+      //Do not cast or use unsigned for tidx. Otherwise, nvcc will transform the -1 to 0!
+      //This is a difference compared to the GPU!
+      int tidx = gpu_target[b];
+      debug5(" -batch=%d target=%d -> output at %p is %f, update grad at %p\n", b, tidx, &(gpu_data_out[b*odim + tidx]), gpu_data_out[b*odim + tidx], &(gpu_grad[b*odim+tidx]));
+      if (tidx != NULL_WORD) {
+        gpu_grad[b*odim + tidx] = (1.0f - gpu_grad[b*odim + tidx]);
+        err += gpu_safelog(gpu_data_out[b*odim + tidx]);
+      }
+    }
+    buf[threadIdx.x] = err;
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      for (int i=1; i<blockDim.x; i++)
+        err += buf[i];
+      *gpu_res=err;
+    }
+  }
+  else
+    // the next blocks computes the grad for all other words
+    for (int b=blockIdx.x-1; b<bsize; b+=gridDim.x-1) {
+      int tidx = gpu_target[b];
+      for (int i=threadIdx.x; i<odim; i+=blockDim.x) {
+        if (tidx == NULL_WORD)
+          gpu_grad[b*odim + i] = 0;
+        else if (tidx != i)
+          gpu_grad[b*odim + i] *= -1.0f;
+      }
+    }
+}
+
+void Gpu::ErrFctSoftmCrossEntNgramCalcGradNull(const int bsize, const int odim, REAL *gpu_data_out,
+                                         REAL *gpu_grad, REAL *gpu_target, REAL * gpu_res)
+{
+  cudaMemcpyAsync(gpu_grad, gpu_data_out, bsize*odim*sizeof(REAL), cudaMemcpyDeviceToDevice, Gpu::curStream);
+
+  int nb_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize + 1);
+  int nb_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  int n_shared_bytes = nb_threads * sizeof(REAL);
+  KernelErrFctSoftmCrossEntNgramCalcGradNull<<<nb_blocks, nb_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, odim, gpu_data_out, gpu_grad, gpu_target, gpu_res);
+
+  cudaError_t err = cudaGetLastError();
+  if(cudaSuccess != err){
+    ErrorN("Error in Gpu::ErrFctSoftmCrossEntNgramCalcGradNull: %s", cudaGetErrorString(err));
+  }
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgram::CalcGradCumul
+//-----------------------------------------------
+/**
+ * @note This kernel need many block to compute the grad but also need to do a reduction.
+ * The first block will do the reduction and compute the grad associated with it
+ * and all the other will compute the grad for other words.
+ */
+__global__
+void KernelErrFctSoftmCrossEntNgramCalcGradCumul(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target,
+					    REAL *gpu_res)
+{
+  if (blockIdx.x == 0) {
+    // the first block computes the error and grad for used words
+    extern __shared__ REAL buf[];
+    REAL err=0.0;
+    unsigned int tidx;
+
+    for (int b=threadIdx.x ; b<bsize ; b+=blockDim.x) {
+      tidx=(b*odim + (uint) gpu_target[b]);
+      gpu_grad[tidx] = (1.0f - gpu_grad[tidx]);
+      err += gpu_safelog(gpu_data_out[tidx]);
+    }
+    buf[threadIdx.x] = err;
+    __syncthreads();
+    if(threadIdx.x == 0) {
+      for(int i=1 ; i<blockDim.x ; i++)
+        err += buf[i];
+      atomicAdd(gpu_res, err);
+    }
+  }
+  else
+    // the next blocks computes the grad for all other words
+    for (int b=blockIdx.x-1; b<bsize; b+=gridDim.x-1) {
+      unsigned int tidx = gpu_target[b];
+      for (int i=threadIdx.x; i<odim; i+=blockDim.x)
+        if (tidx != (uint)i)
+          gpu_grad[b*odim + i] *= -1.0f;
+    }
+}
+
+
+void Gpu::ErrFctSoftmCrossEntNgramCalcGradCumul(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target)
+{
+  if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL));
+
+  cudaMemsetAsync(gpu_result, 0.0, sizeof(REAL), Gpu::curStream); //Each thread will atomicAdd into it.
+  cudaMemcpyAsync(gpu_grad, gpu_data_out, bsize*odim*sizeof(REAL), cudaMemcpyDeviceToDevice, Gpu::curStream);
+  int nb_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize + 1);
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  KernelErrFctSoftmCrossEntNgramCalcGradCumul<<<nb_blocks, n_threads, n_threads*sizeof(REAL), Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_grad, gpu_target, gpu_result);
+  Error("Gpu::ErrFctSoftmCrossEntNgramCalcGradCumul not finished!");
+
+  //REAL res;
+  //cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  //cudaStreamSynchronize(Gpu::curStream);
+  //return res;
+}
+
+//-----------------------------------------------
+// ErrFctSoftmCrossEntNgramMulit::CalcGrad
+//-----------------------------------------------
+/**
+ * @note This kernel need many block to compute the grad but also need to do a reduction.
+ * The first part of blocks will do the reduction and compute the grad associated with it
+ * and all the other will compute the grad for other words.
+ */
+__global__
+void KernelErrFctSoftmCrossEntNgramMultiCalcGrad(const int bsize, const int dim, const int nb,
+						 REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target,
+	                                         REAL *gpu_res)
+{
+  if (blockIdx.y == 0) {
+    if (threadIdx.x < nb) {
+      // the first part of blocks computes the error and grad for non NULL words
+      extern __shared__ REAL buf[];
+      REAL err=0.0;
+      for (int b=blockIdx.x; b<bsize; b+=gridDim.x)
+        for (int n=threadIdx.x; n<nb; n+=blockDim.x) {
+          int tidx=(int) gpu_target[b*nb + n];
+          if (tidx != NULL_WORD) {
+            gpu_grad[(b*nb+n)*dim + tidx] = (1.0 - gpu_grad[(b*nb+n)*dim + tidx]);
+            err += gpu_safelog(gpu_data_out[(b*nb+n)*dim + tidx]);
+            debug6("grad ngram-multi:  b=%d, n=%d, tidx=%u, out=%f -> err=%e, grad@target=%e\n", b, n, tidx, gpu_data_out[(b*nb+n)*dim + tidx], err, gpu_grad[(b*nb+n)*dim + tidx]);
+          }
+          else {
+            debug4("grad ngram-multi:  b=%d, n=%d, tidx=NULL, out=%f -> err=%e\n", b, n, gpu_data_out[(b*nb+n)*dim + tidx], err);
+          }
+        }
+      buf[threadIdx.x] = err;
+      __syncthreads();
+      if (threadIdx.x == 0) {
+        for (int i=1; (i<nb) && (i<blockDim.x); i++)
+          err += buf[i];
+        atomicAdd(gpu_res, err);
+      }
+    }
+  }
+  else if (threadIdx.x < dim)
+    // the next blocks computes the grad for all other words
+    for (int b=blockIdx.x; b<bsize; b+=gridDim.x)
+      for (int n=(blockIdx.y-1); n<nb; n+=(gridDim.y-1)) {
+        int tidx=(int) gpu_target[b*nb + n];
+        for (int i=threadIdx.x; i<dim; i+=blockDim.x) {
+          if (tidx == NULL_WORD)
+            gpu_grad[(b*nb+n)*dim + i] = 0;
+          else if (tidx != i)
+            gpu_grad[(b*nb+n)*dim + i] *= -1.0;
+        }
+      }
+}
+
+REAL Gpu::ErrFctSoftmCrossEntNgramMultiCalcGrad(const int bsize, const int dim, const int nb,
+                                              REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target)
+{
+  if (gpu_result==NULL) cudaMalloc(&gpu_result, sizeof(REAL));
+
+// same below
+  int n=bsize*nb*dim;
+  cudaMemcpyAsync(gpu_grad, gpu_data_out, n*sizeof(REAL),
+             cudaMemcpyDeviceToDevice, Gpu::curStream);
+  
+  cudaMemsetAsync(gpu_result, 0.0, sizeof(REAL), Gpu::curStream);//Each block will atomicAdd into it.
+ 
+  cudaError_t sts = cudaGetLastError();
+  if (cudaSuccess != sts)
+    Error("Error before KernelErrFctSoftmCrossEntNgramMultiCalcGrad");
+  int nb_threads = std::min(std::max(nb, dim), Gpu::curDevProps->maxThreadsDim[0]);
+  int n_shared_bytes = std::min(nb, nb_threads) * sizeof(REAL);
+  dim3 nb_blocks(std::min( bsize, Gpu::curDevProps->maxGridSize[0]),
+                 std::min(nb + 1, Gpu::curDevProps->maxGridSize[1]));
+  KernelErrFctSoftmCrossEntNgramMultiCalcGrad<<<nb_blocks, nb_threads, n_shared_bytes, Gpu::curStream>>>(
+    bsize, dim, nb, gpu_data_out, gpu_grad, gpu_target, gpu_result);
+  sts = cudaGetLastError();
+  if (cudaSuccess != sts) 
+  {
+    printf(cudaGetErrorString(sts));
+    Error("KernelErrFctSoftmCrossEntNgramMultiCalcGrad cuda error: ");
+  }
+  REAL res;
+  cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream);
+  cudaStreamSynchronize(Gpu::curStream);
+
+  return res;
+}
+
+
+//-----------------------------------------------
+// MachSoftmaxClass
+//-----------------------------------------------
+// Forw
+/* This function performs the equivalent of various Gemv, with different sizes
+   and offsets for each example in a minibatch. */
+__global__ void KernelLinForwOffset(const int bsize, const int idim, const int odim,
+                                    REAL* input, REAL* weights, REAL* bias, REAL* output,
+                                    int* class_info)
+{
+  // Each block corresponds to one (or more) sub-vector of the output. Each thread
+  // corresponds to one of its elements.
+  // Axis x of the grid corresponds to the output rows: if sizes takes large values,
+  // j will need to go beyond gridDim.x * blockDim.x
+  // Axis y of the grid corresponds to the batch size.
+
+  extern __shared__ REAL buf[];
+
+  for (int i = blockIdx.y; i < bsize; i += gridDim.y) {
+    int offset = class_info[2*i];
+    int size = class_info[2*i+1];
+    REAL* in_vec = input + i*idim;
+
+    // Copy in_vec into shared memory, so all threads in this block can access it faster
+    for (int k = threadIdx.x; k < idim; k += blockDim.x) {
+      buf[k] = in_vec[k];
+    }
+    __syncthreads();
+
+    for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < size; j += blockDim.x * gridDim.x) {
+      // Compute one (vector-vector) dot product
+      REAL dot = bias[offset + j];
+      REAL* w_vec = weights + offset + j;
+      for (int k = 0; k < idim; k++) {
+        dot += buf[k] * w_vec[k*odim];
+      }
+      output[i*odim + offset + j] = dot;
+    }
+  }
+}
+
+void Gpu::MachSoftmaxClassLinForw(const int bsize, const int idim, const int odim,
+                                REAL* input, REAL* weights, REAL* bias, REAL* output,
+                                int* class_info, const int max_size)
+{
+  debug4("bsize: %d, idim: %d, odim: %d, max_size: %d\n", bsize, idim, odim, max_size);
+  int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], max_size);
+  int n_blocks_y = std::min(Gpu::curDevProps->maxGridSize[1], bsize);
+  int n_blocks_x = std::min(Gpu::curDevProps->maxGridSize[0], max_size/n_threads + (max_size%n_threads==0?0:1));
+  int n_shared_bytes = idim*sizeof(REAL);
+  dim3 n_blocks(n_blocks_x, n_blocks_y);
+
+  debug3("n_threads: %d, n_blocks: (%d, %d)\n", n_threads, n_blocks_x, n_blocks_y);
+  KernelLinForwOffset<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, idim, odim, input, weights, bias, output, class_info);
+  cudaError_t err = cudaGetLastError();
+  if(cudaSuccess != err){
+    printf("KernelLinForwOffset: n_blocks=(%d, %d), n_threads=%d, shared=%d bytes\n",
+           n_blocks_x, n_blocks_y, n_threads, n_shared_bytes);
+    Error(cudaGetErrorString(err));
+  }
+}
+
+__global__ void KernelBatchedSoftmaxOffset(int M,
+    const REAL * x, const int sx0, const int sx1,
+    REAL * sm, const int sm_s0, const int sm_s1,
+    int * offsets, const int offsets_s,
+    int * sizes, const int sizes_s)
+{
+  extern __shared__ REAL buf[];
+  for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) {
+    REAL sum = 0;
+    int offset = offsets[blockIDX * offsets_s];
+    int size = sizes[blockIDX * sizes_s];
+#pragma unroll 16
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      sum += exp(x[blockIDX * sx0 + (offset + i) * sx1]);
+    }
+    buf[threadIdx.x] = sum;
+    __syncthreads();
+
+    // This function trashes buf[1..warpsize], leaving the reduction result in buf[0].
+    if (threadIdx.x < warpSize){
+#pragma unroll 8
+      for (int i = threadIdx.x + warpSize; i < blockDim.x && i < size; i += warpSize){
+        buf[threadIdx.x] += buf[i];
+      }
+      if (threadIdx.x < 16){
+        //reduce so that threadIdx.x 0 has the sum of everything
+        if (threadIdx.x + 16 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16];
+        if (threadIdx.x + 8 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8];
+        if (threadIdx.x + 4 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4];
+        if (threadIdx.x + 2 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2];
+        if (threadIdx.x + 1 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1];
+      }
+    }
+    __syncthreads();
+    REAL row_sum = buf[0];
+#pragma unroll 16
+    for (int i = threadIdx.x; i < size; i += blockDim.x){
+      sm[blockIDX * sm_s0 + (offset + i) * sm_s1] = exp(x[blockIDX * sx0 + (offset + i) * sx1]) / row_sum;
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void KernelBatchedSoftmaxStableOffset(int M,
+    const REAL * x, const int sx0, const int sx1,
+    REAL * sm, const int sm_s0, const int sm_s1,
+    int * offsets, const int offsets_s,
+    int * sizes, const int sizes_s)
+{
+  extern __shared__ REAL buf[];
+  for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) {
+    int offset = offsets[blockIDX * offsets_s];
+    int size = sizes[blockIDX * sizes_s];
+    REAL max_ = x[blockIDX * sx0 + (offset + threadIdx.x) * sx1];
+    for (int i = threadIdx.x + blockDim.x; i < size; i += blockDim.x) {
+      max_ = max(max_, x[blockIDX * sx0 + (offset + i) * sx1]);
+    };
+    buf[threadIdx.x] = max_;
+    __syncthreads();
+
+    // This function trashes buf[1..n_threads], leaving the reduction result in buf[0].
+    // Find the max to stabilize the softmax
+    if (threadIdx.x < warpSize)
+    {
+      for (int i = threadIdx.x + warpSize; i < blockDim.x && i < size; i += warpSize) {
+        buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]);
+      }
+      if (threadIdx.x < 16) {
+        //reduce so that threadIdx.x 0 has the max of everything
+        if (threadIdx.x + 16 < size)
+          buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+16]);
+        if (threadIdx.x + 8 < size)
+          buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+8]);
+        if (threadIdx.x + 4 < size)
+          buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+4]);
+        if (threadIdx.x + 2 < size)
+          buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+2]);
+        if (threadIdx.x + 1 < size)
+          buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+1]);
+      }
+    }
+    __syncthreads();
+    REAL row_max = buf[0];
+    __syncthreads();
+
+    REAL sum = 0;
+#pragma unroll 16
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      sum += exp(x[blockIDX * sx0 + (offset + i) * sx1] - row_max);
+    }
+    buf[threadIdx.x] = sum;
+    __syncthreads();
+
+    // This function trashes buf[1..warpsize], leaving the reduction result in buf[0].
+    if (threadIdx.x < warpSize){
+#pragma unroll 8
+      for (int i = threadIdx.x + warpSize; i < blockDim.x && i < size; i += warpSize){
+                buf[threadIdx.x] += buf[i];
+      }
+      if (threadIdx.x < 16) {
+        //reduce so that threadIdx.x 0 has the sum of everything
+        if (threadIdx.x + 16 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16];
+        if (threadIdx.x + 8 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8];
+        if (threadIdx.x + 4 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4];
+        if (threadIdx.x + 2 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2];
+        if (threadIdx.x + 1 < size)
+          buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1];
+      }
+    }
+    __syncthreads();
+    REAL row_sum = buf[0];
+#pragma unroll 16
+    for (int i = threadIdx.x; i < size; i += blockDim.x){
+      sm[blockIDX * sm_s0 + (offset + i) * sm_s1] = exp(x[blockIDX * sx0 + (offset + i) * sx1] - row_max) / row_sum;
+    }
+    __syncthreads();
+  }
+}
+
+void Gpu::MachSoftmaxClassSoftmForw(const int bsize, const int odim, REAL* gpu_data_out,
+                                  int* class_info, const int max_size, const int stable)
+{
+  int n_blocks = std::min(bsize, 32 * 1024);
+  int n_threads = std::min(max_size, 512);
+  int n_shared_bytes = n_threads * sizeof(REAL);
+  if (bsize > 0) {
+    if (stable) {
+      KernelBatchedSoftmaxStableOffset<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(bsize,
+          gpu_data_out, odim, 1,
+          gpu_data_out, odim, 1,
+          class_info, 2,
+          class_info + 1, 2);
+    }
+    else {
+      KernelBatchedSoftmaxOffset<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(bsize,
+          gpu_data_out, odim, 1,
+          gpu_data_out, odim, 1,
+          class_info, 2,
+          class_info + 1, 2);
+      cudaError_t err = cudaGetLastError();
+      if(cudaSuccess != err){
+        printf("KernelBatchedSoftmaxOffset: n_blocks=%d, n_threads=%d, n_shared_bytes=%d odim=%d\n",
+               n_blocks, n_threads, n_shared_bytes, odim);
+        Error(cudaGetErrorString(err));
+      }
+    }
+  }
+}
+
+__global__ void KernelBatchedSoftmCrossEntGradOffset(int M,
+    const REAL* x, const int sx0, const int sx1,
+    REAL* grad, const int sg0, const int sg1,
+    REAL* target, const int st,
+    int* offsets, const int so,
+    int* sizes, const int ss,
+    REAL* res)
+{
+  extern __shared__ REAL buf[];
+  REAL err = 0.0f;
+  for (int i = threadIdx.x; i < M; i += blockDim.x) {
+    int offset = offsets[i * so];
+    int size = sizes[i * ss];
+    for (int j = 0; j < size; j++) {
+      grad[i * sg0 + (offset + j) * sg1] = - x[i * sx0 + (offset + j) * sx1];
+    }
+    unsigned int tidx = (uint) target[i * st] - offset;
+    grad[i * sg0 + (offset + tidx) * sg1] += 1.0f;
+    err += gpu_safelog(x[i * sx0 + (offset + tidx) * sx1]);
+  }
+  buf[threadIdx.x] = err;
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    for (int i = 1; i < blockDim.x; i++) {
+      err += buf[i];
+    }
+    *res = err;
+  }
+}
+
+void Gpu::ErrFctSoftmClassCrossEntNgramCalcGrad(const int bsize, const int odim,
+    REAL* gpu_data_out, REAL* gpu_grad, REAL* gpu_target, int* class_info, REAL* gpu_res)
+{
+  int nb_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
+  int n_shared_bytes = nb_threads * sizeof(REAL);
+  KernelBatchedSoftmCrossEntGradOffset<<<1, nb_threads, n_shared_bytes, Gpu::curStream>>>(bsize,
+      gpu_data_out, odim, 1,
+      gpu_grad, odim, 1,
+      gpu_target, 1,
+      class_info, 2,
+      class_info + 1, 2,
+      gpu_res);
+
+  cudaError_t err = cudaGetLastError();
+  if(cudaSuccess != err){
+    ErrorN("Error in Gpu::ErrFctSoftmClassCrossEntNgramCalcGrad: %s", cudaGetErrorString(err));
+  }
+}
+
+__global__ void KernelLinGradInOffset(const int bsize, const int idim,
+                                      REAL* grad_out, const int sgo0, const int sgo1,
+                                      REAL* weights, const int sw0, const int sw1,
+                                      REAL* grad_in, const int sgi0, const int sgi1,
+                                      int* offsets, const int so,
+                                      int* sizes, const int ss)
+{
+  /*
+     Computes the a dot product (equivalent of gemv) on each row of grad_in,
+     using a different part of grad_out and weights each time (determined
+     from offsets and sizes).
+     Each row of grad_in (index i) corresponds to one blockIdx.y.
+     Columns of grad_in (lines of weights, index j) are split in groups
+     indexed by blockIdx.x. Each group has blockDim.y indices, each index
+     corresponds to a value of threadIdx.y.
+     For each (i, j), a scalar (vector-vector) dot product is computed, over
+     two vectors of length sizes[i], this sum is indexed by k. blockDim.x partial
+     sums are computed in parallel and stored in buf[threadIdx.y][threadIdx.x],
+     then a reduction steps computes the final dot product.
+     We use threadIdx.x as the fast-moving index to maximize coalesced memory
+     reads and writes.
+  */
+  extern __shared__ REAL buf[];
+  for (int i = blockIdx.y; i < bsize; i += gridDim.y) {
+    int offset = offsets[i * so];
+    int size = sizes[i * ss];
+
+    REAL* ograd_vec = grad_out + i * sgo0;
+    REAL* buf_y = buf + blockDim.x * threadIdx.y;
+    for (int j = blockDim.y * blockIdx.x + threadIdx.y; j < idim; j += gridDim.x * blockDim.y) {
+      // Perform partially-summed dot product, stored in buf[]
+      REAL* w_vec = weights + j * sw0 + offset * sw1;
+      REAL dot = 0;
+      for (int k = threadIdx.x; k < size; k += blockDim.x) {
+        dot += ograd_vec[(offset + k) * sgo1] * w_vec[k * sw1];
+      }
+      buf_y[threadIdx.x] = dot;
+      __syncthreads();
+
+      // Perform the final summation into the first columns of buf[]
+      // and accumulate the final result in grad_in
+      if (threadIdx.x < 16 && threadIdx.x + 16 < size)
+        buf_y[threadIdx.x] += buf_y[threadIdx.x + 16];
+      if (threadIdx.x <  8 && threadIdx.x +  8 < size)
+        buf_y[threadIdx.x] += buf_y[threadIdx.x + 8];
+      if (threadIdx.x <  4 && threadIdx.x +  4 < size)
+        buf_y[threadIdx.x] += buf_y[threadIdx.x + 4];
+      if (threadIdx.x <  2 && threadIdx.x +  2 < size)
+        buf_y[threadIdx.x] += buf_y[threadIdx.x + 2];
+      if (threadIdx.x == 0)
+        grad_in[i * sgi0 + j * sgi1] += buf_y[0] + buf_y[1];
+    }
+  }
+}
+
+void Gpu::MachSoftmaxClassLinGradIn(const int bsize, const int idim, const int odim,
+                                  REAL* grad_out, REAL* weights, REAL* grad_in,
+                                  int* class_info, const int max_size)
+{
+  int n_threads_x = Gpu::curDevProps->warpSize; // one warp
+  int n_threads_y = std::min(Gpu::curDevProps->maxThreadsPerBlock / n_threads_x, Gpu::curDevProps->maxThreadsDim[1]); // Maximum possible
+  int n_blocks_x = std::min(Gpu::curDevProps->maxGridSize[0], idim / n_threads_y + (idim%n_threads_y==0?0:1));
+  int n_blocks_y = std::min(Gpu::curDevProps->maxGridSize[1], bsize);
+  int n_shared_bytes = n_threads_x * n_threads_y * sizeof(REAL);
+  dim3 n_threads(n_threads_x, n_threads_y);
+  dim3 n_blocks(n_blocks_x, n_blocks_y);
+
+  KernelLinGradInOffset<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, idim,
+      grad_out, odim, 1,
+      weights, odim, 1,
+      grad_in, idim, 1,
+      class_info, 2,
+      class_info + 1, 2);
+
+  cudaError_t err = cudaGetLastError();
+  if(cudaSuccess != err){
+    ErrorN("Error in Gpu::MachSoftmaxClassLinGrad: %s", cudaGetErrorString(err));
+  }
+}
+
+__global__ void KernelLinGradUpdate(const int bsize, const int idim,
+                                    REAL* input, const int si0, const int si1,
+                                    REAL* grad_out, const int sg0, const int sg1,
+                                    REAL* weights, const int sw0, const int sw1,
+                                    REAL* bias, const int sb,
+                                    int* offsets, const int so,
+                                    int* sizes, const int ss,
+                                    const REAL lrate, const REAL wdecay)
+{
+  /*
+     Computes a series of rank-1 updates (equivalent of ger) on sub-matrices
+     of weights. Also performs updates on bias directly proportional to
+     the relevant sub-vectors of grad_out.
+     Each row of grad_out and of input (index k) corresponds to one blockIdx.y.
+     Rows of weights (columns of inputs, index i) split in groups indexed by
+     blockIdx.x. Each group has blockDim.y indices, each index corresponds to a
+     value of threadIdx.y.
+     Columns of weights and grad_out (index j) are iterated over with blockDim.x
+     parallel threads, indexed by threadIdx.x.
+
+     Using blockDim.x == 1 warp seems to maximize speed.
+
+     NOTE: Applying weight decay on the whole weight matrix would be too slow
+     (in the order of +50% execution time), so we apply it in this kernel,
+     only on the weights that were used for this minibatch.
+     Since there is no atomic multiplication primitive, the value of weights we
+     read before the update may have already been updated (by another example in
+     the same minibatch), or not. It should not make a large difference.
+  */
+
+
+  for (int k = blockIdx.y; k < bsize; k += gridDim.y) {
+    int offset = offsets[k * so];
+    int size = sizes[k * ss];
+    REAL* in_vec = input + k * si0;
+    REAL* grad_vec = grad_out + k * sg0 + offset * sg1;
+
+    for (int i = blockIdx.x * blockDim.y + threadIdx.y; i < idim; i += gridDim.x * blockDim.y) {
+      REAL* w_vec = weights + i * sw0 + offset * sw1;
+      for (int j = threadIdx.x; j < size; j += blockDim.x)
+      {
+         REAL update = lrate * (in_vec[i * si1] * grad_vec[j * sg1]
+         // TODO: if wdecay > 0, this "+" sign should probably be a "-",
+         // but this is the convention used in MachLin.cpp.
+                                + wdecay * w_vec[j]);
+         atomicAdd(w_vec + j * sw1, update);
+      }
+
+      // Block with i == 0 also updates the bias
+      if (i == 0)
+      {
+        for (int j = threadIdx.x; j < size; j += blockDim.x)
+          atomicAdd(bias + (offset + j) * sb, lrate * grad_vec[j * sg1]);
+      }
+    }
+  }
+}
+
+void Gpu::MachSoftmaxClassLinGradUpdate(const int bsize, const int idim, const int odim,
+                                      REAL* input, REAL* grad_out,
+                                      REAL* weights, REAL* bias,
+                                      int* class_info, const int max_size,
+                                      const REAL lrate, const REAL wdecay)
+{
+  int n_threads_x = Gpu::curDevProps->warpSize; // one warp
+  int n_threads_y = std::min(Gpu::curDevProps->maxThreadsPerBlock / n_threads_x, Gpu::curDevProps->maxThreadsDim[1]); // Maximum possible
+  int n_blocks_x = std::min(Gpu::curDevProps->maxGridSize[0], idim / n_threads_y + (idim%n_threads_y==0?0:1));
+  int n_blocks_y = std::min(Gpu::curDevProps->maxGridSize[1], bsize);
+  dim3 n_threads(n_threads_x, n_threads_y);
+  dim3 n_blocks(n_blocks_x, n_blocks_y);
+  int n_shared_bytes = 0;
+
+  KernelLinGradUpdate<<<n_blocks, n_threads, n_shared_bytes, Gpu::curStream>>>(
+      bsize, idim,
+      input, idim, 1,
+      grad_out, odim, 1,
+      weights, odim, 1,
+      bias, 1,
+      class_info, 2,
+      class_info + 1, 2,
+      lrate,
+      wdecay);
+}
+
+//-----------------------------------------------
+// Copy
+//-----------------------------------------------
+__global__
+void KernelCopyVectorToMatrix(REAL * mat, REAL * vec, const int M, const int N)
+{
+  for(int b = blockIdx.x; b<M; b+=gridDim.x)
+    for(int i = threadIdx.x; i<N; i+=blockDim.x)
+      mat[b * N + i] = vec[i];
+}
+
+/*
+ * This copy the vector on each line of the matrix.
+ */
+void Gpu::CopyVectorToMatrix(REAL * mat, REAL * vec, const int M, const int N)
+{
+  int nb_blocks = std::min(M, Gpu::curDevProps->maxGridSize[0]);
+  int nb_threads = std::min(N, Gpu::curDevProps->maxThreadsDim[0]);
+  debug4("Gpu::CopyVectorToMatrix(%p, %p %d %d)\n", mat, vec, M, N);
+  KernelCopyVectorToMatrix<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(mat, vec, M, N);
+  cudaError_t cuda_stat=cudaGetLastError();
+  if (cuda_stat != cudaSuccess)
+  { ErrorN("CUDA: ERROR %d in Gpu::CopyVectorToMatrix(%p, %p %d %d): %s\n",
+           cuda_stat, mat, vec, M, N, cudaGetErrorString(cuda_stat));
+  }
+}
+
+__global__
+void KernelCopyMatrixToMatrixStrided(REAL * dst, REAL * src, const int M, const int N, const int row_stride)
+{
+  for(int b = blockIdx.x; b<M; b+=gridDim.x)
+    for(int i = threadIdx.x; i<N; i+=blockDim.x)
+      dst[b * row_stride + i] = src[b * N + i]; 
+}
+
+__global__
+void KernelCopyMatrixStridedToMatrix(REAL * dst, REAL * src, const int M, const int N,
+                                     const int row_stride_src)
+{
+  for(int b = blockIdx.x; b<M; b+=gridDim.x)
+    for(int i = threadIdx.x; i<N; i+=blockDim.x)
+      dst[b * N + i] = src[b * row_stride_src + i]; 
+}
+
+/*
+ * This copy each line of a contiguous matrix to another matrix that is strided
+ */
+void Gpu::CopyMatrixToMatrixStrided(REAL * dst, REAL * src, const int M, const int N, const int row_stride)
+{
+  int nb_blocks = std::min(M, Gpu::curDevProps->maxGridSize[0]);
+  int nb_threads = std::min(N, Gpu::curDevProps->maxThreadsDim[0]);
+  KernelCopyMatrixToMatrixStrided<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(dst, src, M, N, row_stride);
+  cudaError_t cuda_stat=cudaGetLastError();
+  if (cuda_stat != cudaSuccess){
+    ErrorN("CUDA: ERROR %d in Gpu::CopyMatrixToMatrixStrided: %s\n",
+           cuda_stat, cudaGetErrorString(cuda_stat));
+  }
+}
+
+/*
+ * This copy each line of a strided matrix to another matrix that is contiguous
+ */
+void Gpu::CopyMatrixStridedToMatrix(REAL * dst, REAL * src, const int M, const int N, const int row_stride)
+{
+  int nb_blocks = std::min(M, Gpu::curDevProps->maxGridSize[0]);
+  int nb_threads = std::min(N, Gpu::curDevProps->maxThreadsDim[0]);
+  KernelCopyMatrixStridedToMatrix<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(dst, src, M, N, row_stride);
+  cudaError_t cuda_stat=cudaGetLastError();
+  if (cuda_stat != cudaSuccess){
+    ErrorN("CUDA: ERROR %d in Gpu::CopyMatrixStridedToMatrix: %s\n",
+           cuda_stat, cudaGetErrorString(cuda_stat));
+  }
+}
+
+//-----------------------------------------------
+// Multiple AXPY input row on one output row
+//-----------------------------------------------
+
+// Each block compute a fixed number of colums for all batch.
+// This allow to have read coalesced and don't need atomic opartion.
+__global__
+void KernelBatchedAXPY(const int n, const REAL a, REAL * x, const int incx,
+                       REAL * y, const int incy, const int nb_batch){
+  for(int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n;
+      idx += blockDim.x*gridDim.x){
+    for(int b=0; b<nb_batch; b++){
+      y[idx * incy] += a * x[b * n * incx + idx * incx];
+    }
+  }
+}
+
+void Gpu::BatchedAXPY(const int n, const REAL a, REAL * x, const int incx,
+                    REAL * y, const int incy, const int nb_batch){
+  int nb_threads = std::min(128, n);
+  int nb_blocks = std::min(Gpu::curDevProps->maxGridSize[0], n/nb_threads+(n%nb_threads==0?0:1));
+  nb_blocks = std::max(nb_blocks, 1);
+  KernelBatchedAXPY<<<nb_blocks,nb_threads, 0, Gpu::curStream>>>(n, a, x, incx, y, incy, nb_batch); 
+ 
+}
+
+
+//-----------------------------------------------
+// Element-wise exponential
+//-----------------------------------------------
+__global__ void KernelElemwiseExp(const int size, REAL *gpu_data_in, REAL *gpu_data_out) {
+  for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) {
+    gpu_data_out[idx] = exp(gpu_data_in[idx]);
+  }
+}
+
+/*
+ * Performs gpu_data_out[i] = exp(gpu_data_in[i]) for 0 <= i < size
+ */
+void Gpu::ElemwiseExp(const int size, REAL *gpu_data_in, REAL *gpu_data_out) {
+  int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]);
+  int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]);
+  KernelElemwiseExp<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(size, gpu_data_in, gpu_data_out);
+}
+
+//-----------------------------------------------
+// Tanh and its gradient
+//-----------------------------------------------
+__global__ void KernelElemwiseTanh(const int size, REAL *gpu_data_in, REAL *gpu_data_out) {
+  for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) {
+    gpu_data_out[idx] = tanh(gpu_data_in[idx]);
+  }
+}
+
+__global__ void KernelElemwiseTanhGrad(const int size, REAL *gpu_data_out, REAL *gpu_grad_out, REAL *gpu_grad_in) {
+  for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) {
+    REAL data_out = gpu_data_out[idx];
+    gpu_grad_in[idx] = (1.0f - data_out * data_out) * gpu_grad_out[idx];
+  }
+}
+
+/*
+ * Performs gpu_data_out[i] = tanh(gpu_data_in[i]) for 0 <= i < size
+ * where tanh(x) = sinh/cosh = (exp x - exp -x) / (exp x + exp -x)
+ *               = (exp(2*x) - 1) / (exp(2*x) + 1)
+ */
+void Gpu::ElemwiseTanh(const int size, REAL *gpu_data_in, REAL *gpu_data_out) {
+  int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]);
+  int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]);
+  KernelElemwiseTanh<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(size, gpu_data_in, gpu_data_out);
+}
+
+/*
+ * Performs gpu_grad_in[i] = (1 - gpu_data_out[i]**2) * gpu_grad_out[i]
+ * for 0 <= i < size
+ * which corresponds to the backpropagation of the gradient through tanh.
+ */
+void Gpu::ElemwiseTanhGrad(const int size, REAL *gpu_data_out, REAL* gpu_grad_out, REAL *gpu_grad_in) {
+  int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]);
+  int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]);
+  KernelElemwiseTanhGrad<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(size, gpu_data_out, gpu_grad_out, gpu_grad_in);
+}
+
+/*
+ * set GPU memory to a value - equivalent to memset() on CPU
+ */
+
+__global__ void KernelMemSet(const int size, REAL *adr, REAL val) {
+  for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) {
+    adr[idx] = val;
+  }
+}
+
+void Gpu::MemSet(REAL *adr, REAL val, int size) {
+  int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]);
+  int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]);
+  KernelMemSet<<<nb_blocks, nb_threads, 0, Gpu::curStream>>>(size, adr, val);
+}
+
+//-----------------------------------------------
+// Helpers
+//-----------------------------------------------
+
+void Gpu::ResSet(REAL val) {
+  cudaMemcpyAsync(gpu_result, &val, sizeof(REAL), cudaMemcpyHostToDevice, Gpu::curStream);
+}
+
+REAL Gpu::ResGet() {
+  REAL val;
+  cudaMemcpyAsync(&val, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost);
+  cudaStreamSynchronize(Gpu::curStream);
+  return val;
+}
diff --git a/KENLM b/KENLM
deleted file mode 100644
index e69de29..0000000
diff --git a/NBest.cpp b/NBest.cpp
new file mode 100644
index 0000000..0b51e5a
--- /dev/null
+++ b/NBest.cpp
@@ -0,0 +1,585 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+
+#include "NBest.h"
+#include "Tools.h"
+
+#include <sstream>
+#include <algorithm>
+
+// blocks separated by '|||'
+//  0:	sentence id
+//  1:	hypthesis
+//  2:	feature functions
+//  3:	global score
+//  4:	phrase alignments, e.g. 0-1=0-1 2-4=2-3 5=4
+
+bool NBest::ParseLine(inputfilestream& inpf, inputfilestream& auxf, const int n, const bool need_alignments, const int aux_dim)
+{
+  static string line; // used internally to buffer an input line
+  static int prev_id=-1; // used to detect a change of the n-best ID
+  int new_id;
+  vector<float> f;
+  vector<string> blocks;
+  static REAL* aux_data=NULL;
+  REAL AuxValue;
+  vector<REAL> aux_data_vec;
+  
+  if (line.empty()) {
+     getline(inpf,line);
+     if (inpf.eof()) return false;
+     if (0 < aux_dim)
+     {
+        if (!auxf)  Error("Not enough auxiliary data available");
+    	for (int i = 0 ; i<aux_dim ; i++)
+    	{
+	        auxf >> AuxValue;
+                aux_data_vec.push_back(AuxValue);	
+                if (auxf.eof()) return false;
+    	}
+    }
+  }
+  else {
+	if (aux_data) 
+	{ 
+        	for (int i = 0 ; i<aux_dim ; i++)
+        	{
+            		aux_data_vec.push_back(aux_data[i]);
+         	}
+  	}
+  }
+
+  debug1("NBest::ParseLine(): %s\n", line.c_str());
+    // split line into blocks
+  //cerr << "PARSE line: " << line << endl;
+  uint pos=0, epos;
+  //while ((epos=line.find(NBEST_DELIM,pos))!=string::npos) {
+  while ((epos=line.find(NBEST_DELIM,pos))<100000) {
+    blocks.push_back(line.substr(pos,epos-pos));
+    //cerr << " block from " << pos << " to " << epos << " : " <<  blocks.back() << endl;
+    pos=epos+strlen(NBEST_DELIM);
+  }
+  blocks.push_back(line.substr(pos,line.size()));
+  // cerr << " block: " << blocks.back() << endl;
+
+  if (blocks.size()<4) {
+    cerr << "ERROR: can't parse the following line (skipped)" << endl << line << endl;
+    line.clear(); // force read of new line
+    return true;
+  }
+
+  if (need_alignments && blocks.size()<5) {
+    Error("alignments are needed when rescoring phrase-tables");
+  }
+
+    // parse ID
+  new_id=Scan<int>(blocks[0]);
+  if (prev_id>=0 && new_id!=prev_id) {
+      if (!aux_data) aux_data = new REAL[aux_dim];
+      int j=0;
+      for (vector<REAL>::iterator x = aux_data_vec.begin(); x != aux_data_vec.end(); x++) {
+         aux_data[j]= *x;
+	 j++;
+      }
+      prev_id=new_id; return false;
+  } // new nbest list has started
+  prev_id=new_id;
+  id=new_id;
+  //cerr << "same ID " << id << endl;
+
+  if (n>0 && nbest.size() >= (uint) n) {
+    //cerr << "skipped" << endl;
+    line.clear();
+    return true; // skip parsing of unused hypos
+  }
+
+    // parse feature function scores
+  //cerr << "PARSE features: '" << blocks[2] << "' size: " << blocks[2].size() << endl;
+  pos=blocks[2].find_first_not_of(' ');
+  while (pos<blocks[2].size() && (epos=blocks[2].find(" ",pos))!=string::npos) {
+    string feat=blocks[2].substr(pos,epos-pos);
+    //cerr << " feat: '" << feat << "', pos: " << pos << ", " << epos << endl;
+    if (feat.find(":",0)!=string::npos || feat.find("=",0)!=string::npos) {
+      // skip feature names (old or new Moses style)
+      //cerr << "  name: " << feat << endl;
+    }
+    else { 
+      f.push_back(Scan<float>(feat));
+      //cerr << "  value: " << f.back() << endl;
+    }
+    pos=blocks[2].find_first_not_of(' ',epos+1);
+  }
+  //cerr << " FOUND " << f.size() << " features" << endl;
+
+#ifdef BOLT_NBEST
+  if (blocks.size()>4) { // copy all additional fields to the output
+    string extra_info;
+    for (size_t bb=4; bb<blocks.size(); bb++) {
+      extra_info.append(NBEST_DELIM);
+      extra_info.append(blocks[bb]);
+    }
+    nbest.push_back(Hypo(id, blocks[1], f, Scan<float>(blocks[3]), extra_info, aux_data_vec, aux_dim) );
+  }
+  else {
+    nbest.push_back(Hypo(id, blocks[1], f, Scan<float>(blocks[3]), aux_data_vec, aux_dim) );
+  }
+#else
+    // eventually parse segmentation
+  if (blocks.size()>4) {
+    vector<Align> a;
+    pos=blocks[4].find_first_not_of(' ');
+
+    debug1("parsing alignment in: %s\n", blocks[4].c_str());
+    blocks[4].append(" "); // simplifies parsing
+
+    //while (pos<blocks[4].size() && (epos=blocks[4].find(" ",pos))!=string::npos) // does not work !?
+    while (pos<blocks[4].size() && (epos=blocks[4].find(" ",pos)) < 100000)
+    {
+      string align_txt=blocks[4].substr(pos,epos-pos);
+
+      debug1(" parsing alignmnent %s:\n",align_txt.c_str());
+      uint tpos=align_txt.find('=');
+      if (tpos>align_txt.size()) {cerr << align_txt; Error("format error in alignment (no target phrase)"); }
+
+      uint pos2;
+      int sb,se,tb,te;
+      pos2=align_txt.rfind('-',tpos);
+      if (pos2>align_txt.size()) {
+        debug2(" src: pos %d-%d\n",0,tpos);
+        se=sb=Scan<int>(align_txt.substr(0,tpos));
+      }
+      else {
+        debug2(" sb: pos %d-%d\n",0,pos2);
+        sb=Scan<int>(align_txt.substr(0,pos2));
+        pos=pos2+1; pos2=align_txt.find('=',pos);
+        debug2(" se: pos %d-%d\n",pos,pos2);
+        if (pos2>align_txt.size())  {cerr << align_txt; Error("format error in alignment (end of source phrase)"); }
+        se=Scan<int>(align_txt.substr(pos,pos2-pos));
+      }
+
+      tpos++;
+      pos2=align_txt.find('-',tpos);
+      if (pos2>align_txt.size()) {
+        debug1(" tgt: pos %d\n",tpos);
+        te=tb=Scan<int>(align_txt.substr(tpos));
+      }
+      else {
+        debug2(" tb: pos %d-%d\n",tpos,pos2);
+        tb=Scan<int>(align_txt.substr(tpos,pos2-tpos));
+        te=Scan<int>(align_txt.substr(pos2+1));
+      }
+
+      if (sb<0 || se<0 || tb<0 || te<0 || sb>se || tb>te)  {cerr << align_txt; Error("wrong numbers in alignment"); }
+      debug4(" result %d-%d = %d-%d\n", sb,se,tb,te);
+      a.push_back(Align(sb,se,tb,te));
+
+      pos=blocks[4].find_first_not_of(' ',epos+1);
+    }
+
+    debug1("found %d phrases\n",(int) a.size());
+    nbest.push_back(Hypo(id, blocks[1], f, Scan<float>(blocks[3]), a, aux_data_vec, aux_dim) );
+  }
+  else {
+    nbest.push_back(Hypo(id, blocks[1], f, Scan<float>(blocks[3]), aux_data_vec, aux_dim) );
+  }
+#endif
+
+  line.clear(); // force read of new line
+  return true;
+}
+
+
+NBest::NBest(inputfilestream &inpf, inputfilestream &auxf, const int n, const bool need_alignments, const int aux_dim) 
+  : max_req(262144), nreq(0), nb_diff_align(0)
+{
+  debug0("NBEST: constructor called\n");
+  areq = new AlignReq[max_req];
+  //areq.reserve(max_req);
+  while (ParseLine(inpf, auxf, n, need_alignments, aux_dim));
+}
+
+
+NBest::~NBest()
+{
+  debug0("NBEST: destructor called\n");
+  nbest.clear();
+  srcw.clear();
+  if (areq) delete [] areq;
+  //areq.clear();
+}
+
+void NBest::Write(outputfilestream &outf, int n)
+{
+  if (n<1 || (uint) n>nbest.size()) n=nbest.size();
+  for (int i=0; i<n; i++) nbest[i].Write(outf);
+}
+
+
+void NBest::CalcGlobal(Weights &w)
+{
+  //cerr << "NBEST: calc global of size " << nbest.size() << endl;
+  for (vector<Hypo>::iterator i = nbest.begin(); i != nbest.end(); i++) {
+    (*i).CalcGlobal(w);
+  }
+}
+
+
+void NBest::Sort() {
+  sort(nbest.begin(),nbest.end());
+}
+
+
+void NBest::AddID(const int o)
+{
+  for (vector<Hypo>::iterator i = nbest.begin(); i != nbest.end(); i++) {
+    (*i).AddID(o);
+  }
+}
+
+void NBest::RescoreLM(NbestLM &lm, const int lm_pos)
+{
+  for (vector<Hypo>::iterator i = nbest.begin(); i != nbest.end(); i++) {
+    lm.RescoreHyp(*i,lm_pos);
+  }
+  lm.FinishPending();
+}
+
+#undef OLD
+#ifdef OLD
+void NBest::RescorePtable(PtableMosesPtree &pt, ifstream &srcf, const int tm_pos)
+{
+    // get a source line and segment into words
+  string src;
+  getline(srcf,src);
+  if (srcf.eof())
+    ErrorN("EOF in source text for n-best hypothesis id=%d", id);
+
+  srcw.clear();
+  srcw = Moses::Tokenize<std::string>(src);
+
+  for (vector<Hypo>::iterator i = nbest.begin(); i != nbest.end(); i++) {
+     pt.RescoreHyp(*i,srcw,tm_pos);
+  }
+}
+#else
+
+void NBest::RescorePtable(PtableMosesPtree &pt, ifstream &srcf, const int tm_pos)
+{
+    // get a source line and segment into words
+  string src;
+  getline(srcf,src);
+  if (srcf.eof())
+    ErrorN("EOF in source text for n-best hypothesis id=%d", id);
+
+  srcw.clear();
+  srcw = Moses::Tokenize<std::string>(src);
+
+  int nscores = pt.GetNscores();
+  debug2("NBest::RescorePtable(): %d scores at position %d\n", nscores, tm_pos);
+  debug2("SRC with %d words: %s\n", (int) srcw.size(),  src.c_str());
+
+  vector<float> null_scores(nscores, 0.0);
+
+  for (vector<Hypo>::iterator hi=nbest.begin(); hi!= nbest.end(); hi++) {
+      // reset the features that will be modified in BlockFinish()
+      // we already append them here if requested
+    if (nscores>1) (*hi).SetFeature(null_scores, tm_pos);
+              else (*hi).SetFeature(0.0, tm_pos);
+    
+    hi->trgw = Moses::Tokenize<std::string>(hi->trg);
+    for (vector<Align>::iterator ali=(*hi).a.begin(); ali!=(*hi).a.end(); ali++) {
+      areq[nreq].sb = (*ali).sb;
+      areq[nreq].se = (*ali).se;
+      for (int w=(*ali).tb; w<=(*ali).te; w++) areq[nreq].tgph.push_back((*hi).trgw[w]);
+      areq[nreq].hyp=&(*hi);
+      if (++nreq >= max_req) BlockFinish(pt,tm_pos);
+    }
+  }
+  BlockFinish(pt,tm_pos);
+}
+#endif
+
+void NBest::RescorePtableInv(PtableMosesPtree &pt, ifstream &srcf, const int tm_pos)
+{
+  Error("NBest::RescorePtableInv");
+    // get a source line and segment into words
+  string src;
+  getline(srcf,src);
+  if (srcf.eof())
+    ErrorN("EOF in source text for n-best hypothesis id=%d", id);
+
+  srcw.clear();
+  srcw = Moses::Tokenize<std::string>(src);
+
+  int nscores = pt.GetNscores();
+  debug2("NBest::RescorePtable(): %d scores at position %d\n", nscores, tm_pos);
+  debug2("SRC with %d words: %s\n", (int) srcw.size(),  src.c_str());
+
+  vector<float> null_scores(nscores, 0.0);
+
+  for (vector<Hypo>::iterator hi=nbest.begin(); hi!= nbest.end(); hi++) {
+      // reset the features that will be modified in BlockFinish()
+      // we already append them here if requested
+    if (nscores>1) (*hi).SetFeature(null_scores, tm_pos);
+              else (*hi).SetFeature(0.0, tm_pos);
+    
+    hi->trgw = Moses::Tokenize<std::string>(hi->trg);
+    for (vector<Align>::iterator ali=(*hi).a.begin(); ali!=(*hi).a.end(); ali++) {
+      areq[nreq].sb = (*ali).sb;
+      areq[nreq].se = (*ali).se;
+      for (int w=(*ali).tb; w<=(*ali).te; w++) areq[nreq].tgph.push_back((*hi).trgw[w]);
+      areq[nreq].hyp=&(*hi);
+      if (++nreq >= max_req) BlockFinish(pt,tm_pos);
+    }
+  }
+  BlockFinish(pt,tm_pos);
+}
+
+  // compare source and target phrases
+int AlignReqComp(const void *v1, const void *v2)
+{
+  AlignReq* a1=(AlignReq*) v1, *a2=(AlignReq*) v2;
+
+  if (a1->sb < a2->sb) return -1;
+  if (a1->sb > a2->sb) return  1;
+  if (a1->se < a2->se) return -1;
+  if (a1->se > a2->se) return  1;
+  if (a1->tgph.size() < a2->tgph.size()) return -1;
+  if (a1->tgph.size() > a2->tgph.size()) return  1;
+  for (int w=0; w<(int)a1->tgph.size(); w++) {
+    if (a1->tgph[w] < a2->tgph[w]) return -1;
+    if (a1->tgph[w] > a2->tgph[w]) return  1;
+  }
+
+  return 0; // both are equal
+}
+
+  // compare source phrases only
+int AlignReqCompSrc(const void *v1, const void *v2)
+{
+  AlignReq* a1=(AlignReq*) v1, *a2=(AlignReq*) v2;
+
+  if (a1->sb < a2->sb) return -1;
+  if (a1->sb > a2->sb) return  1;
+  if (a1->se < a2->se) return -1;
+  if (a1->se > a2->se) return  1;
+
+  return 0; // both are equal
+}
+  
+
+float NBest::GetAlignProb(PtableMosesPtree &pt, AlignReq &aq, const int tm_pos, vector<float> *logP_v) // TODO: param tm_pos is unused
+{
+  debug1("TGT: %s\n", aq.hyp->trg.c_str());
+  debug4("ALIGN %d-%d = %s-%s\n", aq.sb, aq.se, aq.tgph[0].c_str(), aq.tgph.back().c_str());
+
+  if (aq.se >= (int) srcw.size()) Error("phrase table rescoring: last source word in phrase is out of bounds\n");
+
+    // build up current source phrase pair, TODO: switch to reference ?
+  vector<string> srcph;
+  for (int w=aq.sb; w<=aq.se; w++) srcph.push_back(srcw[w]);
+
+  //printf("get Prob for %s..%s || %s..%s  -> %f\n",srcw[0].c_str(),srcw.back().c_str(),trgw[0].c_str(),trgw.back().c_str,pt.GetProb(srcph,trgph));
+  //printf("ALIGN %d-%d = %s-%s -> P=%f\n",aq.sb,aq.se,aq.tb,aq.te,pt.GetProb(srcph,trgph));
+  if (logP_v) {
+    pt.GetProb(srcph,aq.tgph,logP_v);
+    for (vector<float>::iterator fi=logP_v->begin(); fi!=logP_v->end(); fi++) *fi = log(*fi);
+    return (*logP_v)[0];
+  }
+  else {
+    return log(pt.GetProb(srcph,aq.tgph));
+  }
+}
+
+void NBest::BlockFinish(PtableMosesPtree &pt, int tm_pos)
+{
+  debug2("BlockFinish(): processing %d delayed requests, source: %d words\n", nreq, (int)srcw.size());
+
+  if (nreq==0) return;
+
+  qsort(areq, nreq, sizeof(AlignReq), AlignReqComp);
+
+  int nscores = pt.GetNscores();
+  int cnt=1;
+
+  if (tm_pos==0) tm_pos=areq[0].hyp->f.size()-nscores+1; // correct position in append mode
+  debug2("cumulating %d scores starting at position %d\n", nscores, tm_pos);
+
+    // request phrase probas for the first alignment
+  if (nscores>1) {
+    vector<float> logP_scores(nscores, 0.0);
+    debug4("request align 0: %d-%d %s-%s (several scores)\n",areq[0].sb,areq[0].se,areq[0].tgph[0].c_str(),areq[0].tgph.back().c_str());
+    GetAlignProb(pt,areq[0],tm_pos, &logP_scores);
+    areq[0].hyp->AddFeature(logP_scores,tm_pos);
+
+    for (int n=1; n<nreq; n++) {
+      if (AlignReqComp(areq+n-1, areq+n) != 0) {
+          // new alignment pair -> calculate new logP
+        debug5("request align %d: %d-%d %s-%s\n", cnt,areq[n].sb,areq[n].se,areq[n].tgph[0].c_str(),areq[n].tgph.back().c_str());
+        GetAlignProb(pt,areq[n],tm_pos, &logP_scores);
+        cnt++;
+      }
+      //printf("add %f to hyp %s\n",logP,areq[n].hyp->trg.c_str());
+      areq[n].hyp->AddFeature(logP_scores,tm_pos);	// cumulate
+    }
+  }
+  else {
+    debug4("request align 0: %d-%d %s-%s\n",areq[0].sb,areq[0].se,areq[0].tgph[0].c_str(),areq[0].tgph.back().c_str());
+    float logP = GetAlignProb(pt,areq[0],tm_pos);
+    areq[0].hyp->AddFeature(logP,tm_pos);
+
+    for (int n=1; n<nreq; n++) {
+      if (AlignReqComp(areq+n-1, areq+n) != 0) {
+          // new alignment pair -> calculate new logP
+        debug5("request align %d: %d-%d %s-%s\n", cnt,areq[n].sb,areq[n].se,areq[n].tgph[0].c_str(),areq[n].tgph.back().c_str());
+        logP = GetAlignProb(pt,areq[n],tm_pos);
+        cnt++;
+      }
+      //printf("add %f to hyp %s\n",logP,areq[n].hyp->trg.c_str());
+      areq[n].hyp->AddFeature(logP,tm_pos);	// cumulate
+    }
+  }
+
+  debug1(" %d different alignments\n", cnt);
+  nb_diff_align += cnt;
+}
+
+int NBest::NbPhrases()
+{
+  int cnt=0;
+  for (vector<Hypo>::iterator i = nbest.begin(); i != nbest.end(); i++) {
+    cnt += (*i).NbPhrases();
+  }
+
+  return cnt;
+}
+
+//**********************************************************
+//
+// caching algorithm for TM rescoring with CSTM
+//
+//**********************************************************
+
+
+// this is identical to Moses ptable rescoring, we just call a different BlockFinish
+void NBest::RescorePtable(NbestCSTM &cstm, ifstream &srcf, const int tm_pos)
+{
+    // get a source line and segment into words
+  string src;
+  getline(srcf,src);
+  if (srcf.eof())
+    ErrorN("EOF in source text for n-best hypothesis id=%d", id);
+
+  srcw.clear();
+  srcw = Moses::Tokenize<std::string>(src);
+
+  debug1("NBest::RescorePtable(): CSTM score at position %d\n", tm_pos);
+  debug2("SRC with %d words: %s\n", (int) srcw.size(),  src.c_str());
+
+  for (vector<Hypo>::iterator hi=nbest.begin(); hi!= nbest.end(); hi++) {
+      // reset the feature that will be modified in BlockFinish()
+      // we already append it here if requested
+    (*hi).SetFeature(0.0, tm_pos);
+    
+    hi->trgw = Moses::Tokenize<std::string>(hi->trg);
+    int nw=(int) hi->trgw.size();
+    debug2("CSTM token target: %s  %d words\n", hi->trg.c_str(), nw);
+    for (vector<Align>::iterator ali=(*hi).a.begin(); ali!=(*hi).a.end(); ali++) {
+      areq[nreq].sb = (*ali).sb;
+      areq[nreq].se = (*ali).se;
+      debug5("CSTM process areq %d, src: %d-%d, tgt: %d-%d\n",nreq,(*ali).sb,(*ali).se,(*ali).tb,(*ali).te);
+      if ((*ali).tb<0 || (*ali).tb>=nw || ((*ali).te<0 || (*ali).te>=nw)) {
+        fprintf(stderr,"skipping line with targets out of bound in alignment %d-%d=%d-%d\n",(*ali).sb,(*ali).se,(*ali).tb,(*ali).te);
+        continue;
+      }
+      for (int w=(*ali).tb; w<=(*ali).te; w++) areq[nreq].tgph.push_back((*hi).trgw[w]);
+      cstm.LookupTarget(areq[nreq].tgph, areq[nreq].tgwid); // TODO: this is inefficient, the same target will appear many times
+      areq[nreq].hyp=&(*hi);
+      if (++nreq >= max_req) BlockFinish(cstm,tm_pos);
+    }
+  }
+  BlockFinish(cstm,tm_pos);
+}
+
+// this is identical to Moses ptable rescoring, we just call a different BlockFinish
+void NBest::RescorePtableInv(NbestCSTM &cstm, ifstream &srcf, const int tm_pos)
+{
+  Error("NBest::RescorePtableInv()");
+}
+
+void NBest::BlockFinish(NbestCSTM &cstm, int tm_pos)
+{
+  debug2("BlockFinish(): processing %d delayed requests, source: %d words\n", nreq, (int)srcw.size());
+
+  if (nreq==0) return;
+  int bsize=cstm.mach->GetBsize();
+
+  qsort(areq, nreq, sizeof(AlignReq), AlignReqComp);
+
+  if (tm_pos==0) tm_pos=areq[0].hyp->f.size(); // correct position in append mode
+  debug1("cumulating 1 score starting at position %d\n", tm_pos);
+
+  vector<string> srcph;				// one source phrase
+  vector< vector<string> > src_phrases;		// all possible source phrase in this block, size
+  
+    // process first phrase pair
+  areq[0].bs=0;
+  cstm.AddToInput(0,srcw,areq[0].sb,areq[0].se);
+  srcph.clear();
+  for (int w=areq[0].sb; w<=areq[0].se; w++) srcph.push_back(srcw[w]);
+  src_phrases.push_back(srcph);
+
+  int cnt=1;
+
+  int req_beg=0;	// start of current CSLM block in large request array
+  int bs=0;             // current block index in forward bunch
+
+  for (int n=1; n<nreq; n++) {
+    if (AlignReqCompSrc(areq+n-1, areq+n) != 0) { // new source phrase 
+        // first process bunch if full
+      bs++;
+      debug1("   %d new context\n", bs);
+      if (bs >= bsize) {
+        cstm.trainer->ForwAndCollect(src_phrases,areq,req_beg,n-1,bs,tm_pos);
+        bs=0; req_beg=n;
+      }
+          // add new source phrase to bunch for forward pass
+          // REMARK: this is not perfect since some of the examples may be out of slist and we actually wouldn't
+          //         need a forward pass for them. However, all request of an n-best block must be performed before
+          //         we go to the next n-best block, In practice there are often less than 128 difference source phrases.
+          //         Therefore, we only do one forward pass anyway
+      areq[n].bs=bs;
+      cstm.AddToInput(bs,srcw,areq[n].sb,areq[n].se);
+      srcph.clear();
+      for (int w=areq[n].sb; w<=areq[n].se; w++) srcph.push_back(srcw[w]);
+      src_phrases.push_back(srcph);
+      cnt++;
+    }
+    else
+      areq[n].bs=bs;
+  }
+  cstm.trainer->ForwAndCollect(src_phrases,areq,req_beg,nreq-1,bs+1,tm_pos);
+  // FreeReq(); TODO
+
+  printf(" %d different source phrases\n", cnt);
+  nb_diff_align += cnt;
+}
+
diff --git a/NBest.h b/NBest.h
new file mode 100644
index 0000000..51c4628
--- /dev/null
+++ b/NBest.h
@@ -0,0 +1,73 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ */
+
+#ifndef _NBEST_H_
+#define _NBEST_H_
+
+using namespace std;
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "Toolsgz.h"
+#include "Hypo.h"
+#include "NbestLM.h"
+#include "NbestCSTM.h"
+#include "PtableMosesPtree.h"
+
+#include "AlignReq.h"
+
+class NBest {
+  int 		   id;
+  vector<string>   srcw;	// source sentence parsed into words (only available for TM rescoring)
+  vector<Hypo> nbest;
+  bool ParseLine(inputfilestream& inpf, inputfilestream& auxf, const int, const bool, const int);
+    // Delayed translation model rescoring
+  int max_req;			// max number of request cumulated before we perform them in a block
+  int nreq;			// current number of request cumulated
+  AlignReq *areq;		// array to allocate all requests
+  int nb_diff_align;		// stats
+ public:
+  NBest(inputfilestream&, inputfilestream& , const int=0, const bool =false , const int=0);
+  ~NBest();
+  int NbNBest() {return nbest.size(); }
+  int NbPhrases();
+  int NbDiffPhrases() {return nb_diff_align; }
+  void CalcGlobal(Weights&);
+  void Sort(); // largest values first
+  void Write(outputfilestream&, int=0);
+  void AddID(const int offs);
+  void RescoreLM(NbestLM&, const int); // recalc LM score on hypothesis (uses optional auxiliary data)
+    // Delayed translation model rescoring with on disk phrase table
+  void RescorePtable(PtableMosesPtree&, ifstream&, const int);
+  void RescorePtableInv(PtableMosesPtree&, ifstream&, const int);
+  void BlockFinish(PtableMosesPtree&, int);
+  REAL GetAlignProb(PtableMosesPtree&, AlignReq&, const int, vector<float>* = NULL);
+    // Delayed translation model rescoring with CSTM
+  void RescorePtable(NbestCSTM&, ifstream&, const int);
+  void RescorePtableInv(NbestCSTM&, ifstream&, const int);
+  void BlockFinish(NbestCSTM&, int);
+  void ForwAndCollect(int, int, int);
+};
+
+
+#endif
diff --git a/NbestCSTM.cpp b/NbestCSTM.cpp
new file mode 100644
index 0000000..d44dc8b
--- /dev/null
+++ b/NbestCSTM.cpp
@@ -0,0 +1,123 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+using namespace std;
+
+#include <string>
+
+#include "Tools.h"
+#include "Hypo.h"
+#include "NbestCSTM.h"
+#include "ErrFctSoftmCrossEntNgramMulti.h"
+
+
+NbestCSTM::~NbestCSTM() {
+  if (mach) delete mach;
+  if (trainer) delete trainer;
+}
+
+
+void NbestCSTM::Read(char *fname, char *wl_src_fname, char *wl_tgt_fname, char *pt_fname, int nscores, char *scores_specif)
+{
+  ifstream ifs;
+  ifs.open(fname,ios::binary);
+  CHECK_FILE(ifs,fname);
+  mach = Mach::Read(ifs);
+  ifs.close();
+
+  mach->Info();
+
+  // create vocabulary from our source word list, this must be exactly the same order than in extract2bin !!!
+  cout << " - reading source word list from file " << wl_src_fname << flush;
+  src_wlist.SetSortBehavior(this->stable_sort);
+  src_wlist.Read(wl_src_fname);
+  cout << ", got " << src_wlist.GetSize() << " words" << endl;
+
+  // create vocabulary from our target word list, this must be exactly the same order than in extract2bin !!!
+  cout << " - reading target word list from file " << wl_tgt_fname << flush;
+  tgt_wlist.SetSortBehavior(this->stable_sort);
+  tgt_wlist.Read(wl_tgt_fname);
+  cout << ", got " << tgt_wlist.GetSize() << " words" << endl;
+
+  trainer = new TrainerPhraseSlist(mach, &src_wlist, &tgt_wlist, pt_fname, nscores, scores_specif);
+}
+
+void NbestCSTM::AddToInput(int b, vector<string> &vsrcw, int sb, int se)
+{
+  int idim=mach->GetIdim();
+  if (sb-se+1 > idim) {
+    ErrorN("NbestCSTM::AddToInput(): source phrase too long (%d) for machine (%d)\n", sb-se+1, idim);
+  }
+
+  REAL *iptr=trainer->GetBufInput() + b*idim;
+  int i=0;
+
+  // get index of each source word
+  debug0("NbestCSTM::AddToInput():");
+  REAL unk_wi = (REAL) src_wlist.GetIndex(WordList::WordUnknown);
+  for (int w=sb; w<=se; w++) {
+    WordList::WordIndex wi = src_wlist.GetIndex(vsrcw[w].c_str());
+    if (wi==WordList::BadIndex) {
+      fprintf(stderr, "ERROR: source word not found: %s\n", vsrcw[w].c_str());
+      *iptr++ = unk_wi;
+    }
+    else 
+      *iptr++ = (REAL) wi;
+    debug2(" %s->%f", vsrcw[w].c_str(), iptr[-1]);
+    i++;
+  }
+  debug0("\n");
+
+  // fill up input phrase to the dimension of the machine
+  for (; i<idim; i++) *iptr++=NULL_WORD;
+}
+ 
+void NbestCSTM::LookupTarget(vector<string> &vtrgw, WordID *wid)
+{
+  int nph=trainer->GetTgtNbPhr();
+  int vdim=(int) vtrgw.size();
+
+  if (vdim>nph) {
+    ErrorN("NbestCSTM::MapTarget(): phrase (%d) exceeds length of machine (%d)\n",vdim, nph);
+  }
+  
+  int i;
+  debug0("NbestCSTM::LookupTarget():");
+  for (i=0; i<vdim; i++) {
+    WordList::WordIndex wi = tgt_wlist.GetIndex(vtrgw[i].c_str());
+    if (wi==WordList::BadIndex) {
+      //ErrorN("ERROR: target word not found: %s\n", vtrgw[i].c_str());
+      // TODO: count these events
+      
+      // this has as effect that the word won't be processed by the CSTM (out of short list)
+      // maybe the external phrase table knows it?
+      wid[i]=trainer->GetSlistLen();
+    }
+    else
+      wid[i] = (WordID) wi;
+    debug2(" %s->%d", vtrgw[i].c_str(), wid[i]);
+  }
+  debug0("\n");
+
+  // fill up
+  for (; i<nph; i++) wid[i] = NULL_WORD;
+}
diff --git a/NbestCSTM.h b/NbestCSTM.h
new file mode 100644
index 0000000..8ebf495
--- /dev/null
+++ b/NbestCSTM.h
@@ -0,0 +1,51 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+
+#ifndef _NBESTCSTM_H_
+#define _NBESTCSTM_H_
+
+using namespace std;
+
+#include "Mach.h" // from the CSTM toolkit
+#include "TrainerPhraseSlist.h" 
+#include "WordList.h"
+
+class NbestCSTM {
+private:
+  WordList src_wlist;
+  WordList tgt_wlist;
+  Mach *mach;
+  TrainerPhraseSlist *trainer;
+  bool stable_sort;	// use stable sort (default=true), set to false for compatibility with CSLM <= V3.0
+public:
+  NbestCSTM() : src_wlist(true), tgt_wlist(true), mach(NULL), trainer(NULL), stable_sort(true) {}
+  virtual ~NbestCSTM();
+  virtual void SetSortBehavior(bool val) {stable_sort=val;}
+  virtual void Read (char*, char*, char* , char*, int, char*);
+  virtual void AddToInput(int, vector<string> &, int, int);
+  virtual void LookupTarget(vector<string> &v, WordID *);
+  virtual void Stats() {trainer->BlockStats();}
+  friend class NBest;
+};
+
+#endif
diff --git a/Ptable.h b/Ptable.h
new file mode 100644
index 0000000..6b518f7
--- /dev/null
+++ b/Ptable.h
@@ -0,0 +1,49 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#ifndef _Ptable_h
+#define _Ptable_h
+
+using namespace std;
+
+#include <string>
+#include <vector>
+#include "Tools.h"		// for type REAL
+//#include "DataNgramBin.h"	// for type WordID
+
+// interface class to classical phrase tables
+//
+//
+
+#define NULL_LN_PROB (1.0)   // this value must not be possible as a normal return value of ln Prob
+
+class Ptable {
+ private:
+ public:
+  Ptable(const int, const int=2, const bool=false) {};				// initialize
+  virtual ~Ptable() {};
+  virtual void Read(const string &) {};						// read form file
+  virtual REAL GetProb(vector<string>&, vector<string>&) {return 0;}		// get backoff LM P(w|ctxt) from seqeuence of words
+  //virtual REAL GetProbWid(REAL *src, WordID *tgt) {return 0;} 
+};
+
+#endif
diff --git a/PtableMosesPtree.cpp b/PtableMosesPtree.cpp
new file mode 100644
index 0000000..40efd99
--- /dev/null
+++ b/PtableMosesPtree.cpp
@@ -0,0 +1,194 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#include "PtableMosesPtree.h"
+
+
+PtableMosesPtree::~PtableMosesPtree ()
+{
+  for (vector<Moses::PhraseDictionaryTree*>::iterator p=ptree.begin(); p!=ptree.end(); p++)
+    (*p)->FreeMemory();
+}
+
+//
+// read a new phrase table
+//
+void PtableMosesPtree::Read(const string &fname, const int p_nscores, const char *scores_specif)
+{
+  if (strlen(scores_specif)<2 || scores_specif[1]!=':')
+    Error("format error in the specification of the TM scores");
+  if (scores_specif[0]<'1' || scores_specif[0]>'4')
+    Error("wrong value for the number of TM scores");
+
+  if (ptree.size()==0)
+    nscores=scores_specif[0]-'0';
+  else {
+    if (nscores!=scores_specif[0]-'0')
+      Error("PtableMosesPtree::Read(): inconsistent number of scores to be returned from multiple phrase tables");
+  }
+  if (nscores > p_nscores)
+    Error("PtableMosesPtree::Read(): the number of scores to be returned exceeds the number of available ones");
+
+  ptree.push_back(new Moses::PhraseDictionaryTree);
+  pos_scores.push_back(scores_specif[2]-'0');
+
+  ptree.back()->NeedAlignmentInfo(false);
+  cout << " - loading Moses binary phrase table from file " << fname << " with " << p_nscores << " scores" << endl;
+  ptree.back()->Read(fname);
+  cout << "   using " << nscores << " scores starting at position " << pos_scores.back() << endl;
+  tgtcands.clear();
+};
+
+
+//
+// Get probabilities from the phrase-tables
+//  - scores=NULL:	return either one value as a function result
+//  - scores!=NULL:	return a sequence of values in that vector (as many as the vector has space)
+//
+
+REAL PtableMosesPtree::GetProb(vector<string> &src, vector<string> &tgt, vector<float> *scores)
+{
+  uint w;
+
+#ifdef DEBUG
+  cout << "Ptable prob:";
+  for (w=0; w<src.size(); w++) cout << " " << src[w];
+  cout << " |||";
+  for (w=0; w<tgt.size(); w++) cout << " " << tgt[w];
+  cout << " ||| " << endl;
+#endif
+
+  if (scores && scores->size() == 0)
+    Error("PtableMosesPtree::GetProb() parameter scores has zero dimension");
+
+  if (scores && (int) scores->size() > nscores)
+    Error("PtableMosesPtree::GetProb() requesting too much scores form the phrase table");
+
+
+  for (uint p=0; p<ptree.size(); p++) {
+
+      // get all target phrases with scores from current phrase table
+    tgtcands.clear();
+    ptree[p]->GetTargetCandidates(src, tgtcands);
+    debug2(" - phrase table %u has %d candidates:\n", p, (int) tgtcands.size());
+    size_t pos=pos_scores[p];
+
+      // search for our target phrase
+    for (uint tph=0; tph<tgtcands.size(); tph++) {
+      //debug2(" - candidate %d, length %d\n", tph, (int) tgtcands[tph].tokens.size());
+      if (tgt.size() != tgtcands[tph].tokens.size()) continue;
+      bool match=true;
+      for (w=0; match && w<tgt.size(); w++) {
+        match = (tgt[w].compare(*(tgtcands[tph].tokens[w])) == 0);
+        //debug4("   word[%d] %s / %s -> %d\n",w, tgt[w].c_str(), tgtcands[tph].tokens[w]->c_str(), match);
+      }
+      if (match) {
+        debug5("     found phrase of length %u/%u at pos %d out of %d, p=%f\n", (uint) src.size(), (uint) tgt.size(), tph, (int) tgtcands.size(), tgtcands[tph].scores[pos]);
+        if (scores) {
+          for (uint s=0; s<scores->size(); s++) {
+            (*scores)[s]=tgtcands[tph].scores[pos+s]; // return sequence of scores
+            debug2(" score[%u]: %f\n",s, (*scores)[s]);
+          }
+        }
+        return tgtcands[tph].scores[pos];
+      }
+    } 
+ 
+  } 
+      
+    // phrase pair wasn't found in any phrase table
+    // do we have an unknown word which was copied to the target ?
+  if (src.size()==1 && tgt.size()==1 && src[0]==tgt[0]) {
+    debug0("     UNK: source copied to target\n");
+    if (scores) {
+      for (uint s=0; s<scores->size(); s++) (*scores)[s]=PROBA_COPY_UNK; // return sequence of scores
+    }
+    return PROBA_COPY_UNK;
+  }
+  
+#ifdef DEBUG
+  cout << "ERROR: can't find the following phrase pair in the external phrase tables: SETTING PROBA TO " << PROBA_NOT_IN_PTABLE << endl;
+  for (w=0; w<src.size(); w++) cout << " " << src[w];
+  cout << " |||";
+  for (w=0; w<tgt.size(); w++) cout << " " << tgt[w];
+  cout << " ||| " << endl;
+#endif
+  if (scores) {
+    for (uint s=0; s<scores->size(); s++) (*scores)[s]=PROBA_NOT_IN_PTABLE; // return sequence of scores
+  }
+  return PROBA_NOT_IN_PTABLE;
+}
+
+/*
+void PtableMosesPtree::BlockEval (Hypo &hyp, vector<string> &srcw, const int pos)
+{
+}
+*/
+
+void PtableMosesPtree::RescoreHyp (Hypo &hyp, vector<string> &srcw, const int pos)
+{
+  debug1("TGT: %s\n", hyp.trg.c_str());
+  vector<string> trgw = Moses::Tokenize<std::string>(hyp.trg);
+
+  int nws=srcw.size(), nwt=trgw.size();
+  debug3("Ptable rescoring with %d source and %d target words, %d phrases\n", nws, nwt, (int) hyp.a.size());
+  vector<string> srcph, trgph;  // needed to build up current phrase pair
+
+
+  if (nscores>1) { 
+    vector<float> res(nscores,0.0); // we request more than one score form the phrase table
+    vector<float> logP(nscores,0.0); // we request more than one score form the phrase table
+
+    for (vector<Align>::iterator al=hyp.a.begin(); al!=hyp.a.end(); al++) {
+      if ((*al).se>=nws) Error("phrase table rescoring: last source word in phrase out of bounds\n");
+      if ((*al).te>=nwt) Error("phrase table rescoring: last target word in phrase out of bounds\n");
+
+      debug4("ALIGN %d-%d = %d-%d\n", (*al).sb, (*al).se, (*al).tb, (*al).te);
+      srcph.clear();
+      for (int w=(*al).sb; w<=(*al).se; w++) srcph.push_back(srcw[w]);
+      trgph.clear();
+      for (int w=(*al).tb; w<=(*al).te; w++) trgph.push_back(trgw[w]);
+
+      GetProb(srcph,trgph,&res); // TODO: this is very inefficient, we should group together request for the same source phrase
+      for (int i=0; i<nscores; i++) logP[i] += log(res[i]);
+    }
+    hyp.SetFeature(logP,pos);
+
+  }
+  else {
+    REAL logP=0;	// we request only one score from the phrase table
+
+    for (vector<Align>::iterator al=hyp.a.begin(); al!=hyp.a.end(); al++) {
+      if ((*al).se>=nws) Error("phrase table rescoring: last source word in phrase out of bounds\n");
+      if ((*al).te>=nwt) Error("phrase table rescoring: last target word in phrase out of bounds\n");
+
+      debug4("ALIGN %d-%d = %d-%d\n", (*al).sb, (*al).se, (*al).tb, (*al).te);
+      srcph.clear();
+      for (int w=(*al).sb; w<=(*al).se; w++) srcph.push_back(srcw[w]);
+      trgph.clear();
+      for (int w=(*al).tb; w<=(*al).te; w++) trgph.push_back(trgw[w]);
+
+      logP+=log(GetProb(srcph,trgph)); // TODO: this is very inefficient, we should group together request for the same source phrase
+    }
+    hyp.SetFeature(logP,pos);
+  }
+}
diff --git a/PtableMosesPtree.h b/PtableMosesPtree.h
new file mode 100644
index 0000000..53f0632
--- /dev/null
+++ b/PtableMosesPtree.h
@@ -0,0 +1,77 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#ifndef _PtableMosesPtree_h
+#define _PtableMosesPtree_h
+
+using namespace std;
+
+#include "Ptable.h"
+#include "Hypo.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+
+// from Moses:
+#include <TranslationModel/PhraseDictionaryTree.h>
+#include <Util.h>
+
+
+// interface class to Moses binary on-disk prahse tables
+// (implementation with a prefix tree)
+
+const REAL PROBA_COPY_UNK (1);	// translation probability when an unknown word is copied from source to target
+const REAL PROBA_NOT_IN_PTABLE (1e-20);	// translation probability when a phrase pair is not found in the Moses phrase table
+					// this can happen when some words are mapped to <unk> because of limited source or target vocabularies
+
+//
+// helper class to store and compare Phrase requests
+// ugly C-style structure, but this seems to be more efficient
+
+/*
+struct PhraseReq {
+  Align	a;
+  vector<string>  &trgw;
+  int cnt;
+  REAL *res_ptr;
+};
+*/
+
+class PtableMosesPtree {
+ private:
+   vector<Moses::PhraseDictionaryTree*> ptree;	// main and eventually secondary phrase tables
+   vector<int> pos_scores;			// starting position of the scores to be returned from each phrase table
+   int nscores;					// number of scores to be returned (must be same for all phrase-tables)
+   vector<Moses::StringTgtCand> tgtcands;
+ public:
+  PtableMosesPtree() {};
+  virtual ~PtableMosesPtree();
+  virtual void Read(const string &, const int, const char*);		// read next phrase table from file
+  virtual REAL GetProb(vector<string>&, vector<string>&, vector<float> * =NULL);		// return one proba for a tokenized phrase-pair or vector of scores
+  //virtual REAL GetProbWid(REAL *src, WordID *tgt) {return 0;} 
+  virtual void RescoreHyp (Hypo&, vector<string> &, const int);
+  virtual int GetNscores() {return nscores; }
+};
+
+#endif
diff --git a/TrainerPhraseSlist.cpp b/TrainerPhraseSlist.cpp
new file mode 100644
index 0000000..88010f9
--- /dev/null
+++ b/TrainerPhraseSlist.cpp
@@ -0,0 +1,1164 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ */
+
+using namespace std;
+#include <iostream>
+#include <algorithm>
+#include <unistd.h>
+#include <time.h>
+
+#include "Tools.h"
+#include "Mach.h"
+#include "MachTab.h"
+#include "MachPar.h"
+#include "MachSeq.h"
+#include "MachSplit.h"
+#include "TrainerPhraseSlist.h"
+#include "ErrFctSoftmCrossEntNgram.h"
+
+#include "NBest.h" 
+#include "sort.cpp" 
+
+// activate mapping of input
+// not really necessary, may only speed up calculations due to cache locality
+// if you activvate this option, you must do so for all your networks
+#undef TRAINER_PHASE_SLIST_MAP_INPUT
+
+void TrainerPhraseSlist::DoConstructorWork()
+{
+  idim=mach->GetIdim(); odim=mach->GetOdim(); bsize=mach->GetBsize();
+
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  gpu_input = Gpu::Alloc(idim*bsize, "inputs in Trainer");
+  host_output = new REAL[odim*bsize];
+#endif
+  buf_target_wid = new WordID[odim*bsize];	// TODO: those are actually too big, we need tg_nbphr*bsize ??
+  buf_target_ext = new WordID[odim*bsize];
+  buf_target_in_blocks = new REAL[odim*bsize];
+
+    // set up vector to outputs of the target phrases
+  if (mach->GetMType() != file_header_mtype_mseq)
+    Error("CSTM: sequential machine needed\n");
+  MachSeq *mseq=(MachSeq*) mach;
+  if (mseq->MachGetNb()<2)
+    Error("CSTM: the number of machines is suspiciously small");
+
+    // check input layer
+  if (mseq->MachGet(0)->GetMType() != file_header_mtype_mpar)
+    Error("CSTM: the input layer has the wrong architecture\n");
+  MachPar *mpar = (MachPar*) mseq->MachGet(0);
+  if (mpar->MachGet(0)->GetMType() != file_header_mtype_tab)
+    Error("CSTM: the input layer has the wrong architecture\n");
+  MachTab *mtab = (MachTab*) mpar->MachGet(0);
+  max_inp_idx = mtab->GetMaxInpVal();
+
+    // check output layer
+  if (mseq->MachGet(mseq->MachGetNb()-1)->GetMType() != file_header_mtype_msplit)
+    Error("CSTM: the output layer has the wrong architecture\n");
+  MachSplit *msp = (MachSplit*) mseq->MachGet(mseq->MachGetNb()-1);
+  tg_nbphr=msp->MachGetNb();
+  if (data_train && (data_train->GetOdim() != tg_nbphr)) {
+    ErrorN("CSTM: output dimension of the training data should be %d, found %d\n", tg_nbphr, data_train->GetOdim());
+  }
+
+  cout << " - using cross entropy for each output vector" << endl;
+  phrase_mach.clear();
+  mach_errfct.clear();
+  for (int m=0; m<tg_nbphr; m++) {
+    phrase_mach.push_back(msp->MachGet(m));
+    if (m>0 && phrase_mach[m-1]->GetOdim() != phrase_mach[m]->GetOdim())
+      Error("CSTM: the output layer dimension must be identical for all phrases\n");
+    //ErrFctSoftmCrossEntNgram *tmp=dynamic_cast<ErrFctSoftmCrossEntNgram*>(errfct);
+    //mach_errfct.push_back(new ErrFctSoftmCrossEntNgram(*tmp));	// create copy of user specified error function
+    mach_errfct.push_back(new ErrFctSoftmCrossEntNgram(*phrase_mach[m]));	// each machine gets its own error function with local mem for grad
+#ifdef BLAS_CUDA
+    Gpu::SetConfig(mach_errfct[m]->GetGpuConfig());
+    gpu_target.push_back(Gpu::Alloc(bsize*sizeof(REAL), "targets in Trainer"));
+#endif
+  }
+  dim_per_phrase = phrase_mach[0]->GetOdim();
+  cout << " - this machine can predict up to " << phrase_mach.size() << " phrases, each with an output layer of dimension " << dim_per_phrase << endl;
+  tg_slist_len = dim_per_phrase-1;
+
+
+    // get source word list
+  if (sr_wlist == NULL) {
+    vector<WordList> *vect_wlist = NULL;
+    if (data_dev != NULL)
+      vect_wlist = data_dev->GetSrcWList();
+    else if (data_train != NULL)
+      vect_wlist = data_train->GetSrcWList();
+    if ((vect_wlist != NULL) && !vect_wlist->empty())
+      sr_wlist = &(vect_wlist->front());
+  }
+  if (sr_wlist == NULL)
+    Error("no source word list available");
+  if ((int) sr_wlist->GetSize() > max_inp_idx)
+    ErrorN("the size of the source word list (%d) exceeds the number of input words the machine was trained for (%d)",(int) sr_wlist->GetSize(),max_inp_idx);
+  debug1("* using source word list with %d words\n",(int)sr_wlist->GetSize());
+
+    // get target word list
+  if (tg_wlist == NULL) {
+    vector<WordList> *vect_wlist = NULL;
+    if (data_dev != NULL)
+      vect_wlist = data_dev->GetTgtWList();
+    else if (data_train != NULL)
+      vect_wlist = data_train->GetTgtWList();
+    if ((vect_wlist != NULL) && !vect_wlist->empty())
+      tg_wlist = &(vect_wlist->front());
+  }
+  if (tg_wlist == NULL)
+    Error("no target word list available");
+  if (!tg_wlist->FrequSort())
+    Error("the target word list doesn't contain word counts");
+  if (tg_wlist->GetSize() <= tg_slist_len)
+    Error("TrainerPhraseSlist: the output layer is larger than the target word list");
+  debug1("* using target word list with %d words\n",(int)tg_wlist->GetSize());
+
+  ulong sum_sl=0, sum=0;
+  tg_wlist->SetShortListLength(tg_slist_len);
+  tg_wlist->CountWords(sum_sl, sum);
+  printf (" - setting up target short list of %d words, coverage of %5.2f%%\n", tg_slist_len, 100.0*sum_sl/sum);
+
+#ifdef DEBUG2
+  cout << "Words in slist:" << endl;
+  WordID ci=tg_slist_len;
+  WordList::const_iterator iter, end = tg_wlist->End();
+  for (iter=tg_wlist->Begin(); (iter!=end) && (ci > 0); iter++, ci--)
+    printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id);
+#endif
+
+#ifdef DEBUG2
+  cout << "Words not in slist:" << endl;
+  for (; iter!=end; iter++)
+    printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id);
+#endif
+
+#ifdef DEBUG2
+   // just needed for debugging
+  words.reserve(tg_wlist->GetSize());
+  for (iter=tg_wlist->Begin(); iter!=end; iter++) words[iter->id] = strdup(iter->word);
+#endif
+  
+  debug0(" + done init TrainerPhraseSlist\n");
+}
+
+//
+// constructor for training
+//
+
+TrainerPhraseSlist::TrainerPhraseSlist (Mach *pmach, Lrate *lrate, ErrFct *perrfct,
+	const char *train_fname, const char *dev_fname, const char *pt_fname, int p_nscores,
+	REAL p_wd, int p_maxep, int p_ep)
+ : Trainer(pmach,lrate,perrfct,NULL,NULL,p_wd,p_maxep,p_ep),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(NULL), tg_wlist(NULL),
+   ptable(NULL),
+   nb_ex_slist(0), nb_ex_short_tgt(0),
+   nb_forw(0)
+{
+  debug2("*** Constructor TrainerPhraseSlist for training idim=%d, odim=%d ***\n",idim,odim);
+  cout << "Setting up CSTM training with short list" << endl;
+
+  if (train_fname) {
+    data_train = new Data(train_fname);
+    if (idim != data_train->GetIdim()) {
+      ErrorN("TrainerPhraseSlist: input dimension of the training data (%d) does not match the one of the machine (%d)\n", data_train->GetIdim(), idim);
+    }
+    if (data_train->GetOdim()<1 || data_train->GetOdim()>32) {
+      ErrorN("TrainerPhraseSlist: output dimension of the training data should be 1..10, found %d\n", data_train->GetOdim());
+    }
+    auxdim = data_train->GetAuxdim();
+  }
+  else 
+    data_train=NULL;
+
+  if (dev_fname) {
+    data_dev = new Data(dev_fname);
+    data_dev_alloc=true;
+    if (idim != data_dev->GetIdim()) {
+      ErrorN("TrainerPhraseSlist: input dimension of the validation data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim);
+    }
+    if (data_dev->GetOdim()<1 || data_dev->GetOdim()>32) {
+      ErrorN("TrainerPhraseSlist: output dimension of the validation data should be 1..10, found %d\n", data_dev->GetOdim());
+    }
+    int auxdim_dev = data_dev->GetAuxdim();
+    if (0 >= auxdim)
+      auxdim = auxdim_dev;
+    else if (auxdim != auxdim_dev)
+      ErrorN("TrainerPhraseSlist: auxiliary data dimension of the validation data should be %d, found %d", auxdim, auxdim_dev);
+  }
+  else {
+    data_dev=NULL;
+    data_dev_alloc=false;
+  }
+  iaux = (idim - auxdim);
+
+  DoConstructorWork();
+
+  if (data_dev) {
+    if (pt_fname) {
+      ptable = new(PtableMosesPtree);
+      ptable->Read(pt_fname,5,"1:2");
+    }
+    else
+      cout << " - no external phrase table provided (unhandled phrase pairs receive 0 logproba)" << endl;
+  }
+}
+
+//
+// constructor for testing
+//
+
+TrainerPhraseSlist::TrainerPhraseSlist (Mach *pmach, ErrFct *perrfct,
+	Data *data, char *pt_fname, int p_nscores)
+ : Trainer(pmach,NULL,perrfct,NULL,NULL),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(NULL), tg_wlist(NULL),
+   ptable(NULL),
+   nb_ex_slist(0), nb_ex_short_tgt(0),
+   nb_forw(0)
+{
+  debug0("*** Constructor TrainerPhraseSlist for testing ***\n");
+  cout << "Setting up testing with short list" << endl;
+
+  data_train=NULL;
+  data_dev=data;
+  data_dev_alloc=false; // do not free it by this class !
+
+  if (idim != data_dev->GetIdim()) {
+    ErrorN("TrainerPhraseSlist: input dimension of the test data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim);
+  }
+  auxdim = data_dev->GetAuxdim();
+  iaux = (idim - auxdim);
+
+  DoConstructorWork();
+
+  if (pt_fname) {
+    ptable = new(PtableMosesPtree);
+#ifdef BACKWARD_TM
+    ptable->Read(pt_fname,5,"1:0"); // backward TM prob
+#else
+    ptable->Read(pt_fname,5,"1:2"); // forward TM prob
+#endif
+  }
+  else
+    cout << " - no external phrase table provided (unhandled phrase pairs receive 0 logproba)" << endl;
+}
+
+//
+// constructor for nbest rescoring
+//
+
+TrainerPhraseSlist::TrainerPhraseSlist (Mach *pmach,
+    WordList *p_sr_wlist, WordList *p_tg_wlist,
+	char *pt_fname, int nscores, char *scores_specif)
+ : Trainer(pmach,NULL,NULL,NULL,NULL),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(p_sr_wlist), tg_wlist(p_tg_wlist),
+   ptable(NULL),
+   nb_ex_short_tgt(0), nb_forw(0)
+{
+  debug0("*** Constructor TrainerPhraseSlist for block operations ***\n");
+  cout << "Setting up CSTM with short list" << endl;
+  // TODO: init with TrainerNgram before
+  data_train=NULL;
+  data_dev=NULL;
+  DoConstructorWork();
+
+  if (pt_fname) {
+    ptable = new(PtableMosesPtree);
+    ptable->Read(pt_fname, nscores, scores_specif);
+  }
+  else
+    cout << " - no external phrase table provided (unhandled phrase pairs receive 0 logproba)" << endl;
+}
+
+//**************************************************************************************
+
+TrainerPhraseSlist::~TrainerPhraseSlist ()
+{ 
+  debug0("*** Destructor TrainerPhraseSlist ***\n");
+
+  if (buf_target_wid) delete [] buf_target_wid;
+  if (buf_target_ext) delete [] buf_target_ext;
+  if (buf_target_in_blocks) delete [] buf_target_in_blocks;
+    // buf_input and buf_target will be deleted by ~Trainer()
+
+#ifdef BLAS_CUDA
+    // free local gpu_target buffer on each GPU
+  for (vector<REAL*>::iterator it=gpu_target.begin(); it!=gpu_target.end(); ++it)
+    if (*it) cudaFree(*it);
+  gpu_target.clear();
+#endif
+
+  phrase_mach.clear();
+  mach_errfct.clear();
+
+#ifdef DEBUG2
+  vector<char*>::const_iterator iter, end = words.end();
+  for (iter=words.begin(); iter!=end; iter++) delete *iter;
+  words.clear();
+#endif
+}
+
+
+//**************************************************************************************
+//
+// We have MachSplit() at the ouput
+// this means that each machine has its own error function with its own gradient
+//   these error functions point to the outputs in the individual machines
+//   and the gradients stored in this Trainer
+
+REAL TrainerPhraseSlist::Train()
+{
+  if (!data_train) return -1;
+#ifdef DEBUG
+  printf("*****************\n");
+  printf("TrainerPhraseSlist::Train():\n");
+  printf(" - idim=%d, odim=%d, tg_nbphr=%d\n", idim, odim, tg_nbphr);
+  printf(" -          data_in: %p \n", (void*) buf_input);
+  printf(" -           target: %p \n", (void*) buf_target);
+  printf(" - target_in_blocks: %p \n", (void*) buf_target_in_blocks);
+  printf(" -          tgt WID: %p \n", (void*) buf_target_wid);
+#endif
+
+  Timer ttrain;		// total training time
+  //Timer tload;		// total time to select examples
+  //Timer ttransfer;      // total transfer time of data to GPU
+  //Timer tforw;          // total forw time
+  //Timer tgrad;          // total time fr gradient
+  //Timer tbackw;         // total backw time
+
+  ttrain.start();
+  data_train->Rewind();
+
+  REAL log_sum=0;
+  int i;
+  nb_ex=nb_ex_slist=nb_ex_short_inp=nb_ex_short_tgt=0;
+  nb_tg_words=nb_tg_words_slist=0;
+
+
+    // set input 
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);		// we copy from buf_input to gpu_input
+  debug1(" - gpu_input %p\n", gpu_input);
+#else
+  mach->SetDataIn(buf_input);
+  debug1(" - buf_input %p\n", buf_input);
+#endif
+
+    // connect the error functions for each individual machine
+    // buf_target does sequentially contain all the targets for block0, than block1 and so on
+    // buf_target_in_blocks
+    //  targets are arranged by blocks of bsize, i.e. first bsize targets for 1st machine, than 2nd and so on
+    //  by these means we don't need to copy or re-arrange data later in the GPU
+#ifdef BLAS_CUDA
+  REAL *tptr;
+#else
+  REAL *tptr=buf_target_in_blocks;
+#endif
+  debug0("Error functions of the individual machines:\n");
+  for (i=0; i<tg_nbphr; i++) {
+    mach_errfct[i]->SetOutput(phrase_mach[i]->GetDataOut());
+#ifdef BLAS_CUDA
+    tptr=gpu_target[i];	// we copy later from buf_target_in_blocks to gpu_target
+#endif
+    mach_errfct[i]->SetTarget(tptr);
+    phrase_mach[i]->SetGradOut(mach_errfct[i]->GetGrad());
+    debug5(" %d: fct=%p, output=%p, target=%p, grad=%p\n",i,(void*)mach_errfct[i],(void*)phrase_mach[i]->GetDataOut(),(void*)tptr,(void*)mach_errfct[i]->GetGrad());
+#ifndef BLAS_CUDA
+    tptr += bsize;	// each example provides 1 target for each output machine (the word ID)
+#endif
+  }
+
+  eos_src = eos_tgt = NULL_WORD;
+  if (sr_wlist->HasEOS()) {
+    eos_src=sr_wlist->GetEOSIndex();
+    printf(" - using a special token for short source sequences (%d)\n", eos_src);
+  }
+  if (tg_wlist->HasEOS()) {
+    eos_tgt=tg_wlist->GetEOSIndex();
+    printf(" - using a special token for short target sequences (%d)\n", eos_tgt);
+  }
+
+    // master loop on all training data
+  bool data_available;
+  do {
+    //tload.start();
+
+      // get a bunch of data and map all the words
+    int n=0;
+    data_available = true;
+    while (n < mach->GetBsize() && data_available) {
+      data_available = data_train->Next();
+      if (!data_available) break;
+      debug0("TRAIN DATA: input: ");
+      bool at_least_one_short=false;
+      for (i=0; i<iaux; i++) { // copy word indexes
+        WordID inp=(WordID) data_train->input[i];
+        debug2(" %s[%d]", sr_wlist->GetWordInfo(inp).word,inp);
+#if TRAINER_PHASE_SLIST_MAP_INPUT // default is not to do so
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+        else {
+          buf_input[n*idim + i] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist::Train(): input");       // map context words IDs
+          if (inp==eos_src) at_least_one_short=true;
+        }
+#else
+        buf_input[n*idim + i] = inp;
+        if (inp == NULL_WORD || inp==eos_src)
+          at_least_one_short=true;
+        else if (inp<0 || inp>=(int)sr_wlist->GetSize())
+          ErrorN("TrainerPhraseSlist::Train(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize());
+#endif
+      }
+      for (; i < idim ; i++) // copy auxiliary data
+        buf_input[n * idim + i] = data_train->input[i];
+      if (at_least_one_short) nb_ex_short_inp++;
+
+      debug0("\n - > mapped output: ");
+      
+      bool all_in_slist=true;  // ALL to be predicted words are in short list
+      at_least_one_short=false;
+      int nbtgsl=0;
+      for (i=0; i<tg_nbphr; i++) {
+        WordID outp=(WordID) data_train->target[i];
+        int idx=i+n*tg_nbphr;
+        buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist::Train(): output");  // TODO: not really needed during training, just the current value
+        if (outp==NULL_WORD
+            || (at_least_one_short && outp==eos_tgt))	// we only predict the FIRST EOS, the other ones are set to NULL_WORD
+        {   // NULL_WORDS are mapped, they will be detected in gradient calculation
+          buf_target[idx] = (REAL) NULL_WORD;
+          at_least_one_short=true;
+          debug1(" -[%d->NULL]",(int) buf_target[idx]);
+        }
+        else {
+	    // map normal word or EOS
+          nb_tg_words++; // also count EOS since we need to predict them at the output
+          if (outp==eos_tgt) at_least_one_short=true;
+          if (tg_wlist->InShortList(buf_target_wid[idx])) {
+            buf_target[idx] = (REAL) buf_target_wid[idx];
+            debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]);
+            nbtgsl++;
+          }
+          else {
+	    buf_target[idx] = (REAL) tg_slist_len;	// words that are not in slist are ALL done by the last output neuron
+            debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]);
+            all_in_slist=false;
+          }
+        }
+      }
+      if (all_in_slist) {
+        nb_ex_slist++;
+        nb_tg_words_slist += nbtgsl;
+      }
+      if (at_least_one_short) nb_ex_short_tgt++;
+      debug1("     all_slist=%d\n",all_in_slist);
+
+      n++;
+    }  // loop to get a bunch of examples
+    debug4("train bunch of %d words, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n));
+    //tload.stop();
+
+#ifdef DEBUG2
+    printf("network data:\n");
+    REAL *iptr=buf_input;
+    for (int nn=0;nn<n;nn++) {
+       for (i=0;i<idim;i++) printf(" %f", *iptr++); printf(" -> ");
+       for (i=0;i<tg_nbphr;i++) printf(" %f", *tptr++); printf("\n");
+    }
+#endif
+
+      // process the bunch by the neural network
+      // TODO: a lot of this code is identical with testing -> factor
+    if (n>0) {
+        // copy targets from buf_target to buf_target_in_blocks by re-arranging them into blocks per machine
+      
+      debug0("re-arrange targets\n");
+      for (i=0; i<tg_nbphr; i++) {
+        tptr=buf_target_in_blocks + i*bsize;	// destination start is always at full bsize blocks
+        debug2(" %d starts at %p\n",i,(void*)tptr);
+        REAL *tptr_src=buf_target+i;
+        for (int b=0; b<n; b++) {	// be careful with bsize and current n !
+          *tptr++=*tptr_src;
+          tptr_src+=tg_nbphr;
+        }
+      }
+   
+#ifdef BLAS_CUDA
+      //ttransfer.start();
+      Gpu::MemcpyAsync(gpu_input, buf_input , n*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+      REAL *tptr=buf_target_in_blocks;
+      for (i=0; i<tg_nbphr; i++) {
+        Gpu::MemcpyAsync(gpu_target[i], tptr , n*sizeof(REAL), cudaMemcpyHostToDevice);
+        tptr += n;
+      }
+      Gpu::StreamSynchronize();
+      //ttransfer.stop();
+#endif
+
+      //tforw.start();
+      mach->Forw(n,true);
+      //tforw.stop();
+
+      //tgrad.start();
+      debug0("call Error functions of the individual machines:\n");
+      for (i=0; i<tg_nbphr; i++) {
+        debug2(" %d: %p\n",i,(void*)mach_errfct[i]);
+#ifdef BLAS_CUDA
+        debug2("#### CUDA: calc gradient for output %d on GPU %d\n", i, Gpu::GetCudaDevice(Gpu::GetDevice(mach_errfct[i]->GetGpuConfig())));
+#endif
+          // the returned log_sum is cumulated over a full batch for one specific output word
+        log_sum += mach_errfct[i]->CalcGradNull(n);
+      }
+      //tgrad.stop();
+
+      debug1("  log_sum=%e\n",log_sum);
+#ifdef DEBUG2
+      int t=(int) data_train->target[0];
+# ifdef BLAS_CUDA
+      Gpu::SetConfig(mach->GetGpuConfig());
+      REAL * tmp = Gpu::Alloc(5, "tmp buffer for DEBUG2");
+      cublasGetVector(odim,CUDA_SIZE,mach->GetDataOut(),1,tmp,1);
+      printf("OUTPUT:");
+      for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n");
+      cublasGetVector(3, CUDA_SIZE, data_train->target, 1, tmp, 1);
+      printf("TARGET:");
+      for (int i=0;i<1; i++) printf(" %f", tmp[i]); printf("\n");
+      //TODO check if we need odim or idim!
+      // TODO: cublasGetVector(odim*bsize, CUDA_SIZE, errfct->GetGrad(), 1, tmp, 1);
+      printf("  GRAD:");
+      for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n");
+      cublasFree(tmp);
+# else
+      printf("OUTPUT:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",mach->GetDataOut()[i]); printf("\n");
+      printf("TARGET:") ; for (int i=0;i<1; i++) printf(" %f",data_train->target[i]); printf("\n");
+      printf("  GRAD:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",errfct->GetGrad()[i]); printf("\n");
+# endif //BLAS_CUDA
+#endif //DEBUG2
+
+      lrate->UpdateLrateOnForw(mach->GetNbForw());
+      //tbackw.start();
+      mach->Backw(lrate->GetLrate(), wdecay, n);
+      //tbackw.stop();
+    }
+
+    nb_ex += n;
+  } while (data_available);
+#ifdef BLAS_CUDA
+  Gpu::StreamSynchronize();
+#endif
+
+  ttrain.stop();
+  ttrain.disp(" - training time: ");
+  //tload.disp(" including load: ");
+  //ttransfer.disp(" transfer: ");
+  //tforw.disp(" forw: ");
+  //tgrad.disp(" grad: ");
+  //tbackw.disp(" backw: ");
+  printf("\n");
+  
+  printf(" - CSTM log_sum=%.2f%s, target words=%d, in shortlist=%d, nb_tg_words_slist=%d\n",
+	log_sum, tg_wlist->HasEOS() ? " including EOS" : "", nb_tg_words, nb_ex_slist, nb_tg_words_slist);
+  if (nb_tg_words>0) return exp(-log_sum / (REAL) nb_tg_words);  // when normalizing consider that all examples lead to a forward pass 
+
+  return -1;
+}
+
+//**************************************************************************************
+// 
+
+void TrainerPhraseSlist::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni)
+{
+  int Nbest=100;
+
+    // get input length
+  int input_length;
+  for (input_length=0;input_length<iaux;input_length++) {
+    if (buf_input[ni*idim+input_length] == NULL_WORD) break;
+  }
+
+  std::vector<std::vector<std::pair<float, std::size_t> > > prepared_scores
+   = prepare_hypotheses(optr, tg_nbphr, dim_per_phrase, Nbest);
+  std::vector<std::pair<float, std::vector<std::size_t> > > best
+   = sort_ngrams(prepared_scores, input_length, Nbest);
+
+  for(std::size_t i = 0; i < best.size(); ++i) {
+      // source
+    for (int j=0; j<iaux; j++) {
+      if (buf_input[ni*idim+j] == NULL_WORD) break;
+      fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+j]).word << " ";
+    }
+
+      // target
+    fspt << "|||";
+    for(std::size_t j = 0; j < best[i].second.size(); ++j) {
+      fspt << " " << tg_wlist->GetWordInfoMapped(best[i].second[j]).word;
+    }
+
+      // score
+    fspt << " ||| " << exp(best[i].first);
+    fspt << "\n";
+  }
+
+}
+
+//**************************************************************************************
+// 
+#if 0
+void TrainerPhraseSlist::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni)
+{
+  int i;
+	  // Find most likely outputs
+        for (i=0;i<iaux;i++) {
+          if (buf_input[ni*idim+i] == NULL_WORD) break;
+          fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+i]).word << " ";
+        }
+        fspt << "||| ";
+        
+        for (i=0; i<tg_nbphr; i++) {
+          if (buf_target_wid[i+ni*tg_nbphr] == NULL_WORD) break;
+  tgrad.disp(" including ");
+  tgrad.disp(" including ");
+	    // find max of current word
+	  REAL *sptr=optr+i*dim_per_phrase, max=*sptr++; int max_idx=0;
+          for (int s=1; s<dim_per_phrase; s++, sptr++) {
+            if (*sptr>max) { max=*sptr; max_idx=s; }
+          }
+          fspt << tg_wlist->GetWordInfoMapped(max_idx).word << "[" << max << "] ";
+        }
+  fspt << endl;
+}
+#endif
+ 
+//**************************************************************************************
+// 
+
+REAL TrainerPhraseSlist::TestDev(char *fname)
+{
+  if (!data_dev) return -1;
+
+  vector<string> src_phrase;	// interface with classical phrase tables
+  vector<string> tgt_phrase;
+  vector<bool> done_by_cstm;
+
+  ofstream fs;
+  if (fname) {
+    cout << " - dumping phrase probability stream to file '" << fname << "'" << endl;
+    fs.open(fname,ios::out);
+    CHECK_FILE(fs,fname);
+  }
+
+#undef DUMP_PHRASE_TABLE
+#ifdef DUMP_PHRASE_TABLE
+  char *ptfname = (char*) "alltrans.txt";
+  ofstream fspt;
+  fspt.open(ptfname,ios::out);
+  CHECK_FILE(fspt,ptfname);
+  cout << " - dumping new phrase table to file '" << ptfname << "'" << endl;
+#endif
+
+  nb_ex=nb_ex_slist=nb_ex_short_inp=nb_ex_short_tgt=0;
+  nb_tg_words=nb_tg_words_slist=0;
+  int nb_not_in_ptable=0;	// this counts the number of phrase pairs which were not found in the external phrase table
+  int nb_src_words=0;
+  REAL log_sum=0;
+  REAL log_sum_notunk=0;	// all known phrase pairs, either CSTM or ptable (count=nb+_ex - nb_not_in_ptable)
+  REAL log_sum_cstm=0;		// only CSLM, i.e. considering phrases done by CSTM
+  REAL log_sum_cstm_short=0;	// like CSTM, limited to short n-grams, i.e. we do not count the prediction of (multiple) EOS
+
+  uint idx;
+
+    // set input 
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);		// we copy from buf_input to gpu_input
+  debug1(" - gpu_input %p\n", gpu_input);
+#else
+  mach->SetDataIn(buf_input);
+  debug1(" - buf_input %p\n", buf_input);
+#endif
+
+    // connect the error functions for each individual machine
+    // buf_target does sequentially contain all the targets for block0, than block1 and so on
+    // buf_target_in_blocks
+    //  targets are arranged by blocks of bsize, i.e. first bsize targets for 1st machine, than 2nd and so on
+    //  by these means we don't need to copy or re-arange data later in the GPU
+#ifdef BLAS_CUDA
+  REAL *tptr;
+#else
+  REAL *tptr=buf_target_in_blocks;
+#endif
+  debug0("Error functions of the individual machines:\n");
+  for (int i=0; i<tg_nbphr; i++) {
+    mach_errfct[i]->SetOutput(phrase_mach[i]->GetDataOut());
+#ifdef BLAS_CUDA
+    tptr=gpu_target[i];	// we copy later from buf_target_in_blocks to gpu_target
+#endif
+    mach_errfct[i]->SetTarget(tptr);
+    phrase_mach[i]->SetGradOut(mach_errfct[i]->GetGrad());
+    debug5(" %d: fct=%p, output=%p, target=%p, grad=%p\n",i,(void*)mach_errfct[i],(void*)phrase_mach[i]->GetDataOut(),(void*)tptr,(void*)mach_errfct[i]->GetGrad());
+#ifndef BLAS_CUDA
+    tptr += bsize;	// each example provides 1 target for each output machine (the word ID)
+#endif
+  }
+
+    // how do we handle short sequences ?
+  eos_src = eos_tgt = NULL_WORD;
+  if (sr_wlist->HasEOS()) {
+    eos_src=sr_wlist->GetEOSIndex();
+    printf(" - using a special token for short source sequences (%d)\n", eos_src);
+  }
+  if (tg_wlist->HasEOS()) {
+    eos_tgt=tg_wlist->GetEOSIndex();
+    printf(" - using a special token for short target sequences (%d)\n", eos_tgt);
+  }
+
+  bool data_available;
+  data_dev->Rewind();
+  do {
+      // get a bunch of data
+    int n=0, i;
+    data_available = true;
+    debug0("start bunch\n");
+    done_by_cstm.clear();
+    while (n < mach->GetBsize() && data_available) {
+      data_available = data_dev->Next();
+      if (!data_available) break;
+
+      debug0("DEV DATA: input: ");
+      bool at_least_one_short=false;
+      for (i=0; i<iaux; i++) { // copy word indexes
+        WordID inp=(WordID) data_dev->input[i];
+        idx=n*idim + i;
+        debug2(" %s[%d]", tg_wlist->GetWordInfo(inp).word,inp);
+#if TRAINER_PHASE_SLIST_MAP_INPUT // default is not to do so
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+        else {
+          buf_input[idx] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist::TesDev(): input");       // map context words IDs
+          nb_src_words++;
+          if (inp==eos_src) at_least_one_short=true;
+        }
+#else
+        buf_input[idx] = inp;
+        if (inp == NULL_WORD || inp==eos_src)
+          at_least_one_short=true;
+        else {
+          if (inp<0 || inp>=(int)sr_wlist->GetSize())
+            ErrorN("TrainerPhraseSlist::TestDev(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize());
+          nb_src_words++;
+        }
+#endif
+      }
+      for (; i < idim ; i++) // copy auxiliary data
+        buf_input[n * idim + i] = data_dev->input[i];
+      if (at_least_one_short) nb_ex_short_inp++;
+
+      debug0("\n - > mapped output: ");
+      
+      bool all_in_slist=true;  // ALL to be predicted words are in short list
+      int nbtgsl=0;
+      at_least_one_short=false;
+      for (i=0; i<tg_nbphr; i++) {
+        WordID outp=(WordID) data_dev->target[i];
+        idx=i+n*tg_nbphr;
+        buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist::TestDev(): output");
+        buf_target_ext[idx] = outp;		// keep unmapped target word ID for Moses phrase-table
+        if (outp==NULL_WORD
+            || (at_least_one_short && outp==eos_tgt))   // we only predict the FIRST EOS, the other ones are set to NULL_WORD
+        {   // NULL_WORDS are mapped, they will be detected in gradient calculation
+          buf_target_wid[idx] = NULL_WORD;
+          buf_target[idx] = (REAL) NULL_WORD;
+          at_least_one_short=true;
+          debug1(" -[%d->NULL]",(int) buf_target_wid[idx]);
+        }
+        else {
+            // map normal word or EOS
+          nb_tg_words++; // also count EOS since we need to predict them at the output
+          if (outp==eos_tgt) at_least_one_short=true;
+          if (tg_wlist->InShortList(buf_target_wid[idx])) {
+            buf_target[idx] = (REAL) buf_target_wid[idx];
+            debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]);
+	    nbtgsl++;
+          }
+          else {
+	      // TODO: we actually don't need a forward pass for words in the short lists or short n-grams
+	      //       this could be used to save some time (5-10%)
+            buf_target_wid[idx] = tg_slist_len;
+	    buf_target[idx] = (REAL) tg_slist_len;	// words that are not in slist are ALL done by the last output neuron
+            debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]);
+            all_in_slist=false;
+          }
+        }
+      }
+      done_by_cstm.push_back(all_in_slist);
+      if (all_in_slist) {
+        nb_ex_slist++;
+        nb_tg_words_slist += nbtgsl;
+      }
+      if (!at_least_one_short) nb_ex_short_tgt++;
+      debug1("     all_slist=%d\n",all_in_slist);
+
+      n++;
+    }  // loop to get a bunch ef examples
+    debug4("dev bunch of %d phrases, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n));
+
+#ifdef DEBUG2
+printf("network data:\n");
+REAL *iptr=buf_input;
+REAL *tptr=buf_target;
+for (int nn=0;nn<n;nn++) {
+   for (i=0;i<idim;i++) printf(" %f", *iptr++); printf(" -> ");
+   for (i=0;i<tg_nbphr;i++) printf(" %f", *tptr++); printf("\n");
+}
+#endif
+
+
+      // process the bunch by the neural network
+    if (n>0) {
+        // copy targets from buf_target to buf_target_in_blocks by re-arranging them into blocks per machine
+      
+      debug0("re-arrange targets\n");
+      for (i=0; i<tg_nbphr; i++) {
+        tptr=buf_target_in_blocks + i*bsize;	// destination start is always at full bsize blocks
+        debug2(" %d starts at %p\n",i,(void*)tptr);
+        REAL *tptr_src=buf_target+i;
+        for (int b=0; b<n; b++) {	// be careful with bsize and current n !
+          *tptr++=*tptr_src;
+          tptr_src+=tg_nbphr;
+        }
+      }
+    
+#ifdef BLAS_CUDA
+      Gpu::MemcpyAsync(gpu_input, buf_input , n*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+      REAL *tptr=buf_target_in_blocks;
+      for (i=0; i<tg_nbphr; i++) {
+        Gpu::MemcpyAsync(gpu_target[i], tptr , n*sizeof(REAL), cudaMemcpyHostToDevice);
+        tptr += n;
+      }
+      Gpu::StreamSynchronize();
+#endif
+      mach->Forw(n,false); 
+      for (i=0; i<tg_nbphr; i++) {
+          // the returned log_sum is cumulated over a full batch for one specific output word
+        //log_sum += mach_errfct[i]->CalcValueNull(n);
+        log_sum += mach_errfct[i]->CalcGradNull(n);	// TODO: should use CalcValueNull()
+      }
+    }
+
+#if DIRECT_PROBA_CALCULATION
+      // get probas from CSLM or back-off LM
+#ifdef BLAS_CUDA
+      // host output is of dim bsize*odim - bsize*tg_nphr*dim_per_phrase
+      // it contains the whole bunch of the 1st output, then whole bunch of 2nd output, etc
+    for (int i=0; i<tg_nbphr; i++) {
+      Gpu::MemcpyAsync(host_output+i*bsize*dim_per_phrase,phrase_mach[i]->GetDataOut(), n*dim_per_phrase*sizeof(REAL), cudaMemcpyDeviceToHost);
+      // TODO: we actually copy too much data, for each output vector we only need one value !
+    }
+    Gpu::StreamSynchronize();
+#endif
+
+    debug1("Collect n=%d\n", n);
+    if (n!=(int) done_by_cstm.size())
+      Error("TrainerPhraseSlist::TestDev(): internal error, number of phrases done by CSTM does not match");
+
+    REAL *ptr_input = buf_input;	// n times idim values
+    for (int ni=0; ni<n; ni++) {
+      REAL logP=0.0, logP_short=0.0;
+      if (done_by_cstm[ni]) {
+          // get proba from CSTM (removed renorm)
+
+        for (i=0; i<tg_nbphr; i++) {
+          WordID cur_tg=buf_target_wid[i+ni*tg_nbphr];
+          if (cur_tg == NULL_WORD) break;
+		// get proba from output i for bunch ni
+#ifdef BLAS_CUDA
+	  REAL *optr=host_output+i*bsize*dim_per_phrase + ni*dim_per_phrase;
+#else
+	  REAL *optr=phrase_mach[i]->GetDataOut() + ni*dim_per_phrase;
+#endif
+          logP += safelog(optr[cur_tg]); // no error check on indices necessary here
+          if (buf_target_ext[i+ni*tg_nbphr] != eos_tgt) { // exclude the (easy) prediction of EOS from stats
+            logP_short += safelog(optr[cur_tg]); // no error check on indices necessary here
+          }
+          debug5("n=%3d, pos=%d, tg_w=%d (unmapped %d), P=%f\n",ni,i,cur_tg,buf_target_ext[i+ni*tg_nbphr],optr[cur_tg]);
+        }
+        debug4(" -      -> logP=%f/%d, logP_short=%f/%d\n",logP,logP_short); 
+
+#ifdef DUMP_PHRASE_TABLE
+          // create output phrase table
+        for (i=0;i<iaux;i++) {
+          if (buf_input[ni*idim+i] == NULL_WORD) break;
+          fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+i]).word << " ";
+        }
+        fspt << "||| ";
+        for (i=0;i<tg_nbphr;i++) {
+          if (buf_target_wid[i+ni*tg_nbphr] == eos_tgt) break;
+          fspt << tg_wlist->GetWordInfoMapped(buf_target_wid[ni*tg_nbphr+i]).word << " ";
+        }
+        fspt << "||| " << logP << endl;
+#endif
+
+#ifdef DUMP_PHRASE_TABLE_NBEST
+	Error("GetMostLikelyTranslations() change to work with multiple output vectors");
+        GetMostLikelyTranslations(fspt,optr,ni);
+#endif
+
+        debug1(" CSLM: logP=%e\n", logP);
+        log_sum_cstm += logP;
+        log_sum_cstm_short += logP_short;
+        log_sum_notunk += logP;
+        log_sum += logP;
+      }
+      else {
+Error("not done by CSTM");
+
+       if (ptable) {
+          // request proba from Moses phrase-table
+         debug0("create textual phrase pair for external phrase table (word + index)\n");
+         src_phrase.clear();
+         debug0("  source:");
+         for (i=0; i<iaux && ptr_input[i]!=NULL_WORD; i++) {
+           src_phrase.push_back(sr_wlist->GetWordInfo((uint) ptr_input[i]).word);	// TODO: char* to string
+           debug2(" %s[%d]", src_phrase.back().c_str(), (uint) ptr_input[i]);
+#ifdef DUMP_PHRASE_TABLE
+           fspt << src_phrase.back() << " ";
+#endif
+         }
+
+#ifdef DUMP_PHRASE_TABLE
+         fspt << "|P| ";
+#endif
+         tgt_phrase.clear();
+         debug0("  target:");
+         for (i=0; i<tg_nbphr && buf_target_ext[i+ni*tg_nbphr]!=eos_tgt; i++) {
+           tgt_phrase.push_back(tg_wlist->GetWordInfoMapped(buf_target_ext[i+ni*tg_nbphr]).word);	// TODO: char* to string
+           debug2(" %s[%d]", tgt_phrase.back().c_str(), buf_target_ext[i+ni*tg_nbphr]);
+#ifdef DUMP_PHRASE_TABLE
+           fspt << tgt_phrase.back() << " ";
+#endif
+         }
+# ifdef BACKWARD_TM
+         logP = ptable->GetProb(tgt_phrase, src_phrase);
+# else
+         logP = ptable->GetProb(src_phrase, tgt_phrase);
+# endif
+         if (logP == PROBA_NOT_IN_PTABLE) nb_not_in_ptable++;
+                                     else log_sum_notunk += logP;
+         logP = safelog(logP); // take log now
+         debug1("  => logP=%e\n",logP);
+         log_sum += logP;
+       }
+       else { // no ptable was specified
+         logP=0; // flag output that it wasn't done by CSTM
+       }
+#ifdef DUMP_PHRASE_TABLE
+       fspt << "||| " << logP << endl;
+#endif
+      } // not done by CSTM
+          
+      ptr_input += idim;  // next example in bunch at input
+      if (fname) {
+        fs << logP << endl;
+      }
+    }
+#endif // old proba calculation
+
+    nb_ex += n;
+    debug2("%d: %f\n",nb_ex,exp(-log_sum/nb_ex));
+  } while (data_available);
+
+  printf(" - %d phrases, %d target words, avr length src=%.1f tgt=%.1f, CSTM: %d phrases (%.2f), %d target words (%.2f)\n",
+	 nb_ex, nb_tg_words, (REAL) nb_src_words/nb_ex, (REAL) nb_tg_words/nb_ex,
+	 nb_ex_slist, 100.0*nb_ex_slist/nb_ex, nb_tg_words_slist, 100.0 * nb_tg_words_slist/nb_tg_words);
+  if (ptable) {
+    printf(" - %d words were looked up in external phrase table, %d (%.2f%% were not found)\n",
+	nb_ex-nb_ex_slist, nb_not_in_ptable, 100.0*nb_not_in_ptable/(nb_ex-nb_ex_slist));
+  }
+
+#ifdef DIRECT_PROBA_CALCULATION
+  REAL px = (nb_ex>0) ? exp(-log_sum / (REAL) nb_ex) : -1;
+  printf("   cstm px=%.2f, ln_sum=%.2f, cstm_short_px=%.2f, ln_sum=%.2f, overall px=%.2f, with unk=%.2f\n",
+        (nb_ex_slist>0) ? exp(-log_sum_cstm / (REAL) nb_ex_slist) : -1, log_sum_cstm,
+        (nb_ex_slist>0) ? exp(-log_sum_cstm_short / (REAL) nb_ex_slist) : -1, log_sum_cstm_short,
+        (nb_ex-nb_not_in_ptable>0) ? exp(-log_sum_notunk / (REAL) (nb_ex-nb_not_in_ptable)) : -1,
+        px);
+#else
+  REAL px = (nb_ex>0) ? exp(-log_sum / (REAL) nb_tg_words_slist) : -1;
+  printf("   px=%.2f, ln_sum=%.2f\n", px, log_sum);
+#endif
+
+  if (fname) fs.close();
+#ifdef DUMP_PHRASE_TABLE
+  fspt.close();
+#endif
+
+  return px;
+}
+
+
+//**************************************************************************************
+// information after finishing an epoch
+
+void TrainerPhraseSlist::InfoPost ()
+{
+    // if EOS is predicted by the NN, we don't count it as short
+  printf(" - epoch finished, %d target words in %d phrases (%.2f/%.2f%% short source/target)\n",
+	nb_tg_words, nb_ex,
+	100.0*nb_ex_short_inp/nb_ex, 100.0*nb_ex_short_tgt/nb_ex);
+  printf("   CSTM: %d target words in %d phrases (%.2f%%), avrg px=%.2f\n",
+	nb_tg_words_slist, nb_ex_slist, 100.0*nb_ex_slist/nb_ex,
+	err_train);
+}
+
+//**************************************************************************************
+// request one n-gram probability, usually the called will be delayed
+// and processes later 
+
+
+//**************************************************************************************
+// collect all delayed probability requests
+
+
+void TrainerPhraseSlist::ForwAndCollect(vector< vector<string> > &src_phrases, AlignReq *areq, int req_beg, int req_end, int bs, int tm_pos)
+{
+  if (bs<=0) return;
+  debug3("TrainerPhraseSlist::ForwAndCollect(): collecting outputs %d .. %d from bunch of size %d\n", req_beg, req_end, bs);
+  debug3("\ttarget machines %d x dim %d = total %d\n", tg_nbphr, dim_per_phrase, odim);
+
+  if (bs != (int) src_phrases.size())
+    ErrorN("TrainerPhraseSlist::ForwAndCollect(): the number of source phrases (%d) does not match block length (%d)", (int) src_phrases.size(), bs);
+
+#ifdef DEBUG
+  printf("bunch of %d\n",bs);
+  for (int b=0; b<bs; b++) {
+    printf("%3d:", b);
+    for (int ii=0; ii<idim; ii++) printf(" %.2f", buf_input[b*idim+ii]); printf("\n");
+  }
+#endif
+
+  nb_forw++;
+#ifdef CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);
+  Gpu::MemcpyAsync(gpu_input, buf_input , bs*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+#else
+  mach->SetDataIn(buf_input);
+#endif
+  mach->Forw(bs,false);
+
+#ifdef BLAS_CUDA
+  for (int tw=0; tw<tg_nbphr; tw++)
+    Gpu::MemcpyAsync(host_output + tw*bsize*dim_per_phrase, phrase_mach[tw]->GetDataOut(), bs*dim_per_phrase*sizeof(REAL), cudaMemcpyDeviceToHost);
+  Gpu::StreamSynchronize();
+#endif
+
+    // stats
+  int cnt_ex_slist=0, cnt_tg_words=0, cnt_tg_words_slist=0;
+
+  for (int n=req_beg; n<=req_end; n++) {
+    REAL logP=0;
+    int b=areq[n].bs;
+
+    if ((int) areq[n].tgph.size() > tg_nbphr)
+      ErrorN("TrainerPhraseSlist::ForwAndCollect(): target phrase too long (%d) for machine (%d)", (int) areq[n].tgph.size(), tg_nbphr);
+
+#ifdef DEBUG
+    printf("collect b=%3d \n input:", b);
+    for (int ii=0; ii<idim; ii++) printf(" %f",buf_input[b*idim+ii]); printf("\n");
+#endif
+
+      // map target words
+    debug0(" output:");
+    bool all_in_slist=true;
+    int tw;
+    for (tw=0; all_in_slist && tw<tg_nbphr; tw++) {
+      WordID outp = areq[n].tgwid[tw];
+      debug1(" %d",outp);
+      if (outp==eos_tgt) break;
+      cnt_tg_words++;
+      buf_target_wid[tw] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist::ForwAndCollect() output");
+      debug1("->%d",buf_target_wid[tw]);
+      all_in_slist=tg_wlist->InShortList(buf_target_wid[tw]);
+    }
+      // fill up
+    for (; tw<tg_nbphr; tw++) {
+      debug0(" fill");
+      buf_target_wid[tw]=eos_tgt;
+    }
+    debug1("    slist=%d\n",all_in_slist);
+
+    if (!all_in_slist) {
+        // get proba from external phrase table
+      logP=safelog(ptable->GetProb(src_phrases[areq[n].bs], areq[n].tgph));
+      debug1(" ptable: logP=%f\n", logP);
+    }
+    else {
+        // get proba from CSLM
+      debug0(" -  in slist CSLM:");
+      logP=0; int cnt=0;
+      for (int tw=0; tw<tg_nbphr; tw++) {
+        if (buf_target_wid[tw] == eos_tgt) break;
+#ifdef BLAS_CUDA
+        //old;  REAL *optr=host_output + b*odim;
+        //test: REAL *optr=host_output+i*bsize*dim_per_phrase + ni*dim_per_phrase;
+        REAL *optr=host_output+tw*bsize*dim_per_phrase + b*dim_per_phrase;
+#else
+        //old: REAL *optr=mach->GetDataOut() + b*odim;
+        //test: REAL *optr=phrase_mach[i]->GetDataOut() + ni*dim_per_phrase;
+        //TODO: it would be much more efficient to do all the examples of one machine and then switch to the next one
+        REAL *optr=phrase_mach[tw]->GetDataOut() + b*dim_per_phrase;
+#endif
+        debug1(" %e", optr[buf_target_wid[tw]]);
+        logP += safelog(optr[buf_target_wid[tw]]);
+        cnt++;
+      }
+      if (cnt==0) Error("no target phrases when collecting output");
+      logP /= cnt; // TODO: is this normalization correct ?
+      debug1(" -> log avr=%f\n",logP);
+
+      cnt_ex_slist++;
+      cnt_tg_words_slist += cnt;
+    }
+
+        // store LM proba
+    areq[n].hyp->AddFeature(logP,tm_pos);
+  } // for (ni=...)
+
+  printf(" nb of phrases: %d with %d target words, by CSTM %d (%5.2f%%), avrg length %1.2f words\n",
+	 req_end-req_beg+1, cnt_tg_words, cnt_ex_slist, (float) 100.0* cnt_ex_slist / (req_end-req_beg+1), (float) cnt_tg_words_slist/cnt_ex_slist);
+  nb_ex += (req_end-req_beg+1);
+  nb_ex_slist += cnt_ex_slist;
+  nb_tg_words_slist += cnt_tg_words_slist;
+  nb_tg_words += cnt_tg_words;
+}
+
+
+void TrainerPhraseSlist::BlockStats() {
+   //printf(" - %d phrase probability requests, %d=%5.2f short phrase %d forward passes (avrg of %d probas), %d=%5.2f%% predicted by CSTM\n",
+	//nb_ngram, nb_ex_short_tgt, 100.0*nb_ex_short_tgt/nb_ngram, nb_forw, nb_ngram/nb_forw, nb_ex_slist, 100.0*nb_ex_slist/nb_ngram);
+   printf(" - CSTM: %d forward passes, %d=%5.2f%% phrases were predicted by CSTM\n",
+	nb_forw, nb_ex_slist, 100.0 * nb_ex_slist/nb_ex);
+}
diff --git a/TrainerPhraseSlist.h b/TrainerPhraseSlist.h
new file mode 100644
index 0000000..81a83f8
--- /dev/null
+++ b/TrainerPhraseSlist.h
@@ -0,0 +1,114 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#ifndef _TrainerPhraseSlist_h
+#define _TrainerPhraseSlist_h
+
+#include <ostream>
+#include "Tools.h"
+#include "Mach.h"
+#include "ErrFct.h"
+#include "DataPhraseBin.h"
+#include "Trainer.h"
+#include "WordList.h"
+
+#include "PtableMosesPtree.h"
+#include "AlignReq.h"
+
+//
+// Class to train neural networks to predict phrase probabilities
+//  - we use a short list of target words for which the NN predicts the proba
+//  - the proba of the other target words are obtained by a classical Moses phrase table
+//  - the NN also predicts the proba mass of ALL the words not in the short slist
+//    for this we use the last output neuron of the network
+
+
+class TrainerPhraseSlist : public Trainer
+{
+private:
+  int		max_inp_idx;		// largest index -1 of a word at the input (# of entries in projection table)
+  int		tg_nbphr;		// number of phrases at output, odim should be (tg_slist_len+1) * tg_nbphr
+  int		dim_per_phrase;		// output dimension of each phrase prediction layer (must be equal size)
+  WordID	tg_slist_len;		// length of slist (this is set to dim_per_phrase MINUS ONE)
+  WordList	*sr_wlist;
+  WordList	*tg_wlist;
+  vector<Mach*> phrase_mach;		// pointer to the output machine for each phrase
+  vector<ErrFct*> mach_errfct;		// each individual machine has its own error function with local memory
+					// in this version of the Trainer the error function is identical to all machines
+					// (we use the one in the local variable of the mother class Trainer)
+ 
+  PtableMosesPtree	*ptable;	// classical phrase table
+
+    // handling of short sequences
+    // 			input		output	
+    // NULL_WORD	set proj=0	set grad=0
+    // EOS		as normal word	as normal word
+    //
+  WordID eos_src, eos_tgt;		// defaults to NULL_WORD if no special symbol in word list
+
+    // various stats
+  int		nb_ex_slist;		// total number of examples processed in slist
+  int		nb_ex_short_inp;	// total number of incomplete input phrases
+  int		nb_ex_short_tgt;	// total number of incomplete target phrases
+  int		nb_tg_words;		// total number of target words (there can be several target words for a phrase pair)
+  int		nb_tg_words_slist;	// total number of target words which are in short list
+// TODO: use WordID vector for targets in order to make less casts 
+  WordID	*buf_target_wid;	// used instead of buf_target to avoid casts between REAL and WordID
+					// size is odim x bsize
+  WordID	*buf_target_ext;	// similar to buf_target_wid[], but keep even word id out side of short list
+					// needed to request probas from external phrase table
+  REAL		*buf_target_in_blocks;	// same data than in buf_target of Trainer class, but re-arranged in blocks for individual machines
+#ifdef BLAS_CUDA
+  vector<REAL*> gpu_target;	// copied from trainer to GPU
+#endif
+#ifdef DEBUG
+  vector<char*>  words;			// give UTF8 word for a given CSLM internal index
+#endif
+  REAL DoTestDev(char*, bool);	// internal helper function
+  void DoConstructorWork();	// internal helper function for the various constructors
+    // data and functions for block processing
+  int	nb_forw;		// stats on total number of forward passes
+  void GetMostLikelyTranslations(ofstream&,REAL*,int);
+protected:
+  virtual void InfoPost();			// dump information after finishing a training epoch
+public:
+  TrainerPhraseSlist(Mach*, Lrate*, ErrFct*,	// mach, lrate, errfct
+	  const char*, const char*, const char*, int,	// train, dev, external phrase table, number of scores
+	  REAL =0, int =10, int =0);			// wdecay, max epochs, current epoch
+  TrainerPhraseSlist(Mach*, ErrFct*, Data*,	// for testing only: mach, errfct, binary data
+	  char*, int);				// external phrase table, number of scores
+  TrainerPhraseSlist(Mach*, WordList*, WordList*,	// for general proba calculation: mach, src word list, tgt word list
+	  char*, int , char*);			// external phrase table, number of scores, score specif
+  virtual ~TrainerPhraseSlist();
+  virtual REAL Train();				// train for one epoch
+  virtual REAL TestDev(char* =NULL);		// test current network on dev data and save outputs into file
+    // fast block evaluation functions
+  virtual void StoreInput(int b, int d, REAL val) {buf_input[b*bsize+d]=val;}
+  virtual void ForwAndCollect(vector< vector<string> > &, AlignReq*, int,int,int,int);	// for nbest rescoring
+  virtual void BlockStats();				// display some stats on Block mode
+    // interface functions
+  virtual int GetTgtNbPhr() {return tg_nbphr; }
+  virtual int GetSlistLen() {return tg_slist_len; }
+  virtual REAL *GetBufInput() {return buf_input; }
+};
+
+#endif
diff --git a/TrainerPhraseSlist1.cpp b/TrainerPhraseSlist1.cpp
new file mode 100644
index 0000000..5f369d4
--- /dev/null
+++ b/TrainerPhraseSlist1.cpp
@@ -0,0 +1,951 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ */
+
+using namespace std;
+#include <iostream>
+#include <algorithm>
+#include <unistd.h>
+#include <time.h>
+
+#include "Tools.h"
+#include "Mach.h"
+#include "MachTab.h"
+#include "MachPar.h"
+#include "MachSeq.h"
+#include "MachSplit1.h"
+#include "TrainerPhraseSlist1.h"
+
+#include "NBest.h" 
+#include "sort.cpp" 
+
+void TrainerPhraseSlist1::DoConstructorWork()
+{
+  char	msg[1024];
+
+  idim=mach->GetIdim(); odim=mach->GetOdim(); bsize=mach->GetBsize();
+
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  gpu_input = Gpu::Alloc(idim*bsize, "inputs in Trainer");
+  gpu_target = Gpu::Alloc(odim*bsize, "targets in Trainer");
+  host_output = new REAL[odim*bsize];
+#endif
+  buf_target_wid = new WordID[odim*bsize];
+  buf_target_ext = new WordID[odim*bsize];
+
+    // set up vector to outputs of the target phrases
+  if (mach->GetMType() != file_header_mtype_mseq)
+    Error("CSTM: sequential machine needed\n");
+  MachSeq *mseq=(MachSeq*) mach;
+  if (mseq->MachGetNb()<2)
+    Error("CSTM: the number of machines is suspeciously small");
+
+    // check input layer
+  if (mseq->MachGet(0)->GetMType() != file_header_mtype_mpar)
+    Error("TrainerPhraseSlist1::DoConstructorWork: CSTM: the input layer has the wrong architecture\n");
+  MachPar *mpar = (MachPar*) mseq->MachGet(0);
+  if (mpar->MachGet(0)->GetMType() != file_header_mtype_tab)
+    Error("TrainerPhraseSlist1::DoConstructorWork: CSTM: the input layer has the wrong architecture\n");
+  MachTab *mtab = (MachTab*) mpar->MachGet(0);
+  max_inp_idx = mtab->GetMaxInpVal();
+
+    // check output layer
+  if (mseq->MachGet(mseq->MachGetNb()-1)->GetMType() != file_header_mtype_msplit1)
+    Error("CSTM: the output layer has the wrong architecture\n");
+  MachSplit1 *msp = (MachSplit1*) mseq->MachGet(mseq->MachGetNb()-1);
+  tg_nbphr=msp->MachGetNb();
+  if (data_train && (data_train->GetOdim() != tg_nbphr)) {
+    sprintf(msg,"CSTM: output dimension of the training data should be %d, found %d\n", tg_nbphr, data_train->GetOdim());
+    Error(msg);
+  }
+
+  phrase_mach.clear();
+  for (int m=0; m<tg_nbphr; m++) {
+    phrase_mach.push_back(msp->MachGet(m));
+    if (m>0 && phrase_mach[m-1]->GetOdim() != phrase_mach[m]->GetOdim())
+      Error("CSTM: the output layer dimension must be identical for all phrases\n");
+  }
+  dim_per_phrase = phrase_mach[0]->GetOdim();
+  cout << " - this machine can predict up to " << phrase_mach.size() << " phrases, each with an output layer of dimension " << dim_per_phrase << endl;
+  tg_slist_len = dim_per_phrase-1;
+
+
+    // get source word list
+  if (sr_wlist == NULL) {
+    vector<WordList> *vect_wlist = NULL;
+    if (data_dev != NULL)
+      vect_wlist = data_dev->GetSrcWList();
+    else if (data_train != NULL)
+      vect_wlist = data_train->GetSrcWList();
+    if ((vect_wlist != NULL) && !vect_wlist->empty())
+      sr_wlist = &(vect_wlist->front());
+  }
+  if (sr_wlist == NULL)
+    Error("no source word list available");
+  if ((int) sr_wlist->GetSize() > max_inp_idx)
+    Error("the size of the source word list exceeds the number of input words the machine was trained for");
+
+    // get target word list
+  if (tg_wlist == NULL) {
+    vector<WordList> *vect_wlist = NULL;
+    if (data_dev != NULL)
+      vect_wlist = data_dev->GetTgtWList();
+    else if (data_train != NULL)
+      vect_wlist = data_train->GetTgtWList();
+    if ((vect_wlist != NULL) && !vect_wlist->empty())
+      tg_wlist = &(vect_wlist->front());
+  }
+  if (tg_wlist == NULL)
+    Error("no target word list available");
+  if (!tg_wlist->FrequSort())
+    Error("the target word list don't contain word count");
+  if (tg_wlist->GetSize() <= tg_slist_len)
+    Error("TrainerPhraseSlist1: the output layer is larger than the target word list");
+
+  ulong sum_sl=0, sum=0;
+  tg_wlist->SetShortListLength(tg_slist_len);
+  tg_wlist->CountWords(sum_sl, sum);
+  printf (" - setting up target short list of %d words, coverage of %5.2f%%\n", tg_slist_len, 100.0*sum_sl/sum);
+
+#ifdef DEBUG2
+  cout << "Words in slist:" << endl;
+  WordID ci=tg_slist_len;
+  WordList::const_iterator iter, end = tg_wlist->End();
+  for (iter=tg_wlist->Begin(); (iter!=end) && (ci > 0); iter++, ci--)
+    printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id);
+#endif
+
+#ifdef DEBUG2
+  cout << "Words not in slist:" << endl;
+  for (; iter!=end; iter++)
+    printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id);
+#endif
+
+#ifdef DEBUG2
+   // just needed for debugging
+  words.reserve(tg_wlist->GetSize());
+  for (iter=tg_wlist->Begin(); iter!=end; iter++) words[iter->id] = strdup(iter->word);
+#endif
+  
+  debug0(" + done init TrainerPhraseSlist1\n");
+}
+
+//
+// constructor for training
+//
+
+TrainerPhraseSlist1::TrainerPhraseSlist1 (Mach *pmach, Lrate *lrate, ErrFct *perrfct,
+	const char *train_fname, const char *dev_fname, const char *pt_fname, int p_nscores,
+	REAL p_wd, int p_maxep, int p_ep)
+ : Trainer(pmach,lrate,perrfct,NULL,NULL,p_wd,p_maxep,p_ep),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(NULL), tg_wlist(NULL),
+   nb_ex_slist(0), nb_ex_short(0),
+   nb_forw(0)
+{
+  debug2("*** Constructor TrainerPhraseSlist1 for training idim=%d, odim=%d ***\n",idim,odim);
+  cout << "Setting up CSTM training with short list" << endl;
+  char msg[1024];
+
+  if (train_fname) {
+    data_train = new Data(train_fname);
+    if (idim != data_train->GetIdim()) {
+      sprintf(msg,"TrainerPhraseSlist1: input dimension of the training data (%d) does not match the one of the machine (%d)\n", data_train->GetIdim(), idim);
+      Error(msg);
+    }
+    if (data_train->GetOdim()<1 || data_train->GetOdim()>10) {
+      sprintf(msg,"TrainerPhraseSlist1: output dimension of the training data should be 1..10, found %d\n", data_train->GetOdim());
+      Error(msg);
+    }
+    auxdim = data_train->GetAuxdim();
+  }
+  else 
+    data_train=NULL;
+
+  if (dev_fname) {
+    data_dev = new Data(dev_fname);
+    data_dev_alloc=true;
+    if (idim != data_dev->GetIdim()) {
+      sprintf(msg,"TrainerPhraseSlist1: input dimension of the validation data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim);
+      Error(msg);
+    }
+    if (data_dev->GetOdim()<1 || data_dev->GetOdim()>10) {
+      sprintf(msg,"TrainerPhraseSlist1: output dimension of the validation data should be 1..10, found %d\n", data_dev->GetOdim());
+      Error(msg);
+    }
+    int auxdim_dev = data_dev->GetAuxdim();
+    if (0 >= auxdim)
+      auxdim = auxdim_dev;
+    else if (auxdim != auxdim_dev)
+      ErrorN("TrainerPhraseSlist1: auxiliary data dimension of the validation data should be %d, found %d", auxdim, auxdim_dev);
+  }
+  else {
+    data_dev=NULL;
+    data_dev_alloc=false;
+  }
+  iaux = (idim - auxdim);
+
+  DoConstructorWork();
+
+  if (data_dev) {
+    if (pt_fname) {
+      cout << " - loading external phrase table from " << pt_fname << endl;
+      ptable.Read(pt_fname,5,"1:2");
+    }
+    else
+      cout << " - no external phrase table provided" << endl;
+  }
+}
+
+//
+// constructor for testing
+//
+
+TrainerPhraseSlist1::TrainerPhraseSlist1 (Mach *pmach, ErrFct *perrfct,
+	Data *data, char *pt_fname, int p_nscores)
+ : Trainer(pmach,NULL,perrfct,NULL,NULL),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(NULL), tg_wlist(NULL),
+   nb_ex_slist(0), nb_ex_short(0),
+   nb_forw(0)
+{
+  debug0("*** Constructor TrainerPhraseSlist1 for testing ***\n");
+  cout << "Setting up testing with short list" << endl;
+  char	msg[1024];
+
+  data_train=NULL;
+  data_dev=data;
+  data_dev_alloc=false; // do not free it by this class !
+
+  if (idim != data_dev->GetIdim()) {
+    sprintf(msg,"TrainerPhraseSlist1: input dimension of the test data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim);
+    Error(msg);
+  }
+  auxdim = data_dev->GetAuxdim();
+  iaux = (idim - auxdim);
+
+  DoConstructorWork();
+
+  cout << " - loading external phrase table from " << pt_fname << endl;
+#ifdef BACKWRAD_TM
+  ptable.Read(pt_fname,5,"1:0"); // backward TM prob
+#else
+  ptable.Read(pt_fname,5,"1:2"); // forward TM prob
+#endif
+}
+
+//
+// constructor for nbest rescoring
+//
+
+TrainerPhraseSlist1::TrainerPhraseSlist1 (Mach *pmach,
+    WordList *p_sr_wlist, WordList *p_tg_wlist,
+	char *pt_fname, int nscores, char *scores_specif)
+ : Trainer(pmach,NULL,NULL,NULL,NULL), // TODO; should I call:  TrainerNgram(pmach,NULL,NULL,NULL),
+   tg_nbphr(0), tg_slist_len(0), 
+   sr_wlist(p_sr_wlist), tg_wlist(p_tg_wlist),
+   nb_ex_short(0), nb_forw(0)
+{
+  debug0("*** Constructor TrainerPhraseSlist1 for block operations ***\n");
+  cout << "Setting up CSTM with short list" << endl;
+  // TODO: init with TrainerNgram before
+  DoConstructorWork();
+
+  cout << " - loading external phrase table from " << pt_fname << endl;
+  ptable.Read(pt_fname, nscores, scores_specif);
+}
+
+//**************************************************************************************
+
+TrainerPhraseSlist1::~TrainerPhraseSlist1 ()
+{ 
+  debug0("*** Destructor TrainerPhraseSlist1 ***\n");
+
+  if (buf_target_wid) delete [] buf_target_wid;
+  if (buf_target_ext) delete [] buf_target_ext;
+    // buf_input and buf_target will be deleted by ~Trainer()
+
+  phrase_mach.clear();
+
+#ifdef DEBUG2
+  vector<char*>::const_iterator iter, end = words.end();
+  for (iter=words.begin(); iter!=end; iter++) delete *iter;
+  words.clear();
+#endif
+}
+
+
+//**************************************************************************************
+
+REAL TrainerPhraseSlist1::Train()
+{
+  if (!data_train) return -1;
+#ifdef DEBUG
+  printf("*****************\n");
+  printf("TrainerPhraseSlist1::Train():\n");
+  printf(" -    idim=%d, odim=%d, tg_nbphr=%d\n", idim, odim, tg_nbphr);
+  printf(" -  data_in: %p \n", (void*) buf_input);
+  printf(" -   target: %p \n", (void*) buf_target);
+  printf(" -  tgt WID: %p \n", (void*) buf_target_wid);
+  printf(" - grad_out: %p \n", (void*) errfct->GetGrad());
+#endif
+
+  Timer ttrain;		// total training time
+  Timer tload;
+  Timer ttransfer;      // total transfer time of data to GPU
+  Timer tforw;          // total forw time
+  Timer tgrad;          // total gradient time
+  Timer tbackw;         // total backw time
+  ttrain.start();
+
+  data_train->Rewind();
+
+  REAL log_sum=0;
+  int i;
+  nb_ex=nb_ex_slist=nb_ex_short_inp=nb_ex_short=0;
+  nb_tg_words=nb_tg_words_slist=0;
+
+
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);		// we copy from buf_input to gpu_input
+  errfct->SetTarget(gpu_target);	// we copy from buf_target to gpu_target
+  debug1(" - gpu_input %p\n", gpu_input);
+  debug1(" - gpu_target %p\n", gpu_target);
+#else
+  mach->SetDataIn(buf_input);
+  errfct->SetTarget(buf_target);
+  debug1(" - buf_input %p\n", buf_input);
+  debug1(" - buf_target %p\n", buf_target);
+#endif
+  errfct->SetOutput(mach->GetDataOut());
+  mach->SetGradOut(errfct->GetGrad());
+  bool data_available;
+  do {
+    tload.start();
+      // get a bunch of data and map all the words
+    int n=0, nbtgsl=0;
+    data_available = true;
+    while (n < mach->GetBsize() && data_available) {
+      data_available = data_train->Next();
+      if (!data_available) break;
+      debug0("TRAIN DATA: input: ");
+      bool at_least_one_short=false;
+      for (i=0; i<iaux; i++) { // copy word indexes
+        WordID inp=(WordID) data_train->input[i];
+        debug2(" %s[%d]", sr_wlist->GetWordInfo(inp).word,inp);
+#if TODO // should we map input data ?
+        buf_input[n*idim + i] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist1::Train(): input");       // map context words IDs
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+#else
+        buf_input[n*idim + i] = inp;
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+        else if (inp<0 || inp>=(int)sr_wlist->GetSize())
+          ErrorN("TrainerPhraseSlist1::Train(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize());
+#endif
+      }
+      for (; i < idim ; i++) // copy auxiliary data
+        buf_input[n * idim + i] = data_train->input[i];
+      if (at_least_one_short) nb_ex_short_inp++;
+
+      debug0(" - > mapped: ");
+      
+      bool all_in_slist=true;  // ALL to be predicted words are in short list
+      at_least_one_short=false;
+      nbtgsl=0;
+      for (i=0; i<tg_nbphr; i++) {
+        WordID outp=(WordID) data_train->target[i];
+        int idx=i+n*tg_nbphr;
+        buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist1::Train(): output");     // TODO: not really needed during training, just the current value
+        if (outp==NULL_WORD) {
+          buf_target[idx] = (REAL) NULL_WORD;
+          at_least_one_short=true;
+          debug1(" -[%d->NULL]",(int) buf_target[idx]);
+        }
+        else {
+          nb_tg_words++;
+          if (tg_wlist->InShortList(buf_target_wid[idx])) {
+            buf_target[idx] = (REAL) buf_target_wid[idx];
+            debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word,(int) buf_target_wid[idx], outp);
+            nbtgsl++;
+          }
+          else {
+	    buf_target[idx] = (REAL) tg_slist_len;	// words that are not in slist are ALL done by the last output neuron
+            debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,(int) buf_target_wid[idx], outp);
+            all_in_slist=false;
+          }
+        }
+      }
+      if (all_in_slist) {
+        nb_ex_slist++;
+        nb_tg_words_slist += nbtgsl;
+      }
+      if (at_least_one_short) nb_ex_short++;
+      debug1("     all_slist=%d\n",all_in_slist);
+
+      n++;
+    }  // loop to get a bunch of examples
+    debug4("train bunch of %d words, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n));
+    tload.stop();
+
+#ifdef DEBUG2
+printf("network data:\n");
+REAL *iptr=buf_input;
+REAL *tptr=buf_target;
+for (int nn=0;nn<n;nn++) {
+   for (i=0;i<idim;i++) printf(" %f", *iptr++); printf(" -> ");
+   for (i=0;i<tg_nbphr;i++) printf(" %f", *tptr++); printf("\n");
+}
+#endif
+
+    if (n>0) {
+#ifdef BLAS_CUDA
+      ttransfer.start();
+      Gpu::MemcpyAsync(gpu_input, buf_input , n*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+      Gpu::MemcpyAsync(gpu_target, buf_target , n*odim*sizeof(REAL), cudaMemcpyHostToDevice);
+      Gpu::StreamSynchronize();
+      ttransfer.stop();
+#endif
+      tforw.start();
+      mach->Forw(n,true);
+      tforw.stop();
+
+      tgrad.start();
+      log_sum += errfct->CalcGrad(n);
+      tgrad.stop();
+
+      debug1("  log_sum=%e\n",log_sum);
+#ifdef DEBUG2
+      int t=(int) data_train->target[0];
+#ifdef BLAS_CUDA
+      Gpu::SetConfig(mach->GetGpuConfig());
+      REAL * tmp = Gpu::Alloc(5, "tmp buffer for DEBUG2");
+      cublasGetVector(odim,CUDA_SIZE,mach->GetDataOut(),1,tmp,1);
+      printf("OUTPUT:");
+      for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n");
+      cublasGetVector(3, CUDA_SIZE, data_train->target, 1, tmp, 1);
+      printf("TARGET:");
+      for (int i=0;i<1; i++) printf(" %f", tmp[i]); printf("\n");
+      //TODO check if we need odim or idim!
+      cublasGetVector(odim*bsize, CUDA_SIZE, errfct->GetGrad(), 1, tmp, 1);
+      printf("  GRAD:");
+      for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n");
+      cublasFree(tmp);
+#else
+printf("OUTPUT:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",mach->GetDataOut()[i]); printf("\n");
+printf("TARGET:") ; for (int i=0;i<1; i++) printf(" %f",data_train->target[i]); printf("\n");
+printf("  GRAD:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",errfct->GetGrad()[i]); printf("\n");
+#endif //BLAS_CUDA
+#endif //DEBUG2
+      lrate->UpdateLrateOnForw(mach->GetNbForw());
+      tbackw.start();
+      mach->Backw(lrate->GetLrate(), wdecay, n);
+      tbackw.stop();
+    }
+
+    nb_ex += n;
+  } while (data_available);
+#ifdef BLAS_CUDA
+  Gpu::StreamSynchronize();
+#endif
+
+  ttrain.stop();
+  ttrain.disp(" - training time: ");
+  tload.disp(" including load: ");
+#ifdef BLAS_CUDA
+  ttransfer.disp(" transfer: ");
+#endif
+  tforw.disp(" forw: ");
+  tgrad.disp(" grad: ");
+  tbackw.disp(" backw: ");
+  printf("\n");
+  
+  printf(" = log_sum=%.2f, nb_tg_words=%d, nb_ex_slist=%d, nb_tg_words_slist=%d\n", log_sum, nb_tg_words, nb_ex_slist, nb_tg_words_slist);
+  if (nb_tg_words>0) return exp(-log_sum / (REAL) nb_tg_words);  // when normalizing consider that all examples lead to a forward pass 
+
+  return -1;
+}
+
+//**************************************************************************************
+// 
+
+void TrainerPhraseSlist1::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni)
+{
+  int Nbest=100;
+
+    // get input length
+  int input_length;
+  for (input_length=0;input_length<iaux;input_length++) {
+    if (buf_input[ni*idim+input_length] == NULL_WORD) break;
+  }
+
+  std::vector<std::vector<std::pair<float, std::size_t> > > prepared_scores
+   = prepare_hypotheses(optr, tg_nbphr, dim_per_phrase, Nbest);
+  std::vector<std::pair<float, std::vector<std::size_t> > > best
+   = sort_ngrams(prepared_scores, input_length, Nbest);
+
+  for(std::size_t i = 0; i < best.size(); ++i) {
+      // source
+    for (int j=0; j<iaux; j++) {
+      if (buf_input[ni*idim+j] == NULL_WORD) break;
+      fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+j]).word << " ";
+    }
+
+      // target
+    fspt << "|||";
+    for(std::size_t j = 0; j < best[i].second.size(); ++j) {
+      fspt << " " << tg_wlist->GetWordInfoMapped(best[i].second[j]).word;
+    }
+
+      // score
+    fspt << " ||| " << exp(best[i].first);
+    fspt << "\n";
+  }
+
+}
+
+//**************************************************************************************
+// 
+#if 0
+void TrainerPhraseSlist1::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni)
+{
+  int i;
+	  // Find most likely outputs
+        for (i=0;i<iaux;i++) {
+          if (buf_input[ni*idim+i] == NULL_WORD) break;
+          fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+i]).word << " ";
+        }
+        fspt << "||| ";
+        
+        for (i=0; i<tg_nbphr; i++) {
+          if (buf_target_wid[i+ni*tg_nbphr] == NULL_WORD) break;
+  tgrad.disp(" including ");
+  tgrad.disp(" including ");
+	    // find max of current word
+	  REAL *sptr=optr+i*dim_per_phrase, max=*sptr++; int max_idx=0;
+          for (int s=1; s<dim_per_phrase; s++, sptr++) {
+            if (*sptr>max) { max=*sptr; max_idx=s; }
+          }
+          fspt << tg_wlist->GetWordInfoMapped(max_idx).word << "[" << max << "] ";
+        }
+  fspt << endl;
+}
+#endif
+ 
+//**************************************************************************************
+// 
+
+REAL TrainerPhraseSlist1::TestDev(char *fname)
+{
+  if (!data_dev) return -1;
+
+  vector<string> src_phrase;	// interface with classical phrase tables
+  vector<string> tgt_phrase;
+  vector<bool> done_by_cstm;
+
+  ofstream fs;
+  if (fname) {
+    cout << " - dumping phrase probability stream to file '" << fname << "'" << endl;
+    fs.open(fname,ios::out);
+    CHECK_FILE(fs,fname);
+  }
+
+  char *ptfname = (char*) "alltrans.txt";
+  ofstream fspt;
+  cout << " - dumping new phrase table to file '" << ptfname << "'" << endl;
+  fspt.open(ptfname,ios::out);
+  CHECK_FILE(fspt,ptfname);
+
+  nb_ex=nb_ex_slist=nb_ex_short=0;
+  nb_tg_words=nb_tg_words_slist=0;
+  int nb_probs=0;	// this counts the number of cumulated log probs.
+			// This increments by only one for external phrase tables, independently of the target phrase length
+  REAL logP, log_sum=0;
+  REAL log_sum_cstm=0;	// only CSLM, i.e. considering phrases done by CSTM
+
+  uint idx;
+
+#ifdef BLAS_CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);		// we copy from buf_input to gpu_input
+  errfct->SetTarget(gpu_target);	// we copy from buf_target to gpu_target
+  debug1(" - gpu_input %p\n", gpu_input);
+  debug1(" - gpu_target %p\n", gpu_target);
+#else
+  mach->SetDataIn(buf_input);
+  errfct->SetTarget(buf_target);
+#endif
+  errfct->SetOutput(mach->GetDataOut());
+
+  bool data_available;
+  data_dev->Rewind();
+  do {
+      // get a bunch of data
+    int n=0, i;
+    data_available = true;
+    debug0("start bunch\n");
+    done_by_cstm.clear();
+    while (n < mach->GetBsize() && data_available) {
+      data_available = data_dev->Next();
+      if (!data_available) break;
+
+      debug0("DEV DATA: input: ");
+      bool at_least_one_short=false;
+      for (i=0; i<iaux; i++) { // copy word indexes
+        WordID inp=(WordID) data_dev->input[i];
+        idx=n*idim + i;
+        debug1(" %d", inp);
+#if TODO // should we map input data ?
+        buf_input[idx] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist1::TestDev(): input");       // map context words IDs
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+#else
+        buf_input[idx] = inp;
+        if (inp == NULL_WORD)
+          at_least_one_short=true;
+        else if (inp<0 || inp>=(int)sr_wlist->GetSize())
+          ErrorN("TrainerPhraseSlist1::TestDev(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize());
+#endif
+      }
+      for (; i < idim ; i++) // copy auxiliary data
+        buf_input[n * idim + i] = data_dev->input[i];
+      if (at_least_one_short) nb_ex_short_inp++;
+
+      debug0(" - > mapped: ");
+      
+      bool all_in_slist=true;  // ALL to be predicted words are in short list
+      int nb_words_not_null=0;
+      at_least_one_short=false;
+      for (i=0; i<tg_nbphr; i++) {
+        WordID outp=(WordID) data_dev->target[i];
+        idx=n*tg_nbphr + i;
+        buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist1::TestDev(): output");
+        buf_target_ext[idx] = buf_target_wid[idx];		// keep target word ID for Moses phrase-table
+        if (outp==NULL_WORD) {
+          buf_target[idx] = (REAL) NULL_WORD;
+          at_least_one_short=true;			// TODO: optimize: we should be able to stop the loop on "i"
+          debug1(" %d[NULL]",(int) buf_target_wid[idx]);
+        }
+        else {
+          nb_tg_words++;
+          nb_words_not_null++;
+          if (tg_wlist->InShortList(buf_target_wid[idx])) {
+            buf_target[idx] = (REAL) buf_target_wid[idx];
+            debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word, (int) buf_target_wid[idx], outp);
+	    //nbtgsl++;
+          }
+          else {
+	      // TODO: we actually don't need a forward pass for words in the short lists or short n-grams
+	      //       this could be used to save some time (5-10%)
+            buf_target_wid[idx] = tg_slist_len;
+	    buf_target[idx] = (REAL) tg_slist_len;	// words that are not in slist are ALL done by the last output neuron
+            debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,(int) buf_target_wid[idx], outp);
+            all_in_slist=false;
+          }
+        }
+      }
+      done_by_cstm.push_back(all_in_slist);
+      if (all_in_slist) {
+        nb_ex_slist++;
+        nb_tg_words_slist += nb_words_not_null;
+        //nb_tg_words_slist += nbtgsl;
+      }
+      if (!at_least_one_short) nb_ex_short++;
+      debug1("     all_slist=%d\n",all_in_slist);
+
+      n++;
+    }  // loop to get a bunch ef examples
+    debug4("dev bunch of %d phrases, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n));
+
+#ifdef DEBUG2
+printf("network data:\n");
+REAL *iptr=buf_input;
+REAL *tptr=buf_target;
+for (int nn=0;nn<n;nn++) {
+   for (i=0;i<idim;i++) printf(" %f", *iptr++); printf(" -> ");
+   for (i=0;i<tg_nbphr;i++) printf(" %f", *tptr++); printf("\n");
+}
+#endif
+
+
+      // process the bunch by the neural network
+    if (n>0) {
+#ifdef BLAS_CUDA
+      Gpu::MemcpyAsync(gpu_input, buf_input , n*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+      Gpu::MemcpyAsync(gpu_target, buf_target , n*odim*sizeof(REAL), cudaMemcpyHostToDevice);
+#endif
+      mach->Forw(n,false); 
+      log_sum_cstm += errfct->CalcValue(n);
+    }
+
+      // get probas from CSLM or back-off LM
+#ifdef BLAS_CUDA
+    cudaMemcpy(host_output, mach->GetDataOut(), n*odim*sizeof(REAL), cudaMemcpyDeviceToHost);
+    REAL *optr=host_output;
+    Error("TrainerPhraseSlist1::TestDev TODO CUDA");
+#else
+    REAL *optr=mach->GetDataOut();	// n times (tg_nbphr*tg_slen) = odim values
+#endif
+
+    debug1("Collect n=%d\n", n);
+    if (n!=(int) done_by_cstm.size())
+      Error("TrainerPhraseSlist1::TestDev(): internal error, number of phrases done by CSTM does not match");
+
+    REAL *ptr_input = buf_input;	// n times idim values
+    for (int ni=0; ni<n; ni++) {
+      int nb_tg=0; // for normalization
+      if (done_by_cstm[ni]) {
+          // get proba from CSTM (removed renorm)
+          
+#define DUMP_PHRASE_TABLE
+#ifdef DUMP_PHRASE_TABLE
+          // create output phrase table
+        for (i=0;i<iaux;i++) {
+          if (buf_input[ni*idim+i] == NULL_WORD) break;
+          fspt << sr_wlist->GetWordInfo(buf_input[ni*idim+i]).word << " ";
+        }
+        fspt << "||| ";
+        for (i=0;i<tg_nbphr;i++) {
+          if (buf_target_wid[i+ni*tg_nbphr] == NULL_WORD) break;
+          fspt << tg_wlist->GetWordInfoMapped(buf_target_wid[ni*tg_nbphr+i]).word << " ";
+        }
+        fspt << "||| ";
+#endif
+
+        logP=0;
+        REAL *optr2=optr;
+        for (i=0; i<tg_nbphr; i++) {
+          if (buf_target_wid[i+ni*tg_nbphr] == NULL_WORD) break;
+          logP += safelog(optr2[buf_target_wid[i+ni*tg_nbphr]]); // no error check on indices necessary here
+          nb_tg++;
+#ifdef DUMP_PHRASE_TABLE2
+	  fspt << optr2[buf_target_wid[i+ni*tg_nbphr]] << " ";  
+#endif
+          optr2+=dim_per_phrase;
+        }
+#ifdef DUMP_PHRASE_TABLE
+        fspt << logP/nb_tg << endl;
+#endif
+
+#ifdef DUMP_PHRASE_TABLE_NBEST
+        GetMostLikelyTranslations(fspt,optr,ni);
+#endif
+
+        nb_probs+=i;
+        debug1(" CSLM: logP=%e\n", logP);
+      }
+      else {
+          // request proba from Moses phrase-table
+#if 1
+        debug0("create textual phrase pair for external phrase table (word + index)\n");
+        src_phrase.clear();
+        debug0("  source:");
+        for (i=0; i<iaux && ptr_input[i]!=NULL_WORD; i++) {
+          src_phrase.push_back(sr_wlist->GetWordInfo((uint) ptr_input[i]).word);	// TODO: char* to string
+          debug2(" %s[%d]", src_phrase.back().c_str(), (uint) ptr_input[i]);
+        }
+        tgt_phrase.clear();
+        debug0("  target:");
+        for (i=0; i<tg_nbphr && buf_target_ext[i+ni*tg_nbphr]!=NULL_WORD; i++) {
+          tgt_phrase.push_back(tg_wlist->GetWordInfoMapped(buf_target_ext[i+ni*tg_nbphr]).word);	// TODO: char* to string
+          debug2(" %s[%d]", tgt_phrase.back().c_str(), buf_target_ext[i+ni*tg_nbphr]);
+        }
+#ifdef BACKWRAD_TM
+        logP = safelog(ptable.GetProb(tgt_phrase, src_phrase));
+#else
+        logP = safelog(ptable.GetProb(src_phrase, tgt_phrase));
+#endif
+        nb_probs++;
+        debug1("  => logP=%e\n",logP);
+#else
+        logP=1;
+#endif
+      }
+
+      log_sum += logP;
+      ptr_input += idim;  // next example in bunch at input
+      optr += odim;  // next example in bunch at output
+      if (fname) {
+        fs << ((nb_tg>0) ? logP/nb_tg : -1) << endl;
+      }
+    }
+
+    nb_ex += n;
+    debug2("%d: %f\n",nb_ex,exp(-log_sum/nb_ex));
+  } while (data_available);
+
+  printf(" %d target words in %d phrases (%d=%.2f%% uncomplete), CSTM: %d target words in %d phrases (%.2f%%)\n",
+         nb_tg_words, nb_ex, 
+         nb_ex_short, 100.0*nb_ex_short/nb_ex,
+         nb_tg_words_slist, nb_ex_slist, 100.0*nb_ex_slist/nb_ex);
+
+ 
+  REAL px = (nb_probs>0) ? exp(-log_sum / (REAL) nb_probs) : -1;
+  printf("   cstm px=%.2f, ln_sum=%.2f, overall px=%.2f (%d values)\n",
+        (nb_tg_words_slist>0) ? exp(-log_sum_cstm / (REAL) nb_tg_words_slist) : -1, log_sum_cstm, px, nb_probs);
+
+  if (fname) fs.close();
+  fspt.close();
+
+  return px;
+}
+
+
+//**************************************************************************************
+// information after finishing an epoch
+
+void TrainerPhraseSlist1::InfoPost ()
+{
+  printf(" - epoch finished, %d target words in %d phrases (%.2f/%.2f%% short source/target)\n",
+	nb_tg_words, nb_ex,
+	100.0*nb_ex_short_inp/nb_ex_slist, 100.0*nb_ex_short/nb_ex_slist);
+  printf("   CSTM: %d target words in %d phrases (%.2f%%), avrg px=%.2f\n",
+	nb_tg_words_slist, nb_ex_slist, 100.0*nb_ex_slist/nb_ex,
+	err_train);
+}
+
+//**************************************************************************************
+// request one n-gram probability, usually the called will be delayed
+// and processes later 
+
+
+//**************************************************************************************
+// collect all delayed probability requests
+
+
+void TrainerPhraseSlist1::ForwAndCollect(vector< vector<string> > &src_phrases, AlignReq *areq, int req_beg, int req_end, int bs, int tm_pos)
+{
+  if (bs<=0) return;
+  debug3("TrainerPhraseSlist1::ForwAndCollect(): collecting outputs %d .. %d from bunch of size %d\n", req_beg, req_end, bs);
+  debug3("\ttarget machines %d x dim %d = total %d\n", tg_nbphr, dim_per_phrase, odim);
+
+  if (bs != (int) src_phrases.size())
+    ErrorN("TrainerPhraseSlist1::ForwAndCollect(): the number of source phrases (%d) does not match block length (%d)", (int) src_phrases.size(), bs);
+
+#ifdef DEBUG
+  printf("bunch of %d\n",bs);
+  for (int b=0; b<bs; b++) {
+    printf("%3d:", b);
+    for (int ii=0; ii<idim; ii++) printf(" %.2f", buf_input[b*idim+ii]); printf("\n");
+  }
+#endif
+
+  nb_forw++;
+#ifdef CUDA
+  Gpu::SetConfig(mach->GetGpuConfig());
+  mach->SetDataIn(gpu_input);
+  Gpu::MemcpyAsync(gpu_input, buf_input , bs*idim*sizeof(REAL), cudaMemcpyHostToDevice);
+#else
+  mach->SetDataIn(buf_input);
+#endif
+  mach->Forw(bs,false);
+
+#ifdef BLAS_CUDA
+  Gpu::MemcpyAsync(host_output, mach->GetDataOut(), bs*odim*sizeof(REAL), cudaMemcpyDeviceToHost);
+  Gpu::StreamSynchronize();
+#endif
+
+    // stats
+  int cnt_ex_slist=0, cnt_tg_words=0, cnt_tg_words_slist=0;
+
+  for (int n=req_beg; n<=req_end; n++) {
+    REAL logP=0;
+    int b=areq[n].bs;
+
+    if ((int) areq[n].tgph.size() > tg_nbphr)
+      ErrorN("TrainerPhraseSlist1::ForwAndCollect(): target phrase too long (%d) for machine (%d)", (int) areq[n].tgph.size(), tg_nbphr);
+
+#ifdef DEBUG
+    printf("collect b=%3d \n input:", b);
+    for (int ii=0; ii<idim; ii++) printf(" %f",buf_input[b*idim+ii]); printf("\n");
+#endif
+
+      // map target words
+    debug0(" output:");
+    bool all_in_slist=true;
+    int tw;
+    for (tw=0; all_in_slist && tw<tg_nbphr; tw++) {
+      WordID outp = areq[n].tgwid[tw];
+      debug1(" %d",outp);
+      if (outp==NULL_WORD) break;
+      cnt_tg_words++;
+      buf_target_wid[tw] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist1::ForwAndCollect() output");
+      debug1("->%d",buf_target_wid[tw]);
+      all_in_slist=tg_wlist->InShortList(buf_target_wid[tw]);
+    }
+      // fill up
+    for (; tw<tg_nbphr; tw++) {
+      debug0(" fill");
+      buf_target_wid[tw]=NULL_WORD;
+    }
+    debug1("    slist=%d\n",all_in_slist);
+
+#ifdef BLAS_CUDA
+    REAL *optr=host_output + b*odim;
+#else
+    REAL *optr=mach->GetDataOut() + b*odim;
+#endif
+
+    if (!all_in_slist) {
+        // get proba from external phrase table
+      logP=ptable.GetProb(src_phrases[areq[n].bs], areq[n].tgph);
+      debug1(" ptable: logP=%f\n", logP);
+    }
+    else {
+        // get proba from CSLM
+      debug0(" -  in slist CSLM:");
+      logP=0; int cnt=0;
+      for (int tw=0; tw<tg_nbphr; tw++) {
+        if (buf_target_wid[tw] == NULL_WORD) break;
+        debug1(" %e", optr[buf_target_wid[tw]]);
+        logP += safelog(optr[buf_target_wid[tw]]);
+        optr+=dim_per_phrase;
+        cnt++;
+      }
+      if (cnt==0) Error("no target phrases when collecting output");
+      logP /= cnt; // TODO: is this normalization correct ?
+      debug1(" -> log avr=%f\n",logP);
+
+      cnt_ex_slist++;
+      cnt_tg_words_slist += cnt;
+    }
+
+        // store LM proba
+    areq[n].hyp->AddFeature(logP,tm_pos);
+  } // for (ni=...)
+
+  printf(" nb of phrases: %d with %d target words, by CSTM %d (%5.2f%%), avrg length %1.2f words\n",
+	 req_end-req_beg+1, cnt_tg_words, cnt_ex_slist, (float) 100.0* cnt_ex_slist / (req_end-req_beg+1), (float) cnt_tg_words_slist/cnt_ex_slist);
+  nb_ex += (req_end-req_beg+1);
+  nb_ex_slist += cnt_ex_slist;
+  nb_tg_words_slist += cnt_tg_words_slist;
+  nb_tg_words += cnt_tg_words;
+}
+
+
+void TrainerPhraseSlist1::BlockStats() {
+   //printf(" - %d phrase probability requests, %d=%5.2f short phrase %d forward passes (avrg of %d probas), %d=%5.2f%% predicted by CSTM\n",
+	//nb_ngram, nb_ex_short, 100.0*nb_ex_short/nb_ngram, nb_forw, nb_ngram/nb_forw, nb_ex_slist, 100.0*nb_ex_slist/nb_ngram);
+   printf(" - CSTM: %d forward passes, %d=%5.2f%% phrases were predicted by CSTM\n",
+	nb_forw, nb_ex_slist, 100.0 * nb_ex_slist/nb_ex);
+}
diff --git a/TrainerPhraseSlist1.h b/TrainerPhraseSlist1.h
new file mode 100644
index 0000000..dad0a95
--- /dev/null
+++ b/TrainerPhraseSlist1.h
@@ -0,0 +1,105 @@
+/*
+ * This file is part of the continuous space language and translation model toolkit
+ * for statistical machine translation and large vocabulary speech recognition.
+ *
+ * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
+ *
+ * The CSLM toolkit is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version 3 as
+ * published by the Free Software Foundation
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
+ *
+ *
+ */
+
+#ifndef _TrainerPhraseSlist1_h
+#define _TrainerPhraseSlist1_h
+
+#include <ostream>
+#include "Tools.h"
+#include "Mach.h"
+#include "ErrFct.h"
+#include "DataPhraseBin.h"
+#include "Trainer.h"
+#include "WordList.h"
+
+#include "PtableMosesPtree.h"
+#include "AlignReq.h"
+
+//
+// Class to train neural networks to predict phrase probabilities
+//  - we use a short list of target words for which the NN predicts the proba
+//  - the proba of the other target words are obtained by a classical Moses phrase table
+//  - the NN also predicts the proba mass of ALL the words not in the short slist
+//    for this we use the last output neuron of the network
+
+
+//
+// helper class to store and compare one ngram LM request
+// ugly C-style structure, but this seems to be more efficient
+
+class TrainerPhraseSlist1 : public Trainer
+{
+private:
+  int		max_inp_idx;		// largest index -1 of a word at the input (# of entries in projection table)
+  int		tg_nbphr;		// number of phrases at output, odim should be (tg_slist_len+1) * tg_nbphr
+  int		dim_per_phrase;		// output dimension of each phrase prediction layer (must be equal size)
+  WordID	tg_slist_len;		// length of slist (this is set to dim_per_phrase MINUS ONE)
+  WordList	*sr_wlist;
+  WordList	*tg_wlist;
+  vector<Mach*> phrase_mach;	// pointer to the output machine for each phrase
+ 
+    // classical phrase table
+  PtableMosesPtree	ptable;
+
+    // various stats
+  int		nb_ex_slist;		// total number of examples processed in slist
+  int		nb_ex_short_inp;	// total number of incomplete input phrases
+  int		nb_ex_short;		// total number of incomplete target phrases
+  int		nb_tg_words;		// total number of target words (there can be several target words for a phrase pair)
+  int		nb_tg_words_slist;	// total number of target words which are in short list
+// TODO: use WordID vector for targets in order to make less casts 
+  WordID	*buf_target_wid;	// used instead of buf_target to evitate casts between REAL and WordID
+					// size is odim x bsize
+  WordID	*buf_target_ext;	// similar to buf_target_wid[], but keep even word id out side of short list
+					// needed to request probas from external phrase table
+#ifdef DEBUG
+  vector<char*>  words;		// give UTF8 word for a given CSLM internal index
+#endif
+  REAL DoTestDev(char*, bool);	// internal helper function
+  void DoConstructorWork();	// internal helper function for the various constructors
+    // data and functions for block processing
+  int	nb_forw;		// stats on total number of forward passes
+  void GetMostLikelyTranslations(ofstream&,REAL*,int);
+protected:
+  virtual void InfoPost();			// dump information after finishing a training epoch
+public:
+  TrainerPhraseSlist1(Mach*, Lrate*, ErrFct*,	// mach, lrate, errfct
+	  const char*, const char*, const char*, int,	// train, dev, external phrase table, number of scores
+	  REAL =0, int =10, int =0);			// wdecay, max epochs, current epoch
+  TrainerPhraseSlist1(Mach*, ErrFct*, Data*,	// for testing only: mach, errfct, binary data
+	  char*, int);				// external phrase table, number of scores
+  TrainerPhraseSlist1(Mach*, WordList*, WordList*,	// for general proba calculation: mach, src word list, tgt word list
+	  char*, int , char*);			// external phrase table, number of scores, score specif
+  virtual ~TrainerPhraseSlist1();
+  virtual REAL Train();				// train for one epoch
+  virtual REAL TestDev(char* =NULL);		// test current network on dev data and save outputs into file
+    // fast block evaluation functions
+  virtual void StoreInput(int b, int d, REAL val) {buf_input[b*bsize+d]=val;}
+  virtual void ForwAndCollect(vector< vector<string> > &, AlignReq*, int,int,int,int);	// for nbest rescoring
+  virtual void BlockStats();				// display some stats on Block mode
+    // interface functions
+  virtual int GetTgtNbPhr() {return tg_nbphr; }
+  virtual int GetSlistLen() {return tg_slist_len; }
+  virtual REAL *GetBufInput() {return buf_input; }
+};
+
+#endif
diff --git a/docs/Descritpion-of-features.txt b/docs/Description-of-features.txt
similarity index 100%
rename from docs/Descritpion-of-features.txt
rename to docs/Description-of-features.txt
diff --git a/sort.cpp b/sort.cpp
new file mode 100644
index 0000000..2965593
--- /dev/null
+++ b/sort.cpp
@@ -0,0 +1,82 @@
+#include<vector>
+#include<math.h>
+#include <iostream>
+#include <algorithm>
+
+//simple exponential decay as length penalty (input length = output length: no penalty)
+REAL weight_lengths(std::size_t input_length, std::size_t output_length) {
+    return log(0.8)*abs(input_length-output_length);
+}
+
+
+//change data structure (vector of vectors of pairs) and prune number of hypotheses per length to N
+std::vector<std::vector<std::pair<REAL, std::size_t> > > prepare_hypotheses(REAL* scores, std::size_t maxLength, std::size_t vocab_size, std::size_t Nbest) {
+
+    // outermost vector: one item per length
+    std::vector<std::vector<std::pair<REAL, std::size_t> > > ret;
+
+    // for each length
+    for(std::size_t i = 0; i < maxLength; ++i){
+        std::vector<std::pair<REAL, std::size_t> > vec (vocab_size);
+
+        // for each word in the vocabulary
+        for(std::size_t j = (i*vocab_size); j < ((i+1)*vocab_size); ++j){
+            std::size_t idx = j-(i*vocab_size);
+            vec[idx] = std::make_pair(scores[j],idx); //store probability and index
+        }
+
+        // prune to N most probable members
+        std::nth_element(vec.begin(), min(vec.end(),vec.begin()+Nbest), vec.end(), std::greater<std::pair<REAL, std::size_t> >());
+        vec.resize(std::min(Nbest,vec.size()));
+
+        ret.push_back(vec);
+    }
+    return ret;
+}
+
+std::vector<std::pair<REAL, std::vector<std::size_t> > > sort_ngrams(std::vector<std::vector<std::pair<REAL, std::size_t> > > scores, std::size_t input_length, std::size_t Nbest) {
+
+    //stack of hypotheses for building next greater length
+    std::vector<std::pair<REAL, std::vector<std::size_t> > > seed;
+    std::vector<std::size_t> tmp;
+    seed.push_back(std::make_pair(0,tmp));
+
+    std::vector<std::pair<REAL, std::vector<std::size_t> > > ret;
+
+    // for each n-gram length
+    for(std::size_t i = 0; i < scores.size(); ++i){
+
+        std::vector<std::pair<REAL, std::vector<std::size_t> > > scores_current;
+
+        //for each word in vocab (already pruned in prepare_hypotheses)
+        for(std::size_t j = 0; j < scores[i].size(); ++j){
+
+            //for each hypothesis we kept from (n-gram-length-1)
+            for(std::size_t k = 0; k < seed.size(); ++k){
+
+                std::vector<size_t> tempvect (seed[k].second);
+                tempvect.push_back(scores[i][j].second);
+
+                scores_current.push_back(std::make_pair(seed[k].first + log(scores[i][j].first), tempvect));
+            }
+        }
+
+        //we only need Nbest hypotheses
+        std::nth_element(scores_current.begin(), min(scores_current.end(),scores_current.begin()+Nbest), scores_current.end(), std::greater<std::pair<REAL, std::vector<std::size_t> > >());
+        seed.resize(std::min(Nbest,scores_current.size()));
+
+        REAL length_penalty = weight_lengths(input_length,i+1);
+        for(std::size_t j = 0; j < std::min(Nbest,scores_current.size()); ++j) {
+            ret.push_back(std::make_pair((scores_current[j].first+length_penalty)/(i+1), scores_current[j].second)); // normalized by length
+            seed[j] = scores_current[j]; // unnormalized; used to generate longer hypotheses
+        }
+
+    }
+
+    // compare n-grams of different lengths and return Nbest
+    std::sort(ret.begin(), ret.end(), std::greater<std::pair<REAL, std::vector<std::size_t> > >());
+    ret.resize(std::min(ret.size(),Nbest));
+
+    return ret;
+}
+