diff --git a/AlignReq.h b/AlignReq.h new file mode 100644 index 0000000..ea50686 --- /dev/null +++ b/AlignReq.h @@ -0,0 +1,40 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * + * + */ + +#ifndef _ALIGNREQ_H_ +#define _ALIGNREQ_H_ + +using namespace std; + +#include +#include "Hypo.h" + +struct AlignReq { + int sb, se; // requested alignment, we can use the word indices only since the source is constant for all hyps + vector tgph; // target phrase + WordID tgwid[16]; // mpped target wordID; TODO: this is an hack, we map many times the same target phrase + Hypo *hyp; // corresponding hypothesis + int bs; // index into bunch that will be processed by NN + float *logP; // log proba (may be several scores) +}; + +#endif diff --git a/Gpu.cu b/Gpu.cu new file mode 100644 index 0000000..673197b --- /dev/null +++ b/Gpu.cu @@ -0,0 +1,1799 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + +using namespace std; +#include +#include +#include +#include +#define RAISE raise(SIGINT); + +typedef float REAL; +#define NULL_WORD (-1) // from WordList.h +#define LOG_PROBA_NONE 999 // from ErrFact.h +#define LOCK_FNAME "/tmp/gpu_lock.pid%d.gpu%d" +#define LOCK_FNAME_LEN 256 // Hack ;-) + +#include +#include +#include +#include +#include "nvml.h" +#include "Gpu.cuh" +#include "Tools.h" //For Error() + + +// global variables +curandGenerator_t cuda_gen; +string cuda_user_list; // user specified list of GPUs +static REAL *gpu_result; +#define GPU_BUF_DIM 65536 +static REAL *gpu_buf; + +size_t Gpu::curDevIndex = (size_t)-1; ///< current device index +size_t Gpu::curConfIndex = (size_t)-1; ///< current configuration index +cudaStream_t Gpu::curStream = NULL; ///< current stream +bool Gpu::useConcurrentStreams = false; ///< status of concurrent streams +#ifdef GPU_CUBLAS_V2 +cublasHandle_t Gpu::curCbHandle = NULL; ///< current Cublas handle +#endif +cudaDeviceProp* Gpu::curDevProps = NULL; ///< device properties +vector Gpu::vDevices; ///< vector of Gpu devices to be used +vector Gpu::vConfigs; ///< vector of Gpu configurations + +void HandlerSigTERM(int s) +{ + printf("Catched signal: removing lock-files\n"); + Gpu::Unlock(); + exit(1); +} + +/** + * initializes Cuda and creates lock files + * @note selects first device and stream + * @returns configuration index 0 + */ +size_t Gpu::Init() +{ + size_t stId = 0; + if (0 >= Gpu::vConfigs.size()) { + Gpu::vConfigs.resize(1); + + cout << "Initializing Nvidia GPU card" << endl; + int dev_max = 0; + cudaGetDeviceCount(&dev_max); + bool bSelAuto = (':' != cuda_user_list[0]); + Gpu::Device dev; + if (0 < dev_max) { + if (1 == dev_max) + cout << " - found 1 card:" << endl; + else + cout << " - found " << dev_max << " cards:" << endl; + if (bSelAuto) + nvmlInit(); + nvmlDevice_t nd; + nvmlUtilization_t nu; + multimap mSelDev; + for (dev.number = 0 ; dev.number < dev_max ; dev.number++) { + cudaGetDeviceProperties(&dev.props, dev.number); + int nb_cores_per_multiprocessor = -1; + if(dev.props.major == 1 && (dev.props.minor == 0||dev.props.minor == 1||dev.props.minor == 2||dev.props.minor == 3)) + nb_cores_per_multiprocessor = 8; + else if(dev.props.major == 2 && dev.props.minor == 0) + nb_cores_per_multiprocessor = 32; + else if(dev.props.major == 2 && dev.props.minor == 1) + nb_cores_per_multiprocessor = 48; + else if(dev.props.major == 3 && (dev.props.minor == 0||dev.props.minor == 5)) + nb_cores_per_multiprocessor = 192; + + + printf(" %d: %s with %d CPUs x %d threads running at %4.2f Ghz, %d MBytes of memory, use -arch=sm_%d%d", + dev.number, dev.props.name, dev.props.multiProcessorCount, nb_cores_per_multiprocessor, + dev.props.clockRate/1000000.0, (int) (dev.props.totalGlobalMem/1024/1024), + dev.props.major, dev.props.minor); + if (bSelAuto) { + if ( (nvmlDeviceGetHandleByIndex(dev.number, &nd) == NVML_SUCCESS) + && (nvmlDeviceGetUtilizationRates( nd , &nu) == NVML_SUCCESS) ) + printf(", utilization %d%%", nu.gpu); + mSelDev.insert(make_pair(nu.gpu, dev)); + } + printf("\n"); + } + if (bSelAuto) { // select devices automatically + nvmlShutdown(); + int iMaxDev = std::min(std::max(atoi(cuda_user_list.c_str()), 0), dev_max); + for (multimap::const_iterator mmci = mSelDev.begin() ; 0 < iMaxDev-- ; mmci++) + Gpu::vDevices.push_back(mmci->second); + } + } + + if (!bSelAuto) { // read devices specified by user + char c; + istringstream iss; + iss.str(cuda_user_list); + while (iss.good()) { + iss >> c >> dev.number; + Gpu::vDevices.push_back(dev); + cudaGetDeviceProperties(&Gpu::vDevices.back().props, dev.number); + } + if (iss.fail()) + ErrorN("format error in the selection of CUDA devices \"%s\"", cuda_user_list.c_str() + 1); + } + size_t dev_sel = Gpu::vDevices.size(); + switch (dev_sel) { + case 0: printf(" - no GPU device selected\n"); + dev.number = 0; + Gpu::vDevices.push_back(dev); + dev_sel = 1; + cudaGetDeviceProperties(&Gpu::vDevices.back().props, dev.number); + case 1: printf(" - using device %d\n", Gpu::vDevices[0].number); + cudaSetDevice(Gpu::vDevices[0].number); + break; + default: + if (dev_sel > (size_t)dev_max) { + printf(" - requested more GPU devices than available, using %d first ones\n", dev_max); + dev_sel = dev_max; + Gpu::vDevices.resize(dev_sel); + } + printf(" - using %lu devices in parallel:", dev_sel); + for (size_t d = 0 ; d < dev_sel ; d++) { + int n = Gpu::vDevices[d].number; + printf(" %d", n); + if ((n < 0) || (n >= dev_max)) + Error("illegal device identifier"); + } + printf("\n"); + cudaSetDevice(Gpu::vDevices[0].number); + } + + // initialize cublas and random generator + cublasInit(); + Gpu::CheckError("initialization of card\n"); + curandCreateGenerator(&cuda_gen, CURAND_RNG_PSEUDO_DEFAULT); + // curandSetPseudoRandomGeneratorSeed(cuda_gen, CUDA_SEED); + Gpu::CheckError("initialization of random generator\n"); + + // allocate buffers + gpu_buf = Gpu::Alloc(GPU_BUF_DIM*sizeof(REAL),"internal buffer on GPU"); + + // locking devices + ofstream lfs; + char lfname[LOCK_FNAME_LEN] = LOCK_FNAME; + for (size_t d = 0 ; d < dev_sel ; d++) { + sprintf(lfname, LOCK_FNAME, getpid(), Gpu::vDevices[d].number); + lfs.open(lfname,ios::out); + CHECK_FILE(lfs, lfname); + lfs << "Runing job " << getpid() << " on GPU " << Gpu::vDevices[d].number << endl; + lfs.close(); + } + + // catch signals to clean up lock-files + signal(SIGINT , HandlerSigTERM); + signal(SIGHUP , HandlerSigTERM); + signal(SIGFPE , HandlerSigTERM); + signal(SIGSEGV, HandlerSigTERM); + signal(SIGTERM, HandlerSigTERM); + + // create default configuration + Gpu::Config& newConfig = Gpu::vConfigs.back(); + Gpu::curDevIndex = newConfig.devId = 0; + Gpu::curConfIndex = stId; + newConfig.stream = NULL; +#ifdef GPU_CUBLAS_V2 + cublasCreate(&newConfig.cbHandle); + Gpu::curCbHandle = newConfig.cbHandle; +#endif + Gpu::curDevProps = &Gpu::vDevices[0].props; + } + return stId; +} + +/** + * removes lock-files and deletes all configurations + */ +void Gpu::Unlock() +{ + // remove lock-files + Gpu::curDevIndex = (size_t)-1; + char lfname[LOCK_FNAME_LEN] = LOCK_FNAME; + for (std::vector::iterator id = Gpu::vDevices.begin() ; id != Gpu::vDevices.end() ; id++) { + sprintf(lfname, LOCK_FNAME, getpid(), id->number); + if (unlink(lfname)) + cerr << " - ERROR: removing lock file " << lfname << endl; + } + + // destroy streams + Gpu::curConfIndex = (size_t)-1; + Gpu::curStream = NULL; + Gpu::useConcurrentStreams = false; +#ifdef GPU_CUBLAS_V2 + Gpu::curCbHandle = NULL; +#endif + Gpu::curDevProps = NULL; + Gpu::vDevices.clear(); + for (std::vector::iterator igc = Gpu::vConfigs.begin() ; igc != Gpu::vConfigs.end() ; igc++) { + if (NULL != igc->stream) + cudaStreamDestroy(igc->stream); +#ifdef GPU_CUBLAS_V2 + if (NULL != igc->cbHandle) + cublasDestroy(igc->cbHandle); +#endif + } + Gpu::vConfigs.clear(); +} + + +/** + * creates a new Gpu stream on next device + * @note selects the next device and the new stream + * @returns new configuration index + */ +size_t Gpu::NewConfig() +{ + size_t stId = Gpu::vConfigs.size(); + if (0 < stId) { + Gpu::useConcurrentStreams |= (Gpu::vDevices.size() <= (0.8 * (stId + 1))); + Gpu::vConfigs.resize(stId + 1); + Gpu::Config& newConfig = Gpu::vConfigs.back(); + newConfig.devId = ((Gpu::curDevIndex + 1) % Gpu::vDevices.size()); + newConfig.stream = NULL; +#ifdef GPU_CUBLAS_V2 + newConfig.cbHandle = NULL; +#endif + Gpu::ChangeConfig(stId); + return stId; + } + else + return Gpu::Init(); +} + +/** + * changes current configuration + * @param stCfg index of configuration to use + */ +void Gpu::ChangeConfig(size_t stCfg) +{ + Gpu::curConfIndex = stCfg; + Gpu::Config& config = Gpu::vConfigs[Gpu::curConfIndex]; + if (Gpu::curDevIndex != config.devId) { + Gpu::curDevIndex = config.devId; + cudaSetDevice(Gpu::vDevices[Gpu::curDevIndex].number); + Gpu::curDevProps = &Gpu::vDevices[Gpu::curDevIndex].props; + } +#ifdef GPU_CUBLAS_V2 + if (NULL == config.cbHandle) + cublasCreate(&config.cbHandle); + if (Gpu::useConcurrentStreams && (NULL == config.stream)) { + cudaStreamSynchronize(NULL); + cudaStreamCreate(&config.stream); + cublasSetStream(config.cbHandle, config.stream); + } + if (Gpu::curStream != config.stream) { + Gpu::curStream = config.stream; + nppSetStream(Gpu::curStream); + } + Gpu::curCbHandle = config.cbHandle; + debug4("Gpu::ChangeConfig cfg=%zu dev=%d str=%x cbh=%x\n", Gpu::curConfIndex, Gpu::vDevices[Gpu::curDevIndex].number, Gpu::curStream, Gpu::curCbHandle); +#endif +} + +/** + * sets current device with default stream + * @param stDevId device index + */ +void Gpu::SetDevice(size_t stDevId) +{ + Gpu::curConfIndex = (size_t)-1; + if (Gpu::curDevIndex != stDevId) { + Gpu::curDevIndex = (stDevId % Gpu::vDevices.size()); + cudaSetDevice(Gpu::vDevices[Gpu::curDevIndex].number); + Gpu::curDevProps = &Gpu::vDevices[Gpu::curDevIndex].props; + } +#ifdef GPU_CUBLAS_V2 + if (NULL != Gpu::curStream) { + Gpu::curStream = NULL; + nppSetStream(Gpu::curStream); + } + Gpu::curCbHandle = NULL; +#endif +} + +/** + * allocates memory on Gpu and checks error + * @param msg message to print in case of error + */ +REAL* Gpu::Alloc(int dim, const char* msg) { + void* gpu_mem; + char err_msg[1024]; + sprintf(err_msg, "CUDA: can't allocate memory for %s", msg); + sprintf(err_msg, "CUDA: can't allocate memory (%dMB) for %s", (int)(dim / 1024 / 1024 * sizeof(REAL)), msg); + if (dim > 0) { + cublasAlloc(dim, CUDA_SIZE, &gpu_mem); +#ifdef DEBUG + int dev = -1; + cudaGetDevice(&dev); + debug3("allocated %ld at %p on device %d\n", dim * CUDA_SIZE, gpu_mem, dev); +#endif + Gpu::CheckError(err_msg); + if (NULL == gpu_mem) + Error(err_msg); + return (CUDA*)gpu_mem; + } + else + return NULL; +} + +/** + * checks error + * @param msg message to print in case of error + */ +void Gpu::CheckError(const char* msg) { + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) + ErrorN("CUDA: ERROR %d in %s: %s\n", cublasGetError(), msg, cudaGetErrorString(err)); +} + + +// Corresponds to 2.0*numeric_limits::min() +__device__ REAL GPU_LOG_LOWER_BOUND = 2.35099e-38; +__device__ REAL gpu_safelog(REAL x) { return (xmaxThreadsDim[0], odim); + int n_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize); + KernelMachTabForw<<>>(bsize, odim, gpu_data_in, gpu_t, gpu_data_out); +} + + +//----------------------------------------------- +// backward pass for MachTab +//----------------------------------------------- + +__global__ +void KernelMachTabBackw(const REAL lrate, const int bsize, const int odim, + REAL *gpu_data_in, REAL *gpu_t, REAL *gpu_grad_out) +{ + for (int b=blockIdx.x; bmaxThreadsDim[0], odim); + int n_blocks = std::min(Gpu::curDevProps->maxGridSize[0], bsize); + KernelMachTabBackw<<>>(lrate, bsize, odim, gpu_data_in, gpu_t, gpu_grad_out); +} + + +//----------------------------------------------- +// Softmax normalization +//----------------------------------------------- + +__global__ void KernelSoftmax(int M, int N, + const REAL * x, const int sx0, const int sx1, + REAL * sm, const int sm_s0, const int sm_s1) +{ + extern __shared__ REAL buf[]; + for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) { + REAL sum = 0; +#pragma unroll 16 + for (int i = threadIdx.x; i< N; i += blockDim.x){ + sum += exp(x[blockIDX * sx0 + i * sx1]); + } + buf[threadIdx.x] = sum; + __syncthreads(); + + // This function trashes buf[1..warpsize], leaving the reduction result in buf[0]. + if (threadIdx.x < warpSize){ +#pragma unroll 8 + for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize){ + buf[threadIdx.x] += buf[i]; + } + if (threadIdx.x < 16){ + //reduce so that threadIdx.x 0 has the sum of everything + if(threadIdx.x + 16 < N) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16]; + if(threadIdx.x + 8 < N) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8]; + if(threadIdx.x + 4 < N) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4]; + if(threadIdx.x + 2 < N) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2]; + if(threadIdx.x + 1 < N) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1]; + } + } + __syncthreads(); + REAL row_sum = buf[0]; +#pragma unroll 16 + for (int i = threadIdx.x; i< N; i += blockDim.x){ + sm[blockIDX * sm_s0 + i * sm_s1] = exp(x[blockIDX * sx0 + i * sx1]) / row_sum; + } + __syncthreads(); + } +} +void Gpu::MachSoftmaxForw(const int bsize, const int odim, REAL *gpu_data_out) +{ + if(0){ + //This is the original code that is know to work correctly in all case, + //But is slower. + nppsExp_32f_I(gpu_data_out, bsize*odim); + + REAL sum, *optr=gpu_data_out; + + for (int b=0; b we can use the sum_i (ABS(x_i)) + nppsMulC_32f_I(1.0/sum,optr,odim); + } + return; + } + + //int warpSize = 32; +//The follwing check need to access the GPU properties to do it. +//To don't do this access each time, we have done it in MachSoftmax.cpp +// if(warpSize != 32){ +// Error("Gpu::MachSoftmaxForw suppose the warpSize is 32. If run with a GPU with other warpSize" +// " like the current GPU, it will return wrong Results. You must update the reduction in KernelSoftmax"); +// } + int n_blocks = std::min(bsize, 32 * 1024); + int n_threads = std::min(odim, 512); + int n_shared_bytes = n_threads * sizeof(REAL); + if (bsize > 0){ + KernelSoftmax<<>>( + bsize, + odim, + gpu_data_out, + odim, //x.stride[0 + 1, //x.stride[1] + gpu_data_out, + odim, //sm.stride[0] + 1//sm.stride[1] + ); + cudaError_t err = cudaGetLastError(); + if(cudaSuccess != err){ + printf("KernelSoftmax: n_blockn=%d, n_threads=%d, n_shared_bytes=%d odim=%d\n", + n_blocks, n_threads, n_shared_bytes, odim); + Error(cudaGetErrorString(err)); + } + } +} + +//----------------------------------------------- +// Softmax stable normalization +//----------------------------------------------- + +__global__ void KernelSoftmaxStable(int M, int N, + const REAL * x, const int sx0, const int sx1, + REAL * sm, const int sm_s0, const int sm_s1) +{ + extern __shared__ REAL buf[]; + for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) { + REAL max_ = x[blockIDX * sx0 + threadIdx.x * sx1]; + for (int i = threadIdx.x + blockDim.x; i< N; i += blockDim.x) { + max_ = max(max_, x[blockIDX * sx0 + i * sx1]); + }; + buf[threadIdx.x] = max_; + __syncthreads(); + + // This function trashes buf[1..n_threads], leaving the reduction result in buf[0]. + // Find the max to stabilize the softmax + if (threadIdx.x < warpSize) + { + for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize) { + buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]); + } + if (threadIdx.x < 16) { + //reduce so that threadIdx.x 0 has the max of everything + if(threadIdx.x + 16 < N) + buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+16]); + if(threadIdx.x + 8 < N) + buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+8]); + if(threadIdx.x + 4 < N) + buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+4]); + if(threadIdx.x + 2 < N) + buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+2]); + if(threadIdx.x + 1 < N) + buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+1]); + } + } + + __syncthreads(); + REAL row_max = buf[0]; + __syncthreads(); + REAL sum = 0; + for(int i=threadIdx.x; i we can use the sum_i (ABS(x_i)) + nppsMulC_32f_I(1.0/sum,optr,odim); + } + return; + } + //int warpSize = 32; +//The follwing check need to access the GPU properties to do it. +//To don't do this access each time, we have done it in MachSoftmaxStable.cpp +// if(warpSize != 32){ +// Error("Gpu::MachSoftmaxStableForw suppose the warpSize is 32. If run with a GPU with other warpSize" +// " like the current GPU, it will return wrong Results. You must update the reduction in KernelSoftmaxStable"); +// } + int n_blocks = std::min(bsize, 32 * 1024); + int n_threads = std::min(odim, 512); + int n_shared_bytes = n_threads * sizeof(REAL); + if (bsize > 0){ + KernelSoftmaxStable<<>>( + bsize, + odim, + gpu_data_out, + odim, //x.stride[0] + 1, //x.stride[1] + gpu_data_out, + odim, //sm.stride[0] + 1//sm.stride[1] + ); + cudaError_t err = cudaGetLastError(); + if(cudaSuccess != err){ + printf("n_blocks=%d, n_threads=%d, n_shared_bytes=%d odim=%d\n", + n_blocks, n_threads, n_shared_bytes, odim); + Error(cudaGetErrorString(err)); + } + } +} + +//----------------------------------------------- +// Linear Rectifier units +//----------------------------------------------- + +__global__ +void KernelLinRectifForw(const int n, REAL *gpu_data_out) +{ + int tx = threadIdx.x; + int bx = blockIdx.x; + int n_threads = blockDim.x * gridDim.x; + int id = tx * blockDim.x + bx * gridDim.x; + for(int i = id; i < n; i += n_threads){ + if (gpu_data_out[i]<0) gpu_data_out[i]=0; + } +} + +void Gpu::LinRectifForw(const int n, REAL *gpu_data_out) +{ + int nb_thread = std::min(n, 256); + int nb_block = n / 256; + KernelLinRectifForw<<>>(n, gpu_data_out); +} + +__global__ +void KernelLinRectifBackw(const int n, REAL *gpu_data_out, REAL *gpu_grad_out) +{ + int tx = threadIdx.x; + int bx = blockIdx.x; + int n_threads = blockDim.x * gridDim.x; + int id = tx * blockDim.x + bx * gridDim.x; + for(int i = id; i < n; i += n_threads){ + if (gpu_data_out[i]<0) gpu_grad_out[i]=0; else gpu_grad_out[i]=1; + } +} + +void Gpu::LinRectifBackw(const int n, REAL *gpu_data_out, REAL *gpu_grad_out) +{ + int nb_thread = std::min(n, 256); + int nb_block = n / 256; + KernelLinRectifBackw<<>>(n, gpu_data_out, gpu_grad_out); +} + +//----------------------------------------------- +// Helper functions for drop-out +//----------------------------------------------- + +__global__ +void KernelDropOut(const int n, REAL *gpu_vect, REAL *rand, REAL thresh) +{ + int tx = threadIdx.x; + int bx = blockIdx.x; + int n_threads = blockDim.x * gridDim.x; + int id = tx * blockDim.x + bx * gridDim.x; + for (int i = id; i < n; i += n_threads) { + if (rand[i]>>(n, gpu_vect, rand, thresh); +} + +//----------------------------------------------- +// ErrFctSoftmCrossEntNgram::CalcValue +//----------------------------------------------- + +__global__ +void KernelErrFctSoftmCrossEntNgramCalcValue(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target, + REAL *gpu_res) +{ + extern __shared__ REAL buf[]; + REAL err=0.0; + for (int b=threadIdx.x ; bmaxThreadsDim[0], bsize); + KernelErrFctSoftmCrossEntNgramCalcValue<<<1, n_threads, n_threads*sizeof(REAL), Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_target, gpu_result); + cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream); + cudaStreamSynchronize(Gpu::curStream); + return res; +} + +//----------------------------------------------- +// ErrFctSoftmCrossEntNgram::CalcValueNull +//----------------------------------------------- + +__global__ +void KernelErrFctSoftmCrossEntNgramCalcValueNull(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target, + REAL *gpu_res) +{ + extern __shared__ REAL buf[]; + REAL err=0.0; + for (int b=threadIdx.x ; bmaxThreadsDim[0], bsize); + KernelErrFctSoftmCrossEntNgramCalcValueNull<<<1, n_threads, n_threads*sizeof(REAL), Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_target, gpu_result); + cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream); + cudaStreamSynchronize(Gpu::curStream); + return res; +} + +//----------------------------------------------- +// ErrFctSoftmCrossEntNgram::CalcValueBatch +//----------------------------------------------- + +__global__ +void KernelErrFctSoftmCrossEntNgramCalcValueBatch(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_target, REAL *tmp_buf) +{ + //extern __shared__ REAL buf[]; + for (int b=threadIdx.x ; b GPU_BUF_DIM) + Error("Gpu::ErrFctSoftmCrossEntNgramCalcValueBatch(): odim (%d) is larger than internal buffer (%d)"); //,odim,GPU_BUF_DIM); + int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize); + KernelErrFctSoftmCrossEntNgramCalcValueBatch<<<1, n_threads, 0, Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_target, gpu_buf); + cudaMemcpyAsync(res_vect, gpu_buf, bsize*sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream); + cudaStreamSynchronize(Gpu::curStream); +} + +//----------------------------------------------- +// ErrFctSoftmCrossEntNgram::CalcMax +//----------------------------------------------- + +void Gpu::ErrFctSoftmCrossEntNgramCalcMax(const int eff_bsize, const int dim, REAL *output, REAL *target, REAL *res, int *pos) +{ + Error("TODO: Gpu::ErrFctSoftmCrossEntNgramCalcMax()"); +} + +#if 0 // not used anymore, use CalcvalueBatch() instead +__global__ +void KernelErrFctSoftmCrossEntNgramCalcValueNth(const int idx, const int odim, REAL *gpu_data_out, REAL *gpu_target, REAL *gpu_res) +{ + int tidx = (int) gpu_target[idx]; // do not cast to uint ! Otherwise, nvcc will transform the -1 to 0! + if (tdx<0) // NULL_WORD + *gpu_res=-1; + else + *gpu_res = gpu_safelog(gpu_data_out[idx*odim + tidx]); +} + + +REAL Gpu::ErrFctSoftmCrossEntNgramCalcValueNth(const int idx, const int odim, REAL *gpu_data_out, REAL *gpu_target) +{ + REAL res; + if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL)); + KernelErrFctSoftmCrossEntNgramCalcValueNth<<<1, 1, 1*sizeof(REAL), Gpu::curStream>>>(idx, odim, gpu_data_out, gpu_target, gpu_result); + cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream); + cudaStreamSynchronize(Gpu::curStream); + return res; +#endif + + +//----------------------------------------------- +// ErrFctSoftmClassCrossEntNgram::CalcWordClassError +//----------------------------------------------- + +__global__ +void KernelErrFctSoftmClassError(const int bsize, const int n_classes, REAL *gpu_class_out, REAL *gpu_class_target, + REAL *gpu_res) +{ + int class_err=0; + REAL *ocptr=gpu_class_out; + REAL *tcptr=gpu_class_target; + for (int b=0; b max_oclass) { + argmax = i; + max_oclass = oclass_i; + } + } + if ((int) *tcptr != argmax) + class_err++; + + ocptr += n_classes; + tcptr++; + } + *gpu_res = (REAL) class_err; +} + +__global__ void KernelErrFctSoftmClassError2(const int bsize, const int n_classes, + REAL *gpu_class_out, REAL *gpu_class_target, REAL *gpu_res) +{ + extern __shared__ REAL buf[]; + buf[threadIdx.x] = 0; + for (int i = threadIdx.x; i < bsize; i += blockDim.x) { + int argmax = 0; + REAL max_oclass = gpu_class_out[i*n_classes]; + for (int j = 1; j < n_classes; j++) { + REAL oclass_j = gpu_class_out[i*n_classes + j]; + if (oclass_j > max_oclass) { + argmax = j; + max_oclass = oclass_j; + } + } + if ((int) gpu_class_target[i] != argmax) + buf[threadIdx.x] += 1; + } + __syncthreads(); + // Reduce sum into buf[0] + if (threadIdx.x < warpSize) { + for (int i = threadIdx.x + warpSize; i < blockDim.x; i += warpSize) { + buf[threadIdx.x] += buf[i]; + } + if (threadIdx.x < 16) { + if (threadIdx.x + 16 < n_classes) + buf[threadIdx.x] += buf[threadIdx.x + 16]; + if (threadIdx.x + 8 < n_classes) + buf[threadIdx.x] += buf[threadIdx.x + 8]; + if (threadIdx.x + 4 < n_classes) + buf[threadIdx.x] += buf[threadIdx.x + 4]; + if (threadIdx.x + 2 < n_classes) + buf[threadIdx.x] += buf[threadIdx.x + 2]; + if (threadIdx.x + 1 < n_classes) + buf[threadIdx.x] += buf[threadIdx.x + 1]; + } + } + if (threadIdx.x == 0) + *gpu_res = buf[0]; +} + +REAL Gpu::ErrFctSoftmClassError(const int bsize, const int n_classes, REAL *gpu_class_out, REAL *gpu_class_target) +{ + REAL res; + if (gpu_result==NULL) cudaMalloc(&gpu_result,sizeof(REAL)); + int n_threads = std::min(bsize, 512); + int n_blocks = bsize / n_threads + ((bsize % n_threads) ? 1 : 0); + int n_shared_bytes = n_threads * sizeof(REAL); + KernelErrFctSoftmClassError2<<>>( + bsize, n_classes, gpu_class_out, gpu_class_target, gpu_result); + cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream); + cudaStreamSynchronize(Gpu::curStream); + return res; +} + +//----------------------------------------------- +// ErrFctSoftmCrossEntNgram::CalcGrad +//----------------------------------------------- +/** + * @note This kernel need many block to compute the grad but also need to do a reduction. + * The first block will do the reduction and compute the grad associated with it + * and all the other will compute the grad for other words. + */ +__global__ +void KernelErrFctSoftmCrossEntNgramCalcGrad(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target, + REAL *gpu_res) +{ + if (blockIdx.x == 0) { + // the first block computes the error and grad for used words + extern __shared__ REAL buf[]; + REAL err=0.0; + for (int b=threadIdx.x; bmaxGridSize[0], bsize + 1); + int nb_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize); + int n_shared_bytes = nb_threads * sizeof(REAL); + KernelErrFctSoftmCrossEntNgramCalcGrad<<>>( + bsize, odim, gpu_data_out, gpu_grad, gpu_target, gpu_res); + + cudaError_t err = cudaGetLastError(); + if(cudaSuccess != err){ + ErrorN("Error in Gpu::ErrFctSoftmCrossEntNgramCalcGrad: %s", cudaGetErrorString(err)); + } +} + +//----------------------------------------------- +// ErrFctSoftmCrossEntNgram::CalcGradNull +//----------------------------------------------- +/** + * @note This kernel need many block to compute the grad but also need to do a reduction. + * The first block will do the reduction and compute the grad associated with it + * and all the other will compute the grad for other words. + */ +__global__ +void KernelErrFctSoftmCrossEntNgramCalcGradNull(const int bsize, const int odim, + REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target, + REAL *gpu_res) +{ + if (blockIdx.x == 0) { + // the first block computes the error and grad for non NULL words + extern __shared__ REAL buf[]; + REAL err=0.0; + for (int b=threadIdx.x; b output at %p is %f, update grad at %p\n", b, tidx, &(gpu_data_out[b*odim + tidx]), gpu_data_out[b*odim + tidx], &(gpu_grad[b*odim+tidx])); + if (tidx != NULL_WORD) { + gpu_grad[b*odim + tidx] = (1.0f - gpu_grad[b*odim + tidx]); + err += gpu_safelog(gpu_data_out[b*odim + tidx]); + } + } + buf[threadIdx.x] = err; + __syncthreads(); + if (threadIdx.x == 0) { + for (int i=1; imaxGridSize[0], bsize + 1); + int nb_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize); + int n_shared_bytes = nb_threads * sizeof(REAL); + KernelErrFctSoftmCrossEntNgramCalcGradNull<<>>( + bsize, odim, gpu_data_out, gpu_grad, gpu_target, gpu_res); + + cudaError_t err = cudaGetLastError(); + if(cudaSuccess != err){ + ErrorN("Error in Gpu::ErrFctSoftmCrossEntNgramCalcGradNull: %s", cudaGetErrorString(err)); + } +} + +//----------------------------------------------- +// ErrFctSoftmCrossEntNgram::CalcGradCumul +//----------------------------------------------- +/** + * @note This kernel need many block to compute the grad but also need to do a reduction. + * The first block will do the reduction and compute the grad associated with it + * and all the other will compute the grad for other words. + */ +__global__ +void KernelErrFctSoftmCrossEntNgramCalcGradCumul(const int bsize, const int odim, REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target, + REAL *gpu_res) +{ + if (blockIdx.x == 0) { + // the first block computes the error and grad for used words + extern __shared__ REAL buf[]; + REAL err=0.0; + unsigned int tidx; + + for (int b=threadIdx.x ; bmaxGridSize[0], bsize + 1); + int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize); + KernelErrFctSoftmCrossEntNgramCalcGradCumul<<>>(bsize, odim, gpu_data_out, gpu_grad, gpu_target, gpu_result); + Error("Gpu::ErrFctSoftmCrossEntNgramCalcGradCumul not finished!"); + + //REAL res; + //cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream); + //cudaStreamSynchronize(Gpu::curStream); + //return res; +} + +//----------------------------------------------- +// ErrFctSoftmCrossEntNgramMulit::CalcGrad +//----------------------------------------------- +/** + * @note This kernel need many block to compute the grad but also need to do a reduction. + * The first part of blocks will do the reduction and compute the grad associated with it + * and all the other will compute the grad for other words. + */ +__global__ +void KernelErrFctSoftmCrossEntNgramMultiCalcGrad(const int bsize, const int dim, const int nb, + REAL *gpu_data_out, REAL *gpu_grad, REAL *gpu_target, + REAL *gpu_res) +{ + if (blockIdx.y == 0) { + if (threadIdx.x < nb) { + // the first part of blocks computes the error and grad for non NULL words + extern __shared__ REAL buf[]; + REAL err=0.0; + for (int b=blockIdx.x; b err=%e, grad@target=%e\n", b, n, tidx, gpu_data_out[(b*nb+n)*dim + tidx], err, gpu_grad[(b*nb+n)*dim + tidx]); + } + else { + debug4("grad ngram-multi: b=%d, n=%d, tidx=NULL, out=%f -> err=%e\n", b, n, gpu_data_out[(b*nb+n)*dim + tidx], err); + } + } + buf[threadIdx.x] = err; + __syncthreads(); + if (threadIdx.x == 0) { + for (int i=1; (imaxThreadsDim[0]); + int n_shared_bytes = std::min(nb, nb_threads) * sizeof(REAL); + dim3 nb_blocks(std::min( bsize, Gpu::curDevProps->maxGridSize[0]), + std::min(nb + 1, Gpu::curDevProps->maxGridSize[1])); + KernelErrFctSoftmCrossEntNgramMultiCalcGrad<<>>( + bsize, dim, nb, gpu_data_out, gpu_grad, gpu_target, gpu_result); + sts = cudaGetLastError(); + if (cudaSuccess != sts) + { + printf(cudaGetErrorString(sts)); + Error("KernelErrFctSoftmCrossEntNgramMultiCalcGrad cuda error: "); + } + REAL res; + cudaMemcpyAsync(&res, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost, Gpu::curStream); + cudaStreamSynchronize(Gpu::curStream); + + return res; +} + + +//----------------------------------------------- +// MachSoftmaxClass +//----------------------------------------------- +// Forw +/* This function performs the equivalent of various Gemv, with different sizes + and offsets for each example in a minibatch. */ +__global__ void KernelLinForwOffset(const int bsize, const int idim, const int odim, + REAL* input, REAL* weights, REAL* bias, REAL* output, + int* class_info) +{ + // Each block corresponds to one (or more) sub-vector of the output. Each thread + // corresponds to one of its elements. + // Axis x of the grid corresponds to the output rows: if sizes takes large values, + // j will need to go beyond gridDim.x * blockDim.x + // Axis y of the grid corresponds to the batch size. + + extern __shared__ REAL buf[]; + + for (int i = blockIdx.y; i < bsize; i += gridDim.y) { + int offset = class_info[2*i]; + int size = class_info[2*i+1]; + REAL* in_vec = input + i*idim; + + // Copy in_vec into shared memory, so all threads in this block can access it faster + for (int k = threadIdx.x; k < idim; k += blockDim.x) { + buf[k] = in_vec[k]; + } + __syncthreads(); + + for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < size; j += blockDim.x * gridDim.x) { + // Compute one (vector-vector) dot product + REAL dot = bias[offset + j]; + REAL* w_vec = weights + offset + j; + for (int k = 0; k < idim; k++) { + dot += buf[k] * w_vec[k*odim]; + } + output[i*odim + offset + j] = dot; + } + } +} + +void Gpu::MachSoftmaxClassLinForw(const int bsize, const int idim, const int odim, + REAL* input, REAL* weights, REAL* bias, REAL* output, + int* class_info, const int max_size) +{ + debug4("bsize: %d, idim: %d, odim: %d, max_size: %d\n", bsize, idim, odim, max_size); + int n_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], max_size); + int n_blocks_y = std::min(Gpu::curDevProps->maxGridSize[1], bsize); + int n_blocks_x = std::min(Gpu::curDevProps->maxGridSize[0], max_size/n_threads + (max_size%n_threads==0?0:1)); + int n_shared_bytes = idim*sizeof(REAL); + dim3 n_blocks(n_blocks_x, n_blocks_y); + + debug3("n_threads: %d, n_blocks: (%d, %d)\n", n_threads, n_blocks_x, n_blocks_y); + KernelLinForwOffset<<>>( + bsize, idim, odim, input, weights, bias, output, class_info); + cudaError_t err = cudaGetLastError(); + if(cudaSuccess != err){ + printf("KernelLinForwOffset: n_blocks=(%d, %d), n_threads=%d, shared=%d bytes\n", + n_blocks_x, n_blocks_y, n_threads, n_shared_bytes); + Error(cudaGetErrorString(err)); + } +} + +__global__ void KernelBatchedSoftmaxOffset(int M, + const REAL * x, const int sx0, const int sx1, + REAL * sm, const int sm_s0, const int sm_s1, + int * offsets, const int offsets_s, + int * sizes, const int sizes_s) +{ + extern __shared__ REAL buf[]; + for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) { + REAL sum = 0; + int offset = offsets[blockIDX * offsets_s]; + int size = sizes[blockIDX * sizes_s]; +#pragma unroll 16 + for (int i = threadIdx.x; i < size; i += blockDim.x) { + sum += exp(x[blockIDX * sx0 + (offset + i) * sx1]); + } + buf[threadIdx.x] = sum; + __syncthreads(); + + // This function trashes buf[1..warpsize], leaving the reduction result in buf[0]. + if (threadIdx.x < warpSize){ +#pragma unroll 8 + for (int i = threadIdx.x + warpSize; i < blockDim.x && i < size; i += warpSize){ + buf[threadIdx.x] += buf[i]; + } + if (threadIdx.x < 16){ + //reduce so that threadIdx.x 0 has the sum of everything + if (threadIdx.x + 16 < size) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16]; + if (threadIdx.x + 8 < size) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8]; + if (threadIdx.x + 4 < size) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4]; + if (threadIdx.x + 2 < size) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2]; + if (threadIdx.x + 1 < size) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1]; + } + } + __syncthreads(); + REAL row_sum = buf[0]; +#pragma unroll 16 + for (int i = threadIdx.x; i < size; i += blockDim.x){ + sm[blockIDX * sm_s0 + (offset + i) * sm_s1] = exp(x[blockIDX * sx0 + (offset + i) * sx1]) / row_sum; + } + __syncthreads(); + } +} + +__global__ void KernelBatchedSoftmaxStableOffset(int M, + const REAL * x, const int sx0, const int sx1, + REAL * sm, const int sm_s0, const int sm_s1, + int * offsets, const int offsets_s, + int * sizes, const int sizes_s) +{ + extern __shared__ REAL buf[]; + for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x) { + int offset = offsets[blockIDX * offsets_s]; + int size = sizes[blockIDX * sizes_s]; + REAL max_ = x[blockIDX * sx0 + (offset + threadIdx.x) * sx1]; + for (int i = threadIdx.x + blockDim.x; i < size; i += blockDim.x) { + max_ = max(max_, x[blockIDX * sx0 + (offset + i) * sx1]); + }; + buf[threadIdx.x] = max_; + __syncthreads(); + + // This function trashes buf[1..n_threads], leaving the reduction result in buf[0]. + // Find the max to stabilize the softmax + if (threadIdx.x < warpSize) + { + for (int i = threadIdx.x + warpSize; i < blockDim.x && i < size; i += warpSize) { + buf[threadIdx.x] = max(buf[threadIdx.x], buf[i]); + } + if (threadIdx.x < 16) { + //reduce so that threadIdx.x 0 has the max of everything + if (threadIdx.x + 16 < size) + buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+16]); + if (threadIdx.x + 8 < size) + buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+8]); + if (threadIdx.x + 4 < size) + buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+4]); + if (threadIdx.x + 2 < size) + buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+2]); + if (threadIdx.x + 1 < size) + buf[threadIdx.x] = max(buf[threadIdx.x], buf[threadIdx.x+1]); + } + } + __syncthreads(); + REAL row_max = buf[0]; + __syncthreads(); + + REAL sum = 0; +#pragma unroll 16 + for (int i = threadIdx.x; i < size; i += blockDim.x) { + sum += exp(x[blockIDX * sx0 + (offset + i) * sx1] - row_max); + } + buf[threadIdx.x] = sum; + __syncthreads(); + + // This function trashes buf[1..warpsize], leaving the reduction result in buf[0]. + if (threadIdx.x < warpSize){ +#pragma unroll 8 + for (int i = threadIdx.x + warpSize; i < blockDim.x && i < size; i += warpSize){ + buf[threadIdx.x] += buf[i]; + } + if (threadIdx.x < 16) { + //reduce so that threadIdx.x 0 has the sum of everything + if (threadIdx.x + 16 < size) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+16]; + if (threadIdx.x + 8 < size) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+8]; + if (threadIdx.x + 4 < size) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+4]; + if (threadIdx.x + 2 < size) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+2]; + if (threadIdx.x + 1 < size) + buf[threadIdx.x] = buf[threadIdx.x] + buf[threadIdx.x+1]; + } + } + __syncthreads(); + REAL row_sum = buf[0]; +#pragma unroll 16 + for (int i = threadIdx.x; i < size; i += blockDim.x){ + sm[blockIDX * sm_s0 + (offset + i) * sm_s1] = exp(x[blockIDX * sx0 + (offset + i) * sx1] - row_max) / row_sum; + } + __syncthreads(); + } +} + +void Gpu::MachSoftmaxClassSoftmForw(const int bsize, const int odim, REAL* gpu_data_out, + int* class_info, const int max_size, const int stable) +{ + int n_blocks = std::min(bsize, 32 * 1024); + int n_threads = std::min(max_size, 512); + int n_shared_bytes = n_threads * sizeof(REAL); + if (bsize > 0) { + if (stable) { + KernelBatchedSoftmaxStableOffset<<>>(bsize, + gpu_data_out, odim, 1, + gpu_data_out, odim, 1, + class_info, 2, + class_info + 1, 2); + } + else { + KernelBatchedSoftmaxOffset<<>>(bsize, + gpu_data_out, odim, 1, + gpu_data_out, odim, 1, + class_info, 2, + class_info + 1, 2); + cudaError_t err = cudaGetLastError(); + if(cudaSuccess != err){ + printf("KernelBatchedSoftmaxOffset: n_blocks=%d, n_threads=%d, n_shared_bytes=%d odim=%d\n", + n_blocks, n_threads, n_shared_bytes, odim); + Error(cudaGetErrorString(err)); + } + } + } +} + +__global__ void KernelBatchedSoftmCrossEntGradOffset(int M, + const REAL* x, const int sx0, const int sx1, + REAL* grad, const int sg0, const int sg1, + REAL* target, const int st, + int* offsets, const int so, + int* sizes, const int ss, + REAL* res) +{ + extern __shared__ REAL buf[]; + REAL err = 0.0f; + for (int i = threadIdx.x; i < M; i += blockDim.x) { + int offset = offsets[i * so]; + int size = sizes[i * ss]; + for (int j = 0; j < size; j++) { + grad[i * sg0 + (offset + j) * sg1] = - x[i * sx0 + (offset + j) * sx1]; + } + unsigned int tidx = (uint) target[i * st] - offset; + grad[i * sg0 + (offset + tidx) * sg1] += 1.0f; + err += gpu_safelog(x[i * sx0 + (offset + tidx) * sx1]); + } + buf[threadIdx.x] = err; + __syncthreads(); + + if (threadIdx.x == 0) { + for (int i = 1; i < blockDim.x; i++) { + err += buf[i]; + } + *res = err; + } +} + +void Gpu::ErrFctSoftmClassCrossEntNgramCalcGrad(const int bsize, const int odim, + REAL* gpu_data_out, REAL* gpu_grad, REAL* gpu_target, int* class_info, REAL* gpu_res) +{ + int nb_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize); + int n_shared_bytes = nb_threads * sizeof(REAL); + KernelBatchedSoftmCrossEntGradOffset<<<1, nb_threads, n_shared_bytes, Gpu::curStream>>>(bsize, + gpu_data_out, odim, 1, + gpu_grad, odim, 1, + gpu_target, 1, + class_info, 2, + class_info + 1, 2, + gpu_res); + + cudaError_t err = cudaGetLastError(); + if(cudaSuccess != err){ + ErrorN("Error in Gpu::ErrFctSoftmClassCrossEntNgramCalcGrad: %s", cudaGetErrorString(err)); + } +} + +__global__ void KernelLinGradInOffset(const int bsize, const int idim, + REAL* grad_out, const int sgo0, const int sgo1, + REAL* weights, const int sw0, const int sw1, + REAL* grad_in, const int sgi0, const int sgi1, + int* offsets, const int so, + int* sizes, const int ss) +{ + /* + Computes the a dot product (equivalent of gemv) on each row of grad_in, + using a different part of grad_out and weights each time (determined + from offsets and sizes). + Each row of grad_in (index i) corresponds to one blockIdx.y. + Columns of grad_in (lines of weights, index j) are split in groups + indexed by blockIdx.x. Each group has blockDim.y indices, each index + corresponds to a value of threadIdx.y. + For each (i, j), a scalar (vector-vector) dot product is computed, over + two vectors of length sizes[i], this sum is indexed by k. blockDim.x partial + sums are computed in parallel and stored in buf[threadIdx.y][threadIdx.x], + then a reduction steps computes the final dot product. + We use threadIdx.x as the fast-moving index to maximize coalesced memory + reads and writes. + */ + extern __shared__ REAL buf[]; + for (int i = blockIdx.y; i < bsize; i += gridDim.y) { + int offset = offsets[i * so]; + int size = sizes[i * ss]; + + REAL* ograd_vec = grad_out + i * sgo0; + REAL* buf_y = buf + blockDim.x * threadIdx.y; + for (int j = blockDim.y * blockIdx.x + threadIdx.y; j < idim; j += gridDim.x * blockDim.y) { + // Perform partially-summed dot product, stored in buf[] + REAL* w_vec = weights + j * sw0 + offset * sw1; + REAL dot = 0; + for (int k = threadIdx.x; k < size; k += blockDim.x) { + dot += ograd_vec[(offset + k) * sgo1] * w_vec[k * sw1]; + } + buf_y[threadIdx.x] = dot; + __syncthreads(); + + // Perform the final summation into the first columns of buf[] + // and accumulate the final result in grad_in + if (threadIdx.x < 16 && threadIdx.x + 16 < size) + buf_y[threadIdx.x] += buf_y[threadIdx.x + 16]; + if (threadIdx.x < 8 && threadIdx.x + 8 < size) + buf_y[threadIdx.x] += buf_y[threadIdx.x + 8]; + if (threadIdx.x < 4 && threadIdx.x + 4 < size) + buf_y[threadIdx.x] += buf_y[threadIdx.x + 4]; + if (threadIdx.x < 2 && threadIdx.x + 2 < size) + buf_y[threadIdx.x] += buf_y[threadIdx.x + 2]; + if (threadIdx.x == 0) + grad_in[i * sgi0 + j * sgi1] += buf_y[0] + buf_y[1]; + } + } +} + +void Gpu::MachSoftmaxClassLinGradIn(const int bsize, const int idim, const int odim, + REAL* grad_out, REAL* weights, REAL* grad_in, + int* class_info, const int max_size) +{ + int n_threads_x = Gpu::curDevProps->warpSize; // one warp + int n_threads_y = std::min(Gpu::curDevProps->maxThreadsPerBlock / n_threads_x, Gpu::curDevProps->maxThreadsDim[1]); // Maximum possible + int n_blocks_x = std::min(Gpu::curDevProps->maxGridSize[0], idim / n_threads_y + (idim%n_threads_y==0?0:1)); + int n_blocks_y = std::min(Gpu::curDevProps->maxGridSize[1], bsize); + int n_shared_bytes = n_threads_x * n_threads_y * sizeof(REAL); + dim3 n_threads(n_threads_x, n_threads_y); + dim3 n_blocks(n_blocks_x, n_blocks_y); + + KernelLinGradInOffset<<>>( + bsize, idim, + grad_out, odim, 1, + weights, odim, 1, + grad_in, idim, 1, + class_info, 2, + class_info + 1, 2); + + cudaError_t err = cudaGetLastError(); + if(cudaSuccess != err){ + ErrorN("Error in Gpu::MachSoftmaxClassLinGrad: %s", cudaGetErrorString(err)); + } +} + +__global__ void KernelLinGradUpdate(const int bsize, const int idim, + REAL* input, const int si0, const int si1, + REAL* grad_out, const int sg0, const int sg1, + REAL* weights, const int sw0, const int sw1, + REAL* bias, const int sb, + int* offsets, const int so, + int* sizes, const int ss, + const REAL lrate, const REAL wdecay) +{ + /* + Computes a series of rank-1 updates (equivalent of ger) on sub-matrices + of weights. Also performs updates on bias directly proportional to + the relevant sub-vectors of grad_out. + Each row of grad_out and of input (index k) corresponds to one blockIdx.y. + Rows of weights (columns of inputs, index i) split in groups indexed by + blockIdx.x. Each group has blockDim.y indices, each index corresponds to a + value of threadIdx.y. + Columns of weights and grad_out (index j) are iterated over with blockDim.x + parallel threads, indexed by threadIdx.x. + + Using blockDim.x == 1 warp seems to maximize speed. + + NOTE: Applying weight decay on the whole weight matrix would be too slow + (in the order of +50% execution time), so we apply it in this kernel, + only on the weights that were used for this minibatch. + Since there is no atomic multiplication primitive, the value of weights we + read before the update may have already been updated (by another example in + the same minibatch), or not. It should not make a large difference. + */ + + + for (int k = blockIdx.y; k < bsize; k += gridDim.y) { + int offset = offsets[k * so]; + int size = sizes[k * ss]; + REAL* in_vec = input + k * si0; + REAL* grad_vec = grad_out + k * sg0 + offset * sg1; + + for (int i = blockIdx.x * blockDim.y + threadIdx.y; i < idim; i += gridDim.x * blockDim.y) { + REAL* w_vec = weights + i * sw0 + offset * sw1; + for (int j = threadIdx.x; j < size; j += blockDim.x) + { + REAL update = lrate * (in_vec[i * si1] * grad_vec[j * sg1] + // TODO: if wdecay > 0, this "+" sign should probably be a "-", + // but this is the convention used in MachLin.cpp. + + wdecay * w_vec[j]); + atomicAdd(w_vec + j * sw1, update); + } + + // Block with i == 0 also updates the bias + if (i == 0) + { + for (int j = threadIdx.x; j < size; j += blockDim.x) + atomicAdd(bias + (offset + j) * sb, lrate * grad_vec[j * sg1]); + } + } + } +} + +void Gpu::MachSoftmaxClassLinGradUpdate(const int bsize, const int idim, const int odim, + REAL* input, REAL* grad_out, + REAL* weights, REAL* bias, + int* class_info, const int max_size, + const REAL lrate, const REAL wdecay) +{ + int n_threads_x = Gpu::curDevProps->warpSize; // one warp + int n_threads_y = std::min(Gpu::curDevProps->maxThreadsPerBlock / n_threads_x, Gpu::curDevProps->maxThreadsDim[1]); // Maximum possible + int n_blocks_x = std::min(Gpu::curDevProps->maxGridSize[0], idim / n_threads_y + (idim%n_threads_y==0?0:1)); + int n_blocks_y = std::min(Gpu::curDevProps->maxGridSize[1], bsize); + dim3 n_threads(n_threads_x, n_threads_y); + dim3 n_blocks(n_blocks_x, n_blocks_y); + int n_shared_bytes = 0; + + KernelLinGradUpdate<<>>( + bsize, idim, + input, idim, 1, + grad_out, odim, 1, + weights, odim, 1, + bias, 1, + class_info, 2, + class_info + 1, 2, + lrate, + wdecay); +} + +//----------------------------------------------- +// Copy +//----------------------------------------------- +__global__ +void KernelCopyVectorToMatrix(REAL * mat, REAL * vec, const int M, const int N) +{ + for(int b = blockIdx.x; bmaxGridSize[0]); + int nb_threads = std::min(N, Gpu::curDevProps->maxThreadsDim[0]); + debug4("Gpu::CopyVectorToMatrix(%p, %p %d %d)\n", mat, vec, M, N); + KernelCopyVectorToMatrix<<>>(mat, vec, M, N); + cudaError_t cuda_stat=cudaGetLastError(); + if (cuda_stat != cudaSuccess) + { ErrorN("CUDA: ERROR %d in Gpu::CopyVectorToMatrix(%p, %p %d %d): %s\n", + cuda_stat, mat, vec, M, N, cudaGetErrorString(cuda_stat)); + } +} + +__global__ +void KernelCopyMatrixToMatrixStrided(REAL * dst, REAL * src, const int M, const int N, const int row_stride) +{ + for(int b = blockIdx.x; bmaxGridSize[0]); + int nb_threads = std::min(N, Gpu::curDevProps->maxThreadsDim[0]); + KernelCopyMatrixToMatrixStrided<<>>(dst, src, M, N, row_stride); + cudaError_t cuda_stat=cudaGetLastError(); + if (cuda_stat != cudaSuccess){ + ErrorN("CUDA: ERROR %d in Gpu::CopyMatrixToMatrixStrided: %s\n", + cuda_stat, cudaGetErrorString(cuda_stat)); + } +} + +/* + * This copy each line of a strided matrix to another matrix that is contiguous + */ +void Gpu::CopyMatrixStridedToMatrix(REAL * dst, REAL * src, const int M, const int N, const int row_stride) +{ + int nb_blocks = std::min(M, Gpu::curDevProps->maxGridSize[0]); + int nb_threads = std::min(N, Gpu::curDevProps->maxThreadsDim[0]); + KernelCopyMatrixStridedToMatrix<<>>(dst, src, M, N, row_stride); + cudaError_t cuda_stat=cudaGetLastError(); + if (cuda_stat != cudaSuccess){ + ErrorN("CUDA: ERROR %d in Gpu::CopyMatrixStridedToMatrix: %s\n", + cuda_stat, cudaGetErrorString(cuda_stat)); + } +} + +//----------------------------------------------- +// Multiple AXPY input row on one output row +//----------------------------------------------- + +// Each block compute a fixed number of colums for all batch. +// This allow to have read coalesced and don't need atomic opartion. +__global__ +void KernelBatchedAXPY(const int n, const REAL a, REAL * x, const int incx, + REAL * y, const int incy, const int nb_batch){ + for(int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; + idx += blockDim.x*gridDim.x){ + for(int b=0; bmaxGridSize[0], n/nb_threads+(n%nb_threads==0?0:1)); + nb_blocks = std::max(nb_blocks, 1); + KernelBatchedAXPY<<>>(n, a, x, incx, y, incy, nb_batch); + +} + + +//----------------------------------------------- +// Element-wise exponential +//----------------------------------------------- +__global__ void KernelElemwiseExp(const int size, REAL *gpu_data_in, REAL *gpu_data_out) { + for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { + gpu_data_out[idx] = exp(gpu_data_in[idx]); + } +} + +/* + * Performs gpu_data_out[i] = exp(gpu_data_in[i]) for 0 <= i < size + */ +void Gpu::ElemwiseExp(const int size, REAL *gpu_data_in, REAL *gpu_data_out) { + int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]); + int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]); + KernelElemwiseExp<<>>(size, gpu_data_in, gpu_data_out); +} + +//----------------------------------------------- +// Tanh and its gradient +//----------------------------------------------- +__global__ void KernelElemwiseTanh(const int size, REAL *gpu_data_in, REAL *gpu_data_out) { + for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { + gpu_data_out[idx] = tanh(gpu_data_in[idx]); + } +} + +__global__ void KernelElemwiseTanhGrad(const int size, REAL *gpu_data_out, REAL *gpu_grad_out, REAL *gpu_grad_in) { + for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { + REAL data_out = gpu_data_out[idx]; + gpu_grad_in[idx] = (1.0f - data_out * data_out) * gpu_grad_out[idx]; + } +} + +/* + * Performs gpu_data_out[i] = tanh(gpu_data_in[i]) for 0 <= i < size + * where tanh(x) = sinh/cosh = (exp x - exp -x) / (exp x + exp -x) + * = (exp(2*x) - 1) / (exp(2*x) + 1) + */ +void Gpu::ElemwiseTanh(const int size, REAL *gpu_data_in, REAL *gpu_data_out) { + int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]); + int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]); + KernelElemwiseTanh<<>>(size, gpu_data_in, gpu_data_out); +} + +/* + * Performs gpu_grad_in[i] = (1 - gpu_data_out[i]**2) * gpu_grad_out[i] + * for 0 <= i < size + * which corresponds to the backpropagation of the gradient through tanh. + */ +void Gpu::ElemwiseTanhGrad(const int size, REAL *gpu_data_out, REAL* gpu_grad_out, REAL *gpu_grad_in) { + int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]); + int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]); + KernelElemwiseTanhGrad<<>>(size, gpu_data_out, gpu_grad_out, gpu_grad_in); +} + +/* + * set GPU memory to a value - equivalent to memset() on CPU + */ + +__global__ void KernelMemSet(const int size, REAL *adr, REAL val) { + for (int idx=blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += blockDim.x * gridDim.x) { + adr[idx] = val; + } +} + +void Gpu::MemSet(REAL *adr, REAL val, int size) { + int nb_threads = std::min(size, Gpu::curDevProps->maxThreadsDim[0]); + int nb_blocks = std::min(size/nb_threads + ((size%nb_threads ) == 0 ? 0 : 1), Gpu::curDevProps->maxGridSize[0]); + KernelMemSet<<>>(size, adr, val); +} + +//----------------------------------------------- +// Helpers +//----------------------------------------------- + +void Gpu::ResSet(REAL val) { + cudaMemcpyAsync(gpu_result, &val, sizeof(REAL), cudaMemcpyHostToDevice, Gpu::curStream); +} + +REAL Gpu::ResGet() { + REAL val; + cudaMemcpyAsync(&val, gpu_result, sizeof(REAL), cudaMemcpyDeviceToHost); + cudaStreamSynchronize(Gpu::curStream); + return val; +} diff --git a/KENLM b/KENLM deleted file mode 100644 index e69de29..0000000 diff --git a/NBest.cpp b/NBest.cpp new file mode 100644 index 0000000..0b51e5a --- /dev/null +++ b/NBest.cpp @@ -0,0 +1,585 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + + +#include "NBest.h" +#include "Tools.h" + +#include +#include + +// blocks separated by '|||' +// 0: sentence id +// 1: hypthesis +// 2: feature functions +// 3: global score +// 4: phrase alignments, e.g. 0-1=0-1 2-4=2-3 5=4 + +bool NBest::ParseLine(inputfilestream& inpf, inputfilestream& auxf, const int n, const bool need_alignments, const int aux_dim) +{ + static string line; // used internally to buffer an input line + static int prev_id=-1; // used to detect a change of the n-best ID + int new_id; + vector f; + vector blocks; + static REAL* aux_data=NULL; + REAL AuxValue; + vector aux_data_vec; + + if (line.empty()) { + getline(inpf,line); + if (inpf.eof()) return false; + if (0 < aux_dim) + { + if (!auxf) Error("Not enough auxiliary data available"); + for (int i = 0 ; i> AuxValue; + aux_data_vec.push_back(AuxValue); + if (auxf.eof()) return false; + } + } + } + else { + if (aux_data) + { + for (int i = 0 ; i(blocks[0]); + if (prev_id>=0 && new_id!=prev_id) { + if (!aux_data) aux_data = new REAL[aux_dim]; + int j=0; + for (vector::iterator x = aux_data_vec.begin(); x != aux_data_vec.end(); x++) { + aux_data[j]= *x; + j++; + } + prev_id=new_id; return false; + } // new nbest list has started + prev_id=new_id; + id=new_id; + //cerr << "same ID " << id << endl; + + if (n>0 && nbest.size() >= (uint) n) { + //cerr << "skipped" << endl; + line.clear(); + return true; // skip parsing of unused hypos + } + + // parse feature function scores + //cerr << "PARSE features: '" << blocks[2] << "' size: " << blocks[2].size() << endl; + pos=blocks[2].find_first_not_of(' '); + while (pos(feat)); + //cerr << " value: " << f.back() << endl; + } + pos=blocks[2].find_first_not_of(' ',epos+1); + } + //cerr << " FOUND " << f.size() << " features" << endl; + +#ifdef BOLT_NBEST + if (blocks.size()>4) { // copy all additional fields to the output + string extra_info; + for (size_t bb=4; bb(blocks[3]), extra_info, aux_data_vec, aux_dim) ); + } + else { + nbest.push_back(Hypo(id, blocks[1], f, Scan(blocks[3]), aux_data_vec, aux_dim) ); + } +#else + // eventually parse segmentation + if (blocks.size()>4) { + vector a; + pos=blocks[4].find_first_not_of(' '); + + debug1("parsing alignment in: %s\n", blocks[4].c_str()); + blocks[4].append(" "); // simplifies parsing + + //while (posalign_txt.size()) {cerr << align_txt; Error("format error in alignment (no target phrase)"); } + + uint pos2; + int sb,se,tb,te; + pos2=align_txt.rfind('-',tpos); + if (pos2>align_txt.size()) { + debug2(" src: pos %d-%d\n",0,tpos); + se=sb=Scan(align_txt.substr(0,tpos)); + } + else { + debug2(" sb: pos %d-%d\n",0,pos2); + sb=Scan(align_txt.substr(0,pos2)); + pos=pos2+1; pos2=align_txt.find('=',pos); + debug2(" se: pos %d-%d\n",pos,pos2); + if (pos2>align_txt.size()) {cerr << align_txt; Error("format error in alignment (end of source phrase)"); } + se=Scan(align_txt.substr(pos,pos2-pos)); + } + + tpos++; + pos2=align_txt.find('-',tpos); + if (pos2>align_txt.size()) { + debug1(" tgt: pos %d\n",tpos); + te=tb=Scan(align_txt.substr(tpos)); + } + else { + debug2(" tb: pos %d-%d\n",tpos,pos2); + tb=Scan(align_txt.substr(tpos,pos2-tpos)); + te=Scan(align_txt.substr(pos2+1)); + } + + if (sb<0 || se<0 || tb<0 || te<0 || sb>se || tb>te) {cerr << align_txt; Error("wrong numbers in alignment"); } + debug4(" result %d-%d = %d-%d\n", sb,se,tb,te); + a.push_back(Align(sb,se,tb,te)); + + pos=blocks[4].find_first_not_of(' ',epos+1); + } + + debug1("found %d phrases\n",(int) a.size()); + nbest.push_back(Hypo(id, blocks[1], f, Scan(blocks[3]), a, aux_data_vec, aux_dim) ); + } + else { + nbest.push_back(Hypo(id, blocks[1], f, Scan(blocks[3]), aux_data_vec, aux_dim) ); + } +#endif + + line.clear(); // force read of new line + return true; +} + + +NBest::NBest(inputfilestream &inpf, inputfilestream &auxf, const int n, const bool need_alignments, const int aux_dim) + : max_req(262144), nreq(0), nb_diff_align(0) +{ + debug0("NBEST: constructor called\n"); + areq = new AlignReq[max_req]; + //areq.reserve(max_req); + while (ParseLine(inpf, auxf, n, need_alignments, aux_dim)); +} + + +NBest::~NBest() +{ + debug0("NBEST: destructor called\n"); + nbest.clear(); + srcw.clear(); + if (areq) delete [] areq; + //areq.clear(); +} + +void NBest::Write(outputfilestream &outf, int n) +{ + if (n<1 || (uint) n>nbest.size()) n=nbest.size(); + for (int i=0; i::iterator i = nbest.begin(); i != nbest.end(); i++) { + (*i).CalcGlobal(w); + } +} + + +void NBest::Sort() { + sort(nbest.begin(),nbest.end()); +} + + +void NBest::AddID(const int o) +{ + for (vector::iterator i = nbest.begin(); i != nbest.end(); i++) { + (*i).AddID(o); + } +} + +void NBest::RescoreLM(NbestLM &lm, const int lm_pos) +{ + for (vector::iterator i = nbest.begin(); i != nbest.end(); i++) { + lm.RescoreHyp(*i,lm_pos); + } + lm.FinishPending(); +} + +#undef OLD +#ifdef OLD +void NBest::RescorePtable(PtableMosesPtree &pt, ifstream &srcf, const int tm_pos) +{ + // get a source line and segment into words + string src; + getline(srcf,src); + if (srcf.eof()) + ErrorN("EOF in source text for n-best hypothesis id=%d", id); + + srcw.clear(); + srcw = Moses::Tokenize(src); + + for (vector::iterator i = nbest.begin(); i != nbest.end(); i++) { + pt.RescoreHyp(*i,srcw,tm_pos); + } +} +#else + +void NBest::RescorePtable(PtableMosesPtree &pt, ifstream &srcf, const int tm_pos) +{ + // get a source line and segment into words + string src; + getline(srcf,src); + if (srcf.eof()) + ErrorN("EOF in source text for n-best hypothesis id=%d", id); + + srcw.clear(); + srcw = Moses::Tokenize(src); + + int nscores = pt.GetNscores(); + debug2("NBest::RescorePtable(): %d scores at position %d\n", nscores, tm_pos); + debug2("SRC with %d words: %s\n", (int) srcw.size(), src.c_str()); + + vector null_scores(nscores, 0.0); + + for (vector::iterator hi=nbest.begin(); hi!= nbest.end(); hi++) { + // reset the features that will be modified in BlockFinish() + // we already append them here if requested + if (nscores>1) (*hi).SetFeature(null_scores, tm_pos); + else (*hi).SetFeature(0.0, tm_pos); + + hi->trgw = Moses::Tokenize(hi->trg); + for (vector::iterator ali=(*hi).a.begin(); ali!=(*hi).a.end(); ali++) { + areq[nreq].sb = (*ali).sb; + areq[nreq].se = (*ali).se; + for (int w=(*ali).tb; w<=(*ali).te; w++) areq[nreq].tgph.push_back((*hi).trgw[w]); + areq[nreq].hyp=&(*hi); + if (++nreq >= max_req) BlockFinish(pt,tm_pos); + } + } + BlockFinish(pt,tm_pos); +} +#endif + +void NBest::RescorePtableInv(PtableMosesPtree &pt, ifstream &srcf, const int tm_pos) +{ + Error("NBest::RescorePtableInv"); + // get a source line and segment into words + string src; + getline(srcf,src); + if (srcf.eof()) + ErrorN("EOF in source text for n-best hypothesis id=%d", id); + + srcw.clear(); + srcw = Moses::Tokenize(src); + + int nscores = pt.GetNscores(); + debug2("NBest::RescorePtable(): %d scores at position %d\n", nscores, tm_pos); + debug2("SRC with %d words: %s\n", (int) srcw.size(), src.c_str()); + + vector null_scores(nscores, 0.0); + + for (vector::iterator hi=nbest.begin(); hi!= nbest.end(); hi++) { + // reset the features that will be modified in BlockFinish() + // we already append them here if requested + if (nscores>1) (*hi).SetFeature(null_scores, tm_pos); + else (*hi).SetFeature(0.0, tm_pos); + + hi->trgw = Moses::Tokenize(hi->trg); + for (vector::iterator ali=(*hi).a.begin(); ali!=(*hi).a.end(); ali++) { + areq[nreq].sb = (*ali).sb; + areq[nreq].se = (*ali).se; + for (int w=(*ali).tb; w<=(*ali).te; w++) areq[nreq].tgph.push_back((*hi).trgw[w]); + areq[nreq].hyp=&(*hi); + if (++nreq >= max_req) BlockFinish(pt,tm_pos); + } + } + BlockFinish(pt,tm_pos); +} + + // compare source and target phrases +int AlignReqComp(const void *v1, const void *v2) +{ + AlignReq* a1=(AlignReq*) v1, *a2=(AlignReq*) v2; + + if (a1->sb < a2->sb) return -1; + if (a1->sb > a2->sb) return 1; + if (a1->se < a2->se) return -1; + if (a1->se > a2->se) return 1; + if (a1->tgph.size() < a2->tgph.size()) return -1; + if (a1->tgph.size() > a2->tgph.size()) return 1; + for (int w=0; w<(int)a1->tgph.size(); w++) { + if (a1->tgph[w] < a2->tgph[w]) return -1; + if (a1->tgph[w] > a2->tgph[w]) return 1; + } + + return 0; // both are equal +} + + // compare source phrases only +int AlignReqCompSrc(const void *v1, const void *v2) +{ + AlignReq* a1=(AlignReq*) v1, *a2=(AlignReq*) v2; + + if (a1->sb < a2->sb) return -1; + if (a1->sb > a2->sb) return 1; + if (a1->se < a2->se) return -1; + if (a1->se > a2->se) return 1; + + return 0; // both are equal +} + + +float NBest::GetAlignProb(PtableMosesPtree &pt, AlignReq &aq, const int tm_pos, vector *logP_v) // TODO: param tm_pos is unused +{ + debug1("TGT: %s\n", aq.hyp->trg.c_str()); + debug4("ALIGN %d-%d = %s-%s\n", aq.sb, aq.se, aq.tgph[0].c_str(), aq.tgph.back().c_str()); + + if (aq.se >= (int) srcw.size()) Error("phrase table rescoring: last source word in phrase is out of bounds\n"); + + // build up current source phrase pair, TODO: switch to reference ? + vector srcph; + for (int w=aq.sb; w<=aq.se; w++) srcph.push_back(srcw[w]); + + //printf("get Prob for %s..%s || %s..%s -> %f\n",srcw[0].c_str(),srcw.back().c_str(),trgw[0].c_str(),trgw.back().c_str,pt.GetProb(srcph,trgph)); + //printf("ALIGN %d-%d = %s-%s -> P=%f\n",aq.sb,aq.se,aq.tb,aq.te,pt.GetProb(srcph,trgph)); + if (logP_v) { + pt.GetProb(srcph,aq.tgph,logP_v); + for (vector::iterator fi=logP_v->begin(); fi!=logP_v->end(); fi++) *fi = log(*fi); + return (*logP_v)[0]; + } + else { + return log(pt.GetProb(srcph,aq.tgph)); + } +} + +void NBest::BlockFinish(PtableMosesPtree &pt, int tm_pos) +{ + debug2("BlockFinish(): processing %d delayed requests, source: %d words\n", nreq, (int)srcw.size()); + + if (nreq==0) return; + + qsort(areq, nreq, sizeof(AlignReq), AlignReqComp); + + int nscores = pt.GetNscores(); + int cnt=1; + + if (tm_pos==0) tm_pos=areq[0].hyp->f.size()-nscores+1; // correct position in append mode + debug2("cumulating %d scores starting at position %d\n", nscores, tm_pos); + + // request phrase probas for the first alignment + if (nscores>1) { + vector logP_scores(nscores, 0.0); + debug4("request align 0: %d-%d %s-%s (several scores)\n",areq[0].sb,areq[0].se,areq[0].tgph[0].c_str(),areq[0].tgph.back().c_str()); + GetAlignProb(pt,areq[0],tm_pos, &logP_scores); + areq[0].hyp->AddFeature(logP_scores,tm_pos); + + for (int n=1; n calculate new logP + debug5("request align %d: %d-%d %s-%s\n", cnt,areq[n].sb,areq[n].se,areq[n].tgph[0].c_str(),areq[n].tgph.back().c_str()); + GetAlignProb(pt,areq[n],tm_pos, &logP_scores); + cnt++; + } + //printf("add %f to hyp %s\n",logP,areq[n].hyp->trg.c_str()); + areq[n].hyp->AddFeature(logP_scores,tm_pos); // cumulate + } + } + else { + debug4("request align 0: %d-%d %s-%s\n",areq[0].sb,areq[0].se,areq[0].tgph[0].c_str(),areq[0].tgph.back().c_str()); + float logP = GetAlignProb(pt,areq[0],tm_pos); + areq[0].hyp->AddFeature(logP,tm_pos); + + for (int n=1; n calculate new logP + debug5("request align %d: %d-%d %s-%s\n", cnt,areq[n].sb,areq[n].se,areq[n].tgph[0].c_str(),areq[n].tgph.back().c_str()); + logP = GetAlignProb(pt,areq[n],tm_pos); + cnt++; + } + //printf("add %f to hyp %s\n",logP,areq[n].hyp->trg.c_str()); + areq[n].hyp->AddFeature(logP,tm_pos); // cumulate + } + } + + debug1(" %d different alignments\n", cnt); + nb_diff_align += cnt; +} + +int NBest::NbPhrases() +{ + int cnt=0; + for (vector::iterator i = nbest.begin(); i != nbest.end(); i++) { + cnt += (*i).NbPhrases(); + } + + return cnt; +} + +//********************************************************** +// +// caching algorithm for TM rescoring with CSTM +// +//********************************************************** + + +// this is identical to Moses ptable rescoring, we just call a different BlockFinish +void NBest::RescorePtable(NbestCSTM &cstm, ifstream &srcf, const int tm_pos) +{ + // get a source line and segment into words + string src; + getline(srcf,src); + if (srcf.eof()) + ErrorN("EOF in source text for n-best hypothesis id=%d", id); + + srcw.clear(); + srcw = Moses::Tokenize(src); + + debug1("NBest::RescorePtable(): CSTM score at position %d\n", tm_pos); + debug2("SRC with %d words: %s\n", (int) srcw.size(), src.c_str()); + + for (vector::iterator hi=nbest.begin(); hi!= nbest.end(); hi++) { + // reset the feature that will be modified in BlockFinish() + // we already append it here if requested + (*hi).SetFeature(0.0, tm_pos); + + hi->trgw = Moses::Tokenize(hi->trg); + int nw=(int) hi->trgw.size(); + debug2("CSTM token target: %s %d words\n", hi->trg.c_str(), nw); + for (vector::iterator ali=(*hi).a.begin(); ali!=(*hi).a.end(); ali++) { + areq[nreq].sb = (*ali).sb; + areq[nreq].se = (*ali).se; + debug5("CSTM process areq %d, src: %d-%d, tgt: %d-%d\n",nreq,(*ali).sb,(*ali).se,(*ali).tb,(*ali).te); + if ((*ali).tb<0 || (*ali).tb>=nw || ((*ali).te<0 || (*ali).te>=nw)) { + fprintf(stderr,"skipping line with targets out of bound in alignment %d-%d=%d-%d\n",(*ali).sb,(*ali).se,(*ali).tb,(*ali).te); + continue; + } + for (int w=(*ali).tb; w<=(*ali).te; w++) areq[nreq].tgph.push_back((*hi).trgw[w]); + cstm.LookupTarget(areq[nreq].tgph, areq[nreq].tgwid); // TODO: this is inefficient, the same target will appear many times + areq[nreq].hyp=&(*hi); + if (++nreq >= max_req) BlockFinish(cstm,tm_pos); + } + } + BlockFinish(cstm,tm_pos); +} + +// this is identical to Moses ptable rescoring, we just call a different BlockFinish +void NBest::RescorePtableInv(NbestCSTM &cstm, ifstream &srcf, const int tm_pos) +{ + Error("NBest::RescorePtableInv()"); +} + +void NBest::BlockFinish(NbestCSTM &cstm, int tm_pos) +{ + debug2("BlockFinish(): processing %d delayed requests, source: %d words\n", nreq, (int)srcw.size()); + + if (nreq==0) return; + int bsize=cstm.mach->GetBsize(); + + qsort(areq, nreq, sizeof(AlignReq), AlignReqComp); + + if (tm_pos==0) tm_pos=areq[0].hyp->f.size(); // correct position in append mode + debug1("cumulating 1 score starting at position %d\n", tm_pos); + + vector srcph; // one source phrase + vector< vector > src_phrases; // all possible source phrase in this block, size + + // process first phrase pair + areq[0].bs=0; + cstm.AddToInput(0,srcw,areq[0].sb,areq[0].se); + srcph.clear(); + for (int w=areq[0].sb; w<=areq[0].se; w++) srcph.push_back(srcw[w]); + src_phrases.push_back(srcph); + + int cnt=1; + + int req_beg=0; // start of current CSLM block in large request array + int bs=0; // current block index in forward bunch + + for (int n=1; n= bsize) { + cstm.trainer->ForwAndCollect(src_phrases,areq,req_beg,n-1,bs,tm_pos); + bs=0; req_beg=n; + } + // add new source phrase to bunch for forward pass + // REMARK: this is not perfect since some of the examples may be out of slist and we actually wouldn't + // need a forward pass for them. However, all request of an n-best block must be performed before + // we go to the next n-best block, In practice there are often less than 128 difference source phrases. + // Therefore, we only do one forward pass anyway + areq[n].bs=bs; + cstm.AddToInput(bs,srcw,areq[n].sb,areq[n].se); + srcph.clear(); + for (int w=areq[n].sb; w<=areq[n].se; w++) srcph.push_back(srcw[w]); + src_phrases.push_back(srcph); + cnt++; + } + else + areq[n].bs=bs; + } + cstm.trainer->ForwAndCollect(src_phrases,areq,req_beg,nreq-1,bs+1,tm_pos); + // FreeReq(); TODO + + printf(" %d different source phrases\n", cnt); + nb_diff_align += cnt; +} + diff --git a/NBest.h b/NBest.h new file mode 100644 index 0000000..51c4628 --- /dev/null +++ b/NBest.h @@ -0,0 +1,73 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + */ + +#ifndef _NBEST_H_ +#define _NBEST_H_ + +using namespace std; + +#include +#include +#include +#include + +#include "Toolsgz.h" +#include "Hypo.h" +#include "NbestLM.h" +#include "NbestCSTM.h" +#include "PtableMosesPtree.h" + +#include "AlignReq.h" + +class NBest { + int id; + vector srcw; // source sentence parsed into words (only available for TM rescoring) + vector nbest; + bool ParseLine(inputfilestream& inpf, inputfilestream& auxf, const int, const bool, const int); + // Delayed translation model rescoring + int max_req; // max number of request cumulated before we perform them in a block + int nreq; // current number of request cumulated + AlignReq *areq; // array to allocate all requests + int nb_diff_align; // stats + public: + NBest(inputfilestream&, inputfilestream& , const int=0, const bool =false , const int=0); + ~NBest(); + int NbNBest() {return nbest.size(); } + int NbPhrases(); + int NbDiffPhrases() {return nb_diff_align; } + void CalcGlobal(Weights&); + void Sort(); // largest values first + void Write(outputfilestream&, int=0); + void AddID(const int offs); + void RescoreLM(NbestLM&, const int); // recalc LM score on hypothesis (uses optional auxiliary data) + // Delayed translation model rescoring with on disk phrase table + void RescorePtable(PtableMosesPtree&, ifstream&, const int); + void RescorePtableInv(PtableMosesPtree&, ifstream&, const int); + void BlockFinish(PtableMosesPtree&, int); + REAL GetAlignProb(PtableMosesPtree&, AlignReq&, const int, vector* = NULL); + // Delayed translation model rescoring with CSTM + void RescorePtable(NbestCSTM&, ifstream&, const int); + void RescorePtableInv(NbestCSTM&, ifstream&, const int); + void BlockFinish(NbestCSTM&, int); + void ForwAndCollect(int, int, int); +}; + + +#endif diff --git a/NbestCSTM.cpp b/NbestCSTM.cpp new file mode 100644 index 0000000..d44dc8b --- /dev/null +++ b/NbestCSTM.cpp @@ -0,0 +1,123 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * + * + */ + +using namespace std; + +#include + +#include "Tools.h" +#include "Hypo.h" +#include "NbestCSTM.h" +#include "ErrFctSoftmCrossEntNgramMulti.h" + + +NbestCSTM::~NbestCSTM() { + if (mach) delete mach; + if (trainer) delete trainer; +} + + +void NbestCSTM::Read(char *fname, char *wl_src_fname, char *wl_tgt_fname, char *pt_fname, int nscores, char *scores_specif) +{ + ifstream ifs; + ifs.open(fname,ios::binary); + CHECK_FILE(ifs,fname); + mach = Mach::Read(ifs); + ifs.close(); + + mach->Info(); + + // create vocabulary from our source word list, this must be exactly the same order than in extract2bin !!! + cout << " - reading source word list from file " << wl_src_fname << flush; + src_wlist.SetSortBehavior(this->stable_sort); + src_wlist.Read(wl_src_fname); + cout << ", got " << src_wlist.GetSize() << " words" << endl; + + // create vocabulary from our target word list, this must be exactly the same order than in extract2bin !!! + cout << " - reading target word list from file " << wl_tgt_fname << flush; + tgt_wlist.SetSortBehavior(this->stable_sort); + tgt_wlist.Read(wl_tgt_fname); + cout << ", got " << tgt_wlist.GetSize() << " words" << endl; + + trainer = new TrainerPhraseSlist(mach, &src_wlist, &tgt_wlist, pt_fname, nscores, scores_specif); +} + +void NbestCSTM::AddToInput(int b, vector &vsrcw, int sb, int se) +{ + int idim=mach->GetIdim(); + if (sb-se+1 > idim) { + ErrorN("NbestCSTM::AddToInput(): source phrase too long (%d) for machine (%d)\n", sb-se+1, idim); + } + + REAL *iptr=trainer->GetBufInput() + b*idim; + int i=0; + + // get index of each source word + debug0("NbestCSTM::AddToInput():"); + REAL unk_wi = (REAL) src_wlist.GetIndex(WordList::WordUnknown); + for (int w=sb; w<=se; w++) { + WordList::WordIndex wi = src_wlist.GetIndex(vsrcw[w].c_str()); + if (wi==WordList::BadIndex) { + fprintf(stderr, "ERROR: source word not found: %s\n", vsrcw[w].c_str()); + *iptr++ = unk_wi; + } + else + *iptr++ = (REAL) wi; + debug2(" %s->%f", vsrcw[w].c_str(), iptr[-1]); + i++; + } + debug0("\n"); + + // fill up input phrase to the dimension of the machine + for (; i &vtrgw, WordID *wid) +{ + int nph=trainer->GetTgtNbPhr(); + int vdim=(int) vtrgw.size(); + + if (vdim>nph) { + ErrorN("NbestCSTM::MapTarget(): phrase (%d) exceeds length of machine (%d)\n",vdim, nph); + } + + int i; + debug0("NbestCSTM::LookupTarget():"); + for (i=0; iGetSlistLen(); + } + else + wid[i] = (WordID) wi; + debug2(" %s->%d", vtrgw[i].c_str(), wid[i]); + } + debug0("\n"); + + // fill up + for (; i &, int, int); + virtual void LookupTarget(vector &v, WordID *); + virtual void Stats() {trainer->BlockStats();} + friend class NBest; +}; + +#endif diff --git a/Ptable.h b/Ptable.h new file mode 100644 index 0000000..6b518f7 --- /dev/null +++ b/Ptable.h @@ -0,0 +1,49 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * + * + */ + +#ifndef _Ptable_h +#define _Ptable_h + +using namespace std; + +#include +#include +#include "Tools.h" // for type REAL +//#include "DataNgramBin.h" // for type WordID + +// interface class to classical phrase tables +// +// + +#define NULL_LN_PROB (1.0) // this value must not be possible as a normal return value of ln Prob + +class Ptable { + private: + public: + Ptable(const int, const int=2, const bool=false) {}; // initialize + virtual ~Ptable() {}; + virtual void Read(const string &) {}; // read form file + virtual REAL GetProb(vector&, vector&) {return 0;} // get backoff LM P(w|ctxt) from seqeuence of words + //virtual REAL GetProbWid(REAL *src, WordID *tgt) {return 0;} +}; + +#endif diff --git a/PtableMosesPtree.cpp b/PtableMosesPtree.cpp new file mode 100644 index 0000000..40efd99 --- /dev/null +++ b/PtableMosesPtree.cpp @@ -0,0 +1,194 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * + * + */ + +#include "PtableMosesPtree.h" + + +PtableMosesPtree::~PtableMosesPtree () +{ + for (vector::iterator p=ptree.begin(); p!=ptree.end(); p++) + (*p)->FreeMemory(); +} + +// +// read a new phrase table +// +void PtableMosesPtree::Read(const string &fname, const int p_nscores, const char *scores_specif) +{ + if (strlen(scores_specif)<2 || scores_specif[1]!=':') + Error("format error in the specification of the TM scores"); + if (scores_specif[0]<'1' || scores_specif[0]>'4') + Error("wrong value for the number of TM scores"); + + if (ptree.size()==0) + nscores=scores_specif[0]-'0'; + else { + if (nscores!=scores_specif[0]-'0') + Error("PtableMosesPtree::Read(): inconsistent number of scores to be returned from multiple phrase tables"); + } + if (nscores > p_nscores) + Error("PtableMosesPtree::Read(): the number of scores to be returned exceeds the number of available ones"); + + ptree.push_back(new Moses::PhraseDictionaryTree); + pos_scores.push_back(scores_specif[2]-'0'); + + ptree.back()->NeedAlignmentInfo(false); + cout << " - loading Moses binary phrase table from file " << fname << " with " << p_nscores << " scores" << endl; + ptree.back()->Read(fname); + cout << " using " << nscores << " scores starting at position " << pos_scores.back() << endl; + tgtcands.clear(); +}; + + +// +// Get probabilities from the phrase-tables +// - scores=NULL: return either one value as a function result +// - scores!=NULL: return a sequence of values in that vector (as many as the vector has space) +// + +REAL PtableMosesPtree::GetProb(vector &src, vector &tgt, vector *scores) +{ + uint w; + +#ifdef DEBUG + cout << "Ptable prob:"; + for (w=0; wsize() == 0) + Error("PtableMosesPtree::GetProb() parameter scores has zero dimension"); + + if (scores && (int) scores->size() > nscores) + Error("PtableMosesPtree::GetProb() requesting too much scores form the phrase table"); + + + for (uint p=0; pGetTargetCandidates(src, tgtcands); + debug2(" - phrase table %u has %d candidates:\n", p, (int) tgtcands.size()); + size_t pos=pos_scores[p]; + + // search for our target phrase + for (uint tph=0; tph %d\n",w, tgt[w].c_str(), tgtcands[tph].tokens[w]->c_str(), match); + } + if (match) { + debug5(" found phrase of length %u/%u at pos %d out of %d, p=%f\n", (uint) src.size(), (uint) tgt.size(), tph, (int) tgtcands.size(), tgtcands[tph].scores[pos]); + if (scores) { + for (uint s=0; ssize(); s++) { + (*scores)[s]=tgtcands[tph].scores[pos+s]; // return sequence of scores + debug2(" score[%u]: %f\n",s, (*scores)[s]); + } + } + return tgtcands[tph].scores[pos]; + } + } + + } + + // phrase pair wasn't found in any phrase table + // do we have an unknown word which was copied to the target ? + if (src.size()==1 && tgt.size()==1 && src[0]==tgt[0]) { + debug0(" UNK: source copied to target\n"); + if (scores) { + for (uint s=0; ssize(); s++) (*scores)[s]=PROBA_COPY_UNK; // return sequence of scores + } + return PROBA_COPY_UNK; + } + +#ifdef DEBUG + cout << "ERROR: can't find the following phrase pair in the external phrase tables: SETTING PROBA TO " << PROBA_NOT_IN_PTABLE << endl; + for (w=0; wsize(); s++) (*scores)[s]=PROBA_NOT_IN_PTABLE; // return sequence of scores + } + return PROBA_NOT_IN_PTABLE; +} + +/* +void PtableMosesPtree::BlockEval (Hypo &hyp, vector &srcw, const int pos) +{ +} +*/ + +void PtableMosesPtree::RescoreHyp (Hypo &hyp, vector &srcw, const int pos) +{ + debug1("TGT: %s\n", hyp.trg.c_str()); + vector trgw = Moses::Tokenize(hyp.trg); + + int nws=srcw.size(), nwt=trgw.size(); + debug3("Ptable rescoring with %d source and %d target words, %d phrases\n", nws, nwt, (int) hyp.a.size()); + vector srcph, trgph; // needed to build up current phrase pair + + + if (nscores>1) { + vector res(nscores,0.0); // we request more than one score form the phrase table + vector logP(nscores,0.0); // we request more than one score form the phrase table + + for (vector::iterator al=hyp.a.begin(); al!=hyp.a.end(); al++) { + if ((*al).se>=nws) Error("phrase table rescoring: last source word in phrase out of bounds\n"); + if ((*al).te>=nwt) Error("phrase table rescoring: last target word in phrase out of bounds\n"); + + debug4("ALIGN %d-%d = %d-%d\n", (*al).sb, (*al).se, (*al).tb, (*al).te); + srcph.clear(); + for (int w=(*al).sb; w<=(*al).se; w++) srcph.push_back(srcw[w]); + trgph.clear(); + for (int w=(*al).tb; w<=(*al).te; w++) trgph.push_back(trgw[w]); + + GetProb(srcph,trgph,&res); // TODO: this is very inefficient, we should group together request for the same source phrase + for (int i=0; i::iterator al=hyp.a.begin(); al!=hyp.a.end(); al++) { + if ((*al).se>=nws) Error("phrase table rescoring: last source word in phrase out of bounds\n"); + if ((*al).te>=nwt) Error("phrase table rescoring: last target word in phrase out of bounds\n"); + + debug4("ALIGN %d-%d = %d-%d\n", (*al).sb, (*al).se, (*al).tb, (*al).te); + srcph.clear(); + for (int w=(*al).sb; w<=(*al).se; w++) srcph.push_back(srcw[w]); + trgph.clear(); + for (int w=(*al).tb; w<=(*al).te; w++) trgph.push_back(trgw[w]); + + logP+=log(GetProb(srcph,trgph)); // TODO: this is very inefficient, we should group together request for the same source phrase + } + hyp.SetFeature(logP,pos); + } +} diff --git a/PtableMosesPtree.h b/PtableMosesPtree.h new file mode 100644 index 0000000..53f0632 --- /dev/null +++ b/PtableMosesPtree.h @@ -0,0 +1,77 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * + * + */ + +#ifndef _PtableMosesPtree_h +#define _PtableMosesPtree_h + +using namespace std; + +#include "Ptable.h" +#include "Hypo.h" + +#include +#include +#include +#include + +// from Moses: +#include +#include + + +// interface class to Moses binary on-disk prahse tables +// (implementation with a prefix tree) + +const REAL PROBA_COPY_UNK (1); // translation probability when an unknown word is copied from source to target +const REAL PROBA_NOT_IN_PTABLE (1e-20); // translation probability when a phrase pair is not found in the Moses phrase table + // this can happen when some words are mapped to because of limited source or target vocabularies + +// +// helper class to store and compare Phrase requests +// ugly C-style structure, but this seems to be more efficient + +/* +struct PhraseReq { + Align a; + vector &trgw; + int cnt; + REAL *res_ptr; +}; +*/ + +class PtableMosesPtree { + private: + vector ptree; // main and eventually secondary phrase tables + vector pos_scores; // starting position of the scores to be returned from each phrase table + int nscores; // number of scores to be returned (must be same for all phrase-tables) + vector tgtcands; + public: + PtableMosesPtree() {}; + virtual ~PtableMosesPtree(); + virtual void Read(const string &, const int, const char*); // read next phrase table from file + virtual REAL GetProb(vector&, vector&, vector * =NULL); // return one proba for a tokenized phrase-pair or vector of scores + //virtual REAL GetProbWid(REAL *src, WordID *tgt) {return 0;} + virtual void RescoreHyp (Hypo&, vector &, const int); + virtual int GetNscores() {return nscores; } +}; + +#endif diff --git a/TrainerPhraseSlist.cpp b/TrainerPhraseSlist.cpp new file mode 100644 index 0000000..88010f9 --- /dev/null +++ b/TrainerPhraseSlist.cpp @@ -0,0 +1,1164 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * + */ + +using namespace std; +#include +#include +#include +#include + +#include "Tools.h" +#include "Mach.h" +#include "MachTab.h" +#include "MachPar.h" +#include "MachSeq.h" +#include "MachSplit.h" +#include "TrainerPhraseSlist.h" +#include "ErrFctSoftmCrossEntNgram.h" + +#include "NBest.h" +#include "sort.cpp" + +// activate mapping of input +// not really necessary, may only speed up calculations due to cache locality +// if you activvate this option, you must do so for all your networks +#undef TRAINER_PHASE_SLIST_MAP_INPUT + +void TrainerPhraseSlist::DoConstructorWork() +{ + idim=mach->GetIdim(); odim=mach->GetOdim(); bsize=mach->GetBsize(); + +#ifdef BLAS_CUDA + Gpu::SetConfig(mach->GetGpuConfig()); + gpu_input = Gpu::Alloc(idim*bsize, "inputs in Trainer"); + host_output = new REAL[odim*bsize]; +#endif + buf_target_wid = new WordID[odim*bsize]; // TODO: those are actually too big, we need tg_nbphr*bsize ?? + buf_target_ext = new WordID[odim*bsize]; + buf_target_in_blocks = new REAL[odim*bsize]; + + // set up vector to outputs of the target phrases + if (mach->GetMType() != file_header_mtype_mseq) + Error("CSTM: sequential machine needed\n"); + MachSeq *mseq=(MachSeq*) mach; + if (mseq->MachGetNb()<2) + Error("CSTM: the number of machines is suspiciously small"); + + // check input layer + if (mseq->MachGet(0)->GetMType() != file_header_mtype_mpar) + Error("CSTM: the input layer has the wrong architecture\n"); + MachPar *mpar = (MachPar*) mseq->MachGet(0); + if (mpar->MachGet(0)->GetMType() != file_header_mtype_tab) + Error("CSTM: the input layer has the wrong architecture\n"); + MachTab *mtab = (MachTab*) mpar->MachGet(0); + max_inp_idx = mtab->GetMaxInpVal(); + + // check output layer + if (mseq->MachGet(mseq->MachGetNb()-1)->GetMType() != file_header_mtype_msplit) + Error("CSTM: the output layer has the wrong architecture\n"); + MachSplit *msp = (MachSplit*) mseq->MachGet(mseq->MachGetNb()-1); + tg_nbphr=msp->MachGetNb(); + if (data_train && (data_train->GetOdim() != tg_nbphr)) { + ErrorN("CSTM: output dimension of the training data should be %d, found %d\n", tg_nbphr, data_train->GetOdim()); + } + + cout << " - using cross entropy for each output vector" << endl; + phrase_mach.clear(); + mach_errfct.clear(); + for (int m=0; mMachGet(m)); + if (m>0 && phrase_mach[m-1]->GetOdim() != phrase_mach[m]->GetOdim()) + Error("CSTM: the output layer dimension must be identical for all phrases\n"); + //ErrFctSoftmCrossEntNgram *tmp=dynamic_cast(errfct); + //mach_errfct.push_back(new ErrFctSoftmCrossEntNgram(*tmp)); // create copy of user specified error function + mach_errfct.push_back(new ErrFctSoftmCrossEntNgram(*phrase_mach[m])); // each machine gets its own error function with local mem for grad +#ifdef BLAS_CUDA + Gpu::SetConfig(mach_errfct[m]->GetGpuConfig()); + gpu_target.push_back(Gpu::Alloc(bsize*sizeof(REAL), "targets in Trainer")); +#endif + } + dim_per_phrase = phrase_mach[0]->GetOdim(); + cout << " - this machine can predict up to " << phrase_mach.size() << " phrases, each with an output layer of dimension " << dim_per_phrase << endl; + tg_slist_len = dim_per_phrase-1; + + + // get source word list + if (sr_wlist == NULL) { + vector *vect_wlist = NULL; + if (data_dev != NULL) + vect_wlist = data_dev->GetSrcWList(); + else if (data_train != NULL) + vect_wlist = data_train->GetSrcWList(); + if ((vect_wlist != NULL) && !vect_wlist->empty()) + sr_wlist = &(vect_wlist->front()); + } + if (sr_wlist == NULL) + Error("no source word list available"); + if ((int) sr_wlist->GetSize() > max_inp_idx) + ErrorN("the size of the source word list (%d) exceeds the number of input words the machine was trained for (%d)",(int) sr_wlist->GetSize(),max_inp_idx); + debug1("* using source word list with %d words\n",(int)sr_wlist->GetSize()); + + // get target word list + if (tg_wlist == NULL) { + vector *vect_wlist = NULL; + if (data_dev != NULL) + vect_wlist = data_dev->GetTgtWList(); + else if (data_train != NULL) + vect_wlist = data_train->GetTgtWList(); + if ((vect_wlist != NULL) && !vect_wlist->empty()) + tg_wlist = &(vect_wlist->front()); + } + if (tg_wlist == NULL) + Error("no target word list available"); + if (!tg_wlist->FrequSort()) + Error("the target word list doesn't contain word counts"); + if (tg_wlist->GetSize() <= tg_slist_len) + Error("TrainerPhraseSlist: the output layer is larger than the target word list"); + debug1("* using target word list with %d words\n",(int)tg_wlist->GetSize()); + + ulong sum_sl=0, sum=0; + tg_wlist->SetShortListLength(tg_slist_len); + tg_wlist->CountWords(sum_sl, sum); + printf (" - setting up target short list of %d words, coverage of %5.2f%%\n", tg_slist_len, 100.0*sum_sl/sum); + +#ifdef DEBUG2 + cout << "Words in slist:" << endl; + WordID ci=tg_slist_len; + WordList::const_iterator iter, end = tg_wlist->End(); + for (iter=tg_wlist->Begin(); (iter!=end) && (ci > 0); iter++, ci--) + printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id); +#endif + +#ifdef DEBUG2 + cout << "Words not in slist:" << endl; + for (; iter!=end; iter++) + printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id); +#endif + +#ifdef DEBUG2 + // just needed for debugging + words.reserve(tg_wlist->GetSize()); + for (iter=tg_wlist->Begin(); iter!=end; iter++) words[iter->id] = strdup(iter->word); +#endif + + debug0(" + done init TrainerPhraseSlist\n"); +} + +// +// constructor for training +// + +TrainerPhraseSlist::TrainerPhraseSlist (Mach *pmach, Lrate *lrate, ErrFct *perrfct, + const char *train_fname, const char *dev_fname, const char *pt_fname, int p_nscores, + REAL p_wd, int p_maxep, int p_ep) + : Trainer(pmach,lrate,perrfct,NULL,NULL,p_wd,p_maxep,p_ep), + tg_nbphr(0), tg_slist_len(0), + sr_wlist(NULL), tg_wlist(NULL), + ptable(NULL), + nb_ex_slist(0), nb_ex_short_tgt(0), + nb_forw(0) +{ + debug2("*** Constructor TrainerPhraseSlist for training idim=%d, odim=%d ***\n",idim,odim); + cout << "Setting up CSTM training with short list" << endl; + + if (train_fname) { + data_train = new Data(train_fname); + if (idim != data_train->GetIdim()) { + ErrorN("TrainerPhraseSlist: input dimension of the training data (%d) does not match the one of the machine (%d)\n", data_train->GetIdim(), idim); + } + if (data_train->GetOdim()<1 || data_train->GetOdim()>32) { + ErrorN("TrainerPhraseSlist: output dimension of the training data should be 1..10, found %d\n", data_train->GetOdim()); + } + auxdim = data_train->GetAuxdim(); + } + else + data_train=NULL; + + if (dev_fname) { + data_dev = new Data(dev_fname); + data_dev_alloc=true; + if (idim != data_dev->GetIdim()) { + ErrorN("TrainerPhraseSlist: input dimension of the validation data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim); + } + if (data_dev->GetOdim()<1 || data_dev->GetOdim()>32) { + ErrorN("TrainerPhraseSlist: output dimension of the validation data should be 1..10, found %d\n", data_dev->GetOdim()); + } + int auxdim_dev = data_dev->GetAuxdim(); + if (0 >= auxdim) + auxdim = auxdim_dev; + else if (auxdim != auxdim_dev) + ErrorN("TrainerPhraseSlist: auxiliary data dimension of the validation data should be %d, found %d", auxdim, auxdim_dev); + } + else { + data_dev=NULL; + data_dev_alloc=false; + } + iaux = (idim - auxdim); + + DoConstructorWork(); + + if (data_dev) { + if (pt_fname) { + ptable = new(PtableMosesPtree); + ptable->Read(pt_fname,5,"1:2"); + } + else + cout << " - no external phrase table provided (unhandled phrase pairs receive 0 logproba)" << endl; + } +} + +// +// constructor for testing +// + +TrainerPhraseSlist::TrainerPhraseSlist (Mach *pmach, ErrFct *perrfct, + Data *data, char *pt_fname, int p_nscores) + : Trainer(pmach,NULL,perrfct,NULL,NULL), + tg_nbphr(0), tg_slist_len(0), + sr_wlist(NULL), tg_wlist(NULL), + ptable(NULL), + nb_ex_slist(0), nb_ex_short_tgt(0), + nb_forw(0) +{ + debug0("*** Constructor TrainerPhraseSlist for testing ***\n"); + cout << "Setting up testing with short list" << endl; + + data_train=NULL; + data_dev=data; + data_dev_alloc=false; // do not free it by this class ! + + if (idim != data_dev->GetIdim()) { + ErrorN("TrainerPhraseSlist: input dimension of the test data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim); + } + auxdim = data_dev->GetAuxdim(); + iaux = (idim - auxdim); + + DoConstructorWork(); + + if (pt_fname) { + ptable = new(PtableMosesPtree); +#ifdef BACKWARD_TM + ptable->Read(pt_fname,5,"1:0"); // backward TM prob +#else + ptable->Read(pt_fname,5,"1:2"); // forward TM prob +#endif + } + else + cout << " - no external phrase table provided (unhandled phrase pairs receive 0 logproba)" << endl; +} + +// +// constructor for nbest rescoring +// + +TrainerPhraseSlist::TrainerPhraseSlist (Mach *pmach, + WordList *p_sr_wlist, WordList *p_tg_wlist, + char *pt_fname, int nscores, char *scores_specif) + : Trainer(pmach,NULL,NULL,NULL,NULL), + tg_nbphr(0), tg_slist_len(0), + sr_wlist(p_sr_wlist), tg_wlist(p_tg_wlist), + ptable(NULL), + nb_ex_short_tgt(0), nb_forw(0) +{ + debug0("*** Constructor TrainerPhraseSlist for block operations ***\n"); + cout << "Setting up CSTM with short list" << endl; + // TODO: init with TrainerNgram before + data_train=NULL; + data_dev=NULL; + DoConstructorWork(); + + if (pt_fname) { + ptable = new(PtableMosesPtree); + ptable->Read(pt_fname, nscores, scores_specif); + } + else + cout << " - no external phrase table provided (unhandled phrase pairs receive 0 logproba)" << endl; +} + +//************************************************************************************** + +TrainerPhraseSlist::~TrainerPhraseSlist () +{ + debug0("*** Destructor TrainerPhraseSlist ***\n"); + + if (buf_target_wid) delete [] buf_target_wid; + if (buf_target_ext) delete [] buf_target_ext; + if (buf_target_in_blocks) delete [] buf_target_in_blocks; + // buf_input and buf_target will be deleted by ~Trainer() + +#ifdef BLAS_CUDA + // free local gpu_target buffer on each GPU + for (vector::iterator it=gpu_target.begin(); it!=gpu_target.end(); ++it) + if (*it) cudaFree(*it); + gpu_target.clear(); +#endif + + phrase_mach.clear(); + mach_errfct.clear(); + +#ifdef DEBUG2 + vector::const_iterator iter, end = words.end(); + for (iter=words.begin(); iter!=end; iter++) delete *iter; + words.clear(); +#endif +} + + +//************************************************************************************** +// +// We have MachSplit() at the ouput +// this means that each machine has its own error function with its own gradient +// these error functions point to the outputs in the individual machines +// and the gradients stored in this Trainer + +REAL TrainerPhraseSlist::Train() +{ + if (!data_train) return -1; +#ifdef DEBUG + printf("*****************\n"); + printf("TrainerPhraseSlist::Train():\n"); + printf(" - idim=%d, odim=%d, tg_nbphr=%d\n", idim, odim, tg_nbphr); + printf(" - data_in: %p \n", (void*) buf_input); + printf(" - target: %p \n", (void*) buf_target); + printf(" - target_in_blocks: %p \n", (void*) buf_target_in_blocks); + printf(" - tgt WID: %p \n", (void*) buf_target_wid); +#endif + + Timer ttrain; // total training time + //Timer tload; // total time to select examples + //Timer ttransfer; // total transfer time of data to GPU + //Timer tforw; // total forw time + //Timer tgrad; // total time fr gradient + //Timer tbackw; // total backw time + + ttrain.start(); + data_train->Rewind(); + + REAL log_sum=0; + int i; + nb_ex=nb_ex_slist=nb_ex_short_inp=nb_ex_short_tgt=0; + nb_tg_words=nb_tg_words_slist=0; + + + // set input +#ifdef BLAS_CUDA + Gpu::SetConfig(mach->GetGpuConfig()); + mach->SetDataIn(gpu_input); // we copy from buf_input to gpu_input + debug1(" - gpu_input %p\n", gpu_input); +#else + mach->SetDataIn(buf_input); + debug1(" - buf_input %p\n", buf_input); +#endif + + // connect the error functions for each individual machine + // buf_target does sequentially contain all the targets for block0, than block1 and so on + // buf_target_in_blocks + // targets are arranged by blocks of bsize, i.e. first bsize targets for 1st machine, than 2nd and so on + // by these means we don't need to copy or re-arrange data later in the GPU +#ifdef BLAS_CUDA + REAL *tptr; +#else + REAL *tptr=buf_target_in_blocks; +#endif + debug0("Error functions of the individual machines:\n"); + for (i=0; iSetOutput(phrase_mach[i]->GetDataOut()); +#ifdef BLAS_CUDA + tptr=gpu_target[i]; // we copy later from buf_target_in_blocks to gpu_target +#endif + mach_errfct[i]->SetTarget(tptr); + phrase_mach[i]->SetGradOut(mach_errfct[i]->GetGrad()); + debug5(" %d: fct=%p, output=%p, target=%p, grad=%p\n",i,(void*)mach_errfct[i],(void*)phrase_mach[i]->GetDataOut(),(void*)tptr,(void*)mach_errfct[i]->GetGrad()); +#ifndef BLAS_CUDA + tptr += bsize; // each example provides 1 target for each output machine (the word ID) +#endif + } + + eos_src = eos_tgt = NULL_WORD; + if (sr_wlist->HasEOS()) { + eos_src=sr_wlist->GetEOSIndex(); + printf(" - using a special token for short source sequences (%d)\n", eos_src); + } + if (tg_wlist->HasEOS()) { + eos_tgt=tg_wlist->GetEOSIndex(); + printf(" - using a special token for short target sequences (%d)\n", eos_tgt); + } + + // master loop on all training data + bool data_available; + do { + //tload.start(); + + // get a bunch of data and map all the words + int n=0; + data_available = true; + while (n < mach->GetBsize() && data_available) { + data_available = data_train->Next(); + if (!data_available) break; + debug0("TRAIN DATA: input: "); + bool at_least_one_short=false; + for (i=0; iinput[i]; + debug2(" %s[%d]", sr_wlist->GetWordInfo(inp).word,inp); +#if TRAINER_PHASE_SLIST_MAP_INPUT // default is not to do so + if (inp == NULL_WORD) + at_least_one_short=true; + else { + buf_input[n*idim + i] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist::Train(): input"); // map context words IDs + if (inp==eos_src) at_least_one_short=true; + } +#else + buf_input[n*idim + i] = inp; + if (inp == NULL_WORD || inp==eos_src) + at_least_one_short=true; + else if (inp<0 || inp>=(int)sr_wlist->GetSize()) + ErrorN("TrainerPhraseSlist::Train(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize()); +#endif + } + for (; i < idim ; i++) // copy auxiliary data + buf_input[n * idim + i] = data_train->input[i]; + if (at_least_one_short) nb_ex_short_inp++; + + debug0("\n - > mapped output: "); + + bool all_in_slist=true; // ALL to be predicted words are in short list + at_least_one_short=false; + int nbtgsl=0; + for (i=0; itarget[i]; + int idx=i+n*tg_nbphr; + buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist::Train(): output"); // TODO: not really needed during training, just the current value + if (outp==NULL_WORD + || (at_least_one_short && outp==eos_tgt)) // we only predict the FIRST EOS, the other ones are set to NULL_WORD + { // NULL_WORDS are mapped, they will be detected in gradient calculation + buf_target[idx] = (REAL) NULL_WORD; + at_least_one_short=true; + debug1(" -[%d->NULL]",(int) buf_target[idx]); + } + else { + // map normal word or EOS + nb_tg_words++; // also count EOS since we need to predict them at the output + if (outp==eos_tgt) at_least_one_short=true; + if (tg_wlist->InShortList(buf_target_wid[idx])) { + buf_target[idx] = (REAL) buf_target_wid[idx]; + debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]); + nbtgsl++; + } + else { + buf_target[idx] = (REAL) tg_slist_len; // words that are not in slist are ALL done by the last output neuron + debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]); + all_in_slist=false; + } + } + } + if (all_in_slist) { + nb_ex_slist++; + nb_tg_words_slist += nbtgsl; + } + if (at_least_one_short) nb_ex_short_tgt++; + debug1(" all_slist=%d\n",all_in_slist); + + n++; + } // loop to get a bunch of examples + debug4("train bunch of %d words, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n)); + //tload.stop(); + +#ifdef DEBUG2 + printf("network data:\n"); + REAL *iptr=buf_input; + for (int nn=0;nn "); + for (i=0;i factor + if (n>0) { + // copy targets from buf_target to buf_target_in_blocks by re-arranging them into blocks per machine + + debug0("re-arrange targets\n"); + for (i=0; iForw(n,true); + //tforw.stop(); + + //tgrad.start(); + debug0("call Error functions of the individual machines:\n"); + for (i=0; iGetGpuConfig()))); +#endif + // the returned log_sum is cumulated over a full batch for one specific output word + log_sum += mach_errfct[i]->CalcGradNull(n); + } + //tgrad.stop(); + + debug1(" log_sum=%e\n",log_sum); +#ifdef DEBUG2 + int t=(int) data_train->target[0]; +# ifdef BLAS_CUDA + Gpu::SetConfig(mach->GetGpuConfig()); + REAL * tmp = Gpu::Alloc(5, "tmp buffer for DEBUG2"); + cublasGetVector(odim,CUDA_SIZE,mach->GetDataOut(),1,tmp,1); + printf("OUTPUT:"); + for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n"); + cublasGetVector(3, CUDA_SIZE, data_train->target, 1, tmp, 1); + printf("TARGET:"); + for (int i=0;i<1; i++) printf(" %f", tmp[i]); printf("\n"); + //TODO check if we need odim or idim! + // TODO: cublasGetVector(odim*bsize, CUDA_SIZE, errfct->GetGrad(), 1, tmp, 1); + printf(" GRAD:"); + for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n"); + cublasFree(tmp); +# else + printf("OUTPUT:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",mach->GetDataOut()[i]); printf("\n"); + printf("TARGET:") ; for (int i=0;i<1; i++) printf(" %f",data_train->target[i]); printf("\n"); + printf(" GRAD:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",errfct->GetGrad()[i]); printf("\n"); +# endif //BLAS_CUDA +#endif //DEBUG2 + + lrate->UpdateLrateOnForw(mach->GetNbForw()); + //tbackw.start(); + mach->Backw(lrate->GetLrate(), wdecay, n); + //tbackw.stop(); + } + + nb_ex += n; + } while (data_available); +#ifdef BLAS_CUDA + Gpu::StreamSynchronize(); +#endif + + ttrain.stop(); + ttrain.disp(" - training time: "); + //tload.disp(" including load: "); + //ttransfer.disp(" transfer: "); + //tforw.disp(" forw: "); + //tgrad.disp(" grad: "); + //tbackw.disp(" backw: "); + printf("\n"); + + printf(" - CSTM log_sum=%.2f%s, target words=%d, in shortlist=%d, nb_tg_words_slist=%d\n", + log_sum, tg_wlist->HasEOS() ? " including EOS" : "", nb_tg_words, nb_ex_slist, nb_tg_words_slist); + if (nb_tg_words>0) return exp(-log_sum / (REAL) nb_tg_words); // when normalizing consider that all examples lead to a forward pass + + return -1; +} + +//************************************************************************************** +// + +void TrainerPhraseSlist::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni) +{ + int Nbest=100; + + // get input length + int input_length; + for (input_length=0;input_length > > prepared_scores + = prepare_hypotheses(optr, tg_nbphr, dim_per_phrase, Nbest); + std::vector > > best + = sort_ngrams(prepared_scores, input_length, Nbest); + + for(std::size_t i = 0; i < best.size(); ++i) { + // source + for (int j=0; jGetWordInfo(buf_input[ni*idim+j]).word << " "; + } + + // target + fspt << "|||"; + for(std::size_t j = 0; j < best[i].second.size(); ++j) { + fspt << " " << tg_wlist->GetWordInfoMapped(best[i].second[j]).word; + } + + // score + fspt << " ||| " << exp(best[i].first); + fspt << "\n"; + } + +} + +//************************************************************************************** +// +#if 0 +void TrainerPhraseSlist::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni) +{ + int i; + // Find most likely outputs + for (i=0;iGetWordInfo(buf_input[ni*idim+i]).word << " "; + } + fspt << "||| "; + + for (i=0; imax) { max=*sptr; max_idx=s; } + } + fspt << tg_wlist->GetWordInfoMapped(max_idx).word << "[" << max << "] "; + } + fspt << endl; +} +#endif + +//************************************************************************************** +// + +REAL TrainerPhraseSlist::TestDev(char *fname) +{ + if (!data_dev) return -1; + + vector src_phrase; // interface with classical phrase tables + vector tgt_phrase; + vector done_by_cstm; + + ofstream fs; + if (fname) { + cout << " - dumping phrase probability stream to file '" << fname << "'" << endl; + fs.open(fname,ios::out); + CHECK_FILE(fs,fname); + } + +#undef DUMP_PHRASE_TABLE +#ifdef DUMP_PHRASE_TABLE + char *ptfname = (char*) "alltrans.txt"; + ofstream fspt; + fspt.open(ptfname,ios::out); + CHECK_FILE(fspt,ptfname); + cout << " - dumping new phrase table to file '" << ptfname << "'" << endl; +#endif + + nb_ex=nb_ex_slist=nb_ex_short_inp=nb_ex_short_tgt=0; + nb_tg_words=nb_tg_words_slist=0; + int nb_not_in_ptable=0; // this counts the number of phrase pairs which were not found in the external phrase table + int nb_src_words=0; + REAL log_sum=0; + REAL log_sum_notunk=0; // all known phrase pairs, either CSTM or ptable (count=nb+_ex - nb_not_in_ptable) + REAL log_sum_cstm=0; // only CSLM, i.e. considering phrases done by CSTM + REAL log_sum_cstm_short=0; // like CSTM, limited to short n-grams, i.e. we do not count the prediction of (multiple) EOS + + uint idx; + + // set input +#ifdef BLAS_CUDA + Gpu::SetConfig(mach->GetGpuConfig()); + mach->SetDataIn(gpu_input); // we copy from buf_input to gpu_input + debug1(" - gpu_input %p\n", gpu_input); +#else + mach->SetDataIn(buf_input); + debug1(" - buf_input %p\n", buf_input); +#endif + + // connect the error functions for each individual machine + // buf_target does sequentially contain all the targets for block0, than block1 and so on + // buf_target_in_blocks + // targets are arranged by blocks of bsize, i.e. first bsize targets for 1st machine, than 2nd and so on + // by these means we don't need to copy or re-arange data later in the GPU +#ifdef BLAS_CUDA + REAL *tptr; +#else + REAL *tptr=buf_target_in_blocks; +#endif + debug0("Error functions of the individual machines:\n"); + for (int i=0; iSetOutput(phrase_mach[i]->GetDataOut()); +#ifdef BLAS_CUDA + tptr=gpu_target[i]; // we copy later from buf_target_in_blocks to gpu_target +#endif + mach_errfct[i]->SetTarget(tptr); + phrase_mach[i]->SetGradOut(mach_errfct[i]->GetGrad()); + debug5(" %d: fct=%p, output=%p, target=%p, grad=%p\n",i,(void*)mach_errfct[i],(void*)phrase_mach[i]->GetDataOut(),(void*)tptr,(void*)mach_errfct[i]->GetGrad()); +#ifndef BLAS_CUDA + tptr += bsize; // each example provides 1 target for each output machine (the word ID) +#endif + } + + // how do we handle short sequences ? + eos_src = eos_tgt = NULL_WORD; + if (sr_wlist->HasEOS()) { + eos_src=sr_wlist->GetEOSIndex(); + printf(" - using a special token for short source sequences (%d)\n", eos_src); + } + if (tg_wlist->HasEOS()) { + eos_tgt=tg_wlist->GetEOSIndex(); + printf(" - using a special token for short target sequences (%d)\n", eos_tgt); + } + + bool data_available; + data_dev->Rewind(); + do { + // get a bunch of data + int n=0, i; + data_available = true; + debug0("start bunch\n"); + done_by_cstm.clear(); + while (n < mach->GetBsize() && data_available) { + data_available = data_dev->Next(); + if (!data_available) break; + + debug0("DEV DATA: input: "); + bool at_least_one_short=false; + for (i=0; iinput[i]; + idx=n*idim + i; + debug2(" %s[%d]", tg_wlist->GetWordInfo(inp).word,inp); +#if TRAINER_PHASE_SLIST_MAP_INPUT // default is not to do so + if (inp == NULL_WORD) + at_least_one_short=true; + else { + buf_input[idx] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist::TesDev(): input"); // map context words IDs + nb_src_words++; + if (inp==eos_src) at_least_one_short=true; + } +#else + buf_input[idx] = inp; + if (inp == NULL_WORD || inp==eos_src) + at_least_one_short=true; + else { + if (inp<0 || inp>=(int)sr_wlist->GetSize()) + ErrorN("TrainerPhraseSlist::TestDev(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize()); + nb_src_words++; + } +#endif + } + for (; i < idim ; i++) // copy auxiliary data + buf_input[n * idim + i] = data_dev->input[i]; + if (at_least_one_short) nb_ex_short_inp++; + + debug0("\n - > mapped output: "); + + bool all_in_slist=true; // ALL to be predicted words are in short list + int nbtgsl=0; + at_least_one_short=false; + for (i=0; itarget[i]; + idx=i+n*tg_nbphr; + buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist::TestDev(): output"); + buf_target_ext[idx] = outp; // keep unmapped target word ID for Moses phrase-table + if (outp==NULL_WORD + || (at_least_one_short && outp==eos_tgt)) // we only predict the FIRST EOS, the other ones are set to NULL_WORD + { // NULL_WORDS are mapped, they will be detected in gradient calculation + buf_target_wid[idx] = NULL_WORD; + buf_target[idx] = (REAL) NULL_WORD; + at_least_one_short=true; + debug1(" -[%d->NULL]",(int) buf_target_wid[idx]); + } + else { + // map normal word or EOS + nb_tg_words++; // also count EOS since we need to predict them at the output + if (outp==eos_tgt) at_least_one_short=true; + if (tg_wlist->InShortList(buf_target_wid[idx])) { + buf_target[idx] = (REAL) buf_target_wid[idx]; + debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]); + nbtgsl++; + } + else { + // TODO: we actually don't need a forward pass for words in the short lists or short n-grams + // this could be used to save some time (5-10%) + buf_target_wid[idx] = tg_slist_len; + buf_target[idx] = (REAL) tg_slist_len; // words that are not in slist are ALL done by the last output neuron + debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,outp,(int) buf_target_wid[idx]); + all_in_slist=false; + } + } + } + done_by_cstm.push_back(all_in_slist); + if (all_in_slist) { + nb_ex_slist++; + nb_tg_words_slist += nbtgsl; + } + if (!at_least_one_short) nb_ex_short_tgt++; + debug1(" all_slist=%d\n",all_in_slist); + + n++; + } // loop to get a bunch ef examples + debug4("dev bunch of %d phrases, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n)); + +#ifdef DEBUG2 +printf("network data:\n"); +REAL *iptr=buf_input; +REAL *tptr=buf_target; +for (int nn=0;nn "); + for (i=0;i0) { + // copy targets from buf_target to buf_target_in_blocks by re-arranging them into blocks per machine + + debug0("re-arrange targets\n"); + for (i=0; iForw(n,false); + for (i=0; iCalcValueNull(n); + log_sum += mach_errfct[i]->CalcGradNull(n); // TODO: should use CalcValueNull() + } + } + +#if DIRECT_PROBA_CALCULATION + // get probas from CSLM or back-off LM +#ifdef BLAS_CUDA + // host output is of dim bsize*odim - bsize*tg_nphr*dim_per_phrase + // it contains the whole bunch of the 1st output, then whole bunch of 2nd output, etc + for (int i=0; iGetDataOut(), n*dim_per_phrase*sizeof(REAL), cudaMemcpyDeviceToHost); + // TODO: we actually copy too much data, for each output vector we only need one value ! + } + Gpu::StreamSynchronize(); +#endif + + debug1("Collect n=%d\n", n); + if (n!=(int) done_by_cstm.size()) + Error("TrainerPhraseSlist::TestDev(): internal error, number of phrases done by CSTM does not match"); + + REAL *ptr_input = buf_input; // n times idim values + for (int ni=0; niGetDataOut() + ni*dim_per_phrase; +#endif + logP += safelog(optr[cur_tg]); // no error check on indices necessary here + if (buf_target_ext[i+ni*tg_nbphr] != eos_tgt) { // exclude the (easy) prediction of EOS from stats + logP_short += safelog(optr[cur_tg]); // no error check on indices necessary here + } + debug5("n=%3d, pos=%d, tg_w=%d (unmapped %d), P=%f\n",ni,i,cur_tg,buf_target_ext[i+ni*tg_nbphr],optr[cur_tg]); + } + debug4(" - -> logP=%f/%d, logP_short=%f/%d\n",logP,logP_short); + +#ifdef DUMP_PHRASE_TABLE + // create output phrase table + for (i=0;iGetWordInfo(buf_input[ni*idim+i]).word << " "; + } + fspt << "||| "; + for (i=0;iGetWordInfoMapped(buf_target_wid[ni*tg_nbphr+i]).word << " "; + } + fspt << "||| " << logP << endl; +#endif + +#ifdef DUMP_PHRASE_TABLE_NBEST + Error("GetMostLikelyTranslations() change to work with multiple output vectors"); + GetMostLikelyTranslations(fspt,optr,ni); +#endif + + debug1(" CSLM: logP=%e\n", logP); + log_sum_cstm += logP; + log_sum_cstm_short += logP_short; + log_sum_notunk += logP; + log_sum += logP; + } + else { +Error("not done by CSTM"); + + if (ptable) { + // request proba from Moses phrase-table + debug0("create textual phrase pair for external phrase table (word + index)\n"); + src_phrase.clear(); + debug0(" source:"); + for (i=0; iGetWordInfo((uint) ptr_input[i]).word); // TODO: char* to string + debug2(" %s[%d]", src_phrase.back().c_str(), (uint) ptr_input[i]); +#ifdef DUMP_PHRASE_TABLE + fspt << src_phrase.back() << " "; +#endif + } + +#ifdef DUMP_PHRASE_TABLE + fspt << "|P| "; +#endif + tgt_phrase.clear(); + debug0(" target:"); + for (i=0; iGetWordInfoMapped(buf_target_ext[i+ni*tg_nbphr]).word); // TODO: char* to string + debug2(" %s[%d]", tgt_phrase.back().c_str(), buf_target_ext[i+ni*tg_nbphr]); +#ifdef DUMP_PHRASE_TABLE + fspt << tgt_phrase.back() << " "; +#endif + } +# ifdef BACKWARD_TM + logP = ptable->GetProb(tgt_phrase, src_phrase); +# else + logP = ptable->GetProb(src_phrase, tgt_phrase); +# endif + if (logP == PROBA_NOT_IN_PTABLE) nb_not_in_ptable++; + else log_sum_notunk += logP; + logP = safelog(logP); // take log now + debug1(" => logP=%e\n",logP); + log_sum += logP; + } + else { // no ptable was specified + logP=0; // flag output that it wasn't done by CSTM + } +#ifdef DUMP_PHRASE_TABLE + fspt << "||| " << logP << endl; +#endif + } // not done by CSTM + + ptr_input += idim; // next example in bunch at input + if (fname) { + fs << logP << endl; + } + } +#endif // old proba calculation + + nb_ex += n; + debug2("%d: %f\n",nb_ex,exp(-log_sum/nb_ex)); + } while (data_available); + + printf(" - %d phrases, %d target words, avr length src=%.1f tgt=%.1f, CSTM: %d phrases (%.2f), %d target words (%.2f)\n", + nb_ex, nb_tg_words, (REAL) nb_src_words/nb_ex, (REAL) nb_tg_words/nb_ex, + nb_ex_slist, 100.0*nb_ex_slist/nb_ex, nb_tg_words_slist, 100.0 * nb_tg_words_slist/nb_tg_words); + if (ptable) { + printf(" - %d words were looked up in external phrase table, %d (%.2f%% were not found)\n", + nb_ex-nb_ex_slist, nb_not_in_ptable, 100.0*nb_not_in_ptable/(nb_ex-nb_ex_slist)); + } + +#ifdef DIRECT_PROBA_CALCULATION + REAL px = (nb_ex>0) ? exp(-log_sum / (REAL) nb_ex) : -1; + printf(" cstm px=%.2f, ln_sum=%.2f, cstm_short_px=%.2f, ln_sum=%.2f, overall px=%.2f, with unk=%.2f\n", + (nb_ex_slist>0) ? exp(-log_sum_cstm / (REAL) nb_ex_slist) : -1, log_sum_cstm, + (nb_ex_slist>0) ? exp(-log_sum_cstm_short / (REAL) nb_ex_slist) : -1, log_sum_cstm_short, + (nb_ex-nb_not_in_ptable>0) ? exp(-log_sum_notunk / (REAL) (nb_ex-nb_not_in_ptable)) : -1, + px); +#else + REAL px = (nb_ex>0) ? exp(-log_sum / (REAL) nb_tg_words_slist) : -1; + printf(" px=%.2f, ln_sum=%.2f\n", px, log_sum); +#endif + + if (fname) fs.close(); +#ifdef DUMP_PHRASE_TABLE + fspt.close(); +#endif + + return px; +} + + +//************************************************************************************** +// information after finishing an epoch + +void TrainerPhraseSlist::InfoPost () +{ + // if EOS is predicted by the NN, we don't count it as short + printf(" - epoch finished, %d target words in %d phrases (%.2f/%.2f%% short source/target)\n", + nb_tg_words, nb_ex, + 100.0*nb_ex_short_inp/nb_ex, 100.0*nb_ex_short_tgt/nb_ex); + printf(" CSTM: %d target words in %d phrases (%.2f%%), avrg px=%.2f\n", + nb_tg_words_slist, nb_ex_slist, 100.0*nb_ex_slist/nb_ex, + err_train); +} + +//************************************************************************************** +// request one n-gram probability, usually the called will be delayed +// and processes later + + +//************************************************************************************** +// collect all delayed probability requests + + +void TrainerPhraseSlist::ForwAndCollect(vector< vector > &src_phrases, AlignReq *areq, int req_beg, int req_end, int bs, int tm_pos) +{ + if (bs<=0) return; + debug3("TrainerPhraseSlist::ForwAndCollect(): collecting outputs %d .. %d from bunch of size %d\n", req_beg, req_end, bs); + debug3("\ttarget machines %d x dim %d = total %d\n", tg_nbphr, dim_per_phrase, odim); + + if (bs != (int) src_phrases.size()) + ErrorN("TrainerPhraseSlist::ForwAndCollect(): the number of source phrases (%d) does not match block length (%d)", (int) src_phrases.size(), bs); + +#ifdef DEBUG + printf("bunch of %d\n",bs); + for (int b=0; bGetGpuConfig()); + mach->SetDataIn(gpu_input); + Gpu::MemcpyAsync(gpu_input, buf_input , bs*idim*sizeof(REAL), cudaMemcpyHostToDevice); +#else + mach->SetDataIn(buf_input); +#endif + mach->Forw(bs,false); + +#ifdef BLAS_CUDA + for (int tw=0; twGetDataOut(), bs*dim_per_phrase*sizeof(REAL), cudaMemcpyDeviceToHost); + Gpu::StreamSynchronize(); +#endif + + // stats + int cnt_ex_slist=0, cnt_tg_words=0, cnt_tg_words_slist=0; + + for (int n=req_beg; n<=req_end; n++) { + REAL logP=0; + int b=areq[n].bs; + + if ((int) areq[n].tgph.size() > tg_nbphr) + ErrorN("TrainerPhraseSlist::ForwAndCollect(): target phrase too long (%d) for machine (%d)", (int) areq[n].tgph.size(), tg_nbphr); + +#ifdef DEBUG + printf("collect b=%3d \n input:", b); + for (int ii=0; iiMapIndex(outp, "TrainerPhraseSlist::ForwAndCollect() output"); + debug1("->%d",buf_target_wid[tw]); + all_in_slist=tg_wlist->InShortList(buf_target_wid[tw]); + } + // fill up + for (; twGetProb(src_phrases[areq[n].bs], areq[n].tgph)); + debug1(" ptable: logP=%f\n", logP); + } + else { + // get proba from CSLM + debug0(" - in slist CSLM:"); + logP=0; int cnt=0; + for (int tw=0; twGetDataOut() + b*odim; + //test: REAL *optr=phrase_mach[i]->GetDataOut() + ni*dim_per_phrase; + //TODO: it would be much more efficient to do all the examples of one machine and then switch to the next one + REAL *optr=phrase_mach[tw]->GetDataOut() + b*dim_per_phrase; +#endif + debug1(" %e", optr[buf_target_wid[tw]]); + logP += safelog(optr[buf_target_wid[tw]]); + cnt++; + } + if (cnt==0) Error("no target phrases when collecting output"); + logP /= cnt; // TODO: is this normalization correct ? + debug1(" -> log avr=%f\n",logP); + + cnt_ex_slist++; + cnt_tg_words_slist += cnt; + } + + // store LM proba + areq[n].hyp->AddFeature(logP,tm_pos); + } // for (ni=...) + + printf(" nb of phrases: %d with %d target words, by CSTM %d (%5.2f%%), avrg length %1.2f words\n", + req_end-req_beg+1, cnt_tg_words, cnt_ex_slist, (float) 100.0* cnt_ex_slist / (req_end-req_beg+1), (float) cnt_tg_words_slist/cnt_ex_slist); + nb_ex += (req_end-req_beg+1); + nb_ex_slist += cnt_ex_slist; + nb_tg_words_slist += cnt_tg_words_slist; + nb_tg_words += cnt_tg_words; +} + + +void TrainerPhraseSlist::BlockStats() { + //printf(" - %d phrase probability requests, %d=%5.2f short phrase %d forward passes (avrg of %d probas), %d=%5.2f%% predicted by CSTM\n", + //nb_ngram, nb_ex_short_tgt, 100.0*nb_ex_short_tgt/nb_ngram, nb_forw, nb_ngram/nb_forw, nb_ex_slist, 100.0*nb_ex_slist/nb_ngram); + printf(" - CSTM: %d forward passes, %d=%5.2f%% phrases were predicted by CSTM\n", + nb_forw, nb_ex_slist, 100.0 * nb_ex_slist/nb_ex); +} diff --git a/TrainerPhraseSlist.h b/TrainerPhraseSlist.h new file mode 100644 index 0000000..81a83f8 --- /dev/null +++ b/TrainerPhraseSlist.h @@ -0,0 +1,114 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * + * + */ + +#ifndef _TrainerPhraseSlist_h +#define _TrainerPhraseSlist_h + +#include +#include "Tools.h" +#include "Mach.h" +#include "ErrFct.h" +#include "DataPhraseBin.h" +#include "Trainer.h" +#include "WordList.h" + +#include "PtableMosesPtree.h" +#include "AlignReq.h" + +// +// Class to train neural networks to predict phrase probabilities +// - we use a short list of target words for which the NN predicts the proba +// - the proba of the other target words are obtained by a classical Moses phrase table +// - the NN also predicts the proba mass of ALL the words not in the short slist +// for this we use the last output neuron of the network + + +class TrainerPhraseSlist : public Trainer +{ +private: + int max_inp_idx; // largest index -1 of a word at the input (# of entries in projection table) + int tg_nbphr; // number of phrases at output, odim should be (tg_slist_len+1) * tg_nbphr + int dim_per_phrase; // output dimension of each phrase prediction layer (must be equal size) + WordID tg_slist_len; // length of slist (this is set to dim_per_phrase MINUS ONE) + WordList *sr_wlist; + WordList *tg_wlist; + vector phrase_mach; // pointer to the output machine for each phrase + vector mach_errfct; // each individual machine has its own error function with local memory + // in this version of the Trainer the error function is identical to all machines + // (we use the one in the local variable of the mother class Trainer) + + PtableMosesPtree *ptable; // classical phrase table + + // handling of short sequences + // input output + // NULL_WORD set proj=0 set grad=0 + // EOS as normal word as normal word + // + WordID eos_src, eos_tgt; // defaults to NULL_WORD if no special symbol in word list + + // various stats + int nb_ex_slist; // total number of examples processed in slist + int nb_ex_short_inp; // total number of incomplete input phrases + int nb_ex_short_tgt; // total number of incomplete target phrases + int nb_tg_words; // total number of target words (there can be several target words for a phrase pair) + int nb_tg_words_slist; // total number of target words which are in short list +// TODO: use WordID vector for targets in order to make less casts + WordID *buf_target_wid; // used instead of buf_target to avoid casts between REAL and WordID + // size is odim x bsize + WordID *buf_target_ext; // similar to buf_target_wid[], but keep even word id out side of short list + // needed to request probas from external phrase table + REAL *buf_target_in_blocks; // same data than in buf_target of Trainer class, but re-arranged in blocks for individual machines +#ifdef BLAS_CUDA + vector gpu_target; // copied from trainer to GPU +#endif +#ifdef DEBUG + vector words; // give UTF8 word for a given CSLM internal index +#endif + REAL DoTestDev(char*, bool); // internal helper function + void DoConstructorWork(); // internal helper function for the various constructors + // data and functions for block processing + int nb_forw; // stats on total number of forward passes + void GetMostLikelyTranslations(ofstream&,REAL*,int); +protected: + virtual void InfoPost(); // dump information after finishing a training epoch +public: + TrainerPhraseSlist(Mach*, Lrate*, ErrFct*, // mach, lrate, errfct + const char*, const char*, const char*, int, // train, dev, external phrase table, number of scores + REAL =0, int =10, int =0); // wdecay, max epochs, current epoch + TrainerPhraseSlist(Mach*, ErrFct*, Data*, // for testing only: mach, errfct, binary data + char*, int); // external phrase table, number of scores + TrainerPhraseSlist(Mach*, WordList*, WordList*, // for general proba calculation: mach, src word list, tgt word list + char*, int , char*); // external phrase table, number of scores, score specif + virtual ~TrainerPhraseSlist(); + virtual REAL Train(); // train for one epoch + virtual REAL TestDev(char* =NULL); // test current network on dev data and save outputs into file + // fast block evaluation functions + virtual void StoreInput(int b, int d, REAL val) {buf_input[b*bsize+d]=val;} + virtual void ForwAndCollect(vector< vector > &, AlignReq*, int,int,int,int); // for nbest rescoring + virtual void BlockStats(); // display some stats on Block mode + // interface functions + virtual int GetTgtNbPhr() {return tg_nbphr; } + virtual int GetSlistLen() {return tg_slist_len; } + virtual REAL *GetBufInput() {return buf_input; } +}; + +#endif diff --git a/TrainerPhraseSlist1.cpp b/TrainerPhraseSlist1.cpp new file mode 100644 index 0000000..5f369d4 --- /dev/null +++ b/TrainerPhraseSlist1.cpp @@ -0,0 +1,951 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * + */ + +using namespace std; +#include +#include +#include +#include + +#include "Tools.h" +#include "Mach.h" +#include "MachTab.h" +#include "MachPar.h" +#include "MachSeq.h" +#include "MachSplit1.h" +#include "TrainerPhraseSlist1.h" + +#include "NBest.h" +#include "sort.cpp" + +void TrainerPhraseSlist1::DoConstructorWork() +{ + char msg[1024]; + + idim=mach->GetIdim(); odim=mach->GetOdim(); bsize=mach->GetBsize(); + +#ifdef BLAS_CUDA + Gpu::SetConfig(mach->GetGpuConfig()); + gpu_input = Gpu::Alloc(idim*bsize, "inputs in Trainer"); + gpu_target = Gpu::Alloc(odim*bsize, "targets in Trainer"); + host_output = new REAL[odim*bsize]; +#endif + buf_target_wid = new WordID[odim*bsize]; + buf_target_ext = new WordID[odim*bsize]; + + // set up vector to outputs of the target phrases + if (mach->GetMType() != file_header_mtype_mseq) + Error("CSTM: sequential machine needed\n"); + MachSeq *mseq=(MachSeq*) mach; + if (mseq->MachGetNb()<2) + Error("CSTM: the number of machines is suspeciously small"); + + // check input layer + if (mseq->MachGet(0)->GetMType() != file_header_mtype_mpar) + Error("TrainerPhraseSlist1::DoConstructorWork: CSTM: the input layer has the wrong architecture\n"); + MachPar *mpar = (MachPar*) mseq->MachGet(0); + if (mpar->MachGet(0)->GetMType() != file_header_mtype_tab) + Error("TrainerPhraseSlist1::DoConstructorWork: CSTM: the input layer has the wrong architecture\n"); + MachTab *mtab = (MachTab*) mpar->MachGet(0); + max_inp_idx = mtab->GetMaxInpVal(); + + // check output layer + if (mseq->MachGet(mseq->MachGetNb()-1)->GetMType() != file_header_mtype_msplit1) + Error("CSTM: the output layer has the wrong architecture\n"); + MachSplit1 *msp = (MachSplit1*) mseq->MachGet(mseq->MachGetNb()-1); + tg_nbphr=msp->MachGetNb(); + if (data_train && (data_train->GetOdim() != tg_nbphr)) { + sprintf(msg,"CSTM: output dimension of the training data should be %d, found %d\n", tg_nbphr, data_train->GetOdim()); + Error(msg); + } + + phrase_mach.clear(); + for (int m=0; mMachGet(m)); + if (m>0 && phrase_mach[m-1]->GetOdim() != phrase_mach[m]->GetOdim()) + Error("CSTM: the output layer dimension must be identical for all phrases\n"); + } + dim_per_phrase = phrase_mach[0]->GetOdim(); + cout << " - this machine can predict up to " << phrase_mach.size() << " phrases, each with an output layer of dimension " << dim_per_phrase << endl; + tg_slist_len = dim_per_phrase-1; + + + // get source word list + if (sr_wlist == NULL) { + vector *vect_wlist = NULL; + if (data_dev != NULL) + vect_wlist = data_dev->GetSrcWList(); + else if (data_train != NULL) + vect_wlist = data_train->GetSrcWList(); + if ((vect_wlist != NULL) && !vect_wlist->empty()) + sr_wlist = &(vect_wlist->front()); + } + if (sr_wlist == NULL) + Error("no source word list available"); + if ((int) sr_wlist->GetSize() > max_inp_idx) + Error("the size of the source word list exceeds the number of input words the machine was trained for"); + + // get target word list + if (tg_wlist == NULL) { + vector *vect_wlist = NULL; + if (data_dev != NULL) + vect_wlist = data_dev->GetTgtWList(); + else if (data_train != NULL) + vect_wlist = data_train->GetTgtWList(); + if ((vect_wlist != NULL) && !vect_wlist->empty()) + tg_wlist = &(vect_wlist->front()); + } + if (tg_wlist == NULL) + Error("no target word list available"); + if (!tg_wlist->FrequSort()) + Error("the target word list don't contain word count"); + if (tg_wlist->GetSize() <= tg_slist_len) + Error("TrainerPhraseSlist1: the output layer is larger than the target word list"); + + ulong sum_sl=0, sum=0; + tg_wlist->SetShortListLength(tg_slist_len); + tg_wlist->CountWords(sum_sl, sum); + printf (" - setting up target short list of %d words, coverage of %5.2f%%\n", tg_slist_len, 100.0*sum_sl/sum); + +#ifdef DEBUG2 + cout << "Words in slist:" << endl; + WordID ci=tg_slist_len; + WordList::const_iterator iter, end = tg_wlist->End(); + for (iter=tg_wlist->Begin(); (iter!=end) && (ci > 0); iter++, ci--) + printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id); +#endif + +#ifdef DEBUG2 + cout << "Words not in slist:" << endl; + for (; iter!=end; iter++) + printf (" %s cnt=%d idx=%d\n", iter->word, iter->n, iter->id); +#endif + +#ifdef DEBUG2 + // just needed for debugging + words.reserve(tg_wlist->GetSize()); + for (iter=tg_wlist->Begin(); iter!=end; iter++) words[iter->id] = strdup(iter->word); +#endif + + debug0(" + done init TrainerPhraseSlist1\n"); +} + +// +// constructor for training +// + +TrainerPhraseSlist1::TrainerPhraseSlist1 (Mach *pmach, Lrate *lrate, ErrFct *perrfct, + const char *train_fname, const char *dev_fname, const char *pt_fname, int p_nscores, + REAL p_wd, int p_maxep, int p_ep) + : Trainer(pmach,lrate,perrfct,NULL,NULL,p_wd,p_maxep,p_ep), + tg_nbphr(0), tg_slist_len(0), + sr_wlist(NULL), tg_wlist(NULL), + nb_ex_slist(0), nb_ex_short(0), + nb_forw(0) +{ + debug2("*** Constructor TrainerPhraseSlist1 for training idim=%d, odim=%d ***\n",idim,odim); + cout << "Setting up CSTM training with short list" << endl; + char msg[1024]; + + if (train_fname) { + data_train = new Data(train_fname); + if (idim != data_train->GetIdim()) { + sprintf(msg,"TrainerPhraseSlist1: input dimension of the training data (%d) does not match the one of the machine (%d)\n", data_train->GetIdim(), idim); + Error(msg); + } + if (data_train->GetOdim()<1 || data_train->GetOdim()>10) { + sprintf(msg,"TrainerPhraseSlist1: output dimension of the training data should be 1..10, found %d\n", data_train->GetOdim()); + Error(msg); + } + auxdim = data_train->GetAuxdim(); + } + else + data_train=NULL; + + if (dev_fname) { + data_dev = new Data(dev_fname); + data_dev_alloc=true; + if (idim != data_dev->GetIdim()) { + sprintf(msg,"TrainerPhraseSlist1: input dimension of the validation data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim); + Error(msg); + } + if (data_dev->GetOdim()<1 || data_dev->GetOdim()>10) { + sprintf(msg,"TrainerPhraseSlist1: output dimension of the validation data should be 1..10, found %d\n", data_dev->GetOdim()); + Error(msg); + } + int auxdim_dev = data_dev->GetAuxdim(); + if (0 >= auxdim) + auxdim = auxdim_dev; + else if (auxdim != auxdim_dev) + ErrorN("TrainerPhraseSlist1: auxiliary data dimension of the validation data should be %d, found %d", auxdim, auxdim_dev); + } + else { + data_dev=NULL; + data_dev_alloc=false; + } + iaux = (idim - auxdim); + + DoConstructorWork(); + + if (data_dev) { + if (pt_fname) { + cout << " - loading external phrase table from " << pt_fname << endl; + ptable.Read(pt_fname,5,"1:2"); + } + else + cout << " - no external phrase table provided" << endl; + } +} + +// +// constructor for testing +// + +TrainerPhraseSlist1::TrainerPhraseSlist1 (Mach *pmach, ErrFct *perrfct, + Data *data, char *pt_fname, int p_nscores) + : Trainer(pmach,NULL,perrfct,NULL,NULL), + tg_nbphr(0), tg_slist_len(0), + sr_wlist(NULL), tg_wlist(NULL), + nb_ex_slist(0), nb_ex_short(0), + nb_forw(0) +{ + debug0("*** Constructor TrainerPhraseSlist1 for testing ***\n"); + cout << "Setting up testing with short list" << endl; + char msg[1024]; + + data_train=NULL; + data_dev=data; + data_dev_alloc=false; // do not free it by this class ! + + if (idim != data_dev->GetIdim()) { + sprintf(msg,"TrainerPhraseSlist1: input dimension of the test data (%d) does not match the one of the machine (%d)\n", data_dev->GetIdim(), idim); + Error(msg); + } + auxdim = data_dev->GetAuxdim(); + iaux = (idim - auxdim); + + DoConstructorWork(); + + cout << " - loading external phrase table from " << pt_fname << endl; +#ifdef BACKWRAD_TM + ptable.Read(pt_fname,5,"1:0"); // backward TM prob +#else + ptable.Read(pt_fname,5,"1:2"); // forward TM prob +#endif +} + +// +// constructor for nbest rescoring +// + +TrainerPhraseSlist1::TrainerPhraseSlist1 (Mach *pmach, + WordList *p_sr_wlist, WordList *p_tg_wlist, + char *pt_fname, int nscores, char *scores_specif) + : Trainer(pmach,NULL,NULL,NULL,NULL), // TODO; should I call: TrainerNgram(pmach,NULL,NULL,NULL), + tg_nbphr(0), tg_slist_len(0), + sr_wlist(p_sr_wlist), tg_wlist(p_tg_wlist), + nb_ex_short(0), nb_forw(0) +{ + debug0("*** Constructor TrainerPhraseSlist1 for block operations ***\n"); + cout << "Setting up CSTM with short list" << endl; + // TODO: init with TrainerNgram before + DoConstructorWork(); + + cout << " - loading external phrase table from " << pt_fname << endl; + ptable.Read(pt_fname, nscores, scores_specif); +} + +//************************************************************************************** + +TrainerPhraseSlist1::~TrainerPhraseSlist1 () +{ + debug0("*** Destructor TrainerPhraseSlist1 ***\n"); + + if (buf_target_wid) delete [] buf_target_wid; + if (buf_target_ext) delete [] buf_target_ext; + // buf_input and buf_target will be deleted by ~Trainer() + + phrase_mach.clear(); + +#ifdef DEBUG2 + vector::const_iterator iter, end = words.end(); + for (iter=words.begin(); iter!=end; iter++) delete *iter; + words.clear(); +#endif +} + + +//************************************************************************************** + +REAL TrainerPhraseSlist1::Train() +{ + if (!data_train) return -1; +#ifdef DEBUG + printf("*****************\n"); + printf("TrainerPhraseSlist1::Train():\n"); + printf(" - idim=%d, odim=%d, tg_nbphr=%d\n", idim, odim, tg_nbphr); + printf(" - data_in: %p \n", (void*) buf_input); + printf(" - target: %p \n", (void*) buf_target); + printf(" - tgt WID: %p \n", (void*) buf_target_wid); + printf(" - grad_out: %p \n", (void*) errfct->GetGrad()); +#endif + + Timer ttrain; // total training time + Timer tload; + Timer ttransfer; // total transfer time of data to GPU + Timer tforw; // total forw time + Timer tgrad; // total gradient time + Timer tbackw; // total backw time + ttrain.start(); + + data_train->Rewind(); + + REAL log_sum=0; + int i; + nb_ex=nb_ex_slist=nb_ex_short_inp=nb_ex_short=0; + nb_tg_words=nb_tg_words_slist=0; + + +#ifdef BLAS_CUDA + Gpu::SetConfig(mach->GetGpuConfig()); + mach->SetDataIn(gpu_input); // we copy from buf_input to gpu_input + errfct->SetTarget(gpu_target); // we copy from buf_target to gpu_target + debug1(" - gpu_input %p\n", gpu_input); + debug1(" - gpu_target %p\n", gpu_target); +#else + mach->SetDataIn(buf_input); + errfct->SetTarget(buf_target); + debug1(" - buf_input %p\n", buf_input); + debug1(" - buf_target %p\n", buf_target); +#endif + errfct->SetOutput(mach->GetDataOut()); + mach->SetGradOut(errfct->GetGrad()); + bool data_available; + do { + tload.start(); + // get a bunch of data and map all the words + int n=0, nbtgsl=0; + data_available = true; + while (n < mach->GetBsize() && data_available) { + data_available = data_train->Next(); + if (!data_available) break; + debug0("TRAIN DATA: input: "); + bool at_least_one_short=false; + for (i=0; iinput[i]; + debug2(" %s[%d]", sr_wlist->GetWordInfo(inp).word,inp); +#if TODO // should we map input data ? + buf_input[n*idim + i] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist1::Train(): input"); // map context words IDs + if (inp == NULL_WORD) + at_least_one_short=true; +#else + buf_input[n*idim + i] = inp; + if (inp == NULL_WORD) + at_least_one_short=true; + else if (inp<0 || inp>=(int)sr_wlist->GetSize()) + ErrorN("TrainerPhraseSlist1::Train(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize()); +#endif + } + for (; i < idim ; i++) // copy auxiliary data + buf_input[n * idim + i] = data_train->input[i]; + if (at_least_one_short) nb_ex_short_inp++; + + debug0(" - > mapped: "); + + bool all_in_slist=true; // ALL to be predicted words are in short list + at_least_one_short=false; + nbtgsl=0; + for (i=0; itarget[i]; + int idx=i+n*tg_nbphr; + buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist1::Train(): output"); // TODO: not really needed during training, just the current value + if (outp==NULL_WORD) { + buf_target[idx] = (REAL) NULL_WORD; + at_least_one_short=true; + debug1(" -[%d->NULL]",(int) buf_target[idx]); + } + else { + nb_tg_words++; + if (tg_wlist->InShortList(buf_target_wid[idx])) { + buf_target[idx] = (REAL) buf_target_wid[idx]; + debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word,(int) buf_target_wid[idx], outp); + nbtgsl++; + } + else { + buf_target[idx] = (REAL) tg_slist_len; // words that are not in slist are ALL done by the last output neuron + debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,(int) buf_target_wid[idx], outp); + all_in_slist=false; + } + } + } + if (all_in_slist) { + nb_ex_slist++; + nb_tg_words_slist += nbtgsl; + } + if (at_least_one_short) nb_ex_short++; + debug1(" all_slist=%d\n",all_in_slist); + + n++; + } // loop to get a bunch of examples + debug4("train bunch of %d words, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n)); + tload.stop(); + +#ifdef DEBUG2 +printf("network data:\n"); +REAL *iptr=buf_input; +REAL *tptr=buf_target; +for (int nn=0;nn "); + for (i=0;i0) { +#ifdef BLAS_CUDA + ttransfer.start(); + Gpu::MemcpyAsync(gpu_input, buf_input , n*idim*sizeof(REAL), cudaMemcpyHostToDevice); + Gpu::MemcpyAsync(gpu_target, buf_target , n*odim*sizeof(REAL), cudaMemcpyHostToDevice); + Gpu::StreamSynchronize(); + ttransfer.stop(); +#endif + tforw.start(); + mach->Forw(n,true); + tforw.stop(); + + tgrad.start(); + log_sum += errfct->CalcGrad(n); + tgrad.stop(); + + debug1(" log_sum=%e\n",log_sum); +#ifdef DEBUG2 + int t=(int) data_train->target[0]; +#ifdef BLAS_CUDA + Gpu::SetConfig(mach->GetGpuConfig()); + REAL * tmp = Gpu::Alloc(5, "tmp buffer for DEBUG2"); + cublasGetVector(odim,CUDA_SIZE,mach->GetDataOut(),1,tmp,1); + printf("OUTPUT:"); + for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n"); + cublasGetVector(3, CUDA_SIZE, data_train->target, 1, tmp, 1); + printf("TARGET:"); + for (int i=0;i<1; i++) printf(" %f", tmp[i]); printf("\n"); + //TODO check if we need odim or idim! + cublasGetVector(odim*bsize, CUDA_SIZE, errfct->GetGrad(), 1, tmp, 1); + printf(" GRAD:"); + for (int i=t-2;i<=t+2; i++) printf(" %f",tmp[i]); printf("\n"); + cublasFree(tmp); +#else +printf("OUTPUT:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",mach->GetDataOut()[i]); printf("\n"); +printf("TARGET:") ; for (int i=0;i<1; i++) printf(" %f",data_train->target[i]); printf("\n"); +printf(" GRAD:") ; for (int i=t-2;i<=t+2; i++) printf(" %f",errfct->GetGrad()[i]); printf("\n"); +#endif //BLAS_CUDA +#endif //DEBUG2 + lrate->UpdateLrateOnForw(mach->GetNbForw()); + tbackw.start(); + mach->Backw(lrate->GetLrate(), wdecay, n); + tbackw.stop(); + } + + nb_ex += n; + } while (data_available); +#ifdef BLAS_CUDA + Gpu::StreamSynchronize(); +#endif + + ttrain.stop(); + ttrain.disp(" - training time: "); + tload.disp(" including load: "); +#ifdef BLAS_CUDA + ttransfer.disp(" transfer: "); +#endif + tforw.disp(" forw: "); + tgrad.disp(" grad: "); + tbackw.disp(" backw: "); + printf("\n"); + + printf(" = log_sum=%.2f, nb_tg_words=%d, nb_ex_slist=%d, nb_tg_words_slist=%d\n", log_sum, nb_tg_words, nb_ex_slist, nb_tg_words_slist); + if (nb_tg_words>0) return exp(-log_sum / (REAL) nb_tg_words); // when normalizing consider that all examples lead to a forward pass + + return -1; +} + +//************************************************************************************** +// + +void TrainerPhraseSlist1::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni) +{ + int Nbest=100; + + // get input length + int input_length; + for (input_length=0;input_length > > prepared_scores + = prepare_hypotheses(optr, tg_nbphr, dim_per_phrase, Nbest); + std::vector > > best + = sort_ngrams(prepared_scores, input_length, Nbest); + + for(std::size_t i = 0; i < best.size(); ++i) { + // source + for (int j=0; jGetWordInfo(buf_input[ni*idim+j]).word << " "; + } + + // target + fspt << "|||"; + for(std::size_t j = 0; j < best[i].second.size(); ++j) { + fspt << " " << tg_wlist->GetWordInfoMapped(best[i].second[j]).word; + } + + // score + fspt << " ||| " << exp(best[i].first); + fspt << "\n"; + } + +} + +//************************************************************************************** +// +#if 0 +void TrainerPhraseSlist1::GetMostLikelyTranslations (ofstream &fspt, REAL *optr, int ni) +{ + int i; + // Find most likely outputs + for (i=0;iGetWordInfo(buf_input[ni*idim+i]).word << " "; + } + fspt << "||| "; + + for (i=0; imax) { max=*sptr; max_idx=s; } + } + fspt << tg_wlist->GetWordInfoMapped(max_idx).word << "[" << max << "] "; + } + fspt << endl; +} +#endif + +//************************************************************************************** +// + +REAL TrainerPhraseSlist1::TestDev(char *fname) +{ + if (!data_dev) return -1; + + vector src_phrase; // interface with classical phrase tables + vector tgt_phrase; + vector done_by_cstm; + + ofstream fs; + if (fname) { + cout << " - dumping phrase probability stream to file '" << fname << "'" << endl; + fs.open(fname,ios::out); + CHECK_FILE(fs,fname); + } + + char *ptfname = (char*) "alltrans.txt"; + ofstream fspt; + cout << " - dumping new phrase table to file '" << ptfname << "'" << endl; + fspt.open(ptfname,ios::out); + CHECK_FILE(fspt,ptfname); + + nb_ex=nb_ex_slist=nb_ex_short=0; + nb_tg_words=nb_tg_words_slist=0; + int nb_probs=0; // this counts the number of cumulated log probs. + // This increments by only one for external phrase tables, independently of the target phrase length + REAL logP, log_sum=0; + REAL log_sum_cstm=0; // only CSLM, i.e. considering phrases done by CSTM + + uint idx; + +#ifdef BLAS_CUDA + Gpu::SetConfig(mach->GetGpuConfig()); + mach->SetDataIn(gpu_input); // we copy from buf_input to gpu_input + errfct->SetTarget(gpu_target); // we copy from buf_target to gpu_target + debug1(" - gpu_input %p\n", gpu_input); + debug1(" - gpu_target %p\n", gpu_target); +#else + mach->SetDataIn(buf_input); + errfct->SetTarget(buf_target); +#endif + errfct->SetOutput(mach->GetDataOut()); + + bool data_available; + data_dev->Rewind(); + do { + // get a bunch of data + int n=0, i; + data_available = true; + debug0("start bunch\n"); + done_by_cstm.clear(); + while (n < mach->GetBsize() && data_available) { + data_available = data_dev->Next(); + if (!data_available) break; + + debug0("DEV DATA: input: "); + bool at_least_one_short=false; + for (i=0; iinput[i]; + idx=n*idim + i; + debug1(" %d", inp); +#if TODO // should we map input data ? + buf_input[idx] = (REAL) sr_wlist->MapIndex(inp, "TrainerPhraseSlist1::TestDev(): input"); // map context words IDs + if (inp == NULL_WORD) + at_least_one_short=true; +#else + buf_input[idx] = inp; + if (inp == NULL_WORD) + at_least_one_short=true; + else if (inp<0 || inp>=(int)sr_wlist->GetSize()) + ErrorN("TrainerPhraseSlist1::TestDev(): input out of bounds (%d), must be in [0,%d[", inp, (int) sr_wlist->GetSize()); +#endif + } + for (; i < idim ; i++) // copy auxiliary data + buf_input[n * idim + i] = data_dev->input[i]; + if (at_least_one_short) nb_ex_short_inp++; + + debug0(" - > mapped: "); + + bool all_in_slist=true; // ALL to be predicted words are in short list + int nb_words_not_null=0; + at_least_one_short=false; + for (i=0; itarget[i]; + idx=n*tg_nbphr + i; + buf_target_wid[idx] = tg_wlist->MapIndex(outp, "TrainerPhraseSlist1::TestDev(): output"); + buf_target_ext[idx] = buf_target_wid[idx]; // keep target word ID for Moses phrase-table + if (outp==NULL_WORD) { + buf_target[idx] = (REAL) NULL_WORD; + at_least_one_short=true; // TODO: optimize: we should be able to stop the loop on "i" + debug1(" %d[NULL]",(int) buf_target_wid[idx]); + } + else { + nb_tg_words++; + nb_words_not_null++; + if (tg_wlist->InShortList(buf_target_wid[idx])) { + buf_target[idx] = (REAL) buf_target_wid[idx]; + debug3(" %s[%d->%d]", tg_wlist->GetWordInfo(outp).word, (int) buf_target_wid[idx], outp); + //nbtgsl++; + } + else { + // TODO: we actually don't need a forward pass for words in the short lists or short n-grams + // this could be used to save some time (5-10%) + buf_target_wid[idx] = tg_slist_len; + buf_target[idx] = (REAL) tg_slist_len; // words that are not in slist are ALL done by the last output neuron + debug3(" %s[%d->%d]*", tg_wlist->GetWordInfo(outp).word,(int) buf_target_wid[idx], outp); + all_in_slist=false; + } + } + } + done_by_cstm.push_back(all_in_slist); + if (all_in_slist) { + nb_ex_slist++; + nb_tg_words_slist += nb_words_not_null; + //nb_tg_words_slist += nbtgsl; + } + if (!at_least_one_short) nb_ex_short++; + debug1(" all_slist=%d\n",all_in_slist); + + n++; + } // loop to get a bunch ef examples + debug4("dev bunch of %d phrases, totl=%d, totl slist=%d [%.2f%%]\n", n, nb_ex+n, nb_ex_slist, 100.0*nb_ex_slist/(nb_ex+n)); + +#ifdef DEBUG2 +printf("network data:\n"); +REAL *iptr=buf_input; +REAL *tptr=buf_target; +for (int nn=0;nn "); + for (i=0;i0) { +#ifdef BLAS_CUDA + Gpu::MemcpyAsync(gpu_input, buf_input , n*idim*sizeof(REAL), cudaMemcpyHostToDevice); + Gpu::MemcpyAsync(gpu_target, buf_target , n*odim*sizeof(REAL), cudaMemcpyHostToDevice); +#endif + mach->Forw(n,false); + log_sum_cstm += errfct->CalcValue(n); + } + + // get probas from CSLM or back-off LM +#ifdef BLAS_CUDA + cudaMemcpy(host_output, mach->GetDataOut(), n*odim*sizeof(REAL), cudaMemcpyDeviceToHost); + REAL *optr=host_output; + Error("TrainerPhraseSlist1::TestDev TODO CUDA"); +#else + REAL *optr=mach->GetDataOut(); // n times (tg_nbphr*tg_slen) = odim values +#endif + + debug1("Collect n=%d\n", n); + if (n!=(int) done_by_cstm.size()) + Error("TrainerPhraseSlist1::TestDev(): internal error, number of phrases done by CSTM does not match"); + + REAL *ptr_input = buf_input; // n times idim values + for (int ni=0; niGetWordInfo(buf_input[ni*idim+i]).word << " "; + } + fspt << "||| "; + for (i=0;iGetWordInfoMapped(buf_target_wid[ni*tg_nbphr+i]).word << " "; + } + fspt << "||| "; +#endif + + logP=0; + REAL *optr2=optr; + for (i=0; iGetWordInfo((uint) ptr_input[i]).word); // TODO: char* to string + debug2(" %s[%d]", src_phrase.back().c_str(), (uint) ptr_input[i]); + } + tgt_phrase.clear(); + debug0(" target:"); + for (i=0; iGetWordInfoMapped(buf_target_ext[i+ni*tg_nbphr]).word); // TODO: char* to string + debug2(" %s[%d]", tgt_phrase.back().c_str(), buf_target_ext[i+ni*tg_nbphr]); + } +#ifdef BACKWRAD_TM + logP = safelog(ptable.GetProb(tgt_phrase, src_phrase)); +#else + logP = safelog(ptable.GetProb(src_phrase, tgt_phrase)); +#endif + nb_probs++; + debug1(" => logP=%e\n",logP); +#else + logP=1; +#endif + } + + log_sum += logP; + ptr_input += idim; // next example in bunch at input + optr += odim; // next example in bunch at output + if (fname) { + fs << ((nb_tg>0) ? logP/nb_tg : -1) << endl; + } + } + + nb_ex += n; + debug2("%d: %f\n",nb_ex,exp(-log_sum/nb_ex)); + } while (data_available); + + printf(" %d target words in %d phrases (%d=%.2f%% uncomplete), CSTM: %d target words in %d phrases (%.2f%%)\n", + nb_tg_words, nb_ex, + nb_ex_short, 100.0*nb_ex_short/nb_ex, + nb_tg_words_slist, nb_ex_slist, 100.0*nb_ex_slist/nb_ex); + + + REAL px = (nb_probs>0) ? exp(-log_sum / (REAL) nb_probs) : -1; + printf(" cstm px=%.2f, ln_sum=%.2f, overall px=%.2f (%d values)\n", + (nb_tg_words_slist>0) ? exp(-log_sum_cstm / (REAL) nb_tg_words_slist) : -1, log_sum_cstm, px, nb_probs); + + if (fname) fs.close(); + fspt.close(); + + return px; +} + + +//************************************************************************************** +// information after finishing an epoch + +void TrainerPhraseSlist1::InfoPost () +{ + printf(" - epoch finished, %d target words in %d phrases (%.2f/%.2f%% short source/target)\n", + nb_tg_words, nb_ex, + 100.0*nb_ex_short_inp/nb_ex_slist, 100.0*nb_ex_short/nb_ex_slist); + printf(" CSTM: %d target words in %d phrases (%.2f%%), avrg px=%.2f\n", + nb_tg_words_slist, nb_ex_slist, 100.0*nb_ex_slist/nb_ex, + err_train); +} + +//************************************************************************************** +// request one n-gram probability, usually the called will be delayed +// and processes later + + +//************************************************************************************** +// collect all delayed probability requests + + +void TrainerPhraseSlist1::ForwAndCollect(vector< vector > &src_phrases, AlignReq *areq, int req_beg, int req_end, int bs, int tm_pos) +{ + if (bs<=0) return; + debug3("TrainerPhraseSlist1::ForwAndCollect(): collecting outputs %d .. %d from bunch of size %d\n", req_beg, req_end, bs); + debug3("\ttarget machines %d x dim %d = total %d\n", tg_nbphr, dim_per_phrase, odim); + + if (bs != (int) src_phrases.size()) + ErrorN("TrainerPhraseSlist1::ForwAndCollect(): the number of source phrases (%d) does not match block length (%d)", (int) src_phrases.size(), bs); + +#ifdef DEBUG + printf("bunch of %d\n",bs); + for (int b=0; bGetGpuConfig()); + mach->SetDataIn(gpu_input); + Gpu::MemcpyAsync(gpu_input, buf_input , bs*idim*sizeof(REAL), cudaMemcpyHostToDevice); +#else + mach->SetDataIn(buf_input); +#endif + mach->Forw(bs,false); + +#ifdef BLAS_CUDA + Gpu::MemcpyAsync(host_output, mach->GetDataOut(), bs*odim*sizeof(REAL), cudaMemcpyDeviceToHost); + Gpu::StreamSynchronize(); +#endif + + // stats + int cnt_ex_slist=0, cnt_tg_words=0, cnt_tg_words_slist=0; + + for (int n=req_beg; n<=req_end; n++) { + REAL logP=0; + int b=areq[n].bs; + + if ((int) areq[n].tgph.size() > tg_nbphr) + ErrorN("TrainerPhraseSlist1::ForwAndCollect(): target phrase too long (%d) for machine (%d)", (int) areq[n].tgph.size(), tg_nbphr); + +#ifdef DEBUG + printf("collect b=%3d \n input:", b); + for (int ii=0; iiMapIndex(outp, "TrainerPhraseSlist1::ForwAndCollect() output"); + debug1("->%d",buf_target_wid[tw]); + all_in_slist=tg_wlist->InShortList(buf_target_wid[tw]); + } + // fill up + for (; twGetDataOut() + b*odim; +#endif + + if (!all_in_slist) { + // get proba from external phrase table + logP=ptable.GetProb(src_phrases[areq[n].bs], areq[n].tgph); + debug1(" ptable: logP=%f\n", logP); + } + else { + // get proba from CSLM + debug0(" - in slist CSLM:"); + logP=0; int cnt=0; + for (int tw=0; tw log avr=%f\n",logP); + + cnt_ex_slist++; + cnt_tg_words_slist += cnt; + } + + // store LM proba + areq[n].hyp->AddFeature(logP,tm_pos); + } // for (ni=...) + + printf(" nb of phrases: %d with %d target words, by CSTM %d (%5.2f%%), avrg length %1.2f words\n", + req_end-req_beg+1, cnt_tg_words, cnt_ex_slist, (float) 100.0* cnt_ex_slist / (req_end-req_beg+1), (float) cnt_tg_words_slist/cnt_ex_slist); + nb_ex += (req_end-req_beg+1); + nb_ex_slist += cnt_ex_slist; + nb_tg_words_slist += cnt_tg_words_slist; + nb_tg_words += cnt_tg_words; +} + + +void TrainerPhraseSlist1::BlockStats() { + //printf(" - %d phrase probability requests, %d=%5.2f short phrase %d forward passes (avrg of %d probas), %d=%5.2f%% predicted by CSTM\n", + //nb_ngram, nb_ex_short, 100.0*nb_ex_short/nb_ngram, nb_forw, nb_ngram/nb_forw, nb_ex_slist, 100.0*nb_ex_slist/nb_ngram); + printf(" - CSTM: %d forward passes, %d=%5.2f%% phrases were predicted by CSTM\n", + nb_forw, nb_ex_slist, 100.0 * nb_ex_slist/nb_ex); +} diff --git a/TrainerPhraseSlist1.h b/TrainerPhraseSlist1.h new file mode 100644 index 0000000..dad0a95 --- /dev/null +++ b/TrainerPhraseSlist1.h @@ -0,0 +1,105 @@ +/* + * This file is part of the continuous space language and translation model toolkit + * for statistical machine translation and large vocabulary speech recognition. + * + * Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France + * + * The CSLM toolkit is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License version 3 as + * published by the Free Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License + * for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + * + * + */ + +#ifndef _TrainerPhraseSlist1_h +#define _TrainerPhraseSlist1_h + +#include +#include "Tools.h" +#include "Mach.h" +#include "ErrFct.h" +#include "DataPhraseBin.h" +#include "Trainer.h" +#include "WordList.h" + +#include "PtableMosesPtree.h" +#include "AlignReq.h" + +// +// Class to train neural networks to predict phrase probabilities +// - we use a short list of target words for which the NN predicts the proba +// - the proba of the other target words are obtained by a classical Moses phrase table +// - the NN also predicts the proba mass of ALL the words not in the short slist +// for this we use the last output neuron of the network + + +// +// helper class to store and compare one ngram LM request +// ugly C-style structure, but this seems to be more efficient + +class TrainerPhraseSlist1 : public Trainer +{ +private: + int max_inp_idx; // largest index -1 of a word at the input (# of entries in projection table) + int tg_nbphr; // number of phrases at output, odim should be (tg_slist_len+1) * tg_nbphr + int dim_per_phrase; // output dimension of each phrase prediction layer (must be equal size) + WordID tg_slist_len; // length of slist (this is set to dim_per_phrase MINUS ONE) + WordList *sr_wlist; + WordList *tg_wlist; + vector phrase_mach; // pointer to the output machine for each phrase + + // classical phrase table + PtableMosesPtree ptable; + + // various stats + int nb_ex_slist; // total number of examples processed in slist + int nb_ex_short_inp; // total number of incomplete input phrases + int nb_ex_short; // total number of incomplete target phrases + int nb_tg_words; // total number of target words (there can be several target words for a phrase pair) + int nb_tg_words_slist; // total number of target words which are in short list +// TODO: use WordID vector for targets in order to make less casts + WordID *buf_target_wid; // used instead of buf_target to evitate casts between REAL and WordID + // size is odim x bsize + WordID *buf_target_ext; // similar to buf_target_wid[], but keep even word id out side of short list + // needed to request probas from external phrase table +#ifdef DEBUG + vector words; // give UTF8 word for a given CSLM internal index +#endif + REAL DoTestDev(char*, bool); // internal helper function + void DoConstructorWork(); // internal helper function for the various constructors + // data and functions for block processing + int nb_forw; // stats on total number of forward passes + void GetMostLikelyTranslations(ofstream&,REAL*,int); +protected: + virtual void InfoPost(); // dump information after finishing a training epoch +public: + TrainerPhraseSlist1(Mach*, Lrate*, ErrFct*, // mach, lrate, errfct + const char*, const char*, const char*, int, // train, dev, external phrase table, number of scores + REAL =0, int =10, int =0); // wdecay, max epochs, current epoch + TrainerPhraseSlist1(Mach*, ErrFct*, Data*, // for testing only: mach, errfct, binary data + char*, int); // external phrase table, number of scores + TrainerPhraseSlist1(Mach*, WordList*, WordList*, // for general proba calculation: mach, src word list, tgt word list + char*, int , char*); // external phrase table, number of scores, score specif + virtual ~TrainerPhraseSlist1(); + virtual REAL Train(); // train for one epoch + virtual REAL TestDev(char* =NULL); // test current network on dev data and save outputs into file + // fast block evaluation functions + virtual void StoreInput(int b, int d, REAL val) {buf_input[b*bsize+d]=val;} + virtual void ForwAndCollect(vector< vector > &, AlignReq*, int,int,int,int); // for nbest rescoring + virtual void BlockStats(); // display some stats on Block mode + // interface functions + virtual int GetTgtNbPhr() {return tg_nbphr; } + virtual int GetSlistLen() {return tg_slist_len; } + virtual REAL *GetBufInput() {return buf_input; } +}; + +#endif diff --git a/docs/Descritpion-of-features.txt b/docs/Description-of-features.txt similarity index 100% rename from docs/Descritpion-of-features.txt rename to docs/Description-of-features.txt diff --git a/sort.cpp b/sort.cpp new file mode 100644 index 0000000..2965593 --- /dev/null +++ b/sort.cpp @@ -0,0 +1,82 @@ +#include +#include +#include +#include + +//simple exponential decay as length penalty (input length = output length: no penalty) +REAL weight_lengths(std::size_t input_length, std::size_t output_length) { + return log(0.8)*abs(input_length-output_length); +} + + +//change data structure (vector of vectors of pairs) and prune number of hypotheses per length to N +std::vector > > prepare_hypotheses(REAL* scores, std::size_t maxLength, std::size_t vocab_size, std::size_t Nbest) { + + // outermost vector: one item per length + std::vector > > ret; + + // for each length + for(std::size_t i = 0; i < maxLength; ++i){ + std::vector > vec (vocab_size); + + // for each word in the vocabulary + for(std::size_t j = (i*vocab_size); j < ((i+1)*vocab_size); ++j){ + std::size_t idx = j-(i*vocab_size); + vec[idx] = std::make_pair(scores[j],idx); //store probability and index + } + + // prune to N most probable members + std::nth_element(vec.begin(), min(vec.end(),vec.begin()+Nbest), vec.end(), std::greater >()); + vec.resize(std::min(Nbest,vec.size())); + + ret.push_back(vec); + } + return ret; +} + +std::vector > > sort_ngrams(std::vector > > scores, std::size_t input_length, std::size_t Nbest) { + + //stack of hypotheses for building next greater length + std::vector > > seed; + std::vector tmp; + seed.push_back(std::make_pair(0,tmp)); + + std::vector > > ret; + + // for each n-gram length + for(std::size_t i = 0; i < scores.size(); ++i){ + + std::vector > > scores_current; + + //for each word in vocab (already pruned in prepare_hypotheses) + for(std::size_t j = 0; j < scores[i].size(); ++j){ + + //for each hypothesis we kept from (n-gram-length-1) + for(std::size_t k = 0; k < seed.size(); ++k){ + + std::vector tempvect (seed[k].second); + tempvect.push_back(scores[i][j].second); + + scores_current.push_back(std::make_pair(seed[k].first + log(scores[i][j].first), tempvect)); + } + } + + //we only need Nbest hypotheses + std::nth_element(scores_current.begin(), min(scores_current.end(),scores_current.begin()+Nbest), scores_current.end(), std::greater > >()); + seed.resize(std::min(Nbest,scores_current.size())); + + REAL length_penalty = weight_lengths(input_length,i+1); + for(std::size_t j = 0; j < std::min(Nbest,scores_current.size()); ++j) { + ret.push_back(std::make_pair((scores_current[j].first+length_penalty)/(i+1), scores_current[j].second)); // normalized by length + seed[j] = scores_current[j]; // unnormalized; used to generate longer hypotheses + } + + } + + // compare n-grams of different lengths and return Nbest + std::sort(ret.begin(), ret.end(), std::greater > >()); + ret.resize(std::min(ret.size(),Nbest)); + + return ret; +} +