diff --git a/BackoffLmKen.cpp b/BackoffLmKen.cpp index b6d08cc..e899f88 100644 --- a/BackoffLmKen.cpp +++ b/BackoffLmKen.cpp @@ -103,6 +103,7 @@ int BackoffLmKen::GetSentenceIds(WordID *&wid, const string &sentence, bool bos, nw++; } } + debug1(" parsing found %d words\n", nw); // end sentence with EOS ? if (eos) { @@ -111,6 +112,7 @@ int BackoffLmKen::GetSentenceIds(WordID *&wid, const string &sentence, bool bos, } wid = &(wid_vect.front()); + debug4("* split sent with %d words into %d-grams (bos=%d, eos=%d):\n", nw, ken_ngram->Order(), map_ken2wid[ken_vocab->BeginSentence()], map_ken2wid[ken_vocab->EndSentence()]); return nw; } @@ -138,7 +140,9 @@ REAL BackoffLmKen::BoffLnPw(char **ctxt, char *w, int req_order) for (int i = 0; i < (req_order - 1); i++) { ken_ngram->Score(state, ken_vocab->Index(ctxt[i]), out_state); state = out_state; + debug2(" - context position ken=%d, ken_idx=%d\n", i, ken_vocab->Index(ctxt[i])); } + debug2(" - predict ken_idx=%d, log10P=%e\n", ken_vocab->Index(w), ken_ngram->Score(state, ken_vocab->Index(w), out_state)); // we need to convert from log_10 to ln return M_LN10 * ken_ngram->Score(state, ken_vocab->Index(w), out_state); @@ -168,7 +172,9 @@ REAL BackoffLmKen::BoffLnPid(REAL *ctxt, WordID predw, int req_order) for (int i = 0; i < (req_order - 1); i++) { ken_ngram->Score(state, map_cslm2ken[(WordID) ctxt[i]], out_state); state = out_state; + debug2(" - context position ken=%d, ken_idx=%d\n", i, map_cslm2ken[(WordID) ctxt[i]]); } + debug3(" - predict cslm_id=%d, ken_idx=%d, log10P=%e\n", predw, map_cslm2ken[predw], ken_ngram->Score(state, map_cslm2ken[predw], out_state)); // we need to convert from log_10 to ln return M_LN10 * ken_ngram->Score(state, map_cslm2ken[predw], out_state); @@ -198,7 +204,9 @@ REAL BackoffLmKen::BoffLnStd(WordID *ctxt, WordID predw, int req_order) for (int i = 0; i < (req_order - 1); i++) { ken_ngram->Score(state, ctxt[i], out_state); state = out_state; + debug2(" - context position ken=%d, ken_idx=%d\n", i, ctxt[i]); } + debug3(" - predict cslm_id=%d, ken_idx=%d, log10P=%e\n", predw, predw, ken_ngram->Score(state, predw, out_state)); // we need to convert from log_10 to ln return M_LN10 * ken_ngram->Score(state, predw, out_state); diff --git a/BackoffLmSri.cpp b/BackoffLmSri.cpp index ef810cb..813481d 100644 --- a/BackoffLmSri.cpp +++ b/BackoffLmSri.cpp @@ -121,6 +121,7 @@ BackoffLmSri::BackoffLmSri(char *p_fname, int p_max_order, const WordList &wlist WordList::const_iterator iter = wlist.Begin(), end = wlist.End(); for (size_t ci=0; iter!=end; iter++, ci++) { VocabIndex vi = sri_vocab->getIndex(iter->word); + //debug3("'%s' bin=%d -> sri=%d\n", iter->word, ci, vi); if (vi == Vocab_None) { fprintf(stderr,"word %s not found at pos %zu\n", iter->word, ci ); } @@ -156,6 +157,7 @@ int BackoffLmSri::GetSentenceIds(WordID *&wid, const string &sentence, bool bos, strcpy(str,sentence.c_str()); // we need to copy since parseWords() modifies the string nw = sri_vocab->parseWords(str, vstr, max_words - 1); if (nw >= max_words-1) Error("too many words in one hypothesis\n"); + debug1(" parsing found %d words\n", nw); int b=0; // start sentence with BOS ? @@ -171,5 +173,6 @@ int BackoffLmSri::GetSentenceIds(WordID *&wid, const string &sentence, bool bos, if (eos) wid_table[nw++]=sri_vocab->seIndex(); wid = wid_table; + debug4("* split sent with %d words into %d-grams (bos=%d, eos=%d):\n", nw, sri_order, sri_vocab->ssIndex(), sri_vocab->seIndex()); return nw; } diff --git a/BackoffLmSri.h b/BackoffLmSri.h index 37de379..a55542a 100644 --- a/BackoffLmSri.h +++ b/BackoffLmSri.h @@ -119,8 +119,10 @@ class BackoffLmSri : public BackoffLm { for (i=0; i sri=%d, sri_idx=%d word=%s\n", j, i, sri_context_idxs[i], sri_vocab->getWord(sri_context_idxs[i]) ); } sri_context_idxs[i]=Vocab_None; // terminate, this is needed to specify the length of the context + //debug3(" - predict cslm_id=%d, sri_idx=%d word=%s\n", predw, predw, sri_vocab->getWord(predw) ); #ifdef DEBUG printf(" - SRI %d-gram context: ",req_order); diff --git a/Blas.h b/Blas.h index ac4e404..a03dfdb 100644 --- a/Blas.h +++ b/Blas.h @@ -151,6 +151,7 @@ inline void call_gemv (REAL *dest, REAL *matrix, REAL *source, REAL *bias, // m x n + debug0("-mkl- call gemv\n"); #ifdef BLAS_CUDA COPY(dim_dest,bias,inc,dest,inc); // TODO: verify GEMV(trans, dim_dest, dim_src, fact, matrix, dim_dest, source, inc, fact, dest, inc); diff --git a/Data.cpp b/Data.cpp index 9ff85d7..cf71273 100644 --- a/Data.cpp +++ b/Data.cpp @@ -78,6 +78,7 @@ Data::Data(const char *p_fname, Data *other_data, bool use_class) sr_wl_idx(-1), tg_wl_idx(-1), current_df(0), idx(-1), mem_cdf(NULL), mem_inp(NULL), mem_trg(NULL), input(NULL), target(NULL), aux(NULL) { + debug0("* constructor Data\n"); ReadFile(other_data, use_class); } @@ -354,6 +355,7 @@ void Data::ReadFile(Data *other_data, bool use_class) Data::~Data() { + debug0("* destructor Data\n"); if (preload) { delete [] mem_cdf; delete [] mem_inp; @@ -454,12 +456,14 @@ void Data::Preload() int odim1=(*itf)[0]->GetOdim(); while (++n < maxn) { + debug1("getting example %d\n",idx); mem_cdf[idx] = cdf; bool ok=false; while (!ok) { // advance synchronously all factors until ok for (vector::iterator it = (*itf).begin(); it!=(*itf).end(); ++it) { + debug1(" next factor %ld\n", it-(*itf).begin()); if (! (*it)->Next()) (*it)->Rewind(); // TODO: deadlock if file empty } @@ -469,12 +473,14 @@ void Data::Preload() ok = (drand48() < (*itf)[0]->GetResamplCoef()); } + debug1(" %s\n", ok ? "keep" : "skip"); } // copy all factors sequentially in memory REAL *adr_inp=mem_inp+idx*idim; REAL *adr_trg=mem_trg+idx*odim; for (vector::iterator it = (*itf).begin(); it!=(*itf).end(); ++it) { + debug2(" load factor %ld to address %p\n", it-(*itf).begin(), adr_inp); memcpy(adr_inp, (*it)->input, idim1*sizeof(REAL)); adr_inp+=idim1; if (odim1 > 0) { @@ -494,6 +500,7 @@ void Data::Preload() REAL m=0, *mptr; for (e=0, mptr=mem_inp+i; e0) for (e=0, mptr=mem_inp+i; eRewind(); } idx = -1; + debug0("** Data::Rewind() done\n"); } /************************** diff --git a/DataAscii.cpp b/DataAscii.cpp index 1d4f530..36d965e 100644 --- a/DataAscii.cpp +++ b/DataAscii.cpp @@ -31,6 +31,7 @@ const char* DATA_FILE_ASCII="DataAscii"; DataAscii::DataAscii(char *p_prefix, ifstream &ifs, int p_aux_dim, const string& p_aux_ext, int p_nb_SentSc, const string& p_SentSc_ext,int p_betweenSentCtxt , DataAscii *prev_df) : DataFile::DataFile(p_prefix, ifs, p_aux_dim, p_aux_ext, p_nb_SentSc, p_SentSc_ext, p_betweenSentCtxt, prev_df) { + debug0("** constructor DataAscii\n"); char full_fname[max_word_len]=""; @@ -70,6 +71,7 @@ DataAscii::DataAscii(char *p_prefix, ifstream &ifs, int p_aux_dim, const string& DataAscii::~DataAscii() { + debug0("** destructor DataAscii\n"); dfs.close(); if (idim>0) delete [] input; if (odim>0) delete [] target_vect; @@ -82,6 +84,7 @@ DataAscii::~DataAscii() void DataAscii::Rewind() { + debug0("*** DataAscii::Rewind()\n"); dfs.seekg(0, dfs.beg); char buf[DATA_LINE_LEN]; dfs.getline(buf,DATA_LINE_LEN); @@ -95,6 +98,7 @@ void DataAscii::Rewind() bool DataAscii::Next() { +// debug0("*** DataAscii::Next() "); cout<tgt0; @@ -50,6 +51,7 @@ DataAsciiClass::DataAsciiClass(char *p_prefix, ifstream &ifs, int p_aux_dim, con bool DataAsciiClass::Next() { +// debug0("*** DataAsciiClass::Next() "); cout<> p_fname; @@ -69,6 +70,7 @@ DataFile::DataFile(char *p_path_prefix, char *p_fname, const float p_rcoeff) : idim(0), odim(0), auxdim(0), nbex(0), resampl_coeff(p_rcoeff), path_prefix(p_path_prefix), fname(NULL), idx(-1), input(NULL), target_vect(NULL), aux(NULL), target_id(0) { + debug0("** constructor DataFile with fname\n"); if (NULL != p_fname) fname = strdup(p_fname); @@ -78,6 +80,7 @@ DataFile::DataFile(char *p_path_prefix, char *p_fname, const float p_rcoeff) DataFile::~DataFile() { + debug0("** destructor DataFile\n"); if (fname) free(fname); if (aux_fs.is_open()) aux_fs.close(); @@ -153,6 +156,7 @@ int DataFile::Info(const char *txt) void DataFile::Rewind() { + debug0("*** DataFile::Rewind()\n"); Error("DataFile::Rewind() should be overriden"); } @@ -174,6 +178,7 @@ bool DataFile::Next() int DataFile::Resampl() { + //debug0("*** DataFile::Resampl()\n"); bool ok=false; while (!ok) { @@ -184,6 +189,7 @@ int DataFile::Resampl() //cout << " ok=" << ok << endl; } + //debug0("*** DataFile::Resampl() end\n"); return idx; } diff --git a/DataMnist.cpp b/DataMnist.cpp index 9e524fa..9f96eea 100644 --- a/DataMnist.cpp +++ b/DataMnist.cpp @@ -48,6 +48,7 @@ uint DataMnist::read_iswap(int fd) { // swap integer Big Endian -> little Endian ps[0]=pi[3]; ps[1]=pi[2]; ps[2]=pi[1]; ps[3]=pi[0]; + debug2("read=%4x, swap=%4x\n", i, s); return s; } @@ -59,6 +60,7 @@ uint DataMnist::read_iswap(int fd) { DataMnist::DataMnist(char *p_prefix, ifstream &ifs, int p_aux_dim, const string& p_aux_ext, int p_nb_SentSc, string& p_SentSc_ext,int p_betweenSentCtxt, DataMnist *prev_df) : DataFile::DataFile(p_prefix, ifs, p_aux_dim, p_aux_ext, p_nb_SentSc, p_SentSc_ext, p_betweenSentCtxt, prev_df) { + debug0("** constructor DataMnist\n"); char full_fname[max_word_len]=""; printf(" - %s: MNIST data ", fname); fflush(stdout); @@ -125,6 +127,7 @@ DataMnist::DataMnist(char *p_prefix, ifstream &ifs, int p_aux_dim, const string& DataMnist::~DataMnist() { + debug0("** destructor DataMnist\n"); close(dfd); close(lfd); if (idim>0) { delete [] input; delete [] ubuf; } @@ -139,6 +142,7 @@ DataMnist::~DataMnist() void DataMnist::Rewind() { + debug0("*** DataMnist::Rewind()\n"); lseek(dfd, 16, SEEK_SET); lseek(lfd, 8, SEEK_SET); if (aux_fs.is_open()) @@ -151,6 +155,7 @@ void DataMnist::Rewind() bool DataMnist::Next() { +// debug0("*** DataMnist::Next() "); cout<=odim) { ErrorN("example %lu has a target of %d, but we have only %d classes\n", idx+1, target_id, odim); diff --git a/DataNgramBin.cpp b/DataNgramBin.cpp index 4a31d09..d27a1c9 100644 --- a/DataNgramBin.cpp +++ b/DataNgramBin.cpp @@ -166,6 +166,7 @@ DataNgramBin::DataNgramBin(char *p_prefix, ifstream &ifs, int p_aux_dim, const s : DataFile::DataFile(p_prefix, ifs, p_aux_dim, p_aux_ext, p_nb_SentSc, p_SentSc_ext,p_betweenSentCtxt, prev_df), order(0), tgpos(0), eospos(0), mode(0), nbw(0), nbs(0), nbu(0), nbi(0) { + debug0("*** constructor DataNgramBin\n"); // DataNgramBin [] // parse addtl params -> @@ -197,6 +198,7 @@ DataNgramBin::DataNgramBin(char *p_fname, float p_rcoeff, int p_order) : DataFile::DataFile(NULL, p_fname, p_rcoeff), order(p_order), tgpos(p_order - 1), eospos(0), mode(3), nbw(0), nbs(0), nbu(0), nbi(0) { + debug0("*** constructor DataNgramBin with fname\n"); do_constructor_work(); // skip counting for efficieny reasons @@ -209,6 +211,7 @@ DataNgramBin::DataNgramBin(char *p_fname, float p_rcoeff, int p_order, int p_tgp : DataFile::DataFile(NULL, p_fname, p_rcoeff), order(p_order), tgpos(p_tgpos), mode(p_mode), nbw(0), nbs(0), nbu(0), nbi(0) { + debug0("*** constructor DataNgramBin with fname\n"); if (tgpos<0 || tgpos>=order) ErrorN("wrong value of target position: %d not in [0,%d]\n",tgpos,order-1); @@ -221,6 +224,7 @@ DataNgramBin::DataNgramBin(char *p_fname, float p_rcoeff, int p_order, int p_tgp DataNgramBin::~DataNgramBin() { + debug0("*** destructor DataNgramBin\n"); close(fd); if (idim>0) { @@ -238,6 +242,7 @@ DataNgramBin::~DataNgramBin() * */ bool DataNgramBin::Next() { + //debug0("*** DataNgramBin::Next() \n"); bool ok=false; string line_sc; int i; @@ -353,12 +358,14 @@ bool DataNgramBin::Next() void DataNgramBin::Rewind() { + debug0("*** DataNgramBin::Rewind()\n"); lseek(fd, header_len, SEEK_SET); if (aux_fs.is_open()) aux_fs.seekg(0, aux_fs.beg); if(SentSc_fs.is_open()) SentSc_fs.seekg(0, aux_fs.beg); idx=-1; + debug0("*** DataNgramBin::Rewind() done\n"); // initialize read buffer buf_n=0; buf_pos=-1; eospos = 0; diff --git a/DataPhraseBin.cpp b/DataPhraseBin.cpp index 6c9567d..a338c68 100644 --- a/DataPhraseBin.cpp +++ b/DataPhraseBin.cpp @@ -172,6 +172,7 @@ DataPhraseBin::DataPhraseBin(char *p_prefix, ifstream &ifs, int p_aux_dim, const owlist(NULL), onbphw(NULL), ocnbphw(NULL), nbi(0) { + debug0("*** constructor DataPhraseBin\n"); // DataPhraseBin [flags] // parse addtl params if (prev_df) { @@ -200,6 +201,7 @@ DataPhraseBin::DataPhraseBin(char *p_fname, float p_rcoeff, int p_src_phlen, int iwlist(NULL), inbphw(NULL), icnbphw(NULL), owlist(NULL), onbphw(NULL), ocnbphw(NULL) { + debug0("*** constructor DataPhraseBin with fname\n"); do_constructor_work(); // TODO: counting ? @@ -209,6 +211,7 @@ DataPhraseBin::DataPhraseBin(char *p_fname, float p_rcoeff, int p_src_phlen, int DataPhraseBin::~DataPhraseBin() { + debug0("*** destructor DataPhraseBin\n"); close(fd); if (idim>0) delete [] input; @@ -239,6 +242,7 @@ void DataPhraseBin::SetWordLists(WordList *p_iwlist, WordList *p_owlist) bool DataPhraseBin::Next() { + //debug0("*** DataPhraseBin::Next() \n"); bool ok=false; WordID buf[max_buf_len]; @@ -250,6 +254,7 @@ bool DataPhraseBin::Next() // read source phrase if (!ReadBuffered(&src_len, sizeof(src_len))) return false; + debug1("source read %d words:", src_len); if ((int) src_len>max_buf_len) Error("The source phrase is too long, you need to recompile the program\n"); if (!ReadBuffered((uchar*)buf, src_len*sizeof(WordID))) Error("DataPhraseBin::Next(): no source phrase left\n"); #ifdef DEBUG @@ -257,6 +262,7 @@ bool DataPhraseBin::Next() printf("\n"); #endif if ((int) src_len>src_phlen) { + debug0(" src too long -> flag to ignore\n"); nbi++; // ignore: too many source words ok=false; // won't be used, but we still need to read the target phrase to keep it in sync } @@ -269,6 +275,7 @@ bool DataPhraseBin::Next() // read target phrase if (!ReadBuffered(&tgt_len, sizeof(tgt_len))) return false; + debug1("target read %d words:", tgt_len); if ((int)tgt_len>max_buf_len) Error("The target phrase is too long, you need to recompile the program\n"); if (!ReadBuffered((uchar*)buf, tgt_len*sizeof(WordID))) Error("DataPhraseBin::Next(): no target phrase left\n"); #ifdef DEBUG @@ -276,6 +283,7 @@ bool DataPhraseBin::Next() printf("\n"); #endif if ((int)tgt_len > tgt_phlen) { + debug0(" tgt too long -> ignore\n"); nbi++; ok=false; continue; // ignore: too many target words } else { @@ -286,6 +294,7 @@ bool DataPhraseBin::Next() // decide wether the current phrase pair is valid in function of the flags if (!ok) { + debug0(" -> late ignore\n"); continue; } @@ -332,6 +341,7 @@ bool DataPhraseBin::Next() void DataPhraseBin::Rewind() { + debug0("*** DataPhraseBin::Rewind()\n"); lseek(fd, sizeof(int), SEEK_SET); // position on field max_phrase_len int mlen; read(fd, &mlen, sizeof(int)); // get max_phrase_len @@ -339,8 +349,10 @@ void DataPhraseBin::Rewind() lseek(fd, pos , SEEK_CUR); if (aux_fs.is_open()) aux_fs.seekg(0, aux_fs.beg); + debug2("DataPhraseBin::Rewind(): max_phase_len=%d, advance by %u bytes\n", mlen, pos); idx=-1; // initialize read buffer buf_n=0; buf_pos=-1; + debug0("*** DataPhraseBin::Rewind() done\n"); } diff --git a/DataPhraseBin.h b/DataPhraseBin.h index 02f04d3..9ca6b89 100644 --- a/DataPhraseBin.h +++ b/DataPhraseBin.h @@ -87,13 +87,16 @@ class DataPhraseBin : public DataFile #if 0 read(fd, data, cnt); #else + debug2("DataPhraseBin::ReadBuffered(%p,%lu)\n",data,cnt); for (size_t i=0; i=buf_n) { // read new block of data, we can get less than requested buf_n = read(fd, buf_bytes, DATA_FILE_PHRASE_BUF_SIZE); + debug1(" -put %d bytes into buffer\n", buf_n); if (buf_n<=0) return false; // no data left buf_pos=0; } + debug2(" - copy bytes from buf[%d] to target[%lu]\n", buf_pos,i); data[i]=buf_bytes[buf_pos]; } #endif diff --git a/ErrFct.cpp b/ErrFct.cpp index 70ffef1..eebcb3c 100644 --- a/ErrFct.cpp +++ b/ErrFct.cpp @@ -40,6 +40,7 @@ ErrFct::ErrFct (Mach &mach) #else grad = new REAL[dim*bsize]; #endif + debug4("*** ErrFct() constructor, mach=%p, allocated %dx%d for gradient at %p\n",(void*)&mach,bsize,dim,(void*)grad); } ErrFct::ErrFct (const ErrFct &efct) @@ -53,6 +54,7 @@ ErrFct::ErrFct (const ErrFct &efct) #else grad = new REAL[dim*bsize]; #endif + debug3("*** ErrFct() copy constructor, allocated %dx%d for gradient at %p\n",bsize,dim,(void*)grad); } //************************************************************************************** diff --git a/ErrFctSoftmClassCrossEntNgram.cpp b/ErrFctSoftmClassCrossEntNgram.cpp index 33213f4..e19cf78 100644 --- a/ErrFctSoftmClassCrossEntNgram.cpp +++ b/ErrFctSoftmClassCrossEntNgram.cpp @@ -29,6 +29,7 @@ ErrFctSoftmClassCrossEntNgram::ErrFctSoftmClassCrossEntNgram(Mach &mach) : ErrFct(mach), grad_class(NULL) { + debug0("*** ErrFctSoftmClassCrossEntNgram() constructor\n"); #ifdef BLAS_CUDA err = NULL; host_err = NULL; @@ -38,6 +39,7 @@ ErrFctSoftmClassCrossEntNgram::ErrFctSoftmClassCrossEntNgram(Mach &mach) ErrFctSoftmClassCrossEntNgram::ErrFctSoftmClassCrossEntNgram(const ErrFctSoftmClassCrossEntNgram &efct) : ErrFct(efct), grad_class(NULL) { + debug0("*** ErrFctSoftmCrossEntNgram() copy constructor\n"); #ifdef BLAS_CUDA err = NULL; host_err = NULL; @@ -277,10 +279,12 @@ REAL ErrFctSoftmClassCrossEntNgram::CalcWordClassError(int eff_bsize) if ((int) *tcptr != argmax) err_value++; + debug2("%d/%d, ", (int) *tcptr, argmax); ocptr += n_classes; tcptr++; } #endif + debug1("%d\n", err_value); return (REAL) err_value; } diff --git a/ErrFctSoftmCrossEntNgram.cpp b/ErrFctSoftmCrossEntNgram.cpp index 552edbe..83bc938 100644 --- a/ErrFctSoftmCrossEntNgram.cpp +++ b/ErrFctSoftmCrossEntNgram.cpp @@ -31,6 +31,7 @@ ErrFctSoftmCrossEntNgram::ErrFctSoftmCrossEntNgram(Mach &mach) : ErrFct(mach) { #ifdef BLAS_CUDA + debug0("*** ErrFctSoftmCrossEntNgram() constructor, allocate CUDA err variable\n"); Gpu::SetConfig(gpu_conf); err = Gpu::Alloc(1, "ErrFctSoftmCrossEntNgram: err variable"); #endif @@ -39,6 +40,7 @@ ErrFctSoftmCrossEntNgram::ErrFctSoftmCrossEntNgram(Mach &mach) ErrFctSoftmCrossEntNgram::ErrFctSoftmCrossEntNgram(const ErrFctSoftmCrossEntNgram &efct) : ErrFct(efct) { + debug0("*** ErrFctSoftmCrossEntNgram() copy constructor, allocate error\n"); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); err = Gpu::Alloc(1, "ErrFctSoftmCrossEntNgram: err variable"); @@ -189,6 +191,7 @@ REAL ErrFctSoftmCrossEntNgram::CalcValueNth(int idx) REAL ErrFctSoftmCrossEntNgram::CalcGrad(int eff_bsize) { + debug5("ErrFctSoftmCrossEntNgram::CalcGrad() eff_bsize=%d, dim=%d, output=%p, target=%p, grad=%p\n",eff_bsize,dim,(void*)output,(void*)target,(void*)grad); if (eff_bsize<=0) eff_bsize=bsize; #ifdef BLAS_CUDA @@ -212,6 +215,7 @@ REAL ErrFctSoftmCrossEntNgram::CalcGrad(int eff_bsize) for (int b=0; b output at %p is %f, update grad at %p\n",tidx,(void*)(optr+tidx),optr[tidx],(void*)(gptr+tidx)); gptr[tidx] += 1.0; err += safelog(optr[tidx]); gptr+=dim; optr+=dim; @@ -224,6 +228,7 @@ REAL ErrFctSoftmCrossEntNgram::CalcGrad(int eff_bsize) REAL ErrFctSoftmCrossEntNgram::CalcGradNull(int eff_bsize) { + debug5("ErrFctSoftmCrossEntNgram::CalcGradNULL() eff_bsize=%d, dim=%d, output=%p, target=%p, grad=%p\n",eff_bsize,dim,(void*)output,(void*)target,(void*)grad); if (eff_bsize<=0) eff_bsize=bsize; #ifdef BLAS_CUDA @@ -246,6 +251,7 @@ REAL ErrFctSoftmCrossEntNgram::CalcGradNull(int eff_bsize) SCAL(&n,&f1,grad,&inc1); for (int b=0; b output at %p is %f, update grad at %p\n",b,tidx,(void*)(optr+tidx),optr[tidx],(void*)(gptr+tidx)); if (tidx==NULL_WORD) { memset(gptr, 0, dim*sizeof(REAL)); } diff --git a/ErrFctSoftmCrossEntNgramMulti.cpp b/ErrFctSoftmCrossEntNgramMulti.cpp index 8047242..82ac5a0 100644 --- a/ErrFctSoftmCrossEntNgramMulti.cpp +++ b/ErrFctSoftmCrossEntNgramMulti.cpp @@ -39,6 +39,7 @@ ErrFctSoftmCrossEntNgramMulti::ErrFctSoftmCrossEntNgramMulti(Mach &mach, int n) if (mach.GetOdim()%nb != 0) Error("ErrFctSoftmCrossEntNgramMulti: output layer size is not an integer multiple"); dim = mach.GetOdim() / nb; + debug2("ErrFctSoftmCrossEntNgramMulti: %d n-grams of size %d\n", nb, dim); } @@ -67,12 +68,15 @@ REAL ErrFctSoftmCrossEntNgramMulti::CalcValue(int eff_bsize) for (int n=0; n err=%e\n", b, n, optr[tidx], err); memset(gptr, 0, dim*sizeof(REAL)); } else { gptr[tidx] += 1.0; err += log(optr[tidx]); + debug6("grad ngram-multi: b=%d, n=%d, tidx=%u, out=%f -> err=%e, grad@target=%e\n", b, n, tidx, optr[tidx], err, gptr[tidx]); } gptr+=dim; optr+=dim; } } + debug1("ErrFctSoftmCrossEntNgramMulti::CalcGrad err=%f\n", err); return err; #endif diff --git a/Hypo.cpp b/Hypo.cpp index 4ce23b6..76a75e0 100644 --- a/Hypo.cpp +++ b/Hypo.cpp @@ -58,6 +58,7 @@ void Hypo::Write(outputfilestream &outf) float Hypo::CalcGlobal(Weights &w) { + debug0("HYP: calc global\n"); uint sz=w.val.size(); if (sz global score %e\n", s); return s; } diff --git a/Mach.cpp b/Mach.cpp index 0e561d2..a12e7de 100644 --- a/Mach.cpp +++ b/Mach.cpp @@ -70,10 +70,13 @@ void Mach::do_alloc() { Gpu::Init(); + debug3("*** do_alloc CUDA Mach type %d: %dx%d\n",GetMType(),idim,odim); data_out = Gpu::Alloc(odim*bsize, "output data for a machine"); + debug1("*** - data_out=%p\n",(void*)data_out); data_in=NULL; // should be set later by SetDataIn() drop_out_rand = NULL; // will be allocated when calling SetDropOut() grad_in = Gpu::Alloc(idim*bsize, "input gradient for a machine"); + debug1("*** - grad_in=%p\n",(void*)grad_in); grad_out=NULL; // should be set later by SetGradOut() } @@ -84,6 +87,7 @@ void Mach::SetDropOut(const REAL v) { drop_out_rand = Gpu::Alloc(odim*bsize, "buffer for random values for drop-out"); } drop_out=v; + debug4("drop_out: %f in %p for %dx%d\n",drop_out,drop_out_rand,idim,odim); } #endif @@ -92,18 +96,21 @@ void Mach::SetDropOut(const REAL v) { #ifndef BLAS_CUDA void Mach::do_alloc() { + debug3("*** do_alloc Mach type %d: %dx%d\n",GetMType(),idim,odim); if (odim*bsize>0) { data_out=::new REAL[odim*bsize]; if (!data_out) Error ("can't allocate memory for data_out"); drop_out_rand = NULL; // will be allocated when calling SetDropOut() } else { data_out=drop_out_rand=NULL; } + debug1("*** - data_out=%p\n",(void*)data_out); data_in=NULL; // should be set later by SetDataIn() if (idim*bsize>0) { grad_in=::new REAL[idim*bsize]; if (!grad_in) Error ("can't allocate memory for grad_in"); } else grad_in=NULL; + debug1("*** - grad_in=%p\n",(void*)grad_in); grad_out=NULL; // (luint) this) should be set later by SetGradOut() } @@ -115,6 +122,7 @@ void Mach::SetDropOut(const REAL v) { if (!drop_out_rand) Error ("can't allocate memory for drop_out"); } drop_out=v; + debug4("drop_out: %f in %p for %dx%d\n",drop_out,drop_out_rand,idim,odim); } #endif @@ -122,6 +130,7 @@ void Mach::SetDropOut(const REAL v) { Mach::Mach(const int p_idim, const int p_odim, const int p_bsize, const ulong p_nbfw, const ulong p_nbbw) : idim(p_idim), odim(p_odim), bsize(p_bsize), nb_forw(p_nbfw), nb_backw(p_nbbw), update(true), lr_coeff(1.0), drop_out(0.0), drop_out_rand(NULL) { + debug0("*** constructor Mach\n"); do_alloc(); #ifdef BLAS_CUDA gpu_conf = Gpu::GetConfig(); @@ -135,6 +144,7 @@ Mach::Mach(const int p_idim, const int p_odim, const int p_bsize, const ulong p_ Mach::Mach(const Mach &m, const int p_idim) { + debug0("*** copy constructor Mach\n"); if (p_idim > 0) idim = p_idim; else @@ -167,6 +177,7 @@ Mach::Mach(const Mach &m, const int p_idim) Mach::~Mach() { + debug1("*** destructor Mach %lx\n", (luint) this); #ifdef BLAS_CUDA if (data_out) cublasFree(data_out); if (drop_out_rand) cublasFree(drop_out_rand); @@ -184,12 +195,14 @@ Mach::~Mach() //----------------------------------------------- void Mach::WriteParams(ostream &of) { + debug0("*** write params of Mach\n"); // write machine specific params of.write((char*) &nb_forw, sizeof(ulong)); of.write((char*) &nb_backw, sizeof(ulong)); } void Mach::WriteData(ostream &of) { + debug0("*** writing data of general machine to file\n"); const int i=0, s=sizeof(REAL); of.write((char*) &i, sizeof(int)); of.write((char*) &s, sizeof(int)); @@ -197,6 +210,7 @@ void Mach::WriteData(ostream &of) { void Mach::Write(ostream &of) { + debug0("*** writing data of general machine to file\n"); char header[file_header_size]; for (int i=0; ibExternal==0) m->ReadData(inpf, s, bs); //read the data for the first MachTab + else{ + Error("The first MachTab should have its own data but is set to have external data\n"); + } + debug2("Storing address (%p) of machine %d\n",mt->GetTabAdr(),m); prSharedMachines[-1]=m; } else { m = prSharedMachines[-1]->Clone(); + debug1(" cloning MachTab, address = %p\n", mt->GetTabAdr()); //fprintf(stderr, " cloning MachTab, address = %p\n", mt->GetTabAdr()); } } @@ -389,6 +416,7 @@ void Mach::Info(bool detailed, char *txt) #endif tm.disp(", "); printf("\n"); + debug5("*** %s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); } } diff --git a/MachAvr.cpp b/MachAvr.cpp index 4af0dfe..5b17467 100644 --- a/MachAvr.cpp +++ b/MachAvr.cpp @@ -30,6 +30,7 @@ using namespace std; void MachAvr::do_alloc() { + debug2("do_alloc MachAvr %d x %d\n",idim,odim); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); if (data_out) cublasFree(data_out); @@ -40,6 +41,8 @@ void MachAvr::do_alloc() winner = Gpu::Alloc(odim*bsize, "winner of multi-average machine"); grad_in = Gpu::Alloc(idim*bsize, "input gradient of multi-average machine"); + debug2(" - CUDA data_out alloc %lu bytes at %p\n",sizeof(REAL)*odim*bsize,(void*) data_out); + debug2(" - CUDA grad_in alloc %lu bytes at %p\n",sizeof(REAL)*idim*bsize,(void*) grad_in); #else if (data_out) delete [] data_out; if (winner) delete [] winner; @@ -47,6 +50,8 @@ void MachAvr::do_alloc() data_out = (odim*bsize>0) ? new REAL[odim*bsize] : NULL; winner = (odim*bsize>0) ? new REAL[odim*bsize] : NULL; grad_in = (idim*bsize>0) ? new REAL[idim*bsize] : NULL; + debug2(" - data_out alloc %lu bytes at %p\n",sizeof(REAL)*odim*bsize,(void*) data_out); + debug2(" - grad_in alloc %lu bytes at %p\n",sizeof(REAL)*idim*bsize,(void*) grad_in); #endif } @@ -58,6 +63,7 @@ void MachAvr::do_alloc() MachAvr::MachAvr() : MachCombined() { + debug0("*** constructor MachAvr\n"); } /* @@ -68,6 +74,7 @@ MachAvr::MachAvr() MachAvr::MachAvr(const MachAvr &m) : MachCombined(m) { + debug0("*** copy constructor MachAvr\n"); } /* @@ -76,6 +83,7 @@ MachAvr::MachAvr(const MachAvr &m) MachAvr::~MachAvr() { + debug1("*** destructor MachAvr %lx\n", (luint) this); // data_out and grad_in will be deleted by the desctuctor of Mach } @@ -117,6 +125,7 @@ void MachAvr::SetGradOut(REAL *data) void MachAvr::MachAdd(Mach *new_mach) { if (machs.empty()) { + debug0("*** add first element to MachAvr\n"); machs.push_back(new_mach); // think about freeing memory idim=new_mach->GetIdim(); @@ -126,6 +135,7 @@ void MachAvr::MachAdd(Mach *new_mach) do_alloc(); } else { + debug0("*** add new element to MachAvr\n"); if (new_mach->GetIdim() != idim) ErrorN("input dimension of new average machine does not match (%d), should be %d",new_mach->GetIdim(),idim); if (new_mach->GetOdim() != idim) @@ -176,6 +186,7 @@ Mach *MachAvr::MachDel() void MachAvr::ReadData(istream &inpf, size_t s, int bs) { + debug0("*** read data of MachAvr\n"); MachCombined::ReadData(inpf, s, bs); idim = machs[0]->GetIdim(); @@ -207,6 +218,7 @@ void MachAvr::Info(bool detailed, char *txt) tm.disp(", "); tbackw.disp(" + back: "); printf("\n"); + debug5("*** %s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); char ntxt[512]; sprintf(ntxt,"%s ", txt); for (unsigned int i=0; iInfo(detailed, ntxt); @@ -220,6 +232,7 @@ void MachAvr::Info(bool detailed, char *txt) void MachAvr::Forw(int eff_bsize, bool in_train) { + debug2("* MachAvr::Forw: %p -> %p\n", (void*) data_in, (void*) data_out); if (machs.empty()) Error("called Forw() for an empty multiple average machine"); @@ -255,6 +268,7 @@ void MachAvr::Forw(int eff_bsize, bool in_train) void MachAvr::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug2("* MachAvr::Backw: %p <- %p\n", (void*) grad_in, (void*) grad_out); if (machs.empty()) Error("called Backw() for an empty average machine"); diff --git a/MachCombined.cpp b/MachCombined.cpp index 338481f..6091b56 100644 --- a/MachCombined.cpp +++ b/MachCombined.cpp @@ -30,6 +30,7 @@ using namespace std; void MachCombined::do_alloc() { + debug2("do_alloc MachCombined %d x %d\n",idim,odim); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); if (data_out) cublasFree(data_out); @@ -40,6 +41,8 @@ void MachCombined::do_alloc() winner = Gpu::Alloc(odim*bsize, "winner of a combined machine"); grad_in = Gpu::Alloc(idim*bsize, "input gradient of a combined machine"); + debug2(" - CUDA data_out alloc %lu bytes at %p\n",sizeof(REAL)*odim*bsize,(void*) data_out); + debug2(" - CUDA grad_in alloc %lu bytes at %p\n",sizeof(REAL)*idim*bsize,(void*) grad_in); #else if (data_out) delete [] data_out; if (winner) delete [] winner; @@ -47,6 +50,8 @@ void MachCombined::do_alloc() data_out = (odim*bsize>0) ? new REAL[odim*bsize] : NULL; winner = (odim*bsize>0) ? new REAL[odim*bsize] : NULL; grad_in = (idim*bsize>0) ? new REAL[idim*bsize] : NULL; + debug2(" - data_out alloc %lu bytes at %p\n",sizeof(REAL)*odim*bsize,(void*) data_out); + debug2(" - grad_in alloc %lu bytes at %p\n",sizeof(REAL)*idim*bsize,(void*) grad_in); #endif } @@ -58,6 +63,7 @@ void MachCombined::do_alloc() MachCombined::MachCombined() : MachMulti(), winner(NULL) { + debug0("*** constructor MachCombined\n"); } /* @@ -68,6 +74,7 @@ MachCombined::MachCombined() MachCombined::MachCombined(const MachCombined &m) : MachMulti(m), winner(NULL) { + debug0("*** copy constructor MachCombined\n"); } /* @@ -76,6 +83,7 @@ MachCombined::MachCombined(const MachCombined &m) MachCombined::~MachCombined() { + debug1("*** destructor MachCombined %lx\n", (luint) this); // data_out and grad_in will be deleted by the desctuctor of Mach } @@ -117,6 +125,7 @@ void MachCombined::SetGradOut(REAL *data) void MachCombined::MachAdd(Mach *new_mach) { if (machs.empty()) { + debug0("*** add first element to MachCombined\n"); machs.push_back(new_mach); // think about freeing memory idim=new_mach->GetIdim(); @@ -126,6 +135,7 @@ void MachCombined::MachAdd(Mach *new_mach) do_alloc(); } else { + debug0("*** add new element to MachCombined\n"); if (new_mach->GetIdim() != idim) ErrorN("input dimension of new combined machine does not match (%d), should be %d",new_mach->GetIdim(),idim); if (new_mach->GetOdim() != idim) @@ -176,6 +186,7 @@ Mach *MachCombined::MachDel() void MachCombined::ReadData(istream &inpf, size_t s, int bs) { + debug0("*** read data of MachCombined\n"); MachMulti::ReadData(inpf, s, bs); idim = machs[0]->GetIdim(); @@ -207,6 +218,7 @@ void MachCombined::Info(bool detailed, char *txt) tm.disp(", "); tbackw.disp(" + back: "); printf("\n"); + debug5("*** %s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); char ntxt[512]; sprintf(ntxt,"%s ", txt); for (unsigned int i=0; iInfo(detailed, ntxt); @@ -220,6 +232,7 @@ void MachCombined::Info(bool detailed, char *txt) void MachCombined::Forw(int eff_bsize, bool in_train) { + debug2("* MachCombined::Forw: %p -> %p\n", (void*) data_in, (void*) data_out); if (machs.empty()) Error("called Forw() for an empty multiple combined machine"); @@ -237,6 +250,7 @@ void MachCombined::Forw(int eff_bsize, bool in_train) void MachCombined::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug2("* MachCombined::Backw: %p <- %p\n", (void*) grad_in, (void*) grad_out); if (machs.empty()) Error("called Backw() for an empty combined machine"); diff --git a/MachConfig.cpp b/MachConfig.cpp index 2083f1f..83b9bab 100644 --- a/MachConfig.cpp +++ b/MachConfig.cpp @@ -180,7 +180,6 @@ MachConfig::MachConfig (bool bNeedConfFile, REAL rInitBias) : ("nb-backward" , bpo::value ()->default_value(0), "backward number") ("update" , bpo::value(), "update parameters during backward (default true)") ("lrate-coeff" , bpo::value(), "layer specific coefficient of the learning rate (default 1.0)") - ("share-id" , bpo::value ()->default_value(-1), "All machines sharing the same share-id will share their weights (default is all machines share their weights)") ; // machine options for all machine types (including multiple machines) @@ -210,6 +209,7 @@ MachConfig::MachConfig (bool bNeedConfFile, REAL rInitBias) : ("clip-weights" , bpo::value(), "value for clipping weights (used by default with general value)") ("clip-gradients-weights",bpo::value(), "value for clipping gradients on weights (used by default with general value)") ("clip-gradients-bias" , bpo::value(), "value for clipping gradients on biases (used by default with general value)") + ("share-id" , bpo::value ()->default_value(-1), "All machines sharing the same share-id will share their weights (default is all machines share their weights)") ; this->odMachLinConf.add(this->odMachineConf); @@ -217,6 +217,7 @@ MachConfig::MachConfig (bool bNeedConfFile, REAL rInitBias) : this->odMachTabConf.add_options() ("const-init-project" , bpo::value(), "constant value for initialization of the projection layer") ("random-init-project" , bpo::value(), "value for random initialization of the projection layer (method used by default with general value)") + ("share-id" , bpo::value ()->default_value(-1), "All machines sharing the same share-id will share their weights (default is all machines share their weights)") ; this->odMachTabConf.add(this->odMachineConf); @@ -927,7 +928,7 @@ Mach *MachConfig::read_simple_machine (int iMachType, int iBlockSize, bool bMach if(iShareId != -1 && prSharedMachines[iShareId] != NULL) { //TODO: should we check the machine type also? if(prSharedMachines[iShareId]->GetMType() != iMachType){ - cerr << "WARNING: machines sharing weights have not the same type, check the config file!" << endl; + Error("WARNING: machines sharing weights have not the same type, check the config file!"); } if(iMachType == file_header_mtype_tab){ if (prSharedMachines[iShareId]->GetIdim()!=1 || iOutputDim != prSharedMachines[iShareId]->GetOdim()){ @@ -939,7 +940,7 @@ Mach *MachConfig::read_simple_machine (int iMachType, int iBlockSize, bool bMach cerr << "mach[" << iShareId << "]->odim=" << prSharedMachines[iShareId]->GetOdim() << " odim=" << iOutputDim << endl; Error("Machines sharing weights have not the same input/output size, check the config file!"); } - cout << "Cloning previous machine with share-id " << iShareId << endl; + //cout << "Cloning previous machine with share-id " << iShareId << endl; pNewMach = prSharedMachines[iShareId]->Clone(); if(iMachType == file_header_mtype_lin) pMachLin = (MachLin*) pNewMach; else if(iMachType == file_header_mtype_tab) pMachTab = (MachTab*) pNewMach; @@ -952,8 +953,8 @@ Mach *MachConfig::read_simple_machine (int iMachType, int iBlockSize, bool bMach } pNewMach = pMachTab = ((MachTab*)prSharedMachines[iShareId])->Clone(); } else { - if(iShareId==-1) cout << "Creating new machine with no share-id" << endl; - else cout << "Creating new machine with share-id " << iShareId << endl; + //if(iShareId==-1) cout << "Creating new machine with no share-id" << endl; + //else cout << "Creating new machine with share-id " << iShareId << endl; switch (iMachType) { case file_header_mtype_base: pNewMach = new Mach(iInputDim, iOutputDim, iCurBlockSize, iNbForward, iNbBackward); diff --git a/MachCopy.cpp b/MachCopy.cpp index 54c3675..cb2bb16 100644 --- a/MachCopy.cpp +++ b/MachCopy.cpp @@ -31,7 +31,9 @@ MachCopy::MachCopy(const int p_idim, const int p_odim, const int p_bsize, const : Mach(p_idim, p_odim, p_bsize, p_nbfw, p_nbbw) { #ifdef BLAS_CUDA + debug3("*** CUDA constructor MachCopy %d x %d on GPU %d\n", idim,odim,Gpu::GetCudaDevice(Gpu::GetDevice(gpu_conf))); #else + debug2("*** constructor MachCopy %d x %d\n", idim,odim); if (odim != idim) { Error ("The input size should be equal the output size for copy machine"); @@ -42,6 +44,7 @@ MachCopy::MachCopy(const int p_idim, const int p_odim, const int p_bsize, const MachCopy::MachCopy(const MachCopy &m) : Mach(m) { + debug0("*** copy constructor MachCopy\n"); } /******************************************* @@ -63,7 +66,9 @@ void MachCopy::Info(bool detailed, char *txt) tm.newline(); #ifdef BLAS_CUDA + debug5("*** %s cuda data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); #else + debug5("*** %s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); #endif } } @@ -75,6 +80,7 @@ void MachCopy::Info(bool detailed, char *txt) void MachCopy::ReadData(istream &inpf, size_t s, int bs) { + debug0("*** read data of MachCopy\n"); if (0 != s) ErrorN("data block of copy machine has %zu elements (0 were expected)", s); Mach::ReadData(inpf, 0, bs); @@ -87,6 +93,7 @@ void MachCopy::ReadData(istream &inpf, size_t s, int bs) void MachCopy::Forw(int eff_bsize, bool in_train) { + debug1("*** MachCopy Forw %p\n", (void*)this); tm.start(); @@ -98,6 +105,7 @@ void MachCopy::Forw(int eff_bsize, bool in_train) #ifdef BLAS_CUDA Gpu::MemcpyAsync(data_out, data_in, eff_bsize * odim * sizeof(REAL), cudaMemcpyDeviceToDevice); + debug4("*** CUDA: MachCopy::Forw %p[%d] -> %p[%d] \n",data_in,idim,data_out,odim); #else memcpy(data_out, data_in, eff_bsize * odim * sizeof(REAL)); #endif @@ -110,6 +118,7 @@ void MachCopy::Forw(int eff_bsize, bool in_train) void MachCopy::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug2("*** MachCopy Backw %p <- %p\n",(void*)grad_in,(void*)grad_out); if (eff_bsize<=0) eff_bsize=bsize; if (!grad_out) Error("MachCopy::Backw(): output gradient is not set"); diff --git a/MachJoin.cpp b/MachJoin.cpp index 3490b44..5e4562c 100644 --- a/MachJoin.cpp +++ b/MachJoin.cpp @@ -35,14 +35,17 @@ using namespace std; */ void MachJoin::do_alloc(bool alloc_data_out) { + debug2("do_alloc MachJoin %d x %d\n",idim,odim); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); if (alloc_data_out) { if (data_out) cublasFree(data_out); data_out = Gpu::Alloc(odim*bsize, "output data of join machine"); + debug1("ALLOCATE output data [%d] of first machine in MachJoin\n",odim); } if (grad_in) cublasFree(grad_in); grad_in = Gpu::Alloc(idim*bsize, "input gradient of join machine"); + debug2(" - CUDA grad_in alloc %lu bytes at %p\n",sizeof(REAL)*idim*bsize,(void*) grad_in); if (NULL == gpu_dev_data_out) gpu_dev_data_out = Gpu::Alloc(odim*bsize*sizeof(REAL), "MachJoin::Forw tmp for AXPY"); @@ -73,14 +76,17 @@ void MachJoin::do_alloc(bool alloc_data_out) if (alloc_data_out) { if (data_out) delete [] data_out; data_out = (odim*bsize>0) ? new REAL[odim*bsize] : NULL; + debug1("ALLOCATE output data [%d] of first machine in MachJoin\n",odim); // Allocate a buffer that will contain the output gradient passed to // each sub-machine. This is needed because the sub-machine's call // to Backw() can destroy the content of their grad_out buffer, // so we have to pass a copy. grad_out_copy = (odim*bsize>0) ? new REAL[odim*bsize] : NULL; + debug1("ALLOCATE buffer for a copy of output grad [%d] in MachJoin\n",odim); } if (grad_in) delete [] grad_in; grad_in = (idim*bsize>0) ? new REAL[idim*bsize] : NULL; + debug2(" - grad_in alloc %lu bytes at %p\n",sizeof(REAL)*idim*bsize,(void*) grad_in); #endif } @@ -124,6 +130,7 @@ void MachJoin::do_delete() MachJoin::MachJoin() : MachMulti() { + debug0("** constructor MachJoin\n"); #ifdef BLAS_CUDA gpu_dev_data_out = NULL; sub_input_tmp = NULL; @@ -133,6 +140,7 @@ MachJoin::MachJoin() MachJoin::MachJoin(const MachJoin &m) : MachMulti(m) { + debug0("** copy constructor MachJoin\n"); #ifdef BLAS_CUDA gpu_dev_data_out = NULL; sub_input_tmp = NULL; @@ -141,6 +149,7 @@ MachJoin::MachJoin(const MachJoin &m) MachJoin::~MachJoin() { + debug0("** destructor MachJoin\n"); do_delete(); } @@ -155,6 +164,7 @@ MachJoin *MachJoin::Clone() void MachJoin::MachAdd(Mach *new_mach) { if (machs.empty()) { + debug0("** add first element to join machine\n"); machs.push_back(new_mach); // think about freeing memory idim=new_mach->GetIdim(); @@ -165,6 +175,7 @@ void MachJoin::MachAdd(Mach *new_mach) grad_out = NULL; } else { + debug0("** add new element to join machine\n"); if (bsize!=new_mach->GetBsize()) Error("bunch size of new join machine does not match"); if (odim!=new_mach->GetOdim()) @@ -268,6 +279,7 @@ void MachJoin::SetGradOut(REAL *data) void MachJoin::ReadData(istream &inpf, size_t s, int bs) { + debug0("* read data of MachJoin\n"); #ifdef BLAS_CUDA if (s!=machs.size()) ErrorN("data block of join machine has %zu machines (%zu were expected)", s, machs.size()); @@ -321,6 +333,7 @@ void MachJoin::Info(bool detailed, char *txt) printf("%sJoin machine %d-%d, bs=%d, passes=%lu/%lu", txt, idim, odim, bsize, nb_forw, nb_backw); tm.disp(", "); printf("\n"); + debug5("%s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); char ntxt[512]; sprintf(ntxt,"%s ", txt); for (unsigned int i=0; iInfo(detailed, ntxt); @@ -331,6 +344,7 @@ void MachJoin::Info(bool detailed, char *txt) // forward pass for all machines and average output into cumulated output void MachJoin::Forw(int eff_bsize, bool in_train) { + debug4("** MachJoin::Forw: %p[%d] -> %p[%d]\n",(void*)data_in,idim,(void*)data_out,odim); if (machs.empty()) Error("called Forw() for an empty join machine"); @@ -362,11 +376,13 @@ void MachJoin::Forw(int eff_bsize, bool in_train) // sub-machines, which is contiguous and already allocated (see MachAdd). REAL *iptr=data_in; + debug2("MachJoin::Forw: copying input into individual machines input buffers - iptr=%p, idim=%d\n", iptr, idim); #ifdef BLAS_CUDA Gpu::StreamSynchronize(); #endif for (unsigned int m=0; mGetIdim(); + debug3(" machine: %d, ptr=%p, m_idim=%d\n", m, machs[m]->GetDataIn(), m_idim); if (activ_forw[m]) { #ifdef BLAS_CUDA // Use Gpu::Memcpy2DAsync, which does strided copies in just one call @@ -398,6 +414,7 @@ void MachJoin::Forw(int eff_bsize, bool in_train) Gpu::CheckError("MachJoin::Forw after sub-mach->Forw()"); } else { + debug1(" MachJoin[%d]: forw deactivated\n",m); } } // Transfer everything to master GPU and accumulate in data_out @@ -439,6 +456,7 @@ void MachJoin::Forw(int eff_bsize, bool in_train) AXPY(&size, &normf, machs[m]->GetDataOut(), &inc1, data_out, &inc1); } else { + debug1(" MachJoin[%d]: forw deactivated\n",m); } } #endif @@ -455,6 +473,7 @@ void MachJoin::Forw(int eff_bsize, bool in_train) } nb_forw += eff_bsize; + debug0("MachJoin::Forw: done\n"); tm.stop(); debugMachOutp("MachJoin",data_out,idim,odim,eff_bsize); @@ -468,18 +487,21 @@ void MachJoin::Forw(int eff_bsize, bool in_train) // above the input. void MachJoin::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug4("** MachJoin::Backw: %p[%d] <- %p[%d]\n",(void*)grad_in,idim,(void*)grad_out,odim); if (machs.empty()) Error("called Backw() for an empty join machine"); if (eff_bsize<=0) eff_bsize=bsize; tm.start(); + debug4("** MachJoin::Backw: %p[%d] <- %p[%d]\n", (void*) grad_in, idim, (void*) grad_out, odim); #ifdef BLAS_CUDA // copy grad_out to each submachine's local buffer first Gpu::StreamSynchronize(); for (unsigned int m=0; mGetGradOut()); Gpu::SetConfig(machs[m]->GetGpuConfig()); Gpu::MemcpyAsync(machs[m]->GetGradOut(), grad_out, odim*eff_bsize*sizeof(REAL), cudaMemcpyDeviceToDevice); @@ -512,6 +534,7 @@ void MachJoin::Backw(const float lrate, const float wdecay, int eff_bsize) #endif } else { + debug1(" MachJoin[%d]: backw deactivated\n",m); } } #ifdef BLAS_CUDA diff --git a/MachLin.cpp b/MachLin.cpp index d8e6883..04d0466 100644 --- a/MachLin.cpp +++ b/MachLin.cpp @@ -33,21 +33,28 @@ using namespace std; void MachLin::do_alloc() { + debug0("do_alloc MachLin\n"); if(!bExternal){ #ifdef BLAS_CUDA + debug3("*** CUDA do_alloc MachLin %d x %d on GPU %d\n", idim,odim,Gpu::GetCudaDevice(Gpu::GetDevice(gpu_conf))); b = Gpu::Alloc(odim, "bias of linear machine"); w = Gpu::Alloc(idim*odim, "weights of linear machine"); + debug1("*** bias=%p\n",b); + debug1("*** weights=%p\n",w); #else + debug2("*** constructor MachLin %d x %d\n", idim,odim); if (odim>0) { b = new REAL[odim]; if (!b) Error ("can't allocate memory for bias of linear machine"); } else b=NULL; + debug1("*** bias=%p\n",b); if (idim*odim>0) { w = new REAL[idim*odim]; if (!w) Error ("can't allocate memory for weights of linear machine"); } else w=NULL; + debug1("*** weights=%p\n",w); #endif } } @@ -56,6 +63,7 @@ MachLin::MachLin(const int p_idim, const int p_odim, const int p_bsize, const ul : Mach(p_idim, p_odim, p_bsize, p_nbfw, p_nbbw), Shareable(xdata, shareid), bw_shared(NULL), bw_mutex(NULL) { #ifdef BLAS_CUDA + debug3("*** CUDA constructor MachLin %d x %d on GPU %d\n", idim,odim,Gpu::GetCudaDevice(Gpu::GetDevice(gpu_conf))); #endif do_alloc(); // initialize clipping @@ -76,6 +84,7 @@ MachLin::MachLin(const int p_idim, const int p_odim, const int p_bsize, const ul MachLin::MachLin(const MachLin &m) : Mach(m), Shareable(true, -1), b(NULL), w(NULL), bw_shared(NULL), bw_mutex(NULL) { + debug0("*** copy constructor MachLin\n"); iShareId = m.iShareId; int inc_bw_shared = 0; if (m.bw_mutex != NULL) { @@ -102,6 +111,7 @@ MachLin::MachLin(const MachLin &m) MachLin::~MachLin() { + debug1("*** destructor MachLin %lx\n", (luint) this); #ifdef BLAS_CUDA #else @@ -122,6 +132,7 @@ MachLin::~MachLin() pthread_mutex_lock(bw_mutex); if (bw_shared != NULL) { if ((*bw_shared) > 0) { + debug2("*** cloned -> not freeing w %p and b %p\n", w, b); (*bw_shared)--; pthread_mutex_unlock(bw_mutex); return; @@ -154,6 +165,7 @@ MachLin::~MachLin() void MachLin::BiasConst(const REAL val) { + debug2("MachLin::BiasRandom: %d =%f\n",odim,val); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); nppsSet_32f(val, b, odim); @@ -165,6 +177,7 @@ void MachLin::BiasConst(const REAL val) void MachLin::BiasRandom(const REAL range) { REAL c=range*2.0; + debug3("MachLin::BiasRandom: %d r=%f -> +- %f\n",odim,range,c/2.0); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); #ifdef CURAND @@ -185,6 +198,7 @@ void MachLin::BiasRandom(const REAL range) void MachLin::WeightsConst(const REAL val) { + debug3("MachLin::WeightsConst: %dx%d =%f\n",idim,odim,val); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); nppsSet_32f(val, w, idim*odim); @@ -195,6 +209,7 @@ void MachLin::WeightsConst(const REAL val) void MachLin::WeightsID(const REAL scale) { + debug3("MachLin::WeightsID: %dx%d =%f\n",idim,odim,scale); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); REAL * tmp = new REAL[idim * odim]; @@ -217,6 +232,7 @@ void MachLin::WeightsID(const REAL scale) void MachLin::WeightsRandom(const REAL range) { REAL c=range*2.0; + debug4("MachLin::WeightsRandom: %dx%d r=%f -> +- %f\n",idim,odim,range,c/2.0); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); #ifdef CURAND @@ -238,6 +254,7 @@ void MachLin::WeightsRandom(const REAL range) void MachLin::WeightsRandomFanI(const REAL range) { REAL c=2.0*range/sqrt((REAL) idim); + debug4("MachLin::WeightsRandomFanI: %dx%d r=%f -> +- %f\n",idim,odim,range,c/2.0); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); #ifdef CURAND @@ -260,6 +277,7 @@ void MachLin::WeightsRandomFanI(const REAL range) void MachLin::WeightsRandomFanIO(const REAL range) { REAL c=2.0*range/sqrt((REAL) (idim+odim)); + debug4("MachLin::WeightsRandomFanIO: %dx%d r=%f -> +- %f\n",idim,odim,range,c/2.0); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); #ifdef CURAND @@ -299,7 +317,9 @@ void MachLin::Info(bool detailed, char *txt) printf(", weights=%p, bias=%p", w, b); //DEBUG tm.newline(); #ifdef BLAS_CUDA + debug5("*** %s cuda data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); #else + debug5("*** %s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); #endif } } @@ -335,6 +355,7 @@ bool MachLin::CopyParams(Mach* mach) void MachLin::WriteParams(ostream &of) { + debug0("* write params of type MachLin\n"); Mach::WriteParams(of); if(Mach::fileid >= file_header_version4) { //fprintf(stderr, "MachLin::WriteParams - bExternal=%d iShareId=%d\n", (int) bExternal, iShareId); @@ -346,6 +367,7 @@ void MachLin::WriteParams(ostream &of) void MachLin::WriteData(ostream &outf) { int i=0, s=sizeof(REAL); if (bExternal) { + debug0("* MachLin with external address to file\n"); //fprintf(stderr, " MachLin with external address to file share-id=%d\n", iShareId); outf.write((char*) &i, sizeof(int)); outf.write((char*) &s, sizeof(int)); @@ -371,6 +393,7 @@ void MachLin::WriteData(ostream &outf) { outf.write((char*)local_mem,odim*sizeof(REAL)); delete [] local_mem; #else + debug0("*** writing data of linear machine to file\n"); outf.write((char*) w,odim*idim*sizeof(REAL)); outf.write((char*) b,odim*sizeof(REAL)); #endif @@ -383,14 +406,17 @@ void MachLin::WriteData(ostream &outf) { void MachLin::ReadParams(istream &inpf, bool with_alloc) { + debug0("* read params of type MachLin\n"); Mach::ReadParams(inpf, false); //This should be done for file_version 3 or greater ! if(Mach::fileid >= file_header_version4){ inpf.read((char*) &bExternal, sizeof(int)); + debug1(" - bExternal=%d\n", (int) bExternal); // fprintf(stderr, " - bExternal=%d", (int) bExternal); inpf.read((char*) &iShareId, sizeof(int)); + debug1(" - share-id=%d\n", (int) iShareId); // fprintf(stderr, " - share-id=%d\n", (int) iShareId); } //fprintf(stderr, "\n"); @@ -400,6 +426,7 @@ void MachLin::ReadParams(istream &inpf, bool with_alloc) void MachLin::ReadData(istream &inpf, size_t s, int bs) { size_t se=odim*idim + odim; + debug0("*** read data of MachLin\n"); if (bExternal) { if (s>0) { @@ -421,6 +448,7 @@ void MachLin::ReadData(istream &inpf, size_t s, int bs) inpf.read((char*)local_mem,odim*idim*sizeof(REAL)); for (int i=0;i times into result matrix + debug5("*** CUDA: MachLin::Forw %p[%d] -> %p[%d] on GPU %d\n",data_in,idim,data_out,odim,Gpu::GetCudaDevice(Gpu::GetDevice(gpu_conf))); Gpu::CopyVectorToMatrix(data_out, b, eff_bsize, odim); call_gemm(data_out, w, data_in, 1.0, odim, eff_bsize, idim); #else @@ -495,6 +526,7 @@ void MachLin::Forw(int eff_bsize, bool in_train) void MachLin::ForwDropout(int eff_bsize, bool in_train) { + debug0("*** MachLin ForwDropout"); if (drop_out<=0) return; @@ -551,6 +583,7 @@ void MachLin::ForwDropout(int eff_bsize, bool in_train) void MachLin::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug2("*** MachLin Backw %p <- %p\n",(void*)grad_in,(void*)grad_out); static REAL real1=1.0, real0=0.0; static char transN='N', transT='T'; REAL lrate_bs = lr_coeff * lrate / sqrt(GetBsize()); // scale by block size ! diff --git a/MachLinRectif.cpp b/MachLinRectif.cpp index cb7258c..d376277 100644 --- a/MachLinRectif.cpp +++ b/MachLinRectif.cpp @@ -35,15 +35,18 @@ using namespace std; MachLinRectif::MachLinRectif(const int p_idim, const int p_odim, const int p_bsize, const ulong p_nbfw, const ulong p_nbbw, const int shareid, const bool xdata) : MachLin(p_idim, p_odim, p_bsize, p_nbfw, p_nbbw, shareid, xdata) { + debug0("** constructor MachLinRectif\n"); } MachLinRectif::MachLinRectif(const MachLinRectif &m) : MachLin(m) { + debug0("** copy constructor MachLinRectif\n"); } MachLinRectif::~MachLinRectif() { + debug1("** destructor MachLinRectif %lx\n",(luint) this); } @@ -69,6 +72,7 @@ void MachLinRectif::Info(bool detailed, char *txt) tm.disp(", "); tmh.disp(" + recif: "); printf("\n"); + debug5("%s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); } } @@ -78,6 +82,7 @@ void MachLinRectif::Info(bool detailed, char *txt) void MachLinRectif::Forw(int eff_bsize, bool in_train) { + debug2("*** MachLinRectif Forw %p -> %p\n",(void*)data_in,(void*)data_out); if (eff_bsize<=0) eff_bsize=bsize; MachLin::Forw(eff_bsize,in_train); @@ -102,6 +107,7 @@ void MachLinRectif::Forw(int eff_bsize, bool in_train) void MachLinRectif::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug2("*** MachLinRectif Backw %p <- %p\n",(void*)grad_in,(void*)grad_out); // derivate tanh activation function // multiply grad_hidden by derivatives of hidden layer activities (tanh) // grad_out = grad_out .* f'(data_out) diff --git a/MachMulti.cpp b/MachMulti.cpp index 794a509..730dfd6 100644 --- a/MachMulti.cpp +++ b/MachMulti.cpp @@ -33,12 +33,14 @@ using namespace std; MachMulti::MachMulti() : Mach(0,0,0) { + debug0("** constructor MachMulti\n"); machs.clear(); } MachMulti::MachMulti(const MachMulti &m) : Mach(m) { + debug0("** copy constructor MachMulti\n"); machs.clear(); } @@ -48,6 +50,7 @@ MachMulti::MachMulti(const MachMulti &m) MachMulti *MachMulti::Clone() { + debug1("** MachMulti::Clone %p\n", this); MachMulti *m = new MachMulti(*this); if (m != NULL) m->CloneSubmachs(*this); @@ -56,7 +59,9 @@ MachMulti *MachMulti::Clone() void MachMulti::CloneSubmachs(const MachMulti &mm) { + debug1("** MachMulti::CloneSubmachs %p\n", &mm); for (unsigned int m=0; mMachAdd( mm.machs[m]->Clone() ); if (!activ_forw.empty()) activ_forw.back() = mm.activ_forw[m]; @@ -71,6 +76,7 @@ void MachMulti::CloneSubmachs(const MachMulti &mm) MachMulti::~MachMulti() { + debug1("** destructor MachMulti %lx\n", (luint) this); MachMulti::Delete(); machs.clear(); } @@ -113,12 +119,14 @@ ulong MachMulti::GetNbParams() { void MachMulti::WriteParams(ostream &of) { + debug0("* write params of MachMulti\n"); Mach::WriteParams(of); int nbm=machs.size(); of.write((char*) &nbm, sizeof(int)); } void MachMulti::WriteData(ostream &outf) { + debug0("* writing data of multiple machine to file\n"); int nbm=machs.size(), s=sizeof(REAL); outf.write((char*) &nbm, sizeof(int)); outf.write((char*) &s, sizeof(int)); @@ -133,6 +141,7 @@ void MachMulti::WriteData(ostream &outf) { void MachMulti::ReadParams(istream &inpf, bool with_alloc) { + debug0("* read params of type MachMulti\n"); if (machs.size() > 0) Error("Trying to read multiple machine into non empty data structures\n"); @@ -150,6 +159,7 @@ void MachMulti::ReadParams(istream &inpf, bool with_alloc) void MachMulti::ReadData(istream &inpf, size_t s, int bs) { + debug0("* read data of MachMulti\n"); if (s!=machs.size()) ErrorN("data block of multiple machine has %zu machines (%zu were expected)", s, machs.size()); diff --git a/MachPar.cpp b/MachPar.cpp index 10e8259..d0c4871 100644 --- a/MachPar.cpp +++ b/MachPar.cpp @@ -29,6 +29,7 @@ using namespace std; void MachPar::do_alloc() { + debug2("do_alloc MachPar %d x %d\n",idim,odim); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); if (data_out) cublasFree(data_out); @@ -37,11 +38,15 @@ void MachPar::do_alloc() data_out = Gpu::Alloc(odim*bsize, "output data of parallel machine"); grad_in = Gpu::Alloc(idim*bsize, "input gradient of parallel machine"); + debug2(" - CUDA data_out alloc %lu bytes at %p\n",sizeof(REAL)*odim*bsize,(void*) data_out); + debug2(" - CUDA grad_in alloc %lu bytes at %p\n",sizeof(REAL)*idim*bsize,(void*) grad_in); #else if (data_out) delete [] data_out; if (grad_in) delete [] grad_in; data_out = (odim*bsize>0) ? new REAL[odim*bsize] : NULL; grad_in = (idim*bsize>0) ? new REAL[idim*bsize] : NULL; + debug2(" - data_out alloc %lu bytes at %p\n",sizeof(REAL)*odim*bsize,(void*) data_out); + debug2(" - grad_in alloc %lu bytes at %p\n",sizeof(REAL)*idim*bsize,(void*) grad_in); #endif } @@ -49,15 +54,18 @@ void MachPar::do_alloc() MachPar::MachPar() : MachMulti() { + debug0("** constructor MachPar\n"); } MachPar::MachPar(const MachPar &m) : MachMulti(m) { + debug0("** copy constructor MachPar\n"); } MachPar::~MachPar() { + debug0("** destructor MachPar\n"); // data_out and grad_in will be freed by Mach::~Mach() for (unsigned int m=0; mGetIdim(); @@ -94,6 +103,7 @@ void MachPar::MachAdd(Mach *new_mach) do_alloc(); } else { + debug0("** add new element to parallel machine\n"); if (bsize!=new_mach->GetBsize()) Error("bunch size of new parallel machine does not match"); machs.push_back(new_mach); @@ -133,6 +143,7 @@ Mach *MachPar::MachDel() void MachPar::ReadData(istream &inpf, size_t s, int bs) { + debug0("* read data of MachPar\n"); MachMulti::ReadData(inpf, s, bs); // calculate idim and odim and allocate data_out and grad_in @@ -162,6 +173,7 @@ void MachPar::ReadData(istream &inpf, size_t s, int bs) if (mt->GetMType()==file_header_mtype_tab) { if(Mach::fileid >= file_header_version3){ if (tadr[mt->GetShareId()] == NULL) { + debug3("Storing address (%p) of machine %d with share-id %d\n",mt->GetTabAdr(),m, mt->GetShareId()); tadr[mt->GetShareId()] = mt->GetTabAdr(); if(mt->GetTabAdr() == NULL) { std::stringstream oss ("In MachPar: machine "); @@ -169,9 +181,11 @@ void MachPar::ReadData(istream &inpf, size_t s, int bs) Error(oss.str().c_str()); } } else { + debug3("Setting address (%p) of machine %d with share-id %d\n",mt->GetTabAdr(),m, mt->GetShareId()); mt->SetTabAdr(tadr[mt->GetShareId()]); } *//*else { + debug3("Machine %d with share-id '%s' already has its own weights at address (%p)\n",m, mt->GetShareId(), mt->GetTabAdr()); if(mt->GetTabAdr() == NULL) { //std::ostringstream oss("In MachPar: machine "); std::stringstream oss ("In MachPar: machine "); @@ -181,8 +195,11 @@ void MachPar::ReadData(istream &inpf, size_t s, int bs) }*/ /*} else { // before file_header_version3, all MachTab in a MachPar share the weights if(tadr[-1] == NULL ){ + if(tadr[-1]) { debug2("Storing further address (%p) of machine %d\n",tadr[-1],m); } // cout << "set NEW tadr" << endl; } + else { debug2("Storing address (%p) of machine %d\n",mt->GetTabAdr(),m); } //cout << "set tadr" << endl; } tadr[-1]=mt->GetTabAdr(); } else { + debug2("setting address of machine %d to %p\n",m,tadr[-1]); //cout << "set address of machine " << m << " to " << tadr[-1] << endl; //mt->FreeTabAdr(); mt->SetTabAdr(tadr[-1]); @@ -207,6 +224,7 @@ void MachPar::Info(bool detailed, char *txt) printf("%sParallel machine %d-%d, bs=%d, passes=%lu/%lu", txt, idim, odim, bsize, nb_forw, nb_backw); tm.disp(", "); printf("\n"); + debug5("%s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); char ntxt[512]; sprintf(ntxt,"%s ", txt); for (unsigned int i=0; iInfo(detailed, ntxt); @@ -222,6 +240,7 @@ void MachPar::Info(bool detailed, char *txt) // forward pass for all machines and copy output into cumulated output void MachPar::Forw(int eff_bsize, bool in_train) { + debug4("** MachPar::Forw: %p[%d] -> %p[%d]\n",(void*)data_in,idim,(void*)data_out,odim); if (machs.empty()) Error("called Forw() for an empty parallel machine"); @@ -259,6 +278,7 @@ void MachPar::Forw(int eff_bsize, bool in_train) // forward all machines for (unsigned int m=0; mForw(eff_bsize,in_train); } else { @@ -288,6 +308,7 @@ void MachPar::Forw(int eff_bsize, bool in_train) } nb_forw += eff_bsize; + debug0("MachPar::Forw: done\n"); tm.stop(); debugMachOutp("MachPar",data_out,idim,odim,eff_bsize); @@ -296,6 +317,7 @@ void MachPar::Forw(int eff_bsize, bool in_train) // backward pass for all machines and copy input gradient into cumulated gradient void MachPar::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug4("** MachPar::Backw: %p[%d] <- %p[%d]\n",(void*)grad_in,idim,(void*)grad_out,odim); if (machs.empty()) Error("called Backw() for an empty parallel machine"); if (eff_bsize<=0) eff_bsize=bsize; diff --git a/MachSeq.cpp b/MachSeq.cpp index 234fb09..a099fd8 100644 --- a/MachSeq.cpp +++ b/MachSeq.cpp @@ -29,15 +29,18 @@ using namespace std; MachSeq::MachSeq() : MachMulti() { + debug0("*** constructor MachSeq\n"); } MachSeq::MachSeq(const MachSeq &m) : MachMulti(m) { + debug0("*** copy constructor MachSeq\n"); } MachSeq::~MachSeq() { + debug1("*** destructor MachSeq %lx\n", (luint) this); data_out=grad_in=NULL; // prevent delete[] by ~Mach() } @@ -66,6 +69,7 @@ void MachSeq::SetGradOut(REAL *data) void MachSeq::MachAdd(Mach *new_mach) { if (machs.empty()) { + debug0("*** add first element to sequential machine\n"); machs.push_back(new_mach); // think about freeing memory idim=new_mach->GetIdim(); @@ -74,6 +78,7 @@ void MachSeq::MachAdd(Mach *new_mach) grad_in=new_mach->GetGradIn(); } else { + debug0("*** add new element to sequential machine\n"); Mach *last_mach=machs.back(); if (last_mach->GetOdim()!=new_mach->GetIdim()) { cout << "Current sequential machine:" << endl; Info(false); @@ -99,6 +104,7 @@ void MachSeq::MachAdd(Mach *new_mach) odim=new_mach->GetOdim(); data_out=new_mach->GetDataOut(); grad_out=new_mach->GetGradOut(); + debug4("*** data_in=%p, grad_in=%p, data_out=%p, grad_out=%p\n", data_in, grad_in, data_out, grad_out); } Mach *MachSeq::MachDel() @@ -141,8 +147,10 @@ void MachSeq::MachInsert(Mach *new_mach, size_t pos) if (pos<1 || pos>=machs.size()) ErrorN("MachSeq::MachInsert() position must be in [%d,%zu], %zu was requested\n",1,machs.size(),pos); + debug2("*** add new element at pos %lu to sequential machine with %lu\n",pos,machs.size()); Mach *prev_mach=machs[pos-1]; Mach *next_mach=machs[pos]; + debug2("prev=%p, next=%p\n",prev_mach,next_mach); if (prev_mach->GetOdim()!=new_mach->GetIdim()) { cout << "Current sequential machine:" << endl; Info(false); @@ -180,8 +188,10 @@ void MachSeq::MachInsert(Mach *new_mach, size_t pos) void MachSeq::ReadData(istream &inpf, size_t s, int bs) { + debug0("*** read data of MachSeq\n"); MachMulti::ReadData(inpf, s, bs); + debug0("*** rebuild data structures\n"); int nbm=machs.size(); idim = machs[0]->GetIdim(); @@ -218,6 +228,7 @@ void MachSeq::Info(bool detailed, char *txt) tm.disp(", "); tbackw.disp(" + back: "); printf("\n"); + debug5("*** %s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); char ntxt[512]; sprintf(ntxt,"%s ", txt); for (unsigned int i=0; iInfo(detailed, ntxt); @@ -227,6 +238,7 @@ void MachSeq::Info(bool detailed, char *txt) void MachSeq::Forw(int eff_bsize, bool in_train) { + debug2("* MachSeq::Forw: %p -> %p\n", (void*) data_in, (void*) data_out); if (machs.empty()) Error("called Forw() for an empty sequential machine"); @@ -242,6 +254,7 @@ void MachSeq::Forw(int eff_bsize, bool in_train) void MachSeq::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug2("* MachSeq::Backw: %p <- %p\n", (void*) grad_in, (void*) grad_out); if (machs.empty()) Error("called Backw() for an empty sequential machine"); diff --git a/MachSig.cpp b/MachSig.cpp index c7eab5f..33f3feb 100644 --- a/MachSig.cpp +++ b/MachSig.cpp @@ -30,11 +30,13 @@ using namespace std; MachSig::MachSig(const int p_idim, const int p_odim, const int p_bsize, const ulong p_nbfw, const ulong p_nbbw, const int shareid, const bool xdata) : MachLin(p_idim, p_odim, p_bsize, p_nbfw, p_nbbw, shareid, xdata) { + debug0("** constructor MachSig\n"); } MachSig::MachSig(const MachSig &m) : MachLin(m) { + debug0("** copy constructor MachSig\n"); } MachSig::~MachSig() @@ -63,6 +65,7 @@ void MachSig::Info(bool detailed, char *txt) #endif tm.disp(", "); printf("\n"); + debug5("%s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); } } @@ -72,6 +75,7 @@ void MachSig::Info(bool detailed, char *txt) void MachSig::Forw(int eff_bsize, bool in_train) { + debug0("** MachSig Forw\n"); tm.start(); @@ -86,6 +90,7 @@ void MachSig::Forw(int eff_bsize, bool in_train) void MachSig::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug0("** MachSig Backw\n"); // derivate sigmoidal activation function // = grad_hidden .* ( 1 - a_hidden^2 ) diff --git a/MachSoftmax.cpp b/MachSoftmax.cpp index c94c44d..c5373a4 100644 --- a/MachSoftmax.cpp +++ b/MachSoftmax.cpp @@ -35,10 +35,12 @@ using namespace std; MachSoftmax::MachSoftmax(const int p_idim, const int p_odim, const int p_bsize, const ulong p_nbfw, const ulong p_nbbw, const int shareid, const bool xdata) : MachLin(p_idim, p_odim, p_bsize, p_nbfw, p_nbbw, shareid, xdata) { + debug0("** constructor MachSoftmax\n"); #if defined(BLAS_CUDA) && defined(BLAS_CUDA_NPPS_SUM) int nbytes=0; Gpu::SetConfig(gpu_conf); nppsSumGetBufferSize_32f(odim, &nbytes); + debug2(" - CUDA MachSoftmax: allocating %d bytes for fast sum of %d-dimensional output layer\n",nbytes,odim); gpu_sum_buf = nppsMalloc_8u(nbytes); #endif #ifdef BLAS_CUDA @@ -51,9 +53,11 @@ MachSoftmax::MachSoftmax(const int p_idim, const int p_odim, const int p_bsize, MachSoftmax::MachSoftmax(const MachSoftmax &m) : MachLin(m) { + debug0("** copy constructor MachSoftmax\n"); #if defined(BLAS_CUDA) && defined(BLAS_CUDA_NPPS_SUM) int nbytes=0; nppsSumGetBufferSize_32f(odim, &nbytes); + debug2(" - CUDA MachSoftmax: allocating %d bytes for fast sum of %d-dimensional output layer\n",nbytes,odim); gpu_sum_buf = nppsMalloc_8u(nbytes); #endif #ifdef BLAS_CUDA @@ -65,6 +69,7 @@ MachSoftmax::MachSoftmax(const MachSoftmax &m) MachSoftmax::~MachSoftmax() { + debug0("** destructor MachSoftmax\n"); #if defined(BLAS_CUDA) && defined(BLAS_CUDA_NPPS_SUM) Gpu::SetConfig(gpu_conf); if (gpu_sum_buf) nppsFree(gpu_sum_buf); @@ -91,6 +96,7 @@ void MachSoftmax::Info(bool detailed, char *txt) tm.disp(", "); tmn.disp(" + norm: "); printf("\n"); + debug5("%s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); } } @@ -100,6 +106,7 @@ void MachSoftmax::Info(bool detailed, char *txt) void MachSoftmax::Forw(int eff_bsize, bool in_train) { + debug3("*** MachSoftmax::Forw: mach=%p data: %p <- %p\n", this, data_in, data_out); if (eff_bsize<=0) eff_bsize=bsize; MachLin::Forw(eff_bsize,in_train); @@ -129,6 +136,7 @@ void MachSoftmax::Forw(int eff_bsize, bool in_train) void MachSoftmax::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug3("*** MachSoftmax::Backw: mach=%p grad: %p <- %p\n", this, grad_in, grad_out); // derivate softmax activation function // do_i / da_k = o_i (kronecker_ik - o_k) // we suppose that do_i/da_k vanishes in the error function !! diff --git a/MachSoftmaxClass.cpp b/MachSoftmaxClass.cpp index e930051..6e15be8 100644 --- a/MachSoftmaxClass.cpp +++ b/MachSoftmaxClass.cpp @@ -372,6 +372,7 @@ void MachSoftmaxClass::WriteData(ostream &outf) void MachSoftmaxClass::ReadData(istream &inpf, size_t s, int bs) { + debug0("* read data in MachSoftmaxClass"); MachLin::ReadData(inpf, s, bs); inpf.read((char*) &n_classes, sizeof(int)); diff --git a/MachSoftmaxStable.cpp b/MachSoftmaxStable.cpp index c675a2c..79a14ab 100644 --- a/MachSoftmaxStable.cpp +++ b/MachSoftmaxStable.cpp @@ -35,10 +35,12 @@ using namespace std; MachSoftmaxStable::MachSoftmaxStable(const int p_idim, const int p_odim, const int p_bsize, const ulong p_nbfw, const ulong p_nbbw, const int shareid, const bool xdata) : MachLin(p_idim, p_odim, p_bsize, p_nbfw, p_nbbw, shareid, xdata) { + debug0("** constructor MachSoftmaxStable\n"); #if defined(BLAS_CUDA) && defined(BLAS_CUDA_NPPS_SUM) int nbytes=0; Gpu::SetConfig(gpu_conf); nppsSumGetBufferSize_32f(odim, &nbytes); + debug2(" - CUDA MachSoftmaxStable: allocating %d bytes for fast sum of %d-dimensional output layer\n",nbytes,odim); gpu_sum_buf = nppsMalloc_8u(nbytes); #endif #ifdef BLAS_CUDA @@ -51,9 +53,11 @@ MachSoftmaxStable::MachSoftmaxStable(const int p_idim, const int p_odim, const i MachSoftmaxStable::MachSoftmaxStable(const MachSoftmaxStable &m) : MachLin(m) { + debug0("** copy constructor MachSoftmaxStable\n"); #if defined(BLAS_CUDA) && defined(BLAS_CUDA_NPPS_SUM) int nbytes=0; nppsSumGetBufferSize_32f(odim, &nbytes); + debug2(" - CUDA MachSoftmaxStable: allocating %d bytes for fast sum of %d-dimensional output layer\n",nbytes,odim); gpu_sum_buf = nppsMalloc_8u(nbytes); #endif #ifdef BLAS_CUDA @@ -65,6 +69,7 @@ MachSoftmaxStable::MachSoftmaxStable(const MachSoftmaxStable &m) MachSoftmaxStable::~MachSoftmaxStable() { + debug0("** destructor MachSoftmaxStable\n"); #if defined(BLAS_CUDA) && defined(BLAS_CUDA_NPPS_SUM) Gpu::SetConfig(gpu_conf); if (gpu_sum_buf) nppsFree(gpu_sum_buf); @@ -90,6 +95,7 @@ void MachSoftmaxStable::Info(bool detailed, char *txt) tm.disp(", "); tmn.disp(" + norm: "); printf("\n"); + debug5("%s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); } } @@ -99,6 +105,7 @@ void MachSoftmaxStable::Info(bool detailed, char *txt) void MachSoftmaxStable::Forw(int eff_bsize, bool in_train) { + debug2("*** MachSoftmaxStable::Forw: %p -> %p\n",(void*)data_in,(void*)data_out); if (eff_bsize<=0) eff_bsize=bsize; MachLin::Forw(eff_bsize,in_train); @@ -140,6 +147,7 @@ void MachSoftmaxStable::Forw(int eff_bsize, bool in_train) void MachSoftmaxStable::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug2("*** MachSoftmaxStable Backw %p <- %p\n",(void*)grad_in,(void*)grad_out); // derivate softmax activation function // do_i / da_k = o_i (kronecker_ik - o_k) // we suppose that do_i/da_k vanishes in the error function !! diff --git a/MachSplit.cpp b/MachSplit.cpp index d6bee47..3642762 100644 --- a/MachSplit.cpp +++ b/MachSplit.cpp @@ -83,17 +83,20 @@ void MachSplit::do_delete() MachSplit::MachSplit() : MachMulti() { + debug0("** constructor MachSplit\n"); data_out=grad_out=NULL; // important to prevent freeing! } MachSplit::MachSplit(const MachSplit &m) : MachMulti(m) { + debug0("** copy constructor MachSplit\n"); data_out=grad_out=NULL; // important to prevent freeing! } MachSplit::~MachSplit() { + debug0("** destructor MachSplit\n"); do_delete(); } @@ -107,6 +110,7 @@ MachSplit *MachSplit::Clone() void MachSplit::MachAdd(Mach *new_mach) { + debug0("*** MachSplit::MachAdd()"); // REMARK: there is no common output layer no output gradient !! // input gradient is cumulated @@ -118,6 +122,7 @@ void MachSplit::MachAdd(Mach *new_mach) idim=new_mach->GetIdim(); odim=new_mach->GetOdim(); bsize=new_mach->GetBsize(); + debug1("*** adding 1st machine: setting output dim to %d\n", odim); data_in=NULL; // will be set by MachSplit::SetDataIn() new_mach->SetDataIn(data_in); new_mach->SetGradOut(NULL); // must be done by Trainer() @@ -125,6 +130,7 @@ void MachSplit::MachAdd(Mach *new_mach) do_alloc(); } else { + debug1("*** add new machine of odim %d to split machine\n",new_mach->GetOdim()); if (bsize!=new_mach->GetBsize()) Error("bunch size of new split machine does not match"); if (idim!=new_mach->GetIdim()) @@ -134,6 +140,7 @@ void MachSplit::MachAdd(Mach *new_mach) // resize output, we just change odim, no allocation is done since outputs are individual // idim does not change ! odim += new_mach->GetOdim(); + debug2("*** adding %dth machines: resize output dim to %d\n", (int) machs.size(), odim); #ifdef BLAS_CUDA size_t dev = Gpu::GetDevice(new_mach->GetGpuConfig()); if (dev == Gpu::GetDevice(gpu_conf)) @@ -148,6 +155,7 @@ void MachSplit::MachAdd(Mach *new_mach) activ_forw.push_back(true); activ_backw.push_back(true); + debug4("*** data_in=%p, grad_in=%p, data_out=%p, grad_out=%p\n", data_in, grad_in, data_out, grad_out); } @@ -182,6 +190,7 @@ void MachSplit::SetDataIn(REAL *data) { data_in=data; // all machines point on the same input + debug1("*** MachSplit::SetDataIn() setting all machine to %p\n", data_in); #ifdef BLAS_CUDA if (Gpu::GetDeviceCount()==1) { // only one GPU device printf("#### CUDA set data_in for one GPU\n"); @@ -231,6 +240,7 @@ REAL* MachSplit::GetDataOut(int mid) void MachSplit::ReadData(istream &inpf, size_t s, int bs) { + debug0("* read data of MachSplit\n"); #ifdef BLAS_CUDA if (s!=machs.size()) ErrorN("data block of split machine has %zu machines (%zu were expected)", s, machs.size()); @@ -302,6 +312,7 @@ void MachSplit::Info(bool detailed, char *txt) // void MachSplit::Forw(int eff_bsize, bool in_train) { + debug3("** MachSplit::Forw: mach=%p data: %p <- %p\n", this, data_in, data_out); if (machs.empty()) Error("called Forw() for an empty split machine"); @@ -319,6 +330,7 @@ void MachSplit::Forw(int eff_bsize, bool in_train) if (Gpu::GetDeviceCount() > 1) for (size_t d=0; dForw(eff_bsize,in_train); // its the responsibility of the Trainer to collect the individual outputs } @@ -350,6 +363,7 @@ void MachSplit::Forw(int eff_bsize, bool in_train) // backward pass for all machines and cumulate gradient at input void MachSplit::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug3("** MachSplit::Backw: mach=%p grads: %p <- %p\n", this, grad_in, grad_out); if (machs.empty()) Error("called Backw() for an empty split machine"); @@ -371,6 +385,7 @@ void MachSplit::Backw(const float lrate, const float wdecay, int eff_bsize) // backward 1st machine if (activ_backw[0]) { + debug1("MachSplit: back first mach @%p\n",machs[0]); machs[0]->Backw(lrate,wdecay,eff_bsize); #ifdef BLAS_CUDA last_grad_in = machs[0]->GetGradIn(); @@ -381,6 +396,7 @@ void MachSplit::Backw(const float lrate, const float wdecay, int eff_bsize) } else { // clear the gradient so we can cumulate the following ones + debug1("MachSplit: zero grads of first mach @%p\n",machs[0]); #ifdef BLAS_CUDA Gpu::SetConfig(machs[0]->GetGpuConfig()); Gpu::MemsetAsync(grad_in, 0.0, idim*eff_bsize*sizeof(REAL)); @@ -395,12 +411,15 @@ void MachSplit::Backw(const float lrate, const float wdecay, int eff_bsize) #ifdef BLAS_CUDA for (unsigned int m=1; mBackw(lrate,wdecay,eff_bsize); Gpu::CheckError("MachSplit::Backw after following mach"); } else { + debug1(" MachSplit[%d]: GPU backw deactivated\n",m); } } + debug0(" MachSplit: GPU add up gradients\n"); Gpu::SetConfig(machs[0]->GetGpuConfig()); int size = idim*eff_bsize; for (unsigned int m=1; mBackw(lrate,wdecay,eff_bsize); REAL * grad_ptr = machs[m]->GetGradIn(); int size = idim*eff_bsize; @@ -437,6 +458,7 @@ void MachSplit::Backw(const float lrate, const float wdecay, int eff_bsize) AXPY(&size, &onef, grad_ptr, &one, grad_in, &one); } else { + debug1(" MachSplit[%d]: CPU backw deactivated\n",m); } } #endif diff --git a/MachSplit1.cpp b/MachSplit1.cpp index 7840fec..8a24277 100644 --- a/MachSplit1.cpp +++ b/MachSplit1.cpp @@ -33,15 +33,18 @@ using namespace std; MachSplit1::MachSplit1() : MachMulti(), grad_out_split(NULL) { + debug0("** constructor MachSplit1\n"); } MachSplit1::MachSplit1(const MachSplit1 &m) : MachMulti(m), grad_out_split(NULL) { + debug0("** copy constructor MachSplit1\n"); } MachSplit1::~MachSplit1() { + debug0("** destructor MachSplit1\n"); // data_out and grad_in will be freed by Mach::~Mach() #ifdef BLAS_CUDA Error("Check setting CUDA device"); @@ -61,12 +64,14 @@ MachSplit1 *MachSplit1::Clone() void MachSplit1::MachAdd(Mach *new_mach) { + debug0("*** MachSplit1::MachAdd()"); if (machs.empty()) { machs.push_back(new_mach); // think about freeing memory idim=new_mach->GetIdim(); odim=new_mach->GetOdim(); bsize=new_mach->GetBsize(); + debug1("*** adding 1st machine: setting output dim to %d\n", odim); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); data_in=NULL; // will be set by MachSplit1::SetDataIn() @@ -85,6 +90,7 @@ void MachSplit1::MachAdd(Mach *new_mach) new_mach->SetGradOut(NULL); // will be done in Backw() } else { + debug1("*** add new machine of odim %d to split machine\n",new_mach->GetOdim()); if (bsize!=new_mach->GetBsize()) Error("bunch size of new split machine does not match"); if (idim!=new_mach->GetIdim()) @@ -93,6 +99,7 @@ void MachSplit1::MachAdd(Mach *new_mach) // resize output (idim does not change !) odim += new_mach->GetOdim(); + debug2("*** adding %dth machines: resize output dim to %d\n", (int) machs.size(), odim); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); if (data_out) cublasFree(data_out); @@ -111,6 +118,7 @@ void MachSplit1::MachAdd(Mach *new_mach) activ_forw.push_back(true); activ_backw.push_back(true); + debug4("*** data_in=%p, grad_in=%p, data_out=%p, grad_out=%p\n", data_in, grad_in, data_out, grad_out); } Mach *MachSplit1::MachDel() @@ -161,6 +169,7 @@ void MachSplit1::SetDataIn(REAL *data) { data_in=data; // all machines point on the same input + debug1("*** MachSplit1::SetDataIn() setting all machine to %p\n", data_in); for (unsigned int m=0; mSetDataIn(data_in); } @@ -172,6 +181,7 @@ void MachSplit1::SetDataIn(REAL *data) void MachSplit1::ReadData(istream &inpf, size_t s, int bs) { + debug0("* read data of MachSplit1\n"); MachMulti::ReadData(inpf, s, bs); // get dimensions @@ -345,6 +355,7 @@ void MachSplit1::Backw(const float lrate, const float wdecay, int eff_bsize) #endif } else { + debug1(" MachSplit1[%d]: backw deactivated\n",m); } } diff --git a/MachTab.cpp b/MachTab.cpp index dbc0982..b812dfc 100644 --- a/MachTab.cpp +++ b/MachTab.cpp @@ -32,16 +32,20 @@ using namespace std; void MachTab::do_alloc() { + debug0("do_alloc MachTab\n"); if (!bExternal) { #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); t = Gpu::Alloc(idim*odim, "memory for table look-up machine"); + debug3(" CUDA alloc table at %p, size %dx%d\n", (void*)t, idim,odim); #else t = new REAL[idim*odim]; + debug3(" alloc table at %p, size %dx%d\n", (void*)t, idim,odim); if (!t) Error ("can't allocate memory for table look-up machine"); #endif } else { + debug3(" reuse table at %p, size %dx%d\n", (void*)t, idim,odim); } #ifdef BLAS_CUDA tmp_inp = new REAL[idim*bsize]; @@ -51,6 +55,7 @@ void MachTab::do_alloc() MachTab::MachTab(const int p_idim, const int p_odim, const int p_bsize, const ulong p_nbfw, const ulong p_nbbw, const int shareid, const bool xtable) : Mach(1, p_odim, p_bsize, p_nbfw, p_nbbw), Shareable(xtable, shareid), t(NULL), t_shared(NULL), t_mutex(NULL) { + debug1("** constructor MachTab %lx\n", (luint) this); if (p_idim<=0) Error("Table machine: illegal value of input dimension"); if (p_odim<=0) Error("Table machine: illegal value of output dimension"); idim = p_idim; // override 1 in call to Mach() @@ -74,6 +79,7 @@ MachTab::MachTab(const MachTab &m) : Mach(m, 1), Shareable(true, -1), t(NULL), t_shared(NULL), t_mutex(NULL) { + debug1("** copy constructor MachTab with address %lx\n", (luint) this); idim = m.idim; // override 1 in call to Mach() //bExternal = m.bExternal; //Loic: why? this should ALWAYS be true (as in initialization) iShareId = m.iShareId; @@ -105,6 +111,7 @@ MachTab::MachTab(const MachTab &m) MachTab::~MachTab() { + debug1("** destructor MachTab %lx\n", (luint) this); #ifdef BLAS_CUDA if (tmp_inp) delete tmp_inp; @@ -115,6 +122,7 @@ MachTab::~MachTab() pthread_mutex_lock(t_mutex); if (t_shared != NULL) { if ((*t_shared) > 0) { + debug1("*** cloned -> not freeing t %p\n", t); (*t_shared)--; pthread_mutex_unlock(t_mutex); return; @@ -194,6 +202,7 @@ void MachTab::Info(bool detailed, char *txt) tm.disp(", "); printf(", LookupTable=%p", t); //DEBUG printf("\n"); + debug5("%s data: %p -> %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); } } @@ -220,6 +229,7 @@ bool MachTab::CopyParams(Mach* mach) void MachTab::WriteParams(ostream &of) { + debug0("* write params of type MachTab\n"); Mach::WriteParams(of); of.write((char*) &bExternal, sizeof(int)); @@ -229,11 +239,13 @@ void MachTab::WriteParams(ostream &of) void MachTab::WriteData(ostream &outf) { int i=0, s=sizeof(REAL); if (bExternal) { + debug0("* table look-up machine with external address to file\n"); //fprintf(stderr, "* table look-up machine with external address to file\n"); outf.write((char*) &i, sizeof(int)); outf.write((char*) &s, sizeof(int)); } else { + debug0("* writing data of table look-up machine to file\n"); //fprintf(stderr, "* writing data of table look-up machine to file\n"); i=idim*odim; outf.write((char*) &i, sizeof(int)); @@ -254,6 +266,7 @@ void MachTab::WriteData(ostream &outf) { REAL *MachTab::WeightTable(int &idm, int &odm) { + debug0("* dump weights under textual form from a MachTab machine\n"); idm = idim; odm = odim; REAL *myTable = (REAL *) malloc (sizeof(REAL)*idim*odim); @@ -273,13 +286,16 @@ REAL *MachTab::WeightTable(int &idm, int &odm) { void MachTab::ReadParams(istream &inpf, bool with_alloc) { + debug0("* read params of type MachTab\n"); Mach::ReadParams(inpf, false); inpf.read((char*) &bExternal, sizeof(int)); + debug1(" - bExternal=%d\n", (int) bExternal); //This should be done for file_version 3 or greater ! if(Mach::fileid >= file_header_version3){ inpf.read((char*) &iShareId, sizeof(int)); + debug1(" - share-id=%d\n", (int) iShareId); } do_alloc(); @@ -288,6 +304,7 @@ void MachTab::ReadParams(istream &inpf, bool with_alloc) void MachTab::ReadData(istream &inpf, size_t s, int bs) { size_t se=odim*idim; + debug1("* read data of MachTab of size %u\n", (uint)s); if (bExternal) { if (s>0) { @@ -302,6 +319,7 @@ void MachTab::ReadData(istream &inpf, size_t s, int bs) #ifdef BLAS_CUDA REAL *local_mem=new REAL[odim*idim]; inpf.read((char*)local_mem,odim*idim*sizeof(REAL)); + debug2("CUDA: transfer %d elements for MachTab to GPU %d\n",odim*idim,Gpu::GetCudaDevice(Gpu::GetDevice(gpu_conf))); Gpu::SetConfig(gpu_conf); cublasSetVector(odim*idim,CUDA_SIZE,local_mem,1,t,1); Gpu::CheckError("transfer of table look-up machine to GPU memory"); @@ -336,6 +354,7 @@ void MachTab::Forw(int eff_bsize, bool in_train) tm.start(); if (eff_bsize<=0) eff_bsize=bsize; + debug3("MachTab::Forw: %p -> %p, bs=%d\n",(void*)data_in,(void*)data_out,eff_bsize); #ifdef BLAS_CUDA Gpu::SetConfig(gpu_conf); @@ -347,16 +366,20 @@ void MachTab::Forw(int eff_bsize, bool in_train) int idx= (int) data_in[b]; if (idx==NULL_WORD) { // simulate empty word: set everything to 0 + debug4("MachTab %p: b=%d, empty word to %p, size %d\n", this, b, (void*)optr, odim); for (int i=0; i %p, grad %p <- %p\n", txt, (void*)data_in, (void*)data_out, (void*)grad_in, (void*)grad_out); } } @@ -91,6 +95,7 @@ void MachTanh::Info(bool detailed, char *txt) void MachTanh::Forw(int eff_bsize, bool in_train) { + debug3("*** MachTanh::Forw: mach=%p data: %p <- %p\n", this, data_in, data_out); if (eff_bsize<=0) eff_bsize=bsize; MachLin::Forw(eff_bsize,in_train); @@ -113,6 +118,7 @@ void MachTanh::Forw(int eff_bsize, bool in_train) void MachTanh::Backw(const float lrate, const float wdecay, int eff_bsize) { + debug3("*** MachTanh::Backw: mach=%p grad: %p <- %p\n", this, grad_in, grad_out); // derivate tanh activation function // multiply grad_hidden by derivatives of hidden layer activities (tanh) // grad_out = grad_out .* f'(data_out) @@ -130,7 +136,9 @@ void MachTanh::Backw(const float lrate, const float wdecay, int eff_bsize) # ifdef DEBUG { REAL buf[d]; cublasGetVector(d,sizeof(REAL),data_out,1,buf,1); + debug4(" output : %e %e .. %e %e\n", buf[0],buf[1],buf[d-2],buf[d-1]); cublasGetVector(d,sizeof(REAL),grad_out,1,buf,1); + debug4(" grads_out: %e %e .. %e %e\n", buf[0],buf[1],buf[d-2],buf[d-1]); } # endif // work inplace in grad_out @@ -138,13 +146,17 @@ void MachTanh::Backw(const float lrate, const float wdecay, int eff_bsize) # ifdef DEBUG { REAL buf[d]; cublasGetVector(d,sizeof(REAL),grad_out,1,buf,1); + debug4(" grad deriv %e %e .. %e %e\n", buf[0],buf[1],buf[d-2],buf[d-1]); } # endif #else VSQR(&d,data_out); + debug4(" output^2 : %e %e .. %e %e\n", data_out[0],data_out[1],data_out[d-2],data_out[d-1]); + debug4(" grads_out: %e %e .. %e %e\n", grad_out[0],grad_out[1],grad_out[d-2],grad_out[d-1]); REAL *aptr = data_out; REAL *gptr = grad_out; for (int i=0; iGetSentenceIds(wptr, hyp.GetCstr(), mode & RESCORE_MODE_BOS, mode & RESCORE_MODE_EOS); @@ -114,6 +115,7 @@ void NbestCSLM::RescoreHyp(Hypo &hyp, const int lm_pos) // allocate memory to store the delayed LM probabilities delayed_hyps.push_back(new HypSentProba(hyp, lm_pos, nw)); // (nw-1) would be actually enough + debug2(" - allocate mem for %d words: addr=%p\n", nw, delayed_hyps.back()->GetAddrP()); // request n-grams that are shorter then CSLM order, starting with 2-, 3-, ... n-gram int n=2; @@ -126,11 +128,13 @@ void NbestCSLM::RescoreHyp(Hypo &hyp, const int lm_pos) j++; } while (nBlockEval(wptr, n, delayed_hyps.back()->GetAddrP()+n-2, aux_data); n++; } // request all remaining full n-grams while (n<=nw) { // we have n-1 full n-grams in a sentence with n-words + debug2(" - call BlockEval() for %dst %d-gram\n", n-1, lm_order); trainer->BlockEval(wptr, lm_order, delayed_hyps.back()->GetAddrP()+n-2, aux_data); // last address will be base+n-1 n++, wptr++; } @@ -141,6 +145,7 @@ void NbestCSLM::RescoreHyp(Hypo &hyp, const int lm_pos) void NbestCSLM::FinishPending() { + debug1("NbestCSLM::FinishPending(): process %u delayed requests for complete hyps\n", (uint) delayed_hyps.size()); trainer->BlockFinish(); for (vector::iterator i = delayed_hyps.begin(); i != delayed_hyps.end(); i++) { diff --git a/NbestCSLM.h b/NbestCSLM.h index 427dfef..4b8840d 100644 --- a/NbestCSLM.h +++ b/NbestCSLM.h @@ -38,6 +38,7 @@ class HypSentProba { REAL *p; // array to store the n-gram log probabilities public: HypSentProba(Hypo &p_hyp, int p_pos, int p_nw) : hyp(p_hyp), lm_pos(p_pos), nw(p_nw), p(new REAL[nw]) { + debug1("HypSentProba(): alloc addr %p\n", p); }; ~HypSentProba() { if(p) delete [] p; } REAL *GetAddrP() {return p;} @@ -45,8 +46,10 @@ class HypSentProba { { REAL logP=0; for (int i=0; i store sentence logP=%e (log10=%e) at pos %d\n", logP,logP/M_LN10,lm_pos); hyp.SetFeature(logP,lm_pos); } }; diff --git a/NbestLMKEN.cpp b/NbestLMKEN.cpp index 426b786..3e20932 100644 --- a/NbestLMKEN.cpp +++ b/NbestLMKEN.cpp @@ -49,6 +49,7 @@ bool NbestLMKEN::Read (const string &fname, int const) // void NbestLMKEN::RescoreHyp (Hypo &hyp, const int lm_pos, REAL*) { + debug2("NbestLMKEN::RescoreHyp(): lm_pos=%d, mode=%d\n", lm_pos, mode); float logP = 0; if (NULL != ken_ngram) { State state((mode & RESCORE_MODE_BOS) ? ken_ngram->BeginSentenceState() : ken_ngram->NullContextState()), out_state; @@ -64,6 +65,7 @@ void NbestLMKEN::RescoreHyp (Hypo &hyp, const int lm_pos, REAL*) if (mode & RESCORE_MODE_EOS) logP += ken_ngram->Score(state, ken_vocab->EndSentence(), out_state); } + debug1("log10P=%e / 5d\n", logP); hyp.SetFeature(logP, lm_pos); } diff --git a/NbestLMSRI.cpp b/NbestLMSRI.cpp index d51ee49..1a86837 100644 --- a/NbestLMSRI.cpp +++ b/NbestLMSRI.cpp @@ -69,6 +69,7 @@ bool NbestLMSRI::Read (const string &fname, int const order) { // void NbestLMSRI::RescoreHyp (Hypo &hyp, const int lm_pos, REAL*) { + debug2("NbestLMSRI::RescoreHyp(): lm_pos=%d, mode=%d\n", lm_pos, mode); static TextStats tstats; static const int max_words=16384; static const int max_chars=max_words*16; @@ -82,8 +83,10 @@ void NbestLMSRI::RescoreHyp (Hypo &hyp, const int lm_pos, REAL*) strcpy(str,hyp.GetCstr()); // we need to copy since parseWords() modifies the string int nw = sri_vocab->parseWords(str, vstr, max_words + 1); if (nw == max_words+1) Error("too many words in one hypothesis\n"); + debug1(" parsing found %d words\n", nw); float logP = sri_ngram->sentenceProb(vstr, tstats); + debug1("log10P=%e / 5d\n", logP); hyp.SetFeature(logP,lm_pos); return; } diff --git a/README b/README index 3e0f5e6..b5a3fb7 100644 --- a/README +++ b/README @@ -6,7 +6,7 @@ those references. This software includes several tools to process lattices in HTK format. Part of it is based on code from the Spynx project. Please see the source code -for the corresponding licence. +for the corresponding license. Build instructions: ------------------- @@ -28,7 +28,7 @@ You can also choose the LM toolkit to link with make BOLM_TOOL=SRILM SRILM toolkit, you need to download and install the toolkit In addition, you can include support for continuous space TRANSLATION models. This is -optionnal since a complete Moses installation is needed. +optional since a complete Moses installation is needed. make CSTM=1 also creates cstm_train and cstm_eval include support for rescoring translation models in the nbest tool (change the variable MOSES_INC for your configuration) @@ -47,7 +47,7 @@ List of all the tools: mach_dump tool to extract individual layers from a large network those can be used to initialize layers of other networks with "init-from-file=layer.mach" - dumpEmbeddings extract the embeddigns from a network + dumpEmbeddings extract the embeddings from a network nn_train generic neural network training nn_info display information on a neural network @@ -67,6 +67,12 @@ uses the FORTRAN BLAS functions, i.e. using column-major matrices. You should not link with C versions of the BLAS library (which probably use row-major matrix storage) ! +The code contains many lines with instructions debugX(...) These are left-overs +from the development phase which were keep just in case we have to debug some +parts again. In a normal compile, they produce no code and have no impact on +the speed. To activate them, type "make DB=-DDEBUG ...". This will produce MANY +messages on the screen. You may consider compiling like this only the parts +of the code you want to debug. Prerequisites: - working C++ compiler (tested with g++ 4.8.3) @@ -86,12 +92,12 @@ Jun 28 2015 V4.0 - bug fixes: - deterministic sorting of the wordlist for short lists - corrected race condition in MachTab on GPU leading to concurrent updates - now the results are indentical to CPU version, but slight slower + now the results are identical to CPU version, but slight slower - neural network architectures and training: - added classes at the output layer - introduced learning rate schemes - layer-specific learning rates - - support for auxillary data at the network input + - support for auxiliary data at the network input - flexible sharing of parameters - simplified network configuration - refactorisation of GPU code @@ -107,11 +113,11 @@ Jun 28 2015 V4.0 - new tools - dump embeddings - added more documentation - - improved tutoriel + - improved tutorial Mar 25 2014 V3.0 - - LGPL licene + - LGPL license - support of SRILM and KENLM (default) for convenience, KENLM is included in the tar and will be automatically compiled - new command line interface with configuration files for more flexible specification of network architectures @@ -134,13 +140,13 @@ Jan 26 2010 V1.0 - Training and n-best list rescoring is working - Short-lists and interpolation of multiple networks is not yet implemented -It is recommended that you join the Google group "continuos-space-language-model-toolkit" +It is recommended that you join the Google group "continuous-space-language-model-toolkit" to be informed of bug corrections, updates and follow other discussions on the tool. Contributors: ------------- N. Coetmeur configuration files, KENLM support, many improvements and fixes -W. Aransa auxillary data, various fixes +W. Aransa auxiliary data, various fixes L. Barrault network configuration, in particular parameter sharing F. Bastien improved GPU code F. Bougares sentences scores, data handling diff --git a/Tools.h b/Tools.h index 60bd0c5..b1c7411 100644 --- a/Tools.h +++ b/Tools.h @@ -50,8 +50,26 @@ static const string cslm_version="V3.1"; // #ifdef DEBUG # define TRACE(txt) cout << txt; +# define debug0(F) printf(F) +# define debug1(F,a) printf(F,a) +# define debug2(F,a,b) printf(F,a,b) +# define debug3(F,a,b,c) printf(F,a,b,c) +# define debug4(F,a,b,c,d) printf(F,a,b,c,d) +# define debug5(F,a,b,c,d,e) printf(F,a,b,c,d,e) +# define debug6(F,a,b,c,d,e,f) printf(F,a,b,c,d,e,f) +# define debug7(F,a,b,c,d,e,f,h) printf(F,a,b,c,d,e,f,h) +# define debug8(F,a,b,c,d,e,f,h,i) printf(F,a,b,c,d,e,f,h,i) #else # define TRACE(txt) +# define debug0(F) +# define debug1(F,a) +# define debug2(F,a,b) +# define debug3(F,a,b,c) +# define debug4(F,a,b,c,d) +# define debug5(F,a,b,c,d,e) +# define debug6(F,a,b,c,d,e,f) +# define debug7(F,a,b,c,d,e,f,h) +# define debug8(F,a,b,c,d,e,f,h,i) #endif #ifdef DEBUGEX diff --git a/Toolsgz.cpp b/Toolsgz.cpp index ef1bb35..18d4499 100644 --- a/Toolsgz.cpp +++ b/Toolsgz.cpp @@ -52,6 +52,7 @@ int Weights::ScanLine() { if (wf.eof()) Error("Weights::ScanLine() called without open file"); + debug0("scan: waiting...\n"); string line; getline(wf,line); if (!wf.good()) return 0; diff --git a/Trainer.cpp b/Trainer.cpp index 02878a3..84b3b68 100644 --- a/Trainer.cpp +++ b/Trainer.cpp @@ -37,6 +37,7 @@ Trainer::Trainer (Mach *pmach, Lrate *lrate, ErrFct *perrfct, wdecay(p_wd), nb_ex(0), nb_epoch(p_ep), max_epoch(p_maxep), auxdim(0), err_train(0), err_dev(0) { + debug0("*** Constructor Trainer ***\n"); idim=mach->GetIdim(); odim=mach->GetOdim(); bsize=mach->GetBsize(); if (train_fname) { @@ -119,8 +120,10 @@ Trainer::Trainer (Mach *pmach, Lrate *lrate, ErrFct *perrfct, Trainer::~Trainer() { + debug0("*** Destructor Trainer ***\n"); if (data_train) delete data_train; if (data_dev && data_dev_alloc) { + debug0("freeing data_dev\n"); delete data_dev; } #ifdef BLAS_CUDA @@ -158,12 +161,16 @@ REAL Trainer::Train() Gpu::SetConfig(mach->GetGpuConfig()); mach->SetDataIn(gpu_input); // we copy from buf_input to gpu_input errfct->SetTarget(gpu_target); // we copy from buf_target to gpu_target + debug1(" - gpu_input %p\n", gpu_input); + debug1(" - gpu_target %p\n", gpu_target); #else mach->SetDataIn(buf_input); errfct->SetTarget(buf_target); #endif errfct->SetOutput(mach->GetDataOut()); mach->SetGradOut(errfct->GetGrad()); + debug1(" - grad %p\n", errfct->GetGrad()); + debug1(" - output %p\n", mach->GetDataOut()); // TODO: we could copy all the examples on the GPU and then split into bunches locally bool data_available; diff --git a/TrainerNgram.cpp b/TrainerNgram.cpp index c224ce7..d67f378 100644 --- a/TrainerNgram.cpp +++ b/TrainerNgram.cpp @@ -34,6 +34,7 @@ TrainerNgram::TrainerNgram (Mach *pmach, Lrate *lrate, ErrFct *perrfct, : Trainer(pmach,lrate,perrfct,NULL,NULL,p_wd,p_maxep,p_ep,0), order(0) { + debug0("*** Constructor TrainerNgram for training ***\n"); char msg[1024]; idim=mach->GetIdim(); odim=mach->GetOdim(); bsize=mach->GetBsize(); @@ -105,6 +106,7 @@ TrainerNgram::TrainerNgram (Mach *pmach, ErrFct *perrfct, Data *data, int aux_di : Trainer(pmach,NULL,perrfct,NULL,NULL,0,0,0), order(0) { + debug0("*** Constructor TrainerNgram for testing ***\n"); char msg[1024]; idim=mach->GetIdim(); odim=mach->GetOdim(); bsize=mach->GetBsize(); @@ -275,6 +277,7 @@ REAL TrainerNgram::TestDev(char *fname) } nb_ex_dev += n; + debug2("%d: %f\n",nb_ex_dev,exp(-log_sum/nb_ex_dev)); } while (data_available); if (fname) fs.close(); diff --git a/TrainerNgramClass.cpp b/TrainerNgramClass.cpp index 674ca14..5232a08 100644 --- a/TrainerNgramClass.cpp +++ b/TrainerNgramClass.cpp @@ -144,6 +144,7 @@ TrainerNgramClass::TrainerNgramClass(Mach* pmach, ErrFct* perrfct, Data* data) TrainerNgramClass::~TrainerNgramClass() { + debug0("*** Destructor TrainerNgramClass ***\n"); #ifdef BLAS_CUDA if (buf_class_target) cudaFreeHost(buf_class_target); @@ -400,6 +401,7 @@ REAL TrainerNgramClass::TestDev(char *fname) } nb_ex_dev += n; + debug2("%d: %f\n",nb_ex_dev,exp(-log_sum/nb_ex_dev)); } while (data_available); if (fname) fs.close(); diff --git a/TrainerNgramSlist.cpp b/TrainerNgramSlist.cpp index b88f939..c7e1c94 100644 --- a/TrainerNgramSlist.cpp +++ b/TrainerNgramSlist.cpp @@ -118,6 +118,7 @@ void TrainerNgramSlist::DoConstructorWork() { BlockSetMax(); // allocate req + debug0(" + done init TrainerNgramSlist\n"); } // @@ -132,6 +133,7 @@ TrainerNgramSlist::TrainerNgramSlist (Mach *pmach, Lrate *lrate, ErrFct *perrfct lm_fname(strdup(p_lm_fname)), lm_buf_target(new WordID[odim*bsize]), slist_len(mach->GetOdim()-1), blm(NULL), wlist(NULL), max_req(0), nreq(0), req(NULL), nb_ngram(0), nb_forw(0) { + debug0("*** Constructor TrainerNgramSlist for training ***\n"); cout << "Setting up training with short list" << endl; DoConstructorWork(); } @@ -147,6 +149,7 @@ TrainerNgramSlist::TrainerNgramSlist (Mach *pmach, ErrFct *perrfct, lm_fname(strdup(p_lm_fname)), lm_buf_target(new WordID[odim*bsize]), slist_len(mach->GetOdim()-1), blm(NULL), wlist(NULL), max_req(0), nreq(0), req(NULL), nb_ngram(0), nb_forw(0) { + debug0("*** Constructor TrainerNgramSlist for testing ***\n"); cout << "Setting up testing with short list" << endl; DoConstructorWork(); } @@ -157,12 +160,14 @@ TrainerNgramSlist::TrainerNgramSlist (Mach *pmach, WordList *wlist, char *p_lm_f lm_fname(strdup(p_lm_fname)), lm_buf_target(new WordID[odim*bsize]), slist_len(mach->GetOdim()-1), blm(NULL), wlist(wlist), max_req(0), nreq(0), req(NULL), nb_ngram(0), nb_forw(0) { + debug0("*** Constructor TrainerNgramSlist for block operations ***\n"); cout << "Setting up CSLM with short list" << endl; DoConstructorWork(); } void TrainerNgramSlist::FreeReq() { + debug3("TrainerNgramSlist::FreeReq(): %p: %d out of %d\n", req, nreq, max_req); if (req) { for (int i=0; iSetTarget(gpu_target); // we copy from buf_target to gpu_target errfct->SetOutput(mach->GetDataOut()); mach->SetGradOut(errfct->GetGrad()); + debug1(" - gpu_input %p\n", gpu_input); + debug1(" - gpu_target %p\n", gpu_target); + debug1(" - grad %p\n", grad); + debug1(" - output %p\n", output); data_train->Rewind(); // reserve memory on the GPU for all examples @@ -259,6 +269,7 @@ REAL TrainerNgramSlist::Train() if (at_least_one_short) nb_ex_short++; n++; } + debug2("copy bunch of %d words to GPU, totl slist=%d\n", n, nb_ex_slist); if (nb_ex+n > mem_ex) { ErrorN("trying to load %d examples, but memory was reserved for %d examples only\n", nb_ex, mem_ex); @@ -348,6 +359,8 @@ REAL TrainerNgramSlist::Train() Gpu::SetConfig(mach->GetGpuConfig()); mach->SetDataIn(gpu_input); // we copy from buf_input to gpu_input errfct->SetTarget(gpu_target); // we copy from buf_target to gpu_target + debug1(" - gpu_input %p\n", gpu_input); + debug1(" - gpu_target %p\n", gpu_target); #else mach->SetDataIn(buf_input); errfct->SetTarget(buf_target); @@ -391,6 +404,7 @@ REAL TrainerNgramSlist::Train() #endif n++; } + debug2("train bunch of %d words, totl slist=%d\n", n, nb_ex_slist); if (n>0) { #ifdef BLAS_CUDA @@ -400,6 +414,7 @@ REAL TrainerNgramSlist::Train() mach->Forw(n,true); tgrad.start(); log_sum += errfct->CalcGrad(n); + debug1("TrainerNgramSlist::Train - log_sum: %f\n", log_sum); tgrad.stop(); lrate->UpdateLrateOnForw(mach->GetNbForw()); bprop.start(); @@ -451,6 +466,8 @@ REAL TrainerNgramSlist::DoTestDev(char *fname, bool renorm) Gpu::SetConfig(mach->GetGpuConfig()); mach->SetDataIn(gpu_input); // we copy from buf_input to gpu_input errfct->SetTarget(gpu_target); // we copy from buf_target to gpu_target + debug1(" - gpu_input %p\n", gpu_input); + debug1(" - gpu_target %p\n", gpu_target); #else mach->SetDataIn(buf_input); errfct->SetTarget(buf_target); @@ -463,6 +480,7 @@ REAL TrainerNgramSlist::DoTestDev(char *fname, bool renorm) // get a bunch of data int n=0, i; data_available = true; + debug0("start bunch\n"); while (n < mach->GetBsize() && data_available) { data_available = data_dev->Next(); if (!data_available) break; @@ -489,6 +507,7 @@ REAL TrainerNgramSlist::DoTestDev(char *fname, bool renorm) n++; } + debug1("found bunch of %d\n", n); // process the bunch by the neural network if (n>0) { @@ -507,6 +526,7 @@ REAL TrainerNgramSlist::DoTestDev(char *fname, bool renorm) #else REAL *optr=mach->GetDataOut(); #endif + debug1("Collect n=%d\n", n); REAL *ptr_input = buf_input; for (int ni=0; niBoffLnPid(ptr_input+max(tgpos+1-lm_order, 0), lm_buf_target[ni], tgpos-p+1); logP = blm->BoffLnPid(ptr_input+max(tgpos+1-lm_order, 0), lm_buf_target[ni], tgpos-p); nb_ex_short++; + debug2(" short %d-gram LM: logP=%e\n", idim-p, logP); } else #endif @@ -536,6 +557,7 @@ REAL TrainerNgramSlist::DoTestDev(char *fname, bool renorm) int p; for (p = 0 ; (p < tgpos) && (NULL_WORD == ptr_input[p]) ; p++); // search for longest n-gram without NULL_WORD in the first place logP = blm->BoffLnPid(ptr_input+max(tgpos+1-lm_order, p), lm_buf_target[ni],min(lm_order, tgpos + 1 - p)); + debug2(" %d-gram LM: logP=%e\n", lm_order, logP); //printf("NN slist output=%e\n", optr[buf_target_wid[ni]]); } else { @@ -552,6 +574,7 @@ REAL TrainerNgramSlist::DoTestDev(char *fname, bool renorm) } //REAL logP2 = blm->BoffLnPid(ptr_input+max(tgpos+1-lm_order, 0), lm_buf_target[ni],min(lm_order, tgpos + 1)); //printf(" CSLM: logP=%e, ngra,=%e \n", logP, logP2); + debug1(" CSLM: logP=%e\n", logP); nb_ex_slist++; } } @@ -565,6 +588,7 @@ REAL TrainerNgramSlist::DoTestDev(char *fname, bool renorm) } nb_ex += n; + debug2("%d: %f\n",nb_ex,exp(-log_sum/nb_ex)); } while (data_available); printf(" - %d %d-gram requests, %d=%.2f%% short n-grams, %d=%.2f%% by back-off LM, %d=%5.2f%% predicted by CSLM\n", @@ -602,6 +626,7 @@ void TrainerNgramSlist::BlockEval(WordID *wid, int o, REAL*p, REAL *aux_data) int cl=o-1, i; if (cl != iaux) { #ifdef CSLM_DOES_SHORT_NGRAMS + debug7("TrainerNgramSlist::BlockEval(): add %d st short %d-gram %d %d %d .. -> %d, addr=%p\n", nreq, o, wid[0],wid[1],wid[2],wid[cl], p); req[nreq].ctxt_len = iaux; // use full filled-up n-gram req[nreq].ctxt = new WordID[iaux]; // fill up incomplete n-gram with NULL-WORD (at the beginning !) @@ -630,12 +655,15 @@ void TrainerNgramSlist::BlockEval(WordID *wid, int o, REAL*p, REAL *aux_data) if (++nreq >= max_req) BlockFinish(); #else //ErrorN("BlockEval() dim %d differs from CSLM %d\n", cl, iaux); + debug6("TrainerNgramSlist::BlockEval(): process immediately short %d-gram %d %d %d ... -> %d, addr=%p\n", o, wid[0],wid[1],wid[2],wid[cl], p); nb_ex_short++; *p = blm->BoffLnStd(wid, wid[cl], o); + debug2(" stored logP=%e, log10=%e\n", *p, *p/M_LN10); #endif return; } + debug7("TrainerNgramSlist::BlockEval(): add %d st %d-gram %d %d %d ... -> %d, addr=%p\n", nreq, o, wid[0],wid[1],wid[2],wid[cl], p); req[nreq].ctxt_len = cl; req[nreq].ctxt = new WordID[cl]; for(i=0;i= tgpos) { req[nreq].ctxt[i]=wid[i+1];} else { req[nreq].ctxt[i]=wid[i]; } @@ -658,6 +686,7 @@ void TrainerNgramSlist::BlockEval(WordID *wid, int o, REAL*p, REAL *aux_data) int NgramReqComp(const void *v1, const void *v2) { NgramReq* n1=(NgramReq*) v1, *n2=(NgramReq*) v2; + //debug6("compare %d %d %d ? %d %d %d\n", n1->ctxt[0],n1->ctxt[1],n1->ctxt[2], n2->ctxt[0],n2->ctxt[1],n2->ctxt[2]); for (int i=0; ictxt_len; i++) { if (n1->ctxt[i] < n2->ctxt[i]) return -1; if (n1->ctxt[i] > n2->ctxt[i]) return 1; @@ -674,6 +703,7 @@ int NgramReqComp(const void *v1, const void *v2) void TrainerNgramSlist::BlockFinish() { + debug1("TrainerNgramSlist::BlockFinish(): processing block of %d n-gram requests\n", nreq); if (nreq == 0) return; nb_ngram+=nreq; @@ -684,8 +714,10 @@ void TrainerNgramSlist::BlockFinish() printf(" -> %d\n", req[i].wpred); } #endif + debug0("START SORT \n"); //sort(req.begin(),req.end()); // use operator < of Ngramreq qsort(req, nreq, sizeof(NgramReq), NgramReqComp); + debug0("\nAFTER SORT\n"); #ifdef DEBUG for (int i=0; i= bsize) { ForwAndCollect(req_beg,n-1,bs,false); bs=0; req_beg=n; @@ -741,6 +776,7 @@ void TrainerNgramSlist::BlockFinish() void TrainerNgramSlist::ForwAndCollect(int req_beg, int req_end, int bs, bool renorm) { if (bs<=0) return; + debug3("TrainerNgramSlist::ForwAndCollect(): collecting outputs %d .. %d from bunch of size %d\n", req_beg, req_end, bs); nb_forw++; #ifdef CUDA Gpu::SetConfig(mach->GetGpuConfig()); @@ -768,6 +804,7 @@ void TrainerNgramSlist::ForwAndCollect(int req_beg, int req_end, int bs, bool re if (tgt==NULL_WORD) Error("TrainerNgramSlist::ForwAndCollect(): internal error: NULL_WORD in target\n"); WordID mapped_tgt = wlist->MapIndex(tgt); int b=req[n].bs; + debug7("request %5d: CSLM block %d, %d %d %d -> %d, mapped=%d\n", n, b, req[n].ctxt[0], req[n].ctxt[1],req[n].ctxt[2],tgt, mapped_tgt); #ifdef BLAS_CUDA REAL *optr=host_output + b*odim; #else @@ -794,6 +831,7 @@ void TrainerNgramSlist::ForwAndCollect(int req_beg, int req_end, int bs, bool re // the order of the back-off LM may be smaller than the one of the CSLM // -> this is resolved internally by the back-off class (the last words are used) logP = blm->BoffLnPid(ptr_input+max(tgpos+1-lm_order, 0), mapped_tgt, min(lm_order, tgpos + 1)); // TODO target mapped forth an back + debug3(" - not slist: %d-gram LM: logP=%e, log10=%e\n", lm_order, logP, logP/M_LN10); } else { // get proba from CSLM @@ -806,6 +844,7 @@ void TrainerNgramSlist::ForwAndCollect(int req_beg, int req_end, int bs, bool re else { logP = safelog(optr[mapped_tgt]); // no error check on indices necessary here } + debug2(" - in slist CSLM: logP=%e, log10=%e\n", logP, logP/M_LN10); nb_ex_slist++; } } @@ -819,12 +858,14 @@ void TrainerNgramSlist::ForwAndCollect(int req_beg, int req_end, int bs, bool re //************************************************************************************** // void TrainerNgramSlist::BlockSetMax(int p_max) { + debug3("TrainerNgramSlist::BlockSetMax(%d): prev=%p[%d]\n", p_max, req, max_req); if (req) { FreeReq(); delete [] req; } max_req=p_max; req = new NgramReq[max_req]; + debug2("allocated req at %p for %d elements\n", req, max_req); nreq=0; } diff --git a/docs/RELEASE_NOTES b/docs/RELEASE_NOTES index 3e49b13..1ee4393 100644 --- a/docs/RELEASE_NOTES +++ b/docs/RELEASE_NOTES @@ -1,8 +1,32 @@ - ***************************************************************************************** - RELEASE V4.0, June 28 2015 + - bug fixes: + - deterministic sorting of the wordlist for short lists + - corrected race condition in MachTab on GPU leading to concurrent updates + now the results are indentical to CPU version, but slight slower + - neural network architectures and training: + - added classes at the output layer + - introduced learning rate schemes + - layer-specific learning rates + - support for auxillary data at the network input + - flexible sharing of parameters + - simplified network configuration + - refactorisation of GPU code + - use of CUDA streams + - better GPU kernels for activation functions + - more options to select CUDA devices, automatic selection with "-N" + - several missing functions are now available on GPU + - data handling: + - sentence scores + - cross-sentence n-grams + - arbitrary target position (still experimental) + - fast loading of phrases + - new tools + - dump embeddings + - added more documentation + - improved tutoriel + ***************************************************************************************** RELEASE V3.0, March 25 2014 - change to LGPL license diff --git a/extract2bin.cpp b/extract2bin.cpp index 6f7aedf..f075c8a 100644 --- a/extract2bin.cpp +++ b/extract2bin.cpp @@ -133,6 +133,7 @@ int Phrase::AddPhrase(FILE *binf, char *line) } uchar x= (uchar) nw_in_phr; fwrite(&x, sizeof(uchar), 1, binf); // 1 byte is enough + debug2("%s dump %d words:", msg, x); // loop on all words in line bptr=sptr; while ((*bptr != 0) && (*bptr != '\n')) { @@ -155,6 +156,7 @@ int Phrase::AddPhrase(FILE *binf, char *line) voc->GetWordInfo(idx_unk).n++; fwrite(&idx_unk, sizeof(WordList::WordIndex), 1, binf); } + debug2(" UNK: %s[%d]", bptr,idx); //idx=unk_w->AddWord(bptr); TODO #ifdef COUNT_OOV if (idx<0) ErrorN("illegal OOV idx (%d) for word %s\n",idx, bptr); @@ -166,6 +168,7 @@ int Phrase::AddPhrase(FILE *binf, char *line) if (idx<1 || idx>nvoc) ErrorN("illegal word index (%d) for %s word %s\n", idx, msg, bptr); voc->GetWordInfo(idx).n++; fwrite(&idx, sizeof(WordList::WordIndex), 1, binf); + debug2(" %s[%d]", bptr,idx); } bptr = eptr + 1; @@ -173,6 +176,7 @@ int Phrase::AddPhrase(FILE *binf, char *line) } // TODO for (i=0; i