forked from hschwenk/cslm-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
369 changed files
with
120,203 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
|
||
Tools.o: Tools.cpp Tools.h | ||
Toolsgz.o: Toolsgz.cpp Tools.h Toolsgz.h | ||
MachConfig.o: MachConfig.cpp MachAvr.h MachCombined.h MachMulti.h Mach.h \ | ||
Tools.h Blas.h Timer.h MachConfig.h MachLin.h Shareable.h MachTab.h \ | ||
MachJoin.h MachLinRectif.h MachPar.h MachSeq.h MachSig.h MachSoftmax.h \ | ||
MachSoftmaxStable.h MachSoftmaxClass.h WordList.h MachSplit.h \ | ||
MachSplit1.h MachTanh.h MachCopy.h | ||
Mach.o: Mach.cpp Tools.h Mach.h Blas.h Timer.h MachCopy.h MachTab.h \ | ||
Shareable.h MachLin.h MachSig.h MachTanh.h MachSoftmax.h \ | ||
MachSoftmaxStable.h MachSoftmaxClass.h WordList.h MachLinRectif.h \ | ||
MachSeq.h MachMulti.h MachPar.h MachSplit.h MachSplit1.h MachJoin.h | ||
MachTab.o: MachTab.cpp Tools.h MachTab.h Mach.h Blas.h Timer.h \ | ||
Shareable.h | ||
MachCopy.o: MachCopy.cpp Tools.h MachCopy.h Mach.h Blas.h Timer.h | ||
MachLin.o: MachLin.cpp Tools.h MachLin.h Mach.h Blas.h Timer.h \ | ||
Shareable.h | ||
MachSig.o: MachSig.cpp Tools.h MachSig.h MachLin.h Mach.h Blas.h Timer.h \ | ||
Shareable.h | ||
MachTanh.o: MachTanh.cpp Tools.h MachTanh.h MachLin.h Mach.h Blas.h \ | ||
Timer.h Shareable.h | ||
MachSoftmax.o: MachSoftmax.cpp Tools.h MachSoftmax.h MachLin.h Mach.h \ | ||
Blas.h Timer.h Shareable.h | ||
MachSoftmaxStable.o: MachSoftmaxStable.cpp Tools.h MachSoftmaxStable.h \ | ||
MachLin.h Mach.h Blas.h Timer.h Shareable.h | ||
MachLinRectif.o: MachLinRectif.cpp Tools.h MachLinRectif.h MachLin.h \ | ||
Mach.h Blas.h Timer.h Shareable.h | ||
MachMulti.o: MachMulti.cpp Tools.h MachMulti.h Mach.h Blas.h Timer.h | ||
MachSeq.o: MachSeq.cpp Tools.h MachSeq.h MachMulti.h Mach.h Blas.h \ | ||
Timer.h | ||
MachPar.o: MachPar.cpp Tools.h MachTab.h Mach.h Blas.h Timer.h \ | ||
Shareable.h MachPar.h MachMulti.h | ||
MachSplit.o: MachSplit.cpp Tools.h MachSplit.h MachMulti.h Mach.h Blas.h \ | ||
Timer.h | ||
MachSplit1.o: MachSplit1.cpp Tools.h MachSplit1.h MachMulti.h Mach.h \ | ||
Blas.h Timer.h | ||
MachJoin.o: MachJoin.cpp Tools.h MachJoin.h MachMulti.h Mach.h Blas.h \ | ||
Timer.h | ||
Data.o: Data.cpp Tools.h Data.h DataFile.h WordList.h DataAscii.h \ | ||
DataAsciiClass.h DataMnist.h DataNgramBin.h DataPhraseBin.h | ||
DataFile.o: DataFile.cpp Tools.h Data.h DataFile.h WordList.h | ||
DataAscii.o: DataAscii.cpp Tools.h Data.h DataFile.h WordList.h \ | ||
DataAscii.h | ||
DataAsciiClass.o: DataAsciiClass.cpp Tools.h Data.h DataFile.h WordList.h \ | ||
DataAsciiClass.h DataAscii.h | ||
DataMnist.o: DataMnist.cpp Tools.h Data.h DataFile.h WordList.h \ | ||
DataMnist.h | ||
DataNgramBin.o: DataNgramBin.cpp Tools.h Data.h DataFile.h WordList.h \ | ||
DataNgramBin.h | ||
DataPhraseBin.o: DataPhraseBin.cpp Tools.h DataPhraseBin.h Data.h \ | ||
DataFile.h WordList.h | ||
ErrFct.o: ErrFct.cpp Tools.h ErrFct.h Mach.h Blas.h Timer.h Data.h \ | ||
DataFile.h WordList.h | ||
ErrFctMSE.o: ErrFctMSE.cpp Tools.h ErrFctMSE.h ErrFct.h Mach.h Blas.h \ | ||
Timer.h Data.h DataFile.h WordList.h | ||
ErrFctMCE.o: ErrFctMCE.cpp Tools.h ErrFctMCE.h ErrFct.h Mach.h Blas.h \ | ||
Timer.h Data.h DataFile.h WordList.h | ||
ErrFctCrossEnt.o: ErrFctCrossEnt.cpp Tools.h ErrFctCrossEnt.h ErrFct.h \ | ||
Mach.h Blas.h Timer.h Data.h DataFile.h WordList.h | ||
ErrFctSoftmCrossEntNgram.o: ErrFctSoftmCrossEntNgram.cpp Tools.h \ | ||
ErrFctSoftmCrossEntNgram.h ErrFct.h Mach.h Blas.h Timer.h Data.h \ | ||
DataFile.h WordList.h | ||
ErrFctSoftmCrossEntNgramMulti.o: ErrFctSoftmCrossEntNgramMulti.cpp \ | ||
Tools.h ErrFctSoftmCrossEntNgramMulti.h ErrFct.h Mach.h Blas.h Timer.h \ | ||
Data.h DataFile.h WordList.h ErrFctSoftmCrossEntNgram.h | ||
Hypo.o: Hypo.cpp Hypo.h Tools.h Toolsgz.h | ||
Lrate.o: Lrate.cpp Lrate.h Mach.h Tools.h Blas.h Timer.h | ||
NbestLM.o: NbestLM.cpp NbestLM.h Hypo.h Tools.h Toolsgz.h | ||
NbestCSLM.o: NbestCSLM.cpp Hypo.h Tools.h Toolsgz.h NbestCSLM.h NbestLM.h \ | ||
Mach.h Blas.h Timer.h TrainerNgramSlist.h ErrFct.h Data.h DataFile.h \ | ||
WordList.h DataNgramBin.h TrainerNgram.h Trainer.h Lrate.h BackoffLm.h | ||
Trainer.o: Trainer.cpp Tools.h Mach.h Blas.h Timer.h ErrFctMCE.h ErrFct.h \ | ||
Data.h DataFile.h WordList.h Trainer.h Lrate.h | ||
TrainerNgram.o: TrainerNgram.cpp Mach.h Tools.h Blas.h Timer.h \ | ||
TrainerNgram.h ErrFct.h Data.h DataFile.h WordList.h DataNgramBin.h \ | ||
Trainer.h Lrate.h | ||
TrainerNgramSlist.o: TrainerNgramSlist.cpp Tools.h Mach.h Blas.h Timer.h \ | ||
MachTab.h Shareable.h MachPar.h MachMulti.h MachSeq.h \ | ||
TrainerNgramSlist.h ErrFct.h Data.h DataFile.h WordList.h DataNgramBin.h \ | ||
TrainerNgram.h Trainer.h Lrate.h BackoffLm.h | ||
MachSoftmaxClass.o: MachSoftmaxClass.cpp MachSoftmaxClass.h Mach.h \ | ||
Tools.h Blas.h Timer.h MachLin.h Shareable.h MachSoftmax.h WordList.h \ | ||
MachSoftmaxStable.h | ||
ErrFctSoftmClassCrossEntNgram.o: ErrFctSoftmClassCrossEntNgram.cpp \ | ||
ErrFctSoftmClassCrossEntNgram.h ErrFct.h Tools.h Mach.h Blas.h Timer.h \ | ||
Data.h DataFile.h WordList.h MachSoftmaxClass.h MachLin.h Shareable.h \ | ||
MachSoftmax.h | ||
TrainerNgramClass.o: TrainerNgramClass.cpp TrainerNgramClass.h \ | ||
TrainerNgram.h Tools.h Mach.h Blas.h Timer.h ErrFct.h Data.h DataFile.h \ | ||
WordList.h DataNgramBin.h Trainer.h Lrate.h \ | ||
ErrFctSoftmClassCrossEntNgram.h MachSoftmaxClass.h MachLin.h Shareable.h \ | ||
MachSoftmax.h | ||
Shareable.o: Shareable.cpp Shareable.h | ||
WordList.o: WordList.cpp WordList.h Tools.h | ||
MachCombined.o: MachCombined.cpp Tools.h MachCombined.h MachMulti.h \ | ||
Mach.h Blas.h Timer.h | ||
MachAvr.o: MachAvr.cpp Tools.h MachAvr.h MachCombined.h MachMulti.h \ | ||
Mach.h Blas.h Timer.h | ||
Blas.o: Blas.c | ||
NbestLMKEN.o: NbestLMKEN.cpp NbestLMKEN.h NbestLM.h Hypo.h Tools.h \ | ||
Toolsgz.h | ||
BackoffLmKen.o: BackoffLmKen.cpp BackoffLmKen.h BackoffLm.h Tools.h \ | ||
WordList.h |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
/* | ||
* This file is part of the continuous space language and translation model toolkit | ||
* for statistical machine translation and large vocabulary speech recognition. | ||
* | ||
* Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France | ||
* | ||
* The CSLM toolkit is free software; you can redistribute it and/or modify it | ||
* under the terms of the GNU Lesser General Public License version 3 as | ||
* published by the Free Software Foundation | ||
* | ||
* This library is distributed in the hope that it will be useful, but WITHOUT | ||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License | ||
* for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public License | ||
* along with this library; if not, write to the Free Software Foundation, | ||
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA | ||
* | ||
* | ||
*/ | ||
|
||
#ifndef _BackoffLm_h | ||
#define _BackoffLm_h | ||
|
||
#include <string> | ||
#include "Tools.h" // for type WordID | ||
|
||
// We must be very careful with the indices | ||
// - most LM toolkits have their own internal word list | ||
// - binary ngram data files us indices with respect to their word list | ||
// (ideally, this word list should be identical to the one of the LM!) | ||
// - the CSLM code with short list performs a mapping of the binary indices | ||
// of the datafiles according to the 1-gram frequency | ||
// | ||
// | ||
|
||
#define NULL_LN_PROB (1.0) // this value must not be possible as a normal return value of ln Prob | ||
|
||
class BackoffLm { | ||
private: | ||
public: | ||
BackoffLm() {}; | ||
virtual ~BackoffLm() {}; | ||
inline virtual int GetOrder() {return 0; }; // returns order of the loaded LM | ||
inline virtual WordID GetVocSize() {return 0; }; // returns size of the vocabulary | ||
virtual int GetSentenceIds(WordID *&wid, const std::string &sentence, bool bos, bool eos) {return 0; }; // gets WordID of words in sentence | ||
virtual REAL BoffPw(char **ctxt, char *w, int req_order) {return 0;} // gets backoff LM P(w|ctxt) from sequence of words | ||
virtual REAL BoffLnPw(char **ctxt, char *w, int req_order) {return -99;} // idem but ln of P(w|ctxt) | ||
virtual REAL BoffPid(REAL *ctxt, WordID predw, int req_order) {return 0;} // similar for sequences of CSLM indices | ||
virtual REAL BoffLnPid(REAL *ctxt, WordID predw, int req_order) {return -99;} | ||
virtual REAL BoffLnStd(WordID *ctxt, WordID predw, int req_order) {return -99; } // simple wrapper w/o mapping | ||
// req-order can be any value | ||
}; | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
/* | ||
* This file is part of the continuous space language and translation model toolkit | ||
* for statistical machine translation and large vocabulary speech recognition. | ||
* | ||
* Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France | ||
* | ||
* The CSLM toolkit is free software; you can redistribute it and/or modify it | ||
* under the terms of the GNU Lesser General Public License version 3 as | ||
* published by the Free Software Foundation | ||
* | ||
* This library is distributed in the hope that it will be useful, but WITHOUT | ||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License | ||
* for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public License | ||
* along with this library; if not, write to the Free Software Foundation, | ||
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA | ||
* | ||
* | ||
*/ | ||
|
||
#include <cstdio> | ||
#include <iostream> | ||
#include "BackoffLmKen.h" | ||
using namespace std; | ||
using namespace lm::ngram; | ||
|
||
BackoffLmKen::BackoffLmKen(char *p_fname, int, const WordList &wlist) | ||
{ | ||
if ((p_fname == NULL) || (p_fname[0] == '\0')) { | ||
// no back-off file | ||
ken_ngram = NULL; | ||
ken_vocab = NULL; | ||
return; | ||
} | ||
|
||
cout << " - reading back-off KENLM from file '" << p_fname << "'" << endl; | ||
ken_ngram = new ProbingModel(p_fname); | ||
if (NULL == ken_ngram) { | ||
cout << " error" << endl; | ||
ken_vocab = NULL; | ||
return; | ||
} | ||
|
||
ken_vocab = &(ken_ngram->GetVocabulary()); | ||
LMWordIndex ken_size = (ken_vocab->Bound() + 1); | ||
printf(" found %d-gram with vocabulary of %d words\n", (int) ken_ngram->Order(), ken_size); | ||
|
||
// set up mapping from/to KENLM indices | ||
WordList::WordIndex wlist_size = wlist.GetSize(); | ||
map_cslm2ken.reserve(wlist_size); | ||
map_cslm2ken.resize(wlist_size); | ||
map_ken2wid.reserve(ken_size); | ||
map_ken2wid.resize(ken_size); | ||
WordList::const_iterator iter = wlist.Begin(), end = wlist.End(); | ||
for (size_t ci = 0 ; iter != end ; iter++, ci++) { | ||
LMWordIndex wi = ken_vocab->Index(iter->word); | ||
map_cslm2ken[ci] = wi; | ||
if (wi == ken_vocab->NotFound()) | ||
fprintf(stderr,"word %s not found at pos %zu\n", iter->word, ci); | ||
else | ||
map_ken2wid[wi] = iter->id; | ||
} | ||
} | ||
|
||
BackoffLmKen::~BackoffLmKen() | ||
{ | ||
if (NULL != ken_ngram) | ||
delete ken_ngram; | ||
map_cslm2ken.clear(); | ||
wid_vect.clear(); | ||
} | ||
|
||
/** | ||
* gets WordID of words in sentence | ||
* @param wid output table of WordID (allocated internally) | ||
* @param sentence input sentence | ||
* @param bos start sentence with BOS | ||
* @param eos end sentence with EOS | ||
* @return number of words | ||
*/ | ||
int BackoffLmKen::GetSentenceIds(WordID *&wid, const string &sentence, bool bos, bool eos) | ||
{ | ||
if (NULL == ken_vocab) | ||
return 0; | ||
|
||
int nw = 0; | ||
wid_vect.clear(); | ||
|
||
// start sentence with BOS ? | ||
if (bos) { | ||
wid_vect.push_back(map_ken2wid[ken_vocab->BeginSentence()]); | ||
nw++; | ||
} | ||
|
||
istringstream iss(sentence); | ||
while (iss) { | ||
string s; | ||
iss >> s; | ||
if (!s.empty()) { | ||
wid_vect.push_back(map_ken2wid[ken_vocab->Index(s)]); | ||
nw++; | ||
} | ||
} | ||
|
||
// end sentence with EOS ? | ||
if (eos) { | ||
wid_vect.push_back(map_ken2wid[ken_vocab->EndSentence()]); | ||
nw++; | ||
} | ||
|
||
wid = &(wid_vect.front()); | ||
return nw; | ||
} | ||
|
||
/** | ||
* gets ln of backoff LM P(w|ctxt) from sequence of words | ||
*/ | ||
REAL BackoffLmKen::BoffLnPw(char **ctxt, char *w, int req_order) | ||
// gets LOG_e backoff LM proba from a sequence of CSLM indices | ||
// if the order of the back-off LM is smaller than we use the last n-1 words of the context | ||
// w1 w2 w3 -> w4 | ||
// \ 2-gram / | ||
// \-- 3-gram --/ | ||
// \---- 4-gram ----/ | ||
{ | ||
#ifdef DEBUG | ||
printf ("\nrequest KENLM %d-gram: %s ", req_order, ctxt[0]); | ||
for (int i = 1; i < (req_order - 1); i++) printf(", %s", ctxt[i]); | ||
printf(" -> %s \n", w); | ||
#endif | ||
if (NULL == ken_ngram) | ||
// return constant value if we have no LM | ||
return NULL_LN_PROB; | ||
|
||
State state(ken_ngram->NullContextState()), out_state; | ||
for (int i = 0; i < (req_order - 1); i++) { | ||
ken_ngram->Score(state, ken_vocab->Index(ctxt[i]), out_state); | ||
state = out_state; | ||
} | ||
|
||
// we need to convert from log_10 to ln | ||
return M_LN10 * ken_ngram->Score(state, ken_vocab->Index(w), out_state); | ||
} | ||
|
||
/** | ||
* gets ln of backoff LM P(w|ctxt) from sequence of CSLM indices | ||
*/ | ||
REAL BackoffLmKen::BoffLnPid(REAL *ctxt, WordID predw, int req_order) | ||
// gets LOG_e backoff LM proba from a sequence of CSLM indices | ||
// if the order of the back-off LM is smaller than we use the last n-1 words of the context | ||
// w1 w2 w3 -> w4 | ||
// \ 2-gram / | ||
// \-- 3-gram --/ | ||
// \---- 4-gram ----/ | ||
{ | ||
#ifdef DEBUG | ||
printf ("\nrequest KENLM %d-gram: %d ", req_order, (WordID) ctxt[0]); | ||
for (int i = 1; i < (req_order - 1); i++) printf(", %d", (WordID) ctxt[i]); | ||
printf(" -> %d \n", predw); | ||
#endif | ||
if (NULL == ken_ngram) | ||
// return constant value if we have no LM | ||
return NULL_LN_PROB; | ||
|
||
State state(ken_ngram->NullContextState()), out_state; | ||
for (int i = 0; i < (req_order - 1); i++) { | ||
ken_ngram->Score(state, map_cslm2ken[(WordID) ctxt[i]], out_state); | ||
state = out_state; | ||
} | ||
|
||
// we need to convert from log_10 to ln | ||
return M_LN10 * ken_ngram->Score(state, map_cslm2ken[predw], out_state); | ||
} | ||
|
||
/** | ||
* gets ln of backoff LM P(w|ctxt) from sequence of CSLM indices, without mapping | ||
*/ | ||
REAL BackoffLmKen::BoffLnStd(WordID *ctxt, WordID predw, int req_order) | ||
// gets LOG_e backoff LM proba from a sequence of CSLM indices | ||
// if the order of the back-off LM is smaller than we use the last n-1 words of the context | ||
// w1 w2 w3 -> w4 | ||
// \ 2-gram / | ||
// \-- 3-gram --/ | ||
// \---- 4-gram ----/ | ||
{ | ||
#ifdef DEBUG | ||
printf ("\nrequest KENLM %d-gram: %d ", req_order, ctxt[0]); | ||
for (int i = 1; i < (req_order - 1); i++) printf(", %d", ctxt[i]); | ||
printf(" -> %d \n", predw); | ||
#endif | ||
if (NULL == ken_ngram) | ||
// return constant value if we have no LM | ||
return NULL_LN_PROB; | ||
|
||
State state(ken_ngram->NullContextState()), out_state; | ||
for (int i = 0; i < (req_order - 1); i++) { | ||
ken_ngram->Score(state, ctxt[i], out_state); | ||
state = out_state; | ||
} | ||
|
||
// we need to convert from log_10 to ln | ||
return M_LN10 * ken_ngram->Score(state, predw, out_state); | ||
} |
Oops, something went wrong.