Skip to content

Commit

Permalink
added all the files
Browse files Browse the repository at this point in the history
  • Loading branch information
hschwenk committed Jun 30, 2015
1 parent 5f41481 commit a50175b
Show file tree
Hide file tree
Showing 369 changed files with 120,203 additions and 0 deletions.
103 changes: 103 additions & 0 deletions .depend
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@

Tools.o: Tools.cpp Tools.h
Toolsgz.o: Toolsgz.cpp Tools.h Toolsgz.h
MachConfig.o: MachConfig.cpp MachAvr.h MachCombined.h MachMulti.h Mach.h \
Tools.h Blas.h Timer.h MachConfig.h MachLin.h Shareable.h MachTab.h \
MachJoin.h MachLinRectif.h MachPar.h MachSeq.h MachSig.h MachSoftmax.h \
MachSoftmaxStable.h MachSoftmaxClass.h WordList.h MachSplit.h \
MachSplit1.h MachTanh.h MachCopy.h
Mach.o: Mach.cpp Tools.h Mach.h Blas.h Timer.h MachCopy.h MachTab.h \
Shareable.h MachLin.h MachSig.h MachTanh.h MachSoftmax.h \
MachSoftmaxStable.h MachSoftmaxClass.h WordList.h MachLinRectif.h \
MachSeq.h MachMulti.h MachPar.h MachSplit.h MachSplit1.h MachJoin.h
MachTab.o: MachTab.cpp Tools.h MachTab.h Mach.h Blas.h Timer.h \
Shareable.h
MachCopy.o: MachCopy.cpp Tools.h MachCopy.h Mach.h Blas.h Timer.h
MachLin.o: MachLin.cpp Tools.h MachLin.h Mach.h Blas.h Timer.h \
Shareable.h
MachSig.o: MachSig.cpp Tools.h MachSig.h MachLin.h Mach.h Blas.h Timer.h \
Shareable.h
MachTanh.o: MachTanh.cpp Tools.h MachTanh.h MachLin.h Mach.h Blas.h \
Timer.h Shareable.h
MachSoftmax.o: MachSoftmax.cpp Tools.h MachSoftmax.h MachLin.h Mach.h \
Blas.h Timer.h Shareable.h
MachSoftmaxStable.o: MachSoftmaxStable.cpp Tools.h MachSoftmaxStable.h \
MachLin.h Mach.h Blas.h Timer.h Shareable.h
MachLinRectif.o: MachLinRectif.cpp Tools.h MachLinRectif.h MachLin.h \
Mach.h Blas.h Timer.h Shareable.h
MachMulti.o: MachMulti.cpp Tools.h MachMulti.h Mach.h Blas.h Timer.h
MachSeq.o: MachSeq.cpp Tools.h MachSeq.h MachMulti.h Mach.h Blas.h \
Timer.h
MachPar.o: MachPar.cpp Tools.h MachTab.h Mach.h Blas.h Timer.h \
Shareable.h MachPar.h MachMulti.h
MachSplit.o: MachSplit.cpp Tools.h MachSplit.h MachMulti.h Mach.h Blas.h \
Timer.h
MachSplit1.o: MachSplit1.cpp Tools.h MachSplit1.h MachMulti.h Mach.h \
Blas.h Timer.h
MachJoin.o: MachJoin.cpp Tools.h MachJoin.h MachMulti.h Mach.h Blas.h \
Timer.h
Data.o: Data.cpp Tools.h Data.h DataFile.h WordList.h DataAscii.h \
DataAsciiClass.h DataMnist.h DataNgramBin.h DataPhraseBin.h
DataFile.o: DataFile.cpp Tools.h Data.h DataFile.h WordList.h
DataAscii.o: DataAscii.cpp Tools.h Data.h DataFile.h WordList.h \
DataAscii.h
DataAsciiClass.o: DataAsciiClass.cpp Tools.h Data.h DataFile.h WordList.h \
DataAsciiClass.h DataAscii.h
DataMnist.o: DataMnist.cpp Tools.h Data.h DataFile.h WordList.h \
DataMnist.h
DataNgramBin.o: DataNgramBin.cpp Tools.h Data.h DataFile.h WordList.h \
DataNgramBin.h
DataPhraseBin.o: DataPhraseBin.cpp Tools.h DataPhraseBin.h Data.h \
DataFile.h WordList.h
ErrFct.o: ErrFct.cpp Tools.h ErrFct.h Mach.h Blas.h Timer.h Data.h \
DataFile.h WordList.h
ErrFctMSE.o: ErrFctMSE.cpp Tools.h ErrFctMSE.h ErrFct.h Mach.h Blas.h \
Timer.h Data.h DataFile.h WordList.h
ErrFctMCE.o: ErrFctMCE.cpp Tools.h ErrFctMCE.h ErrFct.h Mach.h Blas.h \
Timer.h Data.h DataFile.h WordList.h
ErrFctCrossEnt.o: ErrFctCrossEnt.cpp Tools.h ErrFctCrossEnt.h ErrFct.h \
Mach.h Blas.h Timer.h Data.h DataFile.h WordList.h
ErrFctSoftmCrossEntNgram.o: ErrFctSoftmCrossEntNgram.cpp Tools.h \
ErrFctSoftmCrossEntNgram.h ErrFct.h Mach.h Blas.h Timer.h Data.h \
DataFile.h WordList.h
ErrFctSoftmCrossEntNgramMulti.o: ErrFctSoftmCrossEntNgramMulti.cpp \
Tools.h ErrFctSoftmCrossEntNgramMulti.h ErrFct.h Mach.h Blas.h Timer.h \
Data.h DataFile.h WordList.h ErrFctSoftmCrossEntNgram.h
Hypo.o: Hypo.cpp Hypo.h Tools.h Toolsgz.h
Lrate.o: Lrate.cpp Lrate.h Mach.h Tools.h Blas.h Timer.h
NbestLM.o: NbestLM.cpp NbestLM.h Hypo.h Tools.h Toolsgz.h
NbestCSLM.o: NbestCSLM.cpp Hypo.h Tools.h Toolsgz.h NbestCSLM.h NbestLM.h \
Mach.h Blas.h Timer.h TrainerNgramSlist.h ErrFct.h Data.h DataFile.h \
WordList.h DataNgramBin.h TrainerNgram.h Trainer.h Lrate.h BackoffLm.h
Trainer.o: Trainer.cpp Tools.h Mach.h Blas.h Timer.h ErrFctMCE.h ErrFct.h \
Data.h DataFile.h WordList.h Trainer.h Lrate.h
TrainerNgram.o: TrainerNgram.cpp Mach.h Tools.h Blas.h Timer.h \
TrainerNgram.h ErrFct.h Data.h DataFile.h WordList.h DataNgramBin.h \
Trainer.h Lrate.h
TrainerNgramSlist.o: TrainerNgramSlist.cpp Tools.h Mach.h Blas.h Timer.h \
MachTab.h Shareable.h MachPar.h MachMulti.h MachSeq.h \
TrainerNgramSlist.h ErrFct.h Data.h DataFile.h WordList.h DataNgramBin.h \
TrainerNgram.h Trainer.h Lrate.h BackoffLm.h
MachSoftmaxClass.o: MachSoftmaxClass.cpp MachSoftmaxClass.h Mach.h \
Tools.h Blas.h Timer.h MachLin.h Shareable.h MachSoftmax.h WordList.h \
MachSoftmaxStable.h
ErrFctSoftmClassCrossEntNgram.o: ErrFctSoftmClassCrossEntNgram.cpp \
ErrFctSoftmClassCrossEntNgram.h ErrFct.h Tools.h Mach.h Blas.h Timer.h \
Data.h DataFile.h WordList.h MachSoftmaxClass.h MachLin.h Shareable.h \
MachSoftmax.h
TrainerNgramClass.o: TrainerNgramClass.cpp TrainerNgramClass.h \
TrainerNgram.h Tools.h Mach.h Blas.h Timer.h ErrFct.h Data.h DataFile.h \
WordList.h DataNgramBin.h Trainer.h Lrate.h \
ErrFctSoftmClassCrossEntNgram.h MachSoftmaxClass.h MachLin.h Shareable.h \
MachSoftmax.h
Shareable.o: Shareable.cpp Shareable.h
WordList.o: WordList.cpp WordList.h Tools.h
MachCombined.o: MachCombined.cpp Tools.h MachCombined.h MachMulti.h \
Mach.h Blas.h Timer.h
MachAvr.o: MachAvr.cpp Tools.h MachAvr.h MachCombined.h MachMulti.h \
Mach.h Blas.h Timer.h
Blas.o: Blas.c
NbestLMKEN.o: NbestLMKEN.cpp NbestLMKEN.h NbestLM.h Hypo.h Tools.h \
Toolsgz.h
BackoffLmKen.o: BackoffLmKen.cpp BackoffLmKen.h BackoffLm.h Tools.h \
WordList.h
56 changes: 56 additions & 0 deletions BackoffLm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* This file is part of the continuous space language and translation model toolkit
* for statistical machine translation and large vocabulary speech recognition.
*
* Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
*
* The CSLM toolkit is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License version 3 as
* published by the Free Software Foundation
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*
*
*/

#ifndef _BackoffLm_h
#define _BackoffLm_h

#include <string>
#include "Tools.h" // for type WordID

// We must be very careful with the indices
// - most LM toolkits have their own internal word list
// - binary ngram data files us indices with respect to their word list
// (ideally, this word list should be identical to the one of the LM!)
// - the CSLM code with short list performs a mapping of the binary indices
// of the datafiles according to the 1-gram frequency
//
//

#define NULL_LN_PROB (1.0) // this value must not be possible as a normal return value of ln Prob

class BackoffLm {
private:
public:
BackoffLm() {};
virtual ~BackoffLm() {};
inline virtual int GetOrder() {return 0; }; // returns order of the loaded LM
inline virtual WordID GetVocSize() {return 0; }; // returns size of the vocabulary
virtual int GetSentenceIds(WordID *&wid, const std::string &sentence, bool bos, bool eos) {return 0; }; // gets WordID of words in sentence
virtual REAL BoffPw(char **ctxt, char *w, int req_order) {return 0;} // gets backoff LM P(w|ctxt) from sequence of words
virtual REAL BoffLnPw(char **ctxt, char *w, int req_order) {return -99;} // idem but ln of P(w|ctxt)
virtual REAL BoffPid(REAL *ctxt, WordID predw, int req_order) {return 0;} // similar for sequences of CSLM indices
virtual REAL BoffLnPid(REAL *ctxt, WordID predw, int req_order) {return -99;}
virtual REAL BoffLnStd(WordID *ctxt, WordID predw, int req_order) {return -99; } // simple wrapper w/o mapping
// req-order can be any value
};

#endif
205 changes: 205 additions & 0 deletions BackoffLmKen.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
/*
* This file is part of the continuous space language and translation model toolkit
* for statistical machine translation and large vocabulary speech recognition.
*
* Copyright 2015, Holger Schwenk, LIUM, University of Le Mans, France
*
* The CSLM toolkit is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License version 3 as
* published by the Free Software Foundation
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*
*
*/

#include <cstdio>
#include <iostream>
#include "BackoffLmKen.h"
using namespace std;
using namespace lm::ngram;

BackoffLmKen::BackoffLmKen(char *p_fname, int, const WordList &wlist)
{
if ((p_fname == NULL) || (p_fname[0] == '\0')) {
// no back-off file
ken_ngram = NULL;
ken_vocab = NULL;
return;
}

cout << " - reading back-off KENLM from file '" << p_fname << "'" << endl;
ken_ngram = new ProbingModel(p_fname);
if (NULL == ken_ngram) {
cout << " error" << endl;
ken_vocab = NULL;
return;
}

ken_vocab = &(ken_ngram->GetVocabulary());
LMWordIndex ken_size = (ken_vocab->Bound() + 1);
printf(" found %d-gram with vocabulary of %d words\n", (int) ken_ngram->Order(), ken_size);

// set up mapping from/to KENLM indices
WordList::WordIndex wlist_size = wlist.GetSize();
map_cslm2ken.reserve(wlist_size);
map_cslm2ken.resize(wlist_size);
map_ken2wid.reserve(ken_size);
map_ken2wid.resize(ken_size);
WordList::const_iterator iter = wlist.Begin(), end = wlist.End();
for (size_t ci = 0 ; iter != end ; iter++, ci++) {
LMWordIndex wi = ken_vocab->Index(iter->word);
map_cslm2ken[ci] = wi;
if (wi == ken_vocab->NotFound())
fprintf(stderr,"word %s not found at pos %zu\n", iter->word, ci);
else
map_ken2wid[wi] = iter->id;
}
}

BackoffLmKen::~BackoffLmKen()
{
if (NULL != ken_ngram)
delete ken_ngram;
map_cslm2ken.clear();
wid_vect.clear();
}

/**
* gets WordID of words in sentence
* @param wid output table of WordID (allocated internally)
* @param sentence input sentence
* @param bos start sentence with BOS
* @param eos end sentence with EOS
* @return number of words
*/
int BackoffLmKen::GetSentenceIds(WordID *&wid, const string &sentence, bool bos, bool eos)
{
if (NULL == ken_vocab)
return 0;

int nw = 0;
wid_vect.clear();

// start sentence with BOS ?
if (bos) {
wid_vect.push_back(map_ken2wid[ken_vocab->BeginSentence()]);
nw++;
}

istringstream iss(sentence);
while (iss) {
string s;
iss >> s;
if (!s.empty()) {
wid_vect.push_back(map_ken2wid[ken_vocab->Index(s)]);
nw++;
}
}

// end sentence with EOS ?
if (eos) {
wid_vect.push_back(map_ken2wid[ken_vocab->EndSentence()]);
nw++;
}

wid = &(wid_vect.front());
return nw;
}

/**
* gets ln of backoff LM P(w|ctxt) from sequence of words
*/
REAL BackoffLmKen::BoffLnPw(char **ctxt, char *w, int req_order)
// gets LOG_e backoff LM proba from a sequence of CSLM indices
// if the order of the back-off LM is smaller than we use the last n-1 words of the context
// w1 w2 w3 -> w4
// \ 2-gram /
// \-- 3-gram --/
// \---- 4-gram ----/
{
#ifdef DEBUG
printf ("\nrequest KENLM %d-gram: %s ", req_order, ctxt[0]);
for (int i = 1; i < (req_order - 1); i++) printf(", %s", ctxt[i]);
printf(" -> %s \n", w);
#endif
if (NULL == ken_ngram)
// return constant value if we have no LM
return NULL_LN_PROB;

State state(ken_ngram->NullContextState()), out_state;
for (int i = 0; i < (req_order - 1); i++) {
ken_ngram->Score(state, ken_vocab->Index(ctxt[i]), out_state);
state = out_state;
}

// we need to convert from log_10 to ln
return M_LN10 * ken_ngram->Score(state, ken_vocab->Index(w), out_state);
}

/**
* gets ln of backoff LM P(w|ctxt) from sequence of CSLM indices
*/
REAL BackoffLmKen::BoffLnPid(REAL *ctxt, WordID predw, int req_order)
// gets LOG_e backoff LM proba from a sequence of CSLM indices
// if the order of the back-off LM is smaller than we use the last n-1 words of the context
// w1 w2 w3 -> w4
// \ 2-gram /
// \-- 3-gram --/
// \---- 4-gram ----/
{
#ifdef DEBUG
printf ("\nrequest KENLM %d-gram: %d ", req_order, (WordID) ctxt[0]);
for (int i = 1; i < (req_order - 1); i++) printf(", %d", (WordID) ctxt[i]);
printf(" -> %d \n", predw);
#endif
if (NULL == ken_ngram)
// return constant value if we have no LM
return NULL_LN_PROB;

State state(ken_ngram->NullContextState()), out_state;
for (int i = 0; i < (req_order - 1); i++) {
ken_ngram->Score(state, map_cslm2ken[(WordID) ctxt[i]], out_state);
state = out_state;
}

// we need to convert from log_10 to ln
return M_LN10 * ken_ngram->Score(state, map_cslm2ken[predw], out_state);
}

/**
* gets ln of backoff LM P(w|ctxt) from sequence of CSLM indices, without mapping
*/
REAL BackoffLmKen::BoffLnStd(WordID *ctxt, WordID predw, int req_order)
// gets LOG_e backoff LM proba from a sequence of CSLM indices
// if the order of the back-off LM is smaller than we use the last n-1 words of the context
// w1 w2 w3 -> w4
// \ 2-gram /
// \-- 3-gram --/
// \---- 4-gram ----/
{
#ifdef DEBUG
printf ("\nrequest KENLM %d-gram: %d ", req_order, ctxt[0]);
for (int i = 1; i < (req_order - 1); i++) printf(", %d", ctxt[i]);
printf(" -> %d \n", predw);
#endif
if (NULL == ken_ngram)
// return constant value if we have no LM
return NULL_LN_PROB;

State state(ken_ngram->NullContextState()), out_state;
for (int i = 0; i < (req_order - 1); i++) {
ken_ngram->Score(state, ctxt[i], out_state);
state = out_state;
}

// we need to convert from log_10 to ln
return M_LN10 * ken_ngram->Score(state, predw, out_state);
}
Loading

0 comments on commit a50175b

Please sign in to comment.