diff --git a/.classpath b/.classpath index 2e6d8d18..5552ff34 100644 --- a/.classpath +++ b/.classpath @@ -18,25 +18,25 @@ - + - + - + - + - - + + @@ -96,11 +96,11 @@ - - + + - - + + @@ -112,12 +112,12 @@ - + - + diff --git a/src/java/integration/ivory/integration/wikipedia/SearchSequenceFiles.java b/src/java/integration/ivory/integration/wikipedia/SearchSequenceFiles.java index 0bc603b9..f0d758cc 100644 --- a/src/java/integration/ivory/integration/wikipedia/SearchSequenceFiles.java +++ b/src/java/integration/ivory/integration/wikipedia/SearchSequenceFiles.java @@ -31,7 +31,7 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; /** * Read sequence files, output key-value pairs that match specified key. @@ -59,14 +59,14 @@ public SearchSequenceFiles() { } static class MyMapperTerm extends MapReduceBase implements - Mapper { + Mapper { private String[] keys; public void configure(JobConf job) { keys = job.get("keys").split(","); } - public void map(IntWritable key, HMapSFW value, OutputCollector output, + public void map(IntWritable key, HMapStFW value, OutputCollector output, Reporter reporter) throws IOException { for (String compareKey : keys) { int k = Integer.parseInt(compareKey); @@ -131,8 +131,8 @@ public int run(String[] args) throws Exception { if (valueClassName.contains("HMapSFW")) { job.setMapperClass(MyMapperTerm.class); - job.setMapOutputValueClass(HMapSFW.class); - job.setOutputValueClass(HMapSFW.class); + job.setMapOutputValueClass(HMapStFW.class); + job.setOutputValueClass(HMapStFW.class); } else { job.setMapperClass(MyMapperInt.class); job.setMapOutputValueClass(WeightedIntDocVector.class); diff --git a/src/java/integration/ivory/integration/wikipedia/VerifyWikipediaProcessingCrosslingual.java b/src/java/integration/ivory/integration/wikipedia/VerifyWikipediaProcessingCrosslingual.java index 3be8e759..9211b816 100644 --- a/src/java/integration/ivory/integration/wikipedia/VerifyWikipediaProcessingCrosslingual.java +++ b/src/java/integration/ivory/integration/wikipedia/VerifyWikipediaProcessingCrosslingual.java @@ -19,7 +19,7 @@ import org.junit.Test; import tl.lin.data.map.HMapIFW; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.map.MapIF; import tl.lin.data.map.MapKF; @@ -233,7 +233,7 @@ public void runBuildIndexEnSide() throws Exception { "-input=" + enwikiEn + "/wt-term-doc-vectors", "-output=" + enwikiEn + "/test_wt-term-doc-vectors", "-keys=" + enTermDocVector1Id + "," + enTermDocVector2Id, - "-valueclass=" + HMapSFW.class.getCanonicalName()}; + "-valueclass=" + HMapStFW.class.getCanonicalName()}; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "ivory"), @@ -252,7 +252,7 @@ public void verifyTermDocVectorsEn() throws Exception { SequenceFile.Reader reader; IntWritable key = new IntWritable(); - HMapSFW value = new HMapSFW(); + HMapStFW value = new HMapStFW(); reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(enwikiEn + "/test_wt-term-doc-vectors/part-00000"))); @@ -365,7 +365,7 @@ public void runBuildIndexDeSide() throws Exception { "-input=" + dewikiEn + "/wt-term-doc-vectors", "-output=" + dewikiEn + "/test_wt-term-doc-vectors", "-keys=" + deTermDocVector1Id + "," + deTermDocVector2Id, - "-valueclass=" + HMapSFW.class.getCanonicalName()}; + "-valueclass=" + HMapStFW.class.getCanonicalName()}; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "ivory"), @@ -384,7 +384,7 @@ public void verifyTermDocVectorsDe() throws Exception { SequenceFile.Reader reader; IntWritable key = new IntWritable(); - HMapSFW value = new HMapSFW(); + HMapStFW value = new HMapStFW(); reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(dewikiEn + "/test_wt-term-doc-vectors/part-00000"))); @@ -434,7 +434,7 @@ public void verifyIntDocVectorsDe() throws Exception { reader.close(); } - private void verifyTermDocVector(Map doc, HMapSFW value) { + private void verifyTermDocVector(Map doc, HMapStFW value) { assertTrue(value != null); for (Map.Entry entry : doc.entrySet()) { assertTrue(value.containsKey(entry.getKey())); diff --git a/src/java/integration/ivory/integration/wikipedia/VerifyWikipediaProcessingMonolingual.java b/src/java/integration/ivory/integration/wikipedia/VerifyWikipediaProcessingMonolingual.java index 2e23987a..becd00b3 100644 --- a/src/java/integration/ivory/integration/wikipedia/VerifyWikipediaProcessingMonolingual.java +++ b/src/java/integration/ivory/integration/wikipedia/VerifyWikipediaProcessingMonolingual.java @@ -19,7 +19,7 @@ import org.junit.Test; import tl.lin.data.map.HMapIFW; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.map.MapIF; import tl.lin.data.map.MapKF; @@ -213,7 +213,7 @@ public void runBuildIndexGalago() throws Exception { "-input=" + galagoIndex + "/wt-term-doc-vectors", "-output=" + galagoIndex + "/test_wt-term-doc-vectors", "-keys=" + galagoTermDocVector1Id + "," + galagoTermDocVector2Id, - "-valueclass=" + HMapSFW.class.getCanonicalName() }; + "-valueclass=" + HMapStFW.class.getCanonicalName() }; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "ivory"), @@ -226,7 +226,7 @@ public void runBuildIndexGalago() throws Exception { System.out.println("verifyTermDocVectorsGalago"); IntWritable key1 = new IntWritable(); - HMapSFW value1 = new HMapSFW(); + HMapStFW value1 = new HMapStFW(); SequenceFile.Reader reader1 = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(galagoIndex + "/test_wt-term-doc-vectors/part-00000"))); @@ -327,7 +327,7 @@ public void runBuildIndexOpennlp() throws Exception { "-input=" + opennlpIndex + "/wt-term-doc-vectors", "-output=" + opennlpIndex + "/test_wt-term-doc-vectors", "-keys=" + opennlpTermDocVector1Id + "," + opennlpTermDocVector2Id, - "-valueclass=" + HMapSFW.class.getCanonicalName() }; + "-valueclass=" + HMapStFW.class.getCanonicalName() }; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("dist", "ivory"), @@ -340,7 +340,7 @@ public void runBuildIndexOpennlp() throws Exception { System.out.println("verifyTermDocVectorsOpennlp"); IntWritable key1 = new IntWritable(); - HMapSFW value1 = new HMapSFW(); + HMapStFW value1 = new HMapStFW(); SequenceFile.Reader reader1 = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(new Path(opennlpIndex + "/test_wt-term-doc-vectors/part-00000"))); @@ -385,7 +385,7 @@ public void runBuildIndexOpennlp() throws Exception { reader2.close(); } - private void verifyTermDocVector(Map doc, HMapSFW value) { + private void verifyTermDocVector(Map doc, HMapStFW value) { assertTrue(value != null); for (Map.Entry entry : doc.entrySet()) { System.out.println("checking " + entry.getKey() + ": expected = " + entry.getValue() + ", actual = " + value.get(entry.getKey())); diff --git a/src/java/main/ivory/core/preprocess/BuildTargetLangWeightedIntDocVectors.java b/src/java/main/ivory/core/preprocess/BuildTargetLangWeightedIntDocVectors.java index 8a523fcf..4c0beb0b 100644 --- a/src/java/main/ivory/core/preprocess/BuildTargetLangWeightedIntDocVectors.java +++ b/src/java/main/ivory/core/preprocess/BuildTargetLangWeightedIntDocVectors.java @@ -44,7 +44,7 @@ import org.apache.log4j.Logger; import tl.lin.data.map.HMapIFW; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.map.MapKF; import edu.umd.cloud9.util.PowerTool; import edu.umd.hooka.Vocab; @@ -72,7 +72,7 @@ protected static enum Terms{ } private static class MyMapper extends MapReduceBase implements - Mapper { + Mapper { static IntWritable mDocno = new IntWritable(); private boolean normalize = false; @@ -102,7 +102,7 @@ public void configure(JobConf conf){ HMapIFW weightedVector = new HMapIFW(); float sum2; - public void map(IntWritable docno, HMapSFW doc, + public void map(IntWritable docno, HMapStFW doc, OutputCollector output, Reporter reporter) throws IOException { mDocno.set(docno.get()); diff --git a/src/java/main/ivory/core/preprocess/BuildTranslatedTermDocVectors.java b/src/java/main/ivory/core/preprocess/BuildTranslatedTermDocVectors.java index 15ed613b..ccf2dc6a 100644 --- a/src/java/main/ivory/core/preprocess/BuildTranslatedTermDocVectors.java +++ b/src/java/main/ivory/core/preprocess/BuildTranslatedTermDocVectors.java @@ -37,7 +37,7 @@ import org.apache.log4j.Logger; import tl.lin.data.map.HMapIFW; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.map.MapIF; import com.google.common.collect.Maps; @@ -66,7 +66,7 @@ protected static enum Docs { DBG, ZERO, SHORT, SHORTAfterTranslation, Total }; protected static enum DF { TransDf, NoDf } private static class MyMapperTrans extends MapReduceBase implements - Mapper { + Mapper { private ScoringModel model; // eVocabSrc is the English vocabulary for probability table e2f_Probs. @@ -209,7 +209,7 @@ public void configure(JobConf conf) { } public void map(IntWritable docno, TermDocVector doc, - OutputCollector output, Reporter reporter) throws IOException { + OutputCollector output, Reporter reporter) throws IOException { if (docno.get() % SAMPLING != 0) { return; // for generating sample document vectors. no sampling if SAMPLING=1 } @@ -236,7 +236,7 @@ public void map(IntWritable docno, TermDocVector doc, int docLen = CLIRUtils.translateTFs(doc, tfS, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, tokenizer, LOG); - HMapSFW v = CLIRUtils.createTermDocVector(docLen, tfS, eVocabTrg, model, dict, dfTable, + HMapStFW v = CLIRUtils.createTermDocVector(docLen, tfS, eVocabTrg, model, dict, dfTable, isNormalize, LOG); // If no translation of any word is in the target vocab, remove document i.e., our model @@ -354,9 +354,9 @@ public int runTool() throws Exception { conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); - conf.setMapOutputValueClass(HMapSFW.class); + conf.setMapOutputValueClass(HMapStFW.class); conf.setOutputKeyClass(IntWritable.class); - conf.setOutputValueClass(HMapSFW.class); + conf.setOutputValueClass(HMapStFW.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapperClass(MyMapperTrans.class); diff --git a/src/java/main/ivory/core/preprocess/BuildWeightedTermDocVectors.java b/src/java/main/ivory/core/preprocess/BuildWeightedTermDocVectors.java index a5bc4560..af7525b2 100644 --- a/src/java/main/ivory/core/preprocess/BuildWeightedTermDocVectors.java +++ b/src/java/main/ivory/core/preprocess/BuildWeightedTermDocVectors.java @@ -48,7 +48,7 @@ import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.log4j.Logger; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.map.MapKF; import com.google.common.collect.Maps; @@ -61,7 +61,7 @@ public class BuildWeightedTermDocVectors extends PowerTool { protected static enum Docs { Total, ZERO, SHORT } private static class MyMapper extends MapReduceBase implements - Mapper { + Mapper { static IntWritable mDocno = new IntWritable(); private static DocLengthTable mDLTable; @@ -72,7 +72,7 @@ private static class MyMapper extends MapReduceBase implements private boolean normalize = false; DefaultFrequencySortedDictionary dict; DfTableArray dfTable; - HMapSFW weightedVector = new HMapSFW(); + HMapStFW weightedVector = new HMapStFW(); String term; float wt, sum2; @@ -162,7 +162,7 @@ public void configure(JobConf conf){ } public void map(IntWritable docno, LazyTermDocVector doc, - OutputCollector output, Reporter reporter) + OutputCollector output, Reporter reporter) throws IOException { mDocno.set(docno.get()); int docLen = mDLTable.getDocLength(mDocno.get()); @@ -288,10 +288,10 @@ public int runTool() throws Exception { FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); - conf.setMapOutputValueClass(HMapSFW.class); + conf.setMapOutputValueClass(HMapStFW.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); - conf.setOutputValueClass(HMapSFW.class); + conf.setOutputValueClass(HMapStFW.class); LOG.info("Running job: "+conf.getJobName()); diff --git a/src/java/main/ivory/core/util/CLIRUtils.java b/src/java/main/ivory/core/util/CLIRUtils.java index b8e7b0b9..a3f7d705 100644 --- a/src/java/main/ivory/core/util/CLIRUtils.java +++ b/src/java/main/ivory/core/util/CLIRUtils.java @@ -43,8 +43,8 @@ import tl.lin.data.map.HMapIF; import tl.lin.data.map.HMapIFW; -import tl.lin.data.map.HMapSFW; -import tl.lin.data.map.HMapSIW; +import tl.lin.data.map.HMapStFW; +import tl.lin.data.map.HMapStIW; import tl.lin.data.map.MapKF; import tl.lin.data.pair.PairOfFloatString; import tl.lin.data.pair.PairOfFloats; @@ -147,7 +147,7 @@ public static float cosine(HMapIFW vectorA, HMapIFW vectorB) { * @return * cosine score */ - public static float cosine(HMapSFW vectorA, HMapSFW vectorB) { + public static float cosine(HMapStFW vectorA, HMapStFW vectorB) { float sum = 0, magA = 0, magB = 0; for(tl.lin.data.map.MapKF.Entry e : vectorA.entrySet()){ float value = e.getValue(); @@ -176,7 +176,7 @@ public static float cosine(HMapSFW vectorA, HMapSFW vectorB) { * @return * cosine score */ - public static float cosineNormalized(HMapSFW vectorA, HMapSFW vectorB) { + public static float cosineNormalized(HMapStFW vectorA, HMapStFW vectorB) { float sum = 0; for(tl.lin.data.map.MapKF.Entry e : vectorA.entrySet()){ float value = e.getValue(); @@ -234,7 +234,7 @@ public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_ * @return * mapping from E-terms to their computed df values */ - public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, HMapSIW dfs){ + public static HMapIFW translateDFTable(Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_probs, HMapStIW dfs){ HMapIFW transDfTable = new HMapIFW(); for(int e=1;e entry : tfTable.entrySet()){ // retrieve term string, tf and df @@ -1125,32 +1125,32 @@ private static void combineTTables(String ttableFile, String srcEVocabFile, Stri * */ - public static String[] computeFeaturesF1(HMapSFW eVector, HMapSFW translatedFVector, float eSentLength, float fSentLength) { + public static String[] computeFeaturesF1(HMapStFW eVector, HMapStFW translatedFVector, float eSentLength, float fSentLength) { return computeFeatures(1, null, null, null, null, null, eVector, null, translatedFVector, eSentLength, fSentLength, null, null, null, null, null, null, 0); } - public static String[] computeFeaturesF2(HMapSIW eSrcTfs, HMapSFW eVector, HMapSIW fSrcTfs, HMapSFW translatedFVector, float eSentLength, float fSentLength, + public static String[] computeFeaturesF2(HMapStIW eSrcTfs, HMapStFW eVector, HMapStIW fSrcTfs, HMapStFW translatedFVector, float eSentLength, float fSentLength, Vocab eVocabSrc, Vocab eVocabTrg, Vocab fVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_Probs, TTable_monolithic_IFAs f2e_Probs, float prob){ return computeFeatures(2, null, null, null, null, eSrcTfs, eVector, fSrcTfs, translatedFVector, eSentLength, fSentLength, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, prob); } public static String[] computeFeaturesF3(String fSentence, String eSentence, Tokenizer fTokenizer, Tokenizer eTokenizer, - HMapSIW eSrcTfs, HMapSFW eVector, HMapSIW fSrcTfs, HMapSFW translatedFVector, float eSentLength, float fSentLength, + HMapStIW eSrcTfs, HMapStFW eVector, HMapStIW fSrcTfs, HMapStFW translatedFVector, float eSentLength, float fSentLength, Vocab eVocabSrc, Vocab eVocabTrg, Vocab fVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_Probs, TTable_monolithic_IFAs f2e_Probs, float prob){ return computeFeatures(3, fSentence, eSentence, fTokenizer, eTokenizer, eSrcTfs, eVector, fSrcTfs, translatedFVector, eSentLength, fSentLength, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, prob); } public static String[] computeFeatures(int featSet, String fSentence, String eSentence, Tokenizer fTokenizer, Tokenizer eTokenizer, - HMapSIW eSrcTfs, HMapSFW eVector, HMapSIW fSrcTfs, HMapSFW translatedFVector, float eSentLength, float fSentLength, + HMapStIW eSrcTfs, HMapStFW eVector, HMapStIW fSrcTfs, HMapStFW translatedFVector, float eSentLength, float fSentLength, Vocab eVocabSrc, Vocab eVocabTrg, Vocab fVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_Probs, TTable_monolithic_IFAs f2e_Probs, float prob){ return computeFeatures(featSet, fSentence, eSentence, fTokenizer, eTokenizer, eSrcTfs, eVector, fSrcTfs, translatedFVector, eSentLength, fSentLength, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, prob, logger); } public static String[] computeFeatures(int featSet, String fSentence, String eSentence, Tokenizer fTokenizer, Tokenizer eTokenizer, - HMapSIW eSrcTfs, HMapSFW eVector, HMapSIW fSrcTfs, HMapSFW translatedFVector, float eSentLength, float fSentLength, + HMapStIW eSrcTfs, HMapStFW eVector, HMapStIW fSrcTfs, HMapStFW translatedFVector, float eSentLength, float fSentLength, Vocab eVocabSrc, Vocab eVocabTrg, Vocab fVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2f_Probs, TTable_monolithic_IFAs f2e_Probs, float prob, Logger sLogger) { List features = new ArrayList(); if(fSentLength == 0 || eSentLength == 0){ @@ -1227,7 +1227,7 @@ private static int getNumberOfWordsWithNDigits(int N, String[] tokens) { return cnt; } - private static float getWordTransRatio(HMapSIW eSrcTfs, HMapSIW fSrcTfs, Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2fProbs, float probThreshold) { + private static float getWordTransRatio(HMapStIW eSrcTfs, HMapStIW fSrcTfs, Vocab eVocabSrc, Vocab fVocabTrg, TTable_monolithic_IFAs e2fProbs, float probThreshold) { // if there are k occurences of a term w on source side, and m occurrences of a possible translation of w on target side, // instead of saying that w has a translation on target side, we say w has max(1,m/k) translations to downweight cases where m sentences; - ArrayListWritable vectors = new ArrayListWritable(); + ArrayListWritable vectors = new ArrayListWritable(); ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable(); // identify sentences in document, filter out ones below MinSentLength threshold // convert each sentence into a tf-idf vector, using general DF map for collection and a heuristic for avg. doc length diff --git a/src/java/main/ivory/lsh/bitext/FilterSentencePairs.java b/src/java/main/ivory/lsh/bitext/FilterSentencePairs.java index e8f0febb..d5f335eb 100644 --- a/src/java/main/ivory/lsh/bitext/FilterSentencePairs.java +++ b/src/java/main/ivory/lsh/bitext/FilterSentencePairs.java @@ -35,8 +35,8 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; -import tl.lin.data.map.HMapSFW; -import tl.lin.data.map.HMapSIW; +import tl.lin.data.map.HMapStFW; +import tl.lin.data.map.HMapStIW; /** Step 2 of the bitext extraction algorithm. @@ -73,7 +73,7 @@ private static class MyMapper extends MapReduceBase implements private PreprocessHelper helper; private String eSent, fSent; private int eLen, fLen; - private HMapSFW eVector, fVector; + private HMapStFW eVector, fVector; private Tokenizer eTok, fTok; private Text outSent1, outSent2; private float classifierThreshold; @@ -111,9 +111,9 @@ public void map(LongWritable key, Text sentencePair, OutputCollector fSent = sentences[0]; eLen = eTok.getNumberTokens(eSent); fLen = fTok.getNumberTokens(fSent); - HMapSIW eSrcTfs = new HMapSIW(); + HMapStIW eSrcTfs = new HMapStIW(); eVector = helper.createEDocVector(eSent, eSrcTfs); - HMapSIW fSrcTfs = new HMapSIW(); + HMapStIW fSrcTfs = new HMapStIW(); fVector = helper.createFDocVector(fSent, fSrcTfs); if (eVector == null || fVector == null) { diff --git a/src/java/main/ivory/lsh/bitext/FindParallelSentencePairs.java b/src/java/main/ivory/lsh/bitext/FindParallelSentencePairs.java index a9df372d..587526a2 100644 --- a/src/java/main/ivory/lsh/bitext/FindParallelSentencePairs.java +++ b/src/java/main/ivory/lsh/bitext/FindParallelSentencePairs.java @@ -41,7 +41,7 @@ import tl.lin.data.array.ArrayListOfIntsWritable; import tl.lin.data.array.ArrayListWritable; import tl.lin.data.map.HMapIV; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.pair.PairOfInts; /** @@ -218,7 +218,7 @@ private static class MyReducer extends MapReduceBase implements Reducer{ private int fDocno, eDocno; private int classifierPositiveId; - private ArrayListWritable fVectors, eVectors; + private ArrayListWritable fVectors, eVectors; private ArrayListWritable fSentences, eSentences; private PreprocessHelper helper; // for modularity, helper provides methods to preprocess data private float classifierThreshold; @@ -242,8 +242,8 @@ public void configure(JobConf job) { throw new RuntimeException("Classifier confidence threshold > 1, provide value in [0,1]: "+classifierThreshold); } - eVectors = new ArrayListWritable(); - fVectors = new ArrayListWritable(); + eVectors = new ArrayListWritable(); + fVectors = new ArrayListWritable(); eSentences = new ArrayListWritable(); fSentences = new ArrayListWritable(); } @@ -303,11 +303,11 @@ public void reduce(PairOfInts docnoPair, Iterator wikiSentence // classify each e-f sentence pair in the candidate set for (int f = 0; f < fVectors.size(); f++) { - HMapSFW fVector = fVectors.get(f); + HMapStFW fVector = fVectors.get(f); int fSentLength = fSentences.get(f).getLength(); for (int e = 0; e < eVectors.size(); e++) { - HMapSFW eVector = eVectors.get(e); + HMapStFW eVector = eVectors.get(e); int eSentLength = eSentences.get(e).getLength(); if (eSentLength > 2 * fSentLength || fSentLength > 2 * eSentLength) { diff --git a/src/java/main/ivory/lsh/bitext/FindParallelSentencePairsOld.java b/src/java/main/ivory/lsh/bitext/FindParallelSentencePairsOld.java index 06d7be33..6f62e010 100644 --- a/src/java/main/ivory/lsh/bitext/FindParallelSentencePairsOld.java +++ b/src/java/main/ivory/lsh/bitext/FindParallelSentencePairsOld.java @@ -37,7 +37,7 @@ import tl.lin.data.array.ArrayListOfIntsWritable; import tl.lin.data.array.ArrayListWritable; import tl.lin.data.map.HMapIV; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.pair.PairOfInts; import edu.umd.cloud9.collection.Indexable; import edu.umd.cloud9.collection.wikipedia.WikipediaPage; @@ -184,7 +184,7 @@ public void map(Writable docnoKey, Indexable page, OutputCollector sentences; - ArrayListWritable vectors = new ArrayListWritable(); + ArrayListWritable vectors = new ArrayListWritable(); ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable(); try { if(lang.equals("en")){ @@ -235,7 +235,7 @@ private static class MyReducer extends MapReduceBase implements Reducer{ private int fDocno, eDocno; private int classifierPositiveId; - private ArrayListWritable fVectors, eVectors; + private ArrayListWritable fVectors, eVectors; private ArrayListWritable fSentences, eSentences; private PreprocessHelper helper; // for modularity, helper provides methods to preprocess data private float classifierThreshold; @@ -259,8 +259,8 @@ public void configure(JobConf job) { throw new RuntimeException("Classifier confidence threshold > 1, provide value in [0,1]: "+classifierThreshold); } - eVectors = new ArrayListWritable(); - fVectors = new ArrayListWritable(); + eVectors = new ArrayListWritable(); + fVectors = new ArrayListWritable(); eSentences = new ArrayListWritable(); fSentences = new ArrayListWritable(); } @@ -323,11 +323,11 @@ public void reduce(PairOfInts docnoPair, Iterator wikiTexts, // classify each e-f sentence pair in the candidate set for (int f = 0; f < fVectors.size(); f++) { - HMapSFW fVector = fVectors.get(f); + HMapStFW fVector = fVectors.get(f); int fSentLength = fSentences.get(f).getLength(); for (int e = 0; e < eVectors.size(); e++) { - HMapSFW eVector = eVectors.get(e); + HMapStFW eVector = eVectors.get(e); int eSentLength = eSentences.get(e).getLength(); if (eSentLength > 2 * fSentLength || fSentLength > 2 * eSentLength) { diff --git a/src/java/main/ivory/lsh/bitext/PreprocessHelper.java b/src/java/main/ivory/lsh/bitext/PreprocessHelper.java index d710b303..3450ddfc 100644 --- a/src/java/main/ivory/lsh/bitext/PreprocessHelper.java +++ b/src/java/main/ivory/lsh/bitext/PreprocessHelper.java @@ -29,8 +29,8 @@ import tl.lin.data.array.ArrayListOfIntsWritable; import tl.lin.data.array.ArrayListWritable; import tl.lin.data.map.HMapIFW; -import tl.lin.data.map.HMapSFW; -import tl.lin.data.map.HMapSIW; +import tl.lin.data.map.HMapStFW; +import tl.lin.data.map.HMapStIW; import tl.lin.data.map.MapKI; import com.google.common.collect.Maps; @@ -53,7 +53,7 @@ public class PreprocessHelper { private DfTableArray dfTable; private DefaultFrequencySortedDictionary dict; private final Logger sLogger = Logger.getLogger(PreprocessHelper.class); - private static final HMapSIW lang2AvgSentLen = new HMapSIW(); + private static final HMapStIW lang2AvgSentLen = new HMapStIW(); static { // took average # of tokens per sentence in Wikipedia data lang2AvgSentLen.put("en",21); @@ -218,11 +218,11 @@ private void loadEModels(JobConf conf) throws Exception { dfTable = new DfTableArray(new Path(env.getDfByTermData()), fs); } - public HMapSFW createFDocVector(String sentence) { - return createFDocVector(sentence, new HMapSIW()); + public HMapStFW createFDocVector(String sentence) { + return createFDocVector(sentence, new HMapStIW()); } - public HMapSFW createFDocVector(String sentence, HMapSIW term2Tf) { + public HMapStFW createFDocVector(String sentence, HMapStIW term2Tf) { String[] terms = fTok.processContent(sentence); for(String term : terms){ @@ -238,7 +238,7 @@ public HMapSFW createFDocVector(String sentence, HMapSIW term2Tf) { transTermTf = CLIRUtils.updateTFsByTerm(fTerm, tf, transTermTf, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, eTok, sLogger); } - HMapSFW weightedVector = CLIRUtils.createTermDocVector(terms.length, transTermTf, eVocabTrg, fScoreFn, dict, dfTable, true, sLogger); + HMapStFW weightedVector = CLIRUtils.createTermDocVector(terms.length, transTermTf, eVocabTrg, fScoreFn, dict, dfTable, true, sLogger); // don't count numbers for the min #terms constraint since Wikipedia has "sentences" full of numbers that doesn't make any sense int numNonNumbers = 0; @@ -254,12 +254,12 @@ public HMapSFW createFDocVector(String sentence, HMapSIW term2Tf) { } } - public HMapSFW createEDocVector(String sentence) { - return createEDocVector(sentence, new HMapSIW()); + public HMapStFW createEDocVector(String sentence) { + return createEDocVector(sentence, new HMapStIW()); } - public HMapSFW createEDocVector(String sentence, HMapSIW term2Tf) { - HMapSFW weightedVector = new HMapSFW(); + public HMapStFW createEDocVector(String sentence, HMapStIW term2Tf) { + HMapStFW weightedVector = new HMapStFW(); String[] terms = eTok.processContent(sentence); for(String term : terms){ @@ -281,7 +281,7 @@ public HMapSFW createEDocVector(String sentence, HMapSIW term2Tf) { } } - public ArrayListWritable getESentences(String text, ArrayListWritable vectors, ArrayListOfIntsWritable sentLengths) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException { + public ArrayListWritable getESentences(String text, ArrayListWritable vectors, ArrayListOfIntsWritable sentLengths) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException { ArrayListWritable sentences = new ArrayListWritable(); String[] lines = text.split("\n"); @@ -295,7 +295,7 @@ public ArrayListWritable getESentences(String text, ArrayListWritable= MinSentenceLength){ - HMapSFW vector = createEDocVector(sent.toString()); + HMapStFW vector = createEDocVector(sent.toString()); if(vector != null){ vectors.add(vector); sentences.add(new Text(sent)); @@ -308,7 +308,7 @@ public ArrayListWritable getESentences(String text, ArrayListWritable getFSentences(String text, ArrayListWritable vectors, ArrayListOfIntsWritable sentLengths) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException { + public ArrayListWritable getFSentences(String text, ArrayListWritable vectors, ArrayListOfIntsWritable sentLengths) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException { // sLogger.setLevel(Level.DEBUG); sLogger.debug("text length="+text.length()); @@ -341,7 +341,7 @@ public ArrayListWritable getFSentences(String text, ArrayListWritable= MinSentenceLength) { - HMapSFW vector = createFDocVector(sent); + HMapStFW vector = createFDocVector(sent); if (vector != null) { vectors.add(vector); sentences.add(new Text(sent)); diff --git a/src/java/main/ivory/lsh/data/WikiDocInfo.java b/src/java/main/ivory/lsh/data/WikiDocInfo.java index f1e6304e..99a6c93a 100644 --- a/src/java/main/ivory/lsh/data/WikiDocInfo.java +++ b/src/java/main/ivory/lsh/data/WikiDocInfo.java @@ -8,25 +8,25 @@ import org.apache.hadoop.io.Writable; import tl.lin.data.array.ArrayListWritable; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; public class WikiDocInfo implements Writable { int langID; - ArrayListWritable vectors; + ArrayListWritable vectors; ArrayListWritable sentences; public WikiDocInfo() { super(); } - public WikiDocInfo(int i1, ArrayListWritable t, ArrayListWritable v){//, ArrayListOfIntsWritable l) { + public WikiDocInfo(int i1, ArrayListWritable t, ArrayListWritable v){//, ArrayListOfIntsWritable l) { langID = i1; vectors = v; sentences = t; } public void readFields(DataInput in) { - vectors = new ArrayListWritable(); + vectors = new ArrayListWritable(); sentences = new ArrayListWritable(); try { @@ -56,7 +56,7 @@ public boolean equals(Object other){ return (p.getLangID()==getLangID() && (p.getVectors()).equals(this.getVectors()) && (p.getSentences()).equals(this.getSentences())); } - public ArrayListWritable getVectors() { + public ArrayListWritable getVectors() { return vectors; } @@ -68,7 +68,7 @@ public int getLangID() { return langID; } - public void set(int n1, ArrayListWritable vectors, ArrayListWritable sentences) { + public void set(int n1, ArrayListWritable vectors, ArrayListWritable sentences) { this.langID = n1; this.vectors = vectors; this.sentences = sentences; diff --git a/src/java/main/ivory/lsh/data/WikiSentenceInfo.java b/src/java/main/ivory/lsh/data/WikiSentenceInfo.java index 7f24c9fc..dd695ac4 100644 --- a/src/java/main/ivory/lsh/data/WikiSentenceInfo.java +++ b/src/java/main/ivory/lsh/data/WikiSentenceInfo.java @@ -7,18 +7,18 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; public class WikiSentenceInfo implements Writable { int langID; Text sentence; - HMapSFW vector; + HMapStFW vector; public WikiSentenceInfo() { super(); } - public WikiSentenceInfo(int i1, Text t, HMapSFW v){ + public WikiSentenceInfo(int i1, Text t, HMapStFW v){ langID = i1; sentence = t; vector = v; @@ -35,7 +35,7 @@ public void readFields(DataInput in) { try { sentence = new Text(); sentence.readFields(in); - vector = new HMapSFW(); + vector = new HMapStFW(); vector.readFields(in); } catch (IOException e) { throw new RuntimeException("Could not read vectors/sentences in WikiSentenceInfo"); @@ -58,7 +58,7 @@ public Text getSentence() { return sentence; } - public HMapSFW getVector() { + public HMapStFW getVector() { return vector; } @@ -66,7 +66,7 @@ public int getLangID() { return langID; } - public void set(int n1, Text sentence, HMapSFW vector) { + public void set(int n1, Text sentence, HMapStFW vector) { this.langID = n1; this.sentence = sentence; this.vector = vector; diff --git a/src/java/main/ivory/lsh/eval/BitextClassifierUtils.java b/src/java/main/ivory/lsh/eval/BitextClassifierUtils.java index 6883d50e..8db38af9 100644 --- a/src/java/main/ivory/lsh/eval/BitextClassifierUtils.java +++ b/src/java/main/ivory/lsh/eval/BitextClassifierUtils.java @@ -30,8 +30,8 @@ import tl.lin.data.array.ArrayListOfInts; import tl.lin.data.map.HMapIFW; import tl.lin.data.map.HMapIIW; -import tl.lin.data.map.HMapSFW; -import tl.lin.data.map.HMapSIW; +import tl.lin.data.map.HMapStFW; +import tl.lin.data.map.HMapStIW; import tl.lin.data.map.MapKF; import tl.lin.data.map.MapKI; import edu.umd.hooka.Vocab; @@ -45,26 +45,26 @@ * */ public class BitextClassifierUtils { - static List fDocs = new ArrayList(); - static List fDocTfs = new ArrayList(); + static List fDocs = new ArrayList(); + static List fDocTfs = new ArrayList(); static List fSents = new ArrayList(); - static List eDocs = new ArrayList(); - static List eDocTfs = new ArrayList(); + static List eDocs = new ArrayList(); + static List eDocTfs = new ArrayList(); static List eSents = new ArrayList(); static ArrayListOfInts enSentLengths = new ArrayListOfInts(); static ArrayListOfInts deSentLengths = new ArrayListOfInts(); - static HMapSIW numSentencesPerDocE; - static HMapSIW numSentencesPerDocF; + static HMapStIW numSentencesPerDocE; + static HMapStIW numSentencesPerDocF; - static List gDocs = new ArrayList(); - static HMapSIW dfE = new HMapSIW(); - static HMapSIW dfD = new HMapSIW(); - static HMapSIW dfG = new HMapSIW(); + static List gDocs = new ArrayList(); + static HMapStIW dfE = new HMapStIW(); + static HMapStIW dfD = new HMapStIW(); + static HMapStIW dfG = new HMapStIW(); - static HMapSIW fTitle2SentCnt = new HMapSIW(); - static HMapSIW eTitle2SentCnt = new HMapSIW(); + static HMapStIW fTitle2SentCnt = new HMapStIW(); + static HMapStIW eTitle2SentCnt = new HMapStIW(); static HMapIIW parallelPairs = new HMapIIW(); static float avgFDocLeng; @@ -75,8 +75,8 @@ public class BitextClassifierUtils { static TTable_monolithic_IFAs f2e_Probs, e2f_Probs; private static Options options; - private List translateDocVectors(String eLang, - String eTokenizerModelFile, String eStopwordsFile, List docs, float avgLen, HMapSIW transDfTable) { + private List translateDocVectors(String eLang, + String eTokenizerModelFile, String eStopwordsFile, List docs, float avgLen, HMapStIW transDfTable) { Bm25 mModel = new Bm25(); // set number of docs mModel.setDocCount(docs.size()); @@ -84,12 +84,12 @@ private List translateDocVectors(String eLang, // set average doc length mModel.setAvgDocLength(avgLen); - List transDocs = new ArrayList(); + List transDocs = new ArrayList(); Tokenizer tokenizer = TokenizerFactory.createTokenizer(eLang, eTokenizerModelFile, true, eStopwordsFile, eStopwordsFile + ".stemmed", null); // translate doc texts here - for (HMapSIW deDoc : docs) { + for (HMapStIW deDoc : docs) { HMapIFW tfS = new HMapIFW(); int docLen = 0; try { @@ -98,7 +98,7 @@ private List translateDocVectors(String eLang, } catch (IOException e) { e.printStackTrace(); } - HMapSFW v = CLIRUtils.createTermDocVector(docLen, tfS, eVocabTrg, mModel, dfE, true, null); + HMapStFW v = CLIRUtils.createTermDocVector(docLen, tfS, eVocabTrg, mModel, dfE, true, null); // System.out.println("f"+(n++)+" : " + v); transDocs.add(v); @@ -137,13 +137,13 @@ private void readWikiSentences(String eReadFile, String fReadFile, String pairsF } } - private float readLines(BufferedReader reader, Tokenizer tokenizer, HMapSIW title2SentCnt, ArrayListOfInts sentLengths, - List sentTfs, List sents, HMapSIW dfTable) throws IOException { + private float readLines(BufferedReader reader, Tokenizer tokenizer, HMapStIW title2SentCnt, ArrayListOfInts sentLengths, + List sentTfs, List sents, HMapStIW dfTable) throws IOException { String line = null; boolean isNewDoc = true; int cnt = 0; float sumLengths = 0; - HMapSIW sent = new HMapSIW(); + HMapStIW sent = new HMapStIW(); while ((line = reader.readLine()) != null) { line = line.trim(); @@ -187,8 +187,8 @@ private void readSentences(int sentsPerDoc, String eReadFile, String fReadFile, try { BufferedReader dis1 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(eReadFile)), "UTF-8")); BufferedReader dis2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fReadFile)), "UTF-8")); - HMapSIW fDoc = new HMapSIW(); - HMapSIW eDoc = new HMapSIW(); + HMapStIW fDoc = new HMapStIW(); + HMapStIW eDoc = new HMapStIW(); String eLine = null, fLine = null; int cntEDocs = 0, cntFDocs = 0, lastDocLenE = 0, lastDocLenF = 0, numSents = 0; @@ -231,8 +231,8 @@ private void readSentences(int sentsPerDoc, String eReadFile, String fReadFile, cntFDocs++; // reset variables - fDoc = new HMapSIW(); - eDoc = new HMapSIW(); + fDoc = new HMapStIW(); + eDoc = new HMapStIW(); numSents = 0; lastDocLenE = 0; lastDocLenF = 0; @@ -255,8 +255,8 @@ private void readSentences(int sentsPerDoc, String eReadFile, String fReadFile, } } - private List buildDocVectors(List term2tfVectors, float avgLen, - HMapSIW dfTable) { + private List buildDocVectors(List term2tfVectors, float avgLen, + HMapStIW dfTable) { Bm25 mModel = new Bm25(); // set number of docs mModel.setDocCount(term2tfVectors.size()); @@ -265,9 +265,9 @@ private List buildDocVectors(List term2tfVectors, float avgLen mModel.setAvgDocLength(avgLen); // tf-idf computation - List docVectors = new ArrayList(); - for (HMapSIW enDoc : term2tfVectors) { - HMapSFW v = new HMapSFW(); + List docVectors = new ArrayList(); + for (HMapStIW enDoc : term2tfVectors) { + HMapStFW v = new HMapStFW(); int docLen = 0; for (MapKI.Entry item : enDoc.entrySet()) { int tf = item.getValue(); @@ -322,8 +322,8 @@ private List readAlignments(String alignmentFileName) { private void prepareTrainTestData(List fSents, List eSents, Tokenizer fTokenizer, Tokenizer eTokenizer, - List fTfs, List eTfs, HMapIIW parallelPairs, List transVectors, - List eVectors, int featureSet, float prob, List alignments) { + List fTfs, List eTfs, HMapIIW parallelPairs, List transVectors, + List eVectors, int featureSet, float prob, List alignments) { NumberFormat nf = NumberFormat.getNumberInstance(); nf.setGroupingUsed(false); nf.setMaximumFractionDigits(2); @@ -332,12 +332,12 @@ private void prepareTrainTestData(List fSents, List eSents, long time = System.currentTimeMillis(); for (int i = 0; i < transVectors.size(); i++) { - HMapSFW transVector = transVectors.get(i); - HMapSIW fTfMap = fTfs.get(i); + HMapStFW transVector = transVectors.get(i); + HMapStIW fTfMap = fTfs.get(i); String fSent = fSents.get(i); for (int j = 0; j < eVectors.size(); j++) { - HMapSFW eVector = eVectors.get(j); - HMapSIW eTfMap = eTfs.get(j); + HMapStFW eVector = eVectors.get(j); + HMapStIW eTfMap = eTfs.get(j); String eSent = eSents.get(j); if (parallelPairs.get(i) == j) { label = "parallel"; @@ -418,12 +418,12 @@ public void runPrepareSentenceExtractionData(String fLang, String eLang, String System.out.println("Sentences read in " + (sentTime - startTime) + " ms. Number of sentences: " + fDocTfs.size() + " = " + eDocTfs.size()); - List eSentVectors = buildDocVectors(eDocTfs, avgEDocLeng, dfE); + List eSentVectors = buildDocVectors(eDocTfs, avgEDocLeng, dfE); long evectTime = System.currentTimeMillis(); System.out.println("E vectors created in " + (evectTime - sentTime) + " ms"); - List fSentVectors = translateDocVectors(eLang, eTokenFile, eStopwordsFile, fDocTfs, avgFDocLeng, dfE); + List fSentVectors = translateDocVectors(eLang, eTokenFile, eStopwordsFile, fDocTfs, avgFDocLeng, dfE); long fvectTime = System.currentTimeMillis(); System.out.println("F vectors created in " + (fvectTime - evectTime) + @@ -489,8 +489,8 @@ private static void runCLIRComparison() throws IOException, ClassNotFoundExcepti String DATADIR = "/fs/clip-qa/ferhan/cl-pwsim/pwsim-experiments-2013"; // /Users/ferhanture/edu/research_archive/data/de-en/eu-nc-wmt08 BitextClassifierUtils dt = new BitextClassifierUtils(); - numSentencesPerDocE = new HMapSIW(); - numSentencesPerDocF = new HMapSIW(); + numSentencesPerDocE = new HMapStIW(); + numSentencesPerDocF = new HMapStIW(); FileSystem localFs = FileSystem.getLocal(new Configuration()); eVocabSrc = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.en-de.en"), localFs); eVocabTrg = HadoopAlign.loadVocab(new Path(VOCABDIR+"/vocab.de-en.en"), localFs); @@ -505,7 +505,7 @@ private static void runCLIRComparison() throws IOException, ClassNotFoundExcepti TOKENDIR+"/en-token.bin", TOKENDIR+"/de.stop", TOKENDIR+"/en.stop"); - List fDocVectors = dt.translateDocVectors("en", + List fDocVectors = dt.translateDocVectors("en", TOKENDIR+"/en-token.bin", TOKENDIR+"/en.stop", fDocTfs, avgFDocLeng, dfE); @@ -519,7 +519,7 @@ private static void runCLIRComparison() throws IOException, ClassNotFoundExcepti TOKENDIR+"/en-token.bin", TOKENDIR+"/de.stop", TOKENDIR+"/en.stop"); - List googletransDocVectors = dt.buildDocVectors(eDocTfs, avgEDocLeng, dfE); + List googletransDocVectors = dt.buildDocVectors(eDocTfs, avgEDocLeng, dfE); eDocTfs.clear(); dfE.clear(); @@ -530,7 +530,7 @@ private static void runCLIRComparison() throws IOException, ClassNotFoundExcepti TOKENDIR+"/en-token.bin", TOKENDIR+"/de.stop", TOKENDIR+"/en.stop"); - List cdectransDocVectors = dt.buildDocVectors(eDocTfs, avgEDocLeng, dfE); + List cdectransDocVectors = dt.buildDocVectors(eDocTfs, avgEDocLeng, dfE); eDocTfs.clear(); dfE.clear(); @@ -541,7 +541,7 @@ private static void runCLIRComparison() throws IOException, ClassNotFoundExcepti TOKENDIR+"/en-token.bin", TOKENDIR+"/de.stop", TOKENDIR+"/en.stop"); - List eDocVectors = dt.buildDocVectors(eDocTfs, avgEDocLeng, dfE); + List eDocVectors = dt.buildDocVectors(eDocTfs, avgEDocLeng, dfE); for (int i=0; i<100; i++) { // System.out.println(CLIRUtils.cosine(fDocVectors.get(i), eDocVectors.get(i))); System.out.println("cdec\t+\t" + CLIRUtils.cosine(cdectransDocVectors.get(i), eDocVectors.get(i))); diff --git a/src/java/main/ivory/lsh/eval/BruteForcePwsim.java b/src/java/main/ivory/lsh/eval/BruteForcePwsim.java index dad1ed20..f2aab251 100644 --- a/src/java/main/ivory/lsh/eval/BruteForcePwsim.java +++ b/src/java/main/ivory/lsh/eval/BruteForcePwsim.java @@ -40,7 +40,7 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.pair.PairOfFloatInt; import tl.lin.data.pair.PairOfInts; import tl.lin.data.pair.PairOfWritables; @@ -119,8 +119,8 @@ public void map(IntWritable docno, WeightedIntDocVector docvector, * @author ferhanture */ public static class MyMapperTermDocVectors extends MapReduceBase implements - Mapper { - private List> vectors; + Mapper { + private List> vectors; float threshold; public void configure(JobConf job) { @@ -144,12 +144,12 @@ public void configure(JobConf job) { LOG.info("Read " + vectors.size() + " sample doc vectors"); } - public void map(IntWritable docno, HMapSFW docvector, + public void map(IntWritable docno, HMapStFW docvector, OutputCollector output, Reporter reporter) throws IOException { for (int i = 0; i < vectors.size(); i++) { reporter.incrCounter(Pairs.Total, 1); IntWritable sampleDocno = vectors.get(i).getLeftElement(); - HMapSFW fromSample = vectors.get(i).getRightElement(); + HMapStFW fromSample = vectors.get(i).getRightElement(); float cs = CLIRUtils.cosine(docvector, fromSample); if (cs >= threshold) { diff --git a/src/java/main/ivory/lsh/eval/SampleTermDocVectors.java b/src/java/main/ivory/lsh/eval/SampleTermDocVectors.java index bc87b0bd..9b9653ae 100644 --- a/src/java/main/ivory/lsh/eval/SampleTermDocVectors.java +++ b/src/java/main/ivory/lsh/eval/SampleTermDocVectors.java @@ -45,7 +45,7 @@ import tl.lin.data.map.HMapII; import tl.lin.data.map.HMapIIW; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import edu.umd.cloud9.io.SequenceFileUtils; /** @@ -95,7 +95,7 @@ @SuppressWarnings("deprecation") public class SampleTermDocVectors extends Configured implements Tool { @SuppressWarnings("unchecked") - static Class keyClass = IntWritable.class, valueClass = HMapSFW.class, + static Class keyClass = IntWritable.class, valueClass = HMapStFW.class, inputFormat = SequenceFileInputFormat.class; private static final Logger sLogger = Logger.getLogger(SampleTermDocVectors.class); @@ -107,7 +107,7 @@ private void printUsage() { private static class MyMapper extends MapReduceBase implements - Mapper { + Mapper { private int sampleFreq; private HMapII samplesMap = null; @@ -159,8 +159,8 @@ public void configure(JobConf conf) { } } - public void map(IntWritable key, HMapSFW val, - OutputCollector output, Reporter reporter) + public void map(IntWritable key, HMapStFW val, + OutputCollector output, Reporter reporter) throws IOException { if (samplesMap != null) { if (samplesMap.containsKey(key.get())) { @@ -177,11 +177,11 @@ public void map(IntWritable key, HMapSFW val, } public static class MyReducer extends MapReduceBase implements - Reducer { + Reducer { @Override - public void reduce(IntWritable key, Iterator values, - OutputCollector output, Reporter reporter) + public void reduce(IntWritable key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { output.collect(key, values.next()); } diff --git a/src/java/main/ivory/lsh/projection/ComputeSignaturesSimhash.java b/src/java/main/ivory/lsh/projection/ComputeSignaturesSimhash.java index 6935c7d3..e6dcae62 100644 --- a/src/java/main/ivory/lsh/projection/ComputeSignaturesSimhash.java +++ b/src/java/main/ivory/lsh/projection/ComputeSignaturesSimhash.java @@ -24,7 +24,7 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.map.MapKF; import edu.umd.cloud9.util.PowerTool; @@ -60,7 +60,7 @@ protected static enum Maps { * */ public static class MyMapper extends MapReduceBase implements - Mapper { + Mapper { static GeneralHashFunctionLibrary hashLib; static float[] V = new float[64]; @@ -70,7 +70,7 @@ public void configure(JobConf job) { hashLib = new GeneralHashFunctionLibrary(); } - public void map(IntWritable docno, HMapSFW docvector, + public void map(IntWritable docno, HMapStFW docvector, OutputCollector output, Reporter reporter) throws IOException { V = new float[64]; diff --git a/src/java/main/ivory/sqe/querygenerator/MtNQueryGenerator.java b/src/java/main/ivory/sqe/querygenerator/MtNQueryGenerator.java index 97f2499b..246ac844 100644 --- a/src/java/main/ivory/sqe/querygenerator/MtNQueryGenerator.java +++ b/src/java/main/ivory/sqe/querygenerator/MtNQueryGenerator.java @@ -17,7 +17,7 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.pair.PairOfStrings; import com.google.gson.JsonArray; @@ -126,7 +126,7 @@ public StructuredQuery parseQuery(String query, FileSystem fs, Configuration con String origQuery = translation.getOriginalQuery(); String grammarFile = conf.get(Constants.GrammarPath); - Map probMap = null; + Map probMap = null; if (scfgWeight > 0) { probMap = scfgGenerator.processGrammar(fs, conf, grammarFile); } @@ -188,7 +188,7 @@ public StructuredQuery parseQuery(String query, FileSystem fs, Configuration con JsonArray tokensArr = new JsonArray(); if (tokenWeight > 0) { for (String srcToken : stemmedSourceTokens) { - HMapSFW nbestDist = translation.getDistributionOf(srcToken); + HMapStFW nbestDist = translation.getDistributionOf(srcToken); if (defaultTokenizer.isStopWord(srcToken)){ continue; @@ -200,7 +200,7 @@ public StructuredQuery parseQuery(String query, FileSystem fs, Configuration con // Pr{bitext} if (bitextWeight > 0) { - HMapSFW bitextDist = clGenerator.getTranslations(origQuery.trim(), srcToken, pairsInGrammar, stemmed2Stemmed); + HMapStFW bitextDist = clGenerator.getTranslations(origQuery.trim(), srcToken, pairsInGrammar, stemmed2Stemmed); if(bitextDist != null && !bitextDist.isEmpty()){ tokenRepresentationList.add(new PairOfFloatMap(bitextDist, bitextWeight)); } @@ -208,7 +208,7 @@ public StructuredQuery parseQuery(String query, FileSystem fs, Configuration con // Pr{scfg} if (scfgWeight > 0) { - HMapSFW scfgDist = scfgGenerator.getTranslations(origQuery.trim(), srcToken, probMap, stemmed2Stemmed); + HMapStFW scfgDist = scfgGenerator.getTranslations(origQuery.trim(), srcToken, probMap, stemmed2Stemmed); if (scfgDist != null && !scfgDist.isEmpty() ){ tokenRepresentationList.add(new PairOfFloatMap(scfgDist, scfgWeight)); } @@ -248,7 +248,7 @@ public StructuredQuery parseQuery(String query, FileSystem fs, Configuration con // combine the token-based and phrase-based representations into a #combweight structure JsonArray queryJsonArr = new JsonArray(); - HMapSFW scaledPhrase2Weight = null; + HMapStFW scaledPhrase2Weight = null; if (phraseWeight > 0) { scaledPhrase2Weight = Utils.scaleProbMap(lexProbThreshold, phraseWeight, translation.getPhraseDist()); for (String phrase : scaledPhrase2Weight.keySet()) { diff --git a/src/java/main/ivory/sqe/querygenerator/ProbabilisticStructuredQueryGenerator.java b/src/java/main/ivory/sqe/querygenerator/ProbabilisticStructuredQueryGenerator.java index de454258..37a72770 100644 --- a/src/java/main/ivory/sqe/querygenerator/ProbabilisticStructuredQueryGenerator.java +++ b/src/java/main/ivory/sqe/querygenerator/ProbabilisticStructuredQueryGenerator.java @@ -18,7 +18,7 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.pair.PairOfFloatInt; import tl.lin.data.pair.PairOfStrings; @@ -145,7 +145,7 @@ public StructuredQuery parseQuery(String query, FileSystem fs, Configuration con } } else { JsonObject tokenTrans = new JsonObject(); - HMapSFW distr = getTranslations(origQuery, token, phrasePairs, stemmed2Stemmed); + HMapStFW distr = getTranslations(origQuery, token, phrasePairs, stemmed2Stemmed); JsonArray weights = Utils.createJsonArrayFromProbabilities(distr); if (weights != null) { tokenTrans.add("#weight", weights); @@ -185,8 +185,8 @@ protected String getBestTranslation(String token) { return token; } - protected HMapSFW getTranslations(String query, String token, Set pairsInSCFG, Map stemmed2Stemmed) { - HMapSFW probDist = new HMapSFW(); + protected HMapStFW getTranslations(String query, String token, Set pairsInSCFG, Map stemmed2Stemmed) { + HMapStFW probDist = new HMapStFW(); int f = fVocab_f2e.get(token); if (f <= 0) { diff --git a/src/java/main/ivory/sqe/querygenerator/SCFGQueryGenerator.java b/src/java/main/ivory/sqe/querygenerator/SCFGQueryGenerator.java index 5811c318..6817af5e 100644 --- a/src/java/main/ivory/sqe/querygenerator/SCFGQueryGenerator.java +++ b/src/java/main/ivory/sqe/querygenerator/SCFGQueryGenerator.java @@ -16,7 +16,7 @@ import org.apache.log4j.Level; import org.apache.log4j.Logger; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import tl.lin.data.map.MapKF; import com.google.gson.JsonArray; @@ -32,7 +32,7 @@ public class SCFGQueryGenerator implements QueryGenerator { private static final Logger LOG = Logger.getLogger(SCFGQueryGenerator.class); private Tokenizer defaultTokenizer, docLangTokenizer, queryLangTokenizerWithStemming, queryLangTokenizer, bigramTokenizer; - private Map> query2probMap; + private Map> query2probMap; private int length, numTransPerToken; private boolean isDocStemmed, isStemming, bigramSegment = false; private RetrievalEnvironment env; @@ -101,7 +101,7 @@ public StructuredQuery parseQuery(String query, FileSystem fs, Configuration con String origQuery = query.trim().split("\\|\\|\\|\\|")[0].trim(); String grammarFile = conf.get(Constants.GrammarPath); - Map probMap = processGrammar(fs, conf, grammarFile); + Map probMap = processGrammar(fs, conf, grammarFile); Map stemmed2Stemmed = Utils.getStemMapping(origQuery, defaultTokenizer, docLangTokenizer); JsonArray tokenTranslations = new JsonArray(); @@ -134,8 +134,8 @@ public StructuredQuery parseQuery(String query, FileSystem fs, Configuration con return new StructuredQuery(queryJson, length); } - public Map processGrammar(FileSystem fs, Configuration conf, String grammarFile) { - Map probMap = Utils.generateTranslationTable(fs, conf, grammarFile, + public Map processGrammar(FileSystem fs, Configuration conf, String grammarFile) { + Map probMap = Utils.generateTranslationTable(fs, conf, grammarFile, queryLangTokenizerWithStemming, docLangTokenizer); if (probMap == null) { LOG.info("No probabilities extracted from " + grammarFile); @@ -147,7 +147,7 @@ public Map processGrammar(FileSystem fs, Configuration conf, St } private String getBestTranslation(String query, String token) { - HMapSFW probDist = query2probMap.get(query).get(token); + HMapStFW probDist = query2probMap.get(query).get(token); if(probDist == null){ return token; @@ -164,8 +164,8 @@ private String getBestTranslation(String query, String token) { return maxProbTrans; } - protected HMapSFW getTranslations(String query, String token, Map probMap, Map stemmed2Stemmed) { - HMapSFW probDist = null; + protected HMapStFW getTranslations(String query, String token, Map probMap, Map stemmed2Stemmed) { + HMapStFW probDist = null; try { probDist = probMap.get(token); } catch (NullPointerException e) { @@ -175,7 +175,7 @@ protected HMapSFW getTranslations(String query, String token, Map tok2tokDist; + private Map tok2tokDist; private Set targetTokens; // all non-stopword target tokens s.t. aligned to some source non-stopword token - private HMapSFW targetPhraseDist; // map from RHSs of rules to translation scores; there is only one RHS (equal to entire translation), if we don't have derivation info - private HMapSIW sourceTokenCnt; + private HMapStFW targetPhraseDist; // map from RHSs of rules to translation scores; there is only one RHS (equal to entire translation), if we don't have derivation info + private HMapStIW sourceTokenCnt; private String originalQuery; private int count; private Map stemMapping; @@ -34,19 +34,19 @@ public void setTargetTokens(Set targetTokens) { this.targetTokens = targetTokens; } - public HMapSIW getSourceTokenCnt() { + public HMapStIW getSourceTokenCnt() { return sourceTokenCnt; } - public void setSourceTokenCnt(HMapSIW sourceTokenCnt) { + public void setSourceTokenCnt(HMapStIW sourceTokenCnt) { this.sourceTokenCnt = sourceTokenCnt; } - public void setPhraseDist(HMapSFW dist) { + public void setPhraseDist(HMapStFW dist) { targetPhraseDist = dist; } - public HMapSFW getPhraseDist() { + public HMapStFW getPhraseDist() { return targetPhraseDist; } @@ -58,11 +58,11 @@ public void setOriginalQuery(String o) { originalQuery = o; } - public Map getTokenDist() { + public Map getTokenDist() { return tok2tokDist; } - public void setTokenDist(Map dist) { + public void setTokenDist(Map dist) { tok2tokDist = dist; } @@ -74,7 +74,7 @@ public int getCount() { return count; } - public HMapSFW getDistributionOf(String srcToken) { + public HMapStFW getDistributionOf(String srcToken) { return tok2tokDist.get(srcToken); } } diff --git a/src/java/main/ivory/sqe/querygenerator/TranslationFactory.java b/src/java/main/ivory/sqe/querygenerator/TranslationFactory.java index 6dae3c00..de1cadb6 100644 --- a/src/java/main/ivory/sqe/querygenerator/TranslationFactory.java +++ b/src/java/main/ivory/sqe/querygenerator/TranslationFactory.java @@ -11,8 +11,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.log4j.Logger; -import tl.lin.data.map.HMapSFW; -import tl.lin.data.map.HMapSIW; +import tl.lin.data.map.HMapStFW; +import tl.lin.data.map.HMapStIW; public class TranslationFactory { private static final Logger LOG = Logger.getLogger(TranslationFactory.class); @@ -46,12 +46,12 @@ public static Translation readTranslationsFromNBest(String queryRepresentation, int one2many = conf.getInt(Constants.One2Many, 2); // src token --> (trg token --> prob(trg|src)) - Map token2tokenDist = new HashMap(); + Map token2tokenDist = new HashMap(); // target phrase --> prob - HMapSFW phraseDist = new HMapSFW(); + HMapStFW phraseDist = new HMapStFW(); - HMapSIW srcTokenCnt = new HMapSIW(); + HMapStIW srcTokenCnt = new HMapStIW(); Set bagOfTargetTokens = new HashSet(); diff --git a/src/java/main/ivory/sqe/querygenerator/TranslationFromNBest.java b/src/java/main/ivory/sqe/querygenerator/TranslationFromNBest.java index 36813f58..195b665f 100644 --- a/src/java/main/ivory/sqe/querygenerator/TranslationFromNBest.java +++ b/src/java/main/ivory/sqe/querygenerator/TranslationFromNBest.java @@ -3,12 +3,12 @@ import java.util.Map; import java.util.Set; -import tl.lin.data.map.HMapSFW; -import tl.lin.data.map.HMapSIW; +import tl.lin.data.map.HMapStFW; +import tl.lin.data.map.HMapStIW; public class TranslationFromNBest extends Translation { - public TranslationFromNBest(int n, String origQuery, Map stemmed2stemmed, Set bagOfTargetTokens, Map token2tokenDist, HMapSFW phraseDist, HMapSIW srcTokenCnt) { + public TranslationFromNBest(int n, String origQuery, Map stemmed2stemmed, Set bagOfTargetTokens, Map token2tokenDist, HMapStFW phraseDist, HMapStIW srcTokenCnt) { setOriginalQuery(origQuery); setPhraseDist(phraseDist); setTokenDist(token2tokenDist); diff --git a/src/java/main/ivory/sqe/querygenerator/Utils.java b/src/java/main/ivory/sqe/querygenerator/Utils.java index 98080098..5d7a1039 100644 --- a/src/java/main/ivory/sqe/querygenerator/Utils.java +++ b/src/java/main/ivory/sqe/querygenerator/Utils.java @@ -28,8 +28,8 @@ import tl.lin.data.map.HMapIV; import tl.lin.data.map.HMapKF; import tl.lin.data.map.HMapKI; -import tl.lin.data.map.HMapSFW; -import tl.lin.data.map.HMapSIW; +import tl.lin.data.map.HMapStFW; +import tl.lin.data.map.HMapStIW; import tl.lin.data.map.MapKF; import tl.lin.data.pair.PairOfStringFloat; import tl.lin.data.pair.PairOfStrings; @@ -72,14 +72,14 @@ public static String[] extractPhrases(String[] tokens, int windowSize) { * @param phrase2score * @param phrase2count */ - private static void addToPhraseTable(String fPhrase, String transPhrase, float prob, Map phrase2score, Map> phrase2count){ + private static void addToPhraseTable(String fPhrase, String transPhrase, float prob, Map phrase2score, Map> phrase2count){ fPhrase = fPhrase.trim(); transPhrase = transPhrase.trim(); //LOG.info("Found translation phrase " + transPhrase); if (!phrase2score.containsKey(fPhrase)) { - phrase2score.put(fPhrase, new HMapSFW()); + phrase2score.put(fPhrase, new HMapStFW()); } // if same phrase extracted from multiple rules, average prob.s @@ -188,7 +188,7 @@ private static String isConsecutiveWithStopwords(ArrayListOfInts lst, String[] r * @param docLangTokenizer * to check for stopwords on RHS */ - public static Map generateTranslationTable(FileSystem fs, Configuration conf, String grammarFile, Tokenizer queryLangTokenizer, Tokenizer docLangTokenizer) { + public static Map generateTranslationTable(FileSystem fs, Configuration conf, String grammarFile, Tokenizer queryLangTokenizer, Tokenizer docLangTokenizer) { if (conf.getBoolean(Constants.Quiet, false)) { LOG.setLevel(Level.OFF); } @@ -198,12 +198,12 @@ public static Map generateTranslationTable(FileSystem fs, Confi int one2many = conf.getInt(Constants.One2Many, 2); // scfgDist table is a set of (source_token --> X) maps, where X is a set of (token_trans --> score) maps - Map scfgDist = new HashMap(); + Map scfgDist = new HashMap(); // phrase2count table is a set of (source_phrase --> X) maps, where X is a set of (phrase_trans --> count) maps - HMapSFW phraseDist = new HMapSFW(); + HMapStFW phraseDist = new HMapStFW(); - HMapSIW srcTokenCnt = new HMapSIW(); + HMapStIW srcTokenCnt = new HMapStIW(); Set bagOfTargetTokens = new HashSet(); @@ -230,8 +230,8 @@ public static Map generateTranslationTable(FileSystem fs, Confi return scfgDist; } - public static void processRule(int isOne2Many, boolean isMany2Many, float score, String rule, Set bagOfTargetTokens, Map probDist, - HMapSFW phraseDist, HMapSIW srcTokenCnt, Tokenizer queryLangTokenizer, Tokenizer docLangTokenizer, Map stemmed2Stemmed, Set unknownWords) { + public static void processRule(int isOne2Many, boolean isMany2Many, float score, String rule, Set bagOfTargetTokens, Map probDist, + HMapStFW phraseDist, HMapStIW srcTokenCnt, Tokenizer queryLangTokenizer, Tokenizer docLangTokenizer, Map stemmed2Stemmed, Set unknownWords) { // LOG.info("Processing rule " + rule); String[] parts = rule.split("\\|\\|\\|"); @@ -306,10 +306,10 @@ public static void processRule(int isOne2Many, boolean isMany2Many, float score, bagOfTargetTokens.add(eTerm); if (isOne2Many <= 1) { if (probDist.containsKey(fTerm)) { - HMapSFW eToken2Prob = probDist.get(fTerm); + HMapStFW eToken2Prob = probDist.get(fTerm); eToken2Prob.increment(eTerm, weight); }else { - HMapSFW eToken2Prob = new HMapSFW(); + HMapStFW eToken2Prob = new HMapStFW(); eToken2Prob.put(eTerm, weight); probDist.put(fTerm, eToken2Prob); } @@ -336,10 +336,10 @@ public static void processRule(int isOne2Many, boolean isMany2Many, float score, // update prob. distr. if (probDist.containsKey(fTerm)) { - HMapSFW eToken2Prob = probDist.get(fTerm); + HMapStFW eToken2Prob = probDist.get(fTerm); eToken2Prob.increment(eTerm, weight); }else { - HMapSFW eToken2Prob = new HMapSFW(); + HMapStFW eToken2Prob = new HMapStFW(); eToken2Prob.put(eTerm, weight); probDist.put(fTerm, eToken2Prob); } @@ -427,8 +427,8 @@ private static HMapIV readAlignments(String[] alignments) { * @param scale * @param probMap */ - public static HMapSFW scaleProbMap(float threshold, float scale, HMapSFW probMap) { - HMapSFW scaledProbMap = new HMapSFW(); + public static HMapStFW scaleProbMap(float threshold, float scale, HMapStFW probMap) { + HMapStFW scaledProbMap = new HMapStFW(); for (MapKF.Entry entry : probMap.entrySet()) { float pr = entry.getValue() * scale; @@ -449,8 +449,8 @@ public static HMapSFW scaleProbMap(float threshold, float scale, HMapSFW probMap * @param probMaps * list of probability distributions */ - public static HMapSFW combineProbMaps(float threshold, float scale, List probMaps) { - HMapSFW combinedProbMap = new HMapSFW(); + public static HMapStFW combineProbMaps(float threshold, float scale, List probMaps) { + HMapStFW combinedProbMap = new HMapStFW(); int numDistributions = probMaps.size(); @@ -459,7 +459,7 @@ public static HMapSFW combineProbMaps(float threshold, float scale, List translationAlternatives = new HashSet(); float sumWeights = 0; for (int i=0; i < numDistributions; i++) { - HMapSFW dist = probMaps.get(i).getMap(); + HMapStFW dist = probMaps.get(i).getMap(); float weight = probMaps.get(i).getWeight(); // don't add vocabulary from a distribution that has 0 weight @@ -473,7 +473,7 @@ public static HMapSFW combineProbMaps(float threshold, float scale, List probMap, float lexProbThreshold, float cumProbThreshold, int maxNumTrans) { + public static void normalize(Map probMap, float lexProbThreshold, float cumProbThreshold, int maxNumTrans) { for (String sourceTerm : probMap.keySet()) { - HMapSFW probDist = probMap.get(sourceTerm); + HMapStFW probDist = probMap.get(sourceTerm); TreeSet sortedFilteredProbDist = new TreeSet(); - HMapSFW normProbDist = new HMapSFW(); + HMapStFW normProbDist = new HMapStFW(); // compute normalization factor float sumProb = 0; @@ -538,7 +538,7 @@ public static void normalize(Map probMap, float lexProbThreshol * * @param probMap */ - public static void normalize(HMapSFW probMap) { + public static void normalize(HMapStFW probMap) { float normalization = 0; for (MapKF.Entry e : probMap.entrySet()) { float weight = e.getValue(); @@ -549,7 +549,7 @@ public static void normalize(HMapSFW probMap) { } } - public static void filter(HMapSFW probMap, float lexProbThreshold) { + public static void filter(HMapStFW probMap, float lexProbThreshold) { for (MapKF.Entry e : probMap.entrySet()) { if (e.getValue() > lexProbThreshold) { probMap.put(e.getKey(), e.getValue()); @@ -602,7 +602,7 @@ public static JsonArray createJsonArray(String[] elements) { * Convert prob. distribution to JSONArray in which float at position 2k corresponds to probabilities of term at position 2k+1, k=0...(n/2-1) * @param probMap */ - public static JsonArray createJsonArrayFromProbabilities(HMapSFW probMap) { + public static JsonArray createJsonArrayFromProbabilities(HMapStFW probMap) { if (probMap == null) { return null; } diff --git a/src/java/main/ivory/sqe/retrieval/PairOfFloatMap.java b/src/java/main/ivory/sqe/retrieval/PairOfFloatMap.java index d2801a7d..8f30a9f7 100644 --- a/src/java/main/ivory/sqe/retrieval/PairOfFloatMap.java +++ b/src/java/main/ivory/sqe/retrieval/PairOfFloatMap.java @@ -1,24 +1,24 @@ package ivory.sqe.retrieval; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import com.google.common.base.Preconditions; public class PairOfFloatMap { - private HMapSFW map; + private HMapStFW map; private float weight; - public PairOfFloatMap(HMapSFW map, float weight) { + public PairOfFloatMap(HMapStFW map, float weight) { super(); this.map = Preconditions.checkNotNull(map); this.weight = weight; } - public HMapSFW getMap() { + public HMapStFW getMap() { return map; } - public void setMap(HMapSFW map) { + public void setMap(HMapStFW map) { this.map = map; } diff --git a/src/java/regression/ivory/regression/sigir2013/cdec/EnAr_TREC02.java b/src/java/regression/ivory/regression/sigir2013/cdec/EnAr_TREC02.java index 6fc4e22a..6f34de3a 100644 --- a/src/java/regression/ivory/regression/sigir2013/cdec/EnAr_TREC02.java +++ b/src/java/regression/ivory/regression/sigir2013/cdec/EnAr_TREC02.java @@ -17,7 +17,7 @@ import org.apache.hadoop.fs.FileSystem; import org.junit.Test; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import com.google.common.collect.Maps; @@ -179,17 +179,17 @@ public static junit.framework.Test suite() { public static void main(String[] args) { initialize(); - HMapSFW tenbestAPMap = array2Map(Nbest_AP.get(2)); - HMapSFW onebestAPMap = array2Map(Onebest_AP.get(2)); - HMapSFW grammarAPMap = array2Map(grammar_AP.get(2)); - HMapSFW tokenAPMap = array2Map(baseline_token_AP); + HMapStFW tenbestAPMap = array2Map(Nbest_AP.get(2)); + HMapStFW onebestAPMap = array2Map(Onebest_AP.get(2)); + HMapStFW grammarAPMap = array2Map(grammar_AP.get(2)); + HMapStFW tokenAPMap = array2Map(baseline_token_AP); System.out.println("10best: improved=" + countNumberOfImprovedTopics(tokenAPMap, tenbestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, tenbestAPMap)); System.out.println("Grammar: improved=" + countNumberOfImprovedTopics(tokenAPMap, grammarAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, grammarAPMap)); System.out.println("1best: improved=" + countNumberOfImprovedTopics(tokenAPMap, onebestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, onebestAPMap)); } - private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfImprovedTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -200,7 +200,7 @@ private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridA return cnt; } - private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfNegligibleTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -211,8 +211,8 @@ private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gri return cnt; } - private static HMapSFW array2Map(String[] array) { - HMapSFW map = new HMapSFW(); + private static HMapStFW array2Map(String[] array) { + HMapStFW map = new HMapStFW(); for ( int i = 0; i < array.length; i += 2 ) { map.put(array[i], Float.parseFloat(array[i+1])); } diff --git a/src/java/regression/ivory/regression/sigir2013/cdec/EnFr_CLEF06.java b/src/java/regression/ivory/regression/sigir2013/cdec/EnFr_CLEF06.java index e920869b..6441a8c6 100644 --- a/src/java/regression/ivory/regression/sigir2013/cdec/EnFr_CLEF06.java +++ b/src/java/regression/ivory/regression/sigir2013/cdec/EnFr_CLEF06.java @@ -17,7 +17,7 @@ import org.apache.hadoop.fs.FileSystem; import org.junit.Test; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import com.google.common.collect.Maps; @@ -180,17 +180,17 @@ public static junit.framework.Test suite() { public static void main(String[] args) { initialize(); - HMapSFW tenbestAPMap = array2Map(Nbest_AP.get(2)); - HMapSFW onebestAPMap = array2Map(Onebest_AP.get(2)); - HMapSFW grammarAPMap = array2Map(grammar_AP.get(2)); - HMapSFW tokenAPMap = array2Map(baseline_token_AP); -// HMapSFW gridAPMap = array2Map(Interp_AP.get(2)); + HMapStFW tenbestAPMap = array2Map(Nbest_AP.get(2)); + HMapStFW onebestAPMap = array2Map(Onebest_AP.get(2)); + HMapStFW grammarAPMap = array2Map(grammar_AP.get(2)); + HMapStFW tokenAPMap = array2Map(baseline_token_AP); +// HMapStFW gridAPMap = array2Map(Interp_AP.get(2)); System.out.println("10best: improved=" + countNumberOfImprovedTopics(tokenAPMap, tenbestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, tenbestAPMap)); System.out.println("Grammar: improved=" + countNumberOfImprovedTopics(tokenAPMap, grammarAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, grammarAPMap)); System.out.println("1best: improved=" + countNumberOfImprovedTopics(tokenAPMap, onebestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, onebestAPMap)); } - private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfImprovedTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -201,7 +201,7 @@ private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridA return cnt; } - private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfNegligibleTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -212,8 +212,8 @@ private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gri return cnt; } - private static HMapSFW array2Map(String[] array) { - HMapSFW map = new HMapSFW(); + private static HMapStFW array2Map(String[] array) { + HMapStFW map = new HMapStFW(); for ( int i = 0; i < array.length; i += 2 ) { map.put(array[i], Float.parseFloat(array[i+1])); } diff --git a/src/java/regression/ivory/regression/sigir2013/cdec/EnZh_NTCIR8.java b/src/java/regression/ivory/regression/sigir2013/cdec/EnZh_NTCIR8.java index d5698804..aedf0049 100644 --- a/src/java/regression/ivory/regression/sigir2013/cdec/EnZh_NTCIR8.java +++ b/src/java/regression/ivory/regression/sigir2013/cdec/EnZh_NTCIR8.java @@ -17,7 +17,7 @@ import org.apache.hadoop.fs.FileSystem; import org.junit.Test; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import com.google.common.collect.Maps; @@ -179,17 +179,17 @@ public static junit.framework.Test suite() { public static void main(String[] args) { initialize(); - HMapSFW tenbestAPMap = array2Map(Nbest_AP.get(2)); - HMapSFW onebestAPMap = array2Map(Onebest_AP.get(2)); - HMapSFW grammarAPMap = array2Map(grammar_AP.get(2)); - HMapSFW tokenAPMap = array2Map(baseline_token_AP); + HMapStFW tenbestAPMap = array2Map(Nbest_AP.get(2)); + HMapStFW onebestAPMap = array2Map(Onebest_AP.get(2)); + HMapStFW grammarAPMap = array2Map(grammar_AP.get(2)); + HMapStFW tokenAPMap = array2Map(baseline_token_AP); System.out.println("10best: improved=" + countNumberOfImprovedTopics(tokenAPMap, tenbestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, tenbestAPMap)); System.out.println("Grammar: improved=" + countNumberOfImprovedTopics(tokenAPMap, grammarAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, grammarAPMap)); System.out.println("1best: improved=" + countNumberOfImprovedTopics(tokenAPMap, onebestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, onebestAPMap)); } - private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfImprovedTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -200,7 +200,7 @@ private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridA return cnt; } - private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfNegligibleTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -211,8 +211,8 @@ private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gri return cnt; } - private static HMapSFW array2Map(String[] array) { - HMapSFW map = new HMapSFW(); + private static HMapStFW array2Map(String[] array) { + HMapStFW map = new HMapStFW(); for ( int i = 0; i < array.length; i += 2 ) { map.put(array[i], Float.parseFloat(array[i+1])); } diff --git a/src/java/regression/ivory/regression/sigir2013/moses/EnAr_TREC02.java b/src/java/regression/ivory/regression/sigir2013/moses/EnAr_TREC02.java index f175cc66..684a46ff 100644 --- a/src/java/regression/ivory/regression/sigir2013/moses/EnAr_TREC02.java +++ b/src/java/regression/ivory/regression/sigir2013/moses/EnAr_TREC02.java @@ -17,7 +17,7 @@ import org.apache.hadoop.fs.FileSystem; import org.junit.Test; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import com.google.common.collect.Maps; @@ -183,18 +183,18 @@ public static junit.framework.Test suite() { public static void main(String[] args) { EnAr_TREC02.initialize(); - // HMapSFW gridAPMap = array2Map(Interp_AP); - HMapSFW tenbestAPMap = array2Map(Nbest_AP.get(2)); - HMapSFW onebestAPMap = array2Map(Onebest_AP.get(1)); - HMapSFW grammarAPMap = array2Map(grammar_AP.get(0)); - HMapSFW tokenAPMap = array2Map(baseline_token_AP); + // HMapStFW gridAPMap = array2Map(Interp_AP); + HMapStFW tenbestAPMap = array2Map(Nbest_AP.get(2)); + HMapStFW onebestAPMap = array2Map(Onebest_AP.get(1)); + HMapStFW grammarAPMap = array2Map(grammar_AP.get(0)); + HMapStFW tokenAPMap = array2Map(baseline_token_AP); System.out.println("10best: improved=" + countNumberOfImprovedTopics(tokenAPMap, tenbestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, tenbestAPMap)); System.out.println("Grammar: improved=" + countNumberOfImprovedTopics(tokenAPMap, grammarAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, grammarAPMap)); System.out.println("1best: improved=" + countNumberOfImprovedTopics(tokenAPMap, onebestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, onebestAPMap)); } - private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfImprovedTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -205,7 +205,7 @@ private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridA return cnt; } - private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfNegligibleTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -216,8 +216,8 @@ private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gri return cnt; } - private static HMapSFW array2Map(String[] array) { - HMapSFW map = new HMapSFW(); + private static HMapStFW array2Map(String[] array) { + HMapStFW map = new HMapStFW(); for ( int i = 0; i < array.length; i += 2 ) { map.put(array[i], Float.parseFloat(array[i+1])); } diff --git a/src/java/regression/ivory/regression/sigir2013/moses/EnFr_CLEF06.java b/src/java/regression/ivory/regression/sigir2013/moses/EnFr_CLEF06.java index c4a7fa5e..513031d3 100644 --- a/src/java/regression/ivory/regression/sigir2013/moses/EnFr_CLEF06.java +++ b/src/java/regression/ivory/regression/sigir2013/moses/EnFr_CLEF06.java @@ -17,7 +17,7 @@ import org.apache.hadoop.fs.FileSystem; import org.junit.Test; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import com.google.common.collect.Maps; @@ -185,17 +185,17 @@ public static junit.framework.Test suite() { public static void main(String[] args) { initialize(); - HMapSFW tenbestAPMap = array2Map(Nbest_AP.get(2)); - HMapSFW onebestAPMap = array2Map(Onebest_AP.get(1)); - HMapSFW grammarAPMap = array2Map(grammar_AP.get(0)); - HMapSFW tokenAPMap = array2Map(baseline_token_AP); + HMapStFW tenbestAPMap = array2Map(Nbest_AP.get(2)); + HMapStFW onebestAPMap = array2Map(Onebest_AP.get(1)); + HMapStFW grammarAPMap = array2Map(grammar_AP.get(0)); + HMapStFW tokenAPMap = array2Map(baseline_token_AP); System.out.println("10best: improved=" + countNumberOfImprovedTopics(tokenAPMap, tenbestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, tenbestAPMap)); System.out.println("Grammar: improved=" + countNumberOfImprovedTopics(tokenAPMap, grammarAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, grammarAPMap)); System.out.println("1best: improved=" + countNumberOfImprovedTopics(tokenAPMap, onebestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, onebestAPMap)); } - private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfImprovedTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -206,7 +206,7 @@ private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridA return cnt; } - private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfNegligibleTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -217,8 +217,8 @@ private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gri return cnt; } - private static HMapSFW array2Map(String[] array) { - HMapSFW map = new HMapSFW(); + private static HMapStFW array2Map(String[] array) { + HMapStFW map = new HMapStFW(); for ( int i = 0; i < array.length; i += 2 ) { map.put(array[i], Float.parseFloat(array[i+1])); } diff --git a/src/java/regression/ivory/regression/sigir2013/moses/EnZh_NTCIR8.java b/src/java/regression/ivory/regression/sigir2013/moses/EnZh_NTCIR8.java index efcab995..0cb3d7d2 100644 --- a/src/java/regression/ivory/regression/sigir2013/moses/EnZh_NTCIR8.java +++ b/src/java/regression/ivory/regression/sigir2013/moses/EnZh_NTCIR8.java @@ -18,7 +18,7 @@ import org.apache.hadoop.fs.FileSystem; import org.junit.Test; -import tl.lin.data.map.HMapSFW; +import tl.lin.data.map.HMapStFW; import com.google.common.collect.Maps; @@ -187,17 +187,17 @@ public static junit.framework.Test suite() { public static void main(String[] args) { initialize(); - HMapSFW tenbestAPMap = array2Map(Nbest_AP.get(2)); - HMapSFW onebestAPMap = array2Map(Onebest_AP.get(1)); - HMapSFW grammarAPMap = array2Map(grammar_AP.get(0)); - HMapSFW tokenAPMap = array2Map(baseline_token_AP); + HMapStFW tenbestAPMap = array2Map(Nbest_AP.get(2)); + HMapStFW onebestAPMap = array2Map(Onebest_AP.get(1)); + HMapStFW grammarAPMap = array2Map(grammar_AP.get(0)); + HMapStFW tokenAPMap = array2Map(baseline_token_AP); System.out.println("10best: improved=" + countNumberOfImprovedTopics(tokenAPMap, tenbestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, tenbestAPMap)); System.out.println("Grammar: improved=" + countNumberOfImprovedTopics(tokenAPMap, grammarAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, grammarAPMap)); System.out.println("1best: improved=" + countNumberOfImprovedTopics(tokenAPMap, onebestAPMap) + ", negligible=" + countNumberOfNegligibleTopics(tokenAPMap, onebestAPMap)); } - private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfImprovedTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -208,7 +208,7 @@ private static int countNumberOfImprovedTopics(HMapSFW tokenAPMap, HMapSFW gridA return cnt; } - private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gridAPMap) { + private static int countNumberOfNegligibleTopics(HMapStFW tokenAPMap, HMapStFW gridAPMap) { int cnt = 0; for (String key : tokenAPMap.keySet()) { float difference = gridAPMap.get(key) - tokenAPMap.get(key); @@ -219,8 +219,8 @@ private static int countNumberOfNegligibleTopics(HMapSFW tokenAPMap, HMapSFW gri return cnt; } - private static HMapSFW array2Map(String[] array) { - HMapSFW map = new HMapSFW(); + private static HMapStFW array2Map(String[] array) { + HMapStFW map = new HMapStFW(); for ( int i = 0; i < array.length; i += 2 ) { map.put(array[i], Float.parseFloat(array[i+1])); }