Skip to content

Commit ac5e0e3

Browse files
author
Leonard Poon
committed
Support conversion to sparse data format
- added a script to rename the dependency file after assembly
1 parent 46320f2 commit ac5e0e3

File tree

3 files changed

+32
-3
lines changed

3 files changed

+32
-3
lines changed

rename-deps.sh

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/bash
2+
3+
BASE=$(dirname $0)
4+
TARGET="$BASE/target/scala-2.11/HLTA-assembly-1.1-deps.jar"
5+
if [[ -e $TARGET ]]; then
6+
mv $TARGET $BASE/target/scala-2.11/HLTA-deps.jar
7+
else
8+
echo "$TARGET does not exist."
9+
exit -1
10+
fi

src/main/scala/tm/text/Convert.scala

+5-2
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,12 @@ import tm.util.ParMapReduce._
1616
import org.rogach.scallop._
1717
import tm.util.Arguments
1818

19-
2019
object Convert {
2120
class Conf(args: Seq[String]) extends Arguments(args) {
2221
banner("Usage: tm.text.Convert [OPTION]... name max-words max-n source")
2322
val name = trailArg[String](descr = "Name of data")
2423
val maxWords = trailArg[Int](descr = "Maximum number of words")
25-
val concatenations =
24+
val concatenations =
2625
trailArg[Int](descr = "Number of concatentations for building n-grams")
2726
val source = trailArg[String](descr = "Source directory")
2827

@@ -79,6 +78,10 @@ object Convert {
7978
def readFiles(name: String, source: Path): GenSeq[Document] = {
8079
logger.info("Finding files under {}", source)
8180
val paths = getFiles(source)
81+
if (paths.isEmpty) {
82+
logger.error("No text files found under {}", source)
83+
throw new IllegalArgumentException("No text files found files under " + source)
84+
}
8285
readFiles(paths)
8386
}
8487

src/main/scala/tm/text/DataConverter.scala

+17-1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ object DataConverter {
5050
logger.info("Saving in HLCM format (binary data)")
5151
saveAsBinaryHlcm(name, s"${name}.txt",
5252
dictionary.words, countsByDocuments.seq, bowConverter)
53+
logger.info("Saving in sparse data format (binary data)")
54+
saveAsSparseData(s"${name}.sparse.txt", countsByDocuments.seq, dictionary.map)
5355

5456
logger.info("done")
5557
}
@@ -233,11 +235,25 @@ object DataConverter {
233235
writer.close
234236
}
235237

238+
def saveAsSparseData(filename: String,
239+
countsByDocuments: Seq[TokenCounts], indices: Map[NGram, Int]) = {
240+
val writer = new PrintWriter(filename)
241+
242+
countsByDocuments.zipWithIndex.foreach { p =>
243+
val rowId = p._2 + 1 // since the indices from zipWithIndex start with zero
244+
// filter out any words not contained in the indices or those with zero counts
245+
p._1.filter(tc => indices.contains(tc._1) && tc._2 > 0)
246+
.foreach { tc => writer.println(s"${rowId},${tc._1}") }
247+
}
248+
249+
writer.close
250+
}
251+
236252
/**
237253
* Given a sequence of tokens, build the n-grams based on the tokens. The
238254
* n-grams are built from two consecutive tokens. Besides,
239255
* the constituent tokens must be contained in the given {@code base} dictionary.
240-
* It is possible that after c concatenations n-grams, where n=2^c, may appear.
256+
* It is possible that after c concatenations n-grams, where n=2^c, may appear.
241257
*/
242258
def buildNextNGrams(tokens: Seq[NGram], base: Dictionary): Iterator[NGram] =
243259
tokens.sliding(2)

0 commit comments

Comments
 (0)