@@ -50,6 +50,8 @@ object DataConverter {
50
50
logger.info(" Saving in HLCM format (binary data)" )
51
51
saveAsBinaryHlcm(name, s " ${name}.txt " ,
52
52
dictionary.words, countsByDocuments.seq, bowConverter)
53
+ logger.info(" Saving in sparse data format (binary data)" )
54
+ saveAsSparseData(s " ${name}.sparse.txt " , countsByDocuments.seq, dictionary.map)
53
55
54
56
logger.info(" done" )
55
57
}
@@ -233,11 +235,25 @@ object DataConverter {
233
235
writer.close
234
236
}
235
237
238
+ def saveAsSparseData (filename : String ,
239
+ countsByDocuments : Seq [TokenCounts ], indices : Map [NGram , Int ]) = {
240
+ val writer = new PrintWriter (filename)
241
+
242
+ countsByDocuments.zipWithIndex.foreach { p =>
243
+ val rowId = p._2 + 1 // since the indices from zipWithIndex start with zero
244
+ // filter out any words not contained in the indices or those with zero counts
245
+ p._1.filter(tc => indices.contains(tc._1) && tc._2 > 0 )
246
+ .foreach { tc => writer.println(s " ${rowId}, ${tc._1}" ) }
247
+ }
248
+
249
+ writer.close
250
+ }
251
+
236
252
/**
237
253
* Given a sequence of tokens, build the n-grams based on the tokens. The
238
254
* n-grams are built from two consecutive tokens. Besides,
239
255
* the constituent tokens must be contained in the given {@code base} dictionary.
240
- * It is possible that after c concatenations n-grams, where n=2^c, may appear.
256
+ * It is possible that after c concatenations n-grams, where n=2^c, may appear.
241
257
*/
242
258
def buildNextNGrams (tokens : Seq [NGram ], base : Dictionary ): Iterator [NGram ] =
243
259
tokens.sliding(2 )
0 commit comments