Skip to content

Commit e3a1868

Browse files
author
Leonard Poon
committed
- initial import
- code for processing Hillary Emails
0 parents  commit e3a1868

29 files changed

+2167
-0
lines changed

.classpath

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
<classpath>
2+
<classpathentry kind="src" path="src/main/scala"/>
3+
<classpathentry kind="src" path="src/main/java"/>
4+
<classpathentry kind="src" path="src/main/resources"/>
5+
<classpathentry kind="src" path="src/test/scala"/>
6+
<classpathentry kind="src" path="src/test/java"/>
7+
<classpathentry kind="src" path="src/test/resources"/>
8+
<classpathentry kind="lib" path="./FastHLTA/bin"/>
9+
<classpathentry kind="con" path="org.scala-ide.sdt.launching.SCALA_CONTAINER"/>
10+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/org.scalactic/scalactic_2.10/srcs/scalactic_2.10-2.2.6-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/org.scalactic/scalactic_2.10/bundles/scalactic_2.10-2.2.6.jar">
11+
<attributes>
12+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/org.scalactic/scalactic_2.10/docs/scalactic_2.10-2.2.6-javadoc.jar!/"/>
13+
</attributes>
14+
</classpathentry>
15+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/org.apache.commons/commons-csv/srcs/commons-csv-1.2-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/org.apache.commons/commons-csv/jars/commons-csv-1.2.jar">
16+
<attributes>
17+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/org.apache.commons/commons-csv/docs/commons-csv-1.2-javadoc.jar!/"/>
18+
</attributes>
19+
</classpathentry>
20+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/org.apache.opennlp/opennlp-tools/srcs/opennlp-tools-1.6.0-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/org.apache.opennlp/opennlp-tools/bundles/opennlp-tools-1.6.0.jar">
21+
<attributes>
22+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/org.apache.opennlp/opennlp-tools/docs/opennlp-tools-1.6.0-javadoc.jar!/"/>
23+
</attributes>
24+
</classpathentry>
25+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/org.apache.opennlp/opennlp-maxent/srcs/opennlp-maxent-3.0.3-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/org.apache.opennlp/opennlp-maxent/bundles/opennlp-maxent-3.0.3.jar">
26+
<attributes>
27+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/org.apache.opennlp/opennlp-maxent/docs/opennlp-maxent-3.0.3-javadoc.jar!/"/>
28+
</attributes>
29+
</classpathentry>
30+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/org.apache.lucene/lucene-core/srcs/lucene-core-5.5.0-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/org.apache.lucene/lucene-core/jars/lucene-core-5.5.0.jar">
31+
<attributes>
32+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/org.apache.lucene/lucene-core/docs/lucene-core-5.5.0-javadoc.jar!/"/>
33+
</attributes>
34+
</classpathentry>
35+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/org.apache.lucene/lucene-analyzers-common/srcs/lucene-analyzers-common-5.5.0-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/org.apache.lucene/lucene-analyzers-common/jars/lucene-analyzers-common-5.5.0.jar">
36+
<attributes>
37+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/org.apache.lucene/lucene-analyzers-common/docs/lucene-analyzers-common-5.5.0-javadoc.jar!/"/>
38+
</attributes>
39+
</classpathentry>
40+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/org.apache.pdfbox/pdfbox/srcs/pdfbox-1.8.10-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/org.apache.pdfbox/pdfbox/bundles/pdfbox-1.8.10.jar">
41+
<attributes>
42+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/org.apache.pdfbox/pdfbox/docs/pdfbox-1.8.10-javadoc.jar!/"/>
43+
</attributes>
44+
</classpathentry>
45+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/org.apache.pdfbox/fontbox/srcs/fontbox-1.8.10-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/org.apache.pdfbox/fontbox/bundles/fontbox-1.8.10.jar">
46+
<attributes>
47+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/org.apache.pdfbox/fontbox/docs/fontbox-1.8.10-javadoc.jar!/"/>
48+
</attributes>
49+
</classpathentry>
50+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/commons-logging/commons-logging/srcs/commons-logging-1.1.1-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/commons-logging/commons-logging/jars/commons-logging-1.1.1.jar">
51+
<attributes>
52+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/commons-logging/commons-logging/docs/commons-logging-1.1.1-javadoc.jar!/"/>
53+
</attributes>
54+
</classpathentry>
55+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/org.apache.pdfbox/jempbox/srcs/jempbox-1.8.10-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/org.apache.pdfbox/jempbox/bundles/jempbox-1.8.10.jar">
56+
<attributes>
57+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/org.apache.pdfbox/jempbox/docs/jempbox-1.8.10-javadoc.jar!/"/>
58+
</attributes>
59+
</classpathentry>
60+
<classpathentry sourcepath="/Users/kmpoon/.ivy2/cache/org.scalatest/scalatest_2.10/srcs/scalatest_2.10-2.2.6-sources.jar" kind="lib" path="/Users/kmpoon/.ivy2/cache/org.scalatest/scalatest_2.10/bundles/scalatest_2.10-2.2.6.jar">
61+
<attributes>
62+
<attribute name="javadoc_location" value="jar:file:/Users/kmpoon/.ivy2/cache/org.scalatest/scalatest_2.10/docs/scalatest_2.10-2.2.6-javadoc.jar!/"/>
63+
</attributes>
64+
</classpathentry>
65+
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
66+
<classpathentry kind="output" path="bin"/>
67+
</classpath>

.gitignore

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
.DS_Store
2+
.cache-main
3+
.cache-tests
4+
bin
5+
target
6+
project/target
7+
project/project/target
8+
converted
9+
*~
10+
.worksheet/bin

.project

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<projectDescription>
2+
<name>HillaryEmails</name>
3+
<buildSpec>
4+
<buildCommand>
5+
<name>org.scala-ide.sdt.core.scalabuilder</name>
6+
</buildCommand>
7+
</buildSpec>
8+
<natures>
9+
<nature>org.scala-ide.sdt.core.scalanature</nature>
10+
<nature>org.eclipse.jdt.core.javanature</nature>
11+
</natures>
12+
<linkedResources> </linkedResources>
13+
</projectDescription>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#Generated by sbteclipse
2+
#Tue Mar 08 15:13:02 HKT 2016
3+
encoding/<project>=UTF-8

.settings/org.eclipse.jdt.core.prefs

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
eclipse.preferences.version=1
2+
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3+
org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
4+
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
5+
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6+
org.eclipse.jdt.core.compiler.compliance=1.6
7+
org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8+
org.eclipse.jdt.core.compiler.debug.localVariable=generate
9+
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10+
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11+
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12+
org.eclipse.jdt.core.compiler.source=1.6
+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#Generated by sbteclipse
2+
#Tue Mar 08 15:13:02 HKT 2016
3+
scala.compiler.additionalParams=-Xsource\:2.10 -Ymacro-expand\:none
4+
scala.compiler.installation=2.10
5+
scala.compiler.useProjectSettings=true
6+
target=jvm-1.6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package hillary
2+
3+
object CheckDictionary {;import org.scalaide.worksheet.runtime.library.WorksheetSupport._; def main(args: Array[String])=$execute{;$skip(87);
4+
println("Welcome to the Scala worksheet");$skip(155);
5+
val dictionary = Dictionary.read(
6+
"/Users/kmpoon/Documents/research/workspace/HillaryEmails/" +
7+
"converted/hillary.20160226.dict.csv");System.out.println("""dictionary : hillary.Dictionary = """ + $show(dictionary ));$skip(67);
8+
dictionary.info.filter(_.word.endsWith("nt")).foreach(println)}
9+
}
+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
package hillary
2+
3+
object dictionary {;import org.scalaide.worksheet.runtime.library.WorksheetSupport._; def main(args: Array[String])=$execute{;$skip(80);
4+
println("Welcome to the Scala worksheet")}
5+
6+
}
+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package hillary
2+
3+
object worksheet {;import org.scalaide.worksheet.runtime.library.WorksheetSupport._; def main(args: Array[String])=$execute{;$skip(81);
4+
println("Welcome to the Scala worksheet");$skip(142);
5+
val lines = scala.io.Source.fromFile("/Users/kmpoon/Documents/research/workspace/HillaryEmails/converted/hillary.20160226.arff").getLines;System.out.println("""lines : Iterator[String] = """ + $show(lines ));$skip(58);
6+
val relationRegex = """(?i)@relation(?:\s+)(\S+)""".r;System.out.println("""relationRegex : scala.util.matching.Regex = """ + $show(relationRegex ));$skip(26);
7+
val line = lines.next;System.out.println("""line : String = """ + $show(line ));$skip(74);
8+
9+
line match {
10+
case relationRegex(name) => println(name)
11+
}}
12+
}

FastHLTA

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../FastHLTA

build.sbt

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// -*- mode: scala -*-
2+
3+
name := "HillaryEmails"
4+
5+
version := "1.0"
6+
7+
scalaVersion := "2.10.6"
8+
9+
libraryDependencies ++=
10+
"org.scalatest" %% "scalatest" % "2.2.6" % "test" ::
11+
"org.scalactic" %% "scalactic" % "2.2.6" ::
12+
"org.apache.commons" % "commons-csv" % "1.2" ::
13+
"org.apache.opennlp" % "opennlp-tools" % "1.6.0" ::
14+
"org.apache.opennlp" % "opennlp-maxent" % "3.0.3" ::
15+
"org.apache.lucene" % "lucene-core" % "5.5.0" ::
16+
"org.apache.lucene" % "lucene-analyzers-common" % "5.5.0" ::
17+
"org.apache.pdfbox" % "pdfbox" % "1.8.10" ::
18+
Nil
19+
20+
21+
EclipseKeys.withSource := true
22+
23+
EclipseKeys.withJavadoc := true
24+
25+
javacOptions ++= Seq("-source", "1.6", "-target", "1.6")
26+
27+
scalacOptions += "-target:jvm-1.6"
28+
29+
// EclipseKeys.eclipseOutput := Some("target")
30+
31+
// Compile the project before generating Eclipse files, so that generated .scala or .class files for views and routes are present
32+
EclipseKeys.preTasks := Seq(compile in Compile)
33+
34+
// To skip test during assembly
35+
// test in assembly := {}
36+
37+
unmanagedClasspath in Compile += baseDirectory.value / "FastHLTA" / "bin"
38+
39+
unmanagedClasspath in Test += baseDirectory.value / "FastHLTA" / "bin"
40+
41+
unmanagedClasspath in Runtime += baseDirectory.value / "FastHLTA" / "bin"

data

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/Users/kmpoon/Documents/research/sync/data/hillary-clinton-emails/hillary-clinton-emails-release-2015-09-11-01-39-01

data.txt

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
hillary.20160225.arff
2+
3+
n-grams, where n = 1 or 2. TF over 5.
4+
5+
hillary.20160224.arff
6+
7+
n-grams, where n = 1 to 3. TF over 5.
8+
9+
hillary.20160215.arff
10+
11+
All one gram. Term frequency over 5.

project/assembly.sbt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.0")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package hillary
2+
3+
object BuildDictionary extends App {
4+
run()
5+
6+
def run() = {
7+
import Converter._
8+
9+
println("Extracting bodies")
10+
val bodies = readEmails.map(_._3).toList.par
11+
12+
println("Counting words in each email")
13+
val countsByEmails = bodies.map(tokenizeAndCount(_, 3)).par
14+
15+
println("Building Dictionary")
16+
val dictionary = buildDictionary(countsByEmails)
17+
18+
println("Saving dictionary")
19+
dictionary.save("dictionary.all.csv")
20+
21+
println("done")
22+
}
23+
24+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package hillary
2+
3+
object CheckDictionary extends App {
4+
println("Welcome to the Scala worksheet")
5+
val dictionary = Dictionary.read(
6+
"/Users/kmpoon/Documents/research/workspace/HillaryEmails/" +
7+
"converted/hillary.20160226.dict.csv")
8+
dictionary.info.filter(w => w.word.endsWith("nt") && w.word.length <= 8).foreach(println)
9+
}

src/main/scala/hillary/Convert.scala

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
2+
3+
package hillary
4+
5+
import java.io.FileReader
6+
import org.apache.commons.csv.CSVFormat
7+
import java.util.Date
8+
import java.text.SimpleDateFormat
9+
import scala.collection.mutable
10+
import scala.collection.GenSeq
11+
12+
object Convert extends App {
13+
run(println)
14+
15+
def run(log: (String) => Any) = {
16+
import Converter._
17+
18+
val maxN = 2
19+
val minTf = 6
20+
21+
log("Extracting bodies")
22+
val bodies = readEmails.map(_._3).toList.par
23+
24+
log("Extracting words")
25+
val wordsByEmails = bodies.map(tokenizeBySpace)
26+
27+
log("Counting words in each email")
28+
val wordCountsByEmails = wordsByEmails
29+
.map(find1ToNGrams(_, maxN).flatten)
30+
.map(countWords)
31+
32+
log("Building Dictionary")
33+
val dictionary = buildDictionary(wordCountsByEmails).filter(_.tf >= minTf)
34+
35+
log("Saving dictionary")
36+
dictionary.save("dictionary.csv")
37+
38+
val tokenCountsByEmails = wordsByEmails
39+
.map(words =>
40+
tokenizeWithoutConstituentTokens(words, dictionary.map.contains, 2))
41+
.map(countWords)
42+
43+
log("Converting to bow")
44+
val bow = convertToBow(tokenCountsByEmails, dictionary.map)
45+
46+
log("Saving in ARFF format")
47+
saveAsArff("hillary", "bow.arff",
48+
AttributeType.numeric, dictionary.words, bow.seq)
49+
saveAsBinaryHlcm("hillary", "bow.txt", dictionary.words, bow.seq)
50+
51+
log("done")
52+
}
53+
}

0 commit comments

Comments
 (0)