-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathpostprocess_reachables.scala
312 lines (246 loc) · 12.7 KB
/
postprocess_reachables.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.reachables
import java.io.{File, PrintWriter}
import act.server.MongoDB
import scala.collection.JavaConversions._
import scala.io.Source
object postprocess_reachables {
private var currentDatabase = "jarvis_2016-12-09"
private lazy val defaultDbName = getDefaultDbName
private def getDefaultDbName: String = {
currentDatabase
}
def main(args: Array[String]) {
if (args.length == 0) {
println("Usage: run --prefix=PRE --regressionSuiteDir=path --extractReachables --writeGraphToo --defaultDbName=DB_NAME")
println("Example: run --prefix=r")
println(" will create reachables tree with prefix r and by default with only enzymes that have seq")
println("Example: run --prefix=r --regressionSuiteDir=path ")
println(" will just run the regressions over a dataset with prefix 'r'")
println("Example: run --prefix=r --extractReachables ")
println(" will convert the actdata structure with the specified prefix to trees/tables")
System.exit(-1)
}
val params = new CmdLine(args)
val prefix = params.get("prefix") match {
case Some(x) => x
case None => println("Need --prefix. Abort"); System.exit(-1); ""
}
val write_other_formats = params.get("extractReachables") != None
val outputDirectory = params.get("output-dir") match {
case Some(x) => x
case None => ""
}
params.get("defaultDbName") match {
case Some(x) => currentDatabase = x
case None => // Let the default hold
}
val regression_suite_files: Set[String] =
params.get("regressionSuiteDir") match {
case Some(dir) => {
val files = new File(dir).listFiles
val testfiles = files
.map(n => n.getAbsolutePath)
.filter(_.endsWith(".test.txt"))
testfiles.toSet
}
case _ => Set()
}
val write_graph_too = params.get("writeGraphToo") != None
println("Deserializing reachables graph")
ActData.instance.deserialize(new File(outputDirectory, prefix + ".actdata").getAbsolutePath)
val tree = ActData.instance().getActTree
// get inchis for all the reachables
// Network.nodesAndIds(): Map[Node, Long] i.e., maps nodes -> ids
// chemId2Inchis: Map[Long, String] i.e., maps ids -> inchis
// so we take the Map[Node, Long] and map each of the key_value pairs
// by looking up their ids (using n_ids._2) in chemId2Inchis
// then get a nice little set from that Iterable
val reachables: Map[Long, (String, String)] = tree.nodesAndIds.map(x => id2InChIName(x._2)).toMap
def fst(x: (String, String)) = x._1
val r_inchis: Set[String] = reachables.values.toSet.map( fst ) // reachables.values are (inchi, name)
if (write_other_formats) {
write_reachable_tree(prefix, write_graph_too, reachables, outputDirectory)
}
if (regression_suite_files.nonEmpty) {
run_regression_suite(prefix, regression_suite_files, r_inchis, outputDirectory)
}
}
def id2InChIName(id: Long) = id ->
(ActData.instance().mapChemId2Inchis(id), ActData.instance().mapChemId2ReadableName(id))
def run_regression_suite(prefix: String, regression_suite_files : Set[String], r_inchis: Set[String], outputDirectory: String): Unit = {
val rdir = new File(outputDirectory, prefix + ".regressions/").getAbsolutePath // regression output directory
// create output directory for regression test reports, if not already exists
mk_regression_test_reporting_dir(rdir)
// run regression suites if provided
regression_suite_files.foreach(test => reachables.runRegression(r_inchis, test, rdir))
}
def write_reachable_tree(prefix: String, write_graph_too: Boolean, reachables: Map[Long, (String, String)], outputDirectory: String) {
val g = new File(outputDirectory, prefix + ".graph.json").getAbsolutePath // output file for json of graph
val t = new File(outputDirectory, prefix + ".trees.json").getAbsolutePath // output file for json of tree
val r = new File(outputDirectory, prefix + ".reachables.txt").getAbsolutePath // output file for list of all reachables
val e = new File(outputDirectory, prefix + ".expansion.txt").getAbsolutePath // output file for tree structure of reachables expansion
// Connect to the DB so that extended attributes for chemicals can be fetched as we serialize.
val db = new MongoDB("localhost", 27017, defaultDbName)
println("Writing disjoint graphs to " + g + " and forest to " + t)
val tree = ActData.instance().getActTree
def tab(id_inchi_name: (Long, (String, String))) = id_inchi_name._1 + "\t" + id_inchi_name._2._2 + "\t" + id_inchi_name._2._1
write_to(r, reachables.map(tab).reduce(_ + "\n" + _))
println("Done: Written reachables list to: " + r)
write_to(e, tree2table(tree, reachables))
println("Done: Written reachables tree as spreadsheet to: " + e)
val disjointtrees = tree.disjointTrees(db) // a JSONObject
val treejson = disjointtrees.toString(2) // serialized using indent = 2
write_to(t, treejson)
println("Done: Writing disjoint trees")
if (write_graph_too) {
println("scala/reachables.scala: You asked to write graph, in addition to default tree.")
println("scala/reachables.scala: This will most likely run out of memory")
val disjointgraphs = tree.disjointGraphs(db) // a JSONArray
val graphjson = disjointgraphs.toString(2) // serialized using indent = 2
write_to(g, graphjson)
println("scala/reachables.scala: Done writing disjoint graphs")
}
println("Done: Written reachables to trees (and graphs, if requested).")
}
def tree2table(tree: Network, reachables: Map[Long, (String, String)]) = {
// helper function to help go from
// java.util.HashMap[java.lang.Long, java.lang.Integer] to Map[Long, Int]
def ident(id: Long) = id -> Int.unbox(tree.nodeDepths.get(id))
// run the helper over the nodeDepths to get node_id -> depth map
val nodes_depth: Map[Long, Int] = tree.nodeDepths.map(x => ident(x._1)).toMap
// get all depths as a set (this should be a continuous range from 0->n)
val all_depths = nodes_depth.values.toSet
// construct the inverse map of depth -> set of nodes at that depth
val depth_to_nodes: Map[Int, Set[Long]] = {
// function that checks if a particular (node, depth) pair is at depth `d`
def is_at_depth(d: Int, nid_d: (Long, Int)): Boolean = nid_d._2 == d
// function that takes a depth `d` and returns for it the nodes at that depth
def depth2nodes(d: Int): Set[Long] = {
// filter the map of node_id -> depth those that are at depth `d`
val atdepth: Map[Long, Int] = nodes_depth.filter( x => is_at_depth(d, x) )
// take the (node_id, depth) pairs that are at depth `d` and return
// the set of their node_ids by unzipping and then converting to set
atdepth.toList.unzip._1.toSet
}
// now map each depth `x` to the set of nodes at depth `x`
all_depths.map(x => x -> depth2nodes(x)).toMap
}
// using the above-computed map of `depth -> set(nodes at that depth)`
// create a map of `depth -> set(node_data strings at that depth)`
def node2str(id: Long) = {
val inchi_name: Option[(String, String)] = reachables.get(id)
val description = inchi_name match {
case None => "no name/inchi"
case Some((inchi, name)) => name + " " + inchi
}
id + ": " + description
}
// create a map with the metadata strings in the value fields
val depth_to_nodestrs = depth_to_nodes.map{ case (k, v) => (k, v.map(node2str)) }
// sort the list of depths, so that we can print them out in order
val sorted_depths = all_depths.toList.sorted
// go from list of depths, to sets of node_strs at that depth
val projected_nodes = sorted_depths.map(depth => depth_to_nodestrs(depth))
// collapse all node_strs at each depth to tab-separated
val node_lines = projected_nodes.map(set => set.reduce(_ + "\t" + _))
// return the lines concatenated together with newlines
node_lines.reduce(_ + "\n" + _)
}
def run_regression(reachable_inchis: Set[String], test_file: String, output_report_dir: String) {
val testlines: List[String] = Source.fromFile(test_file).getLines.toList
val testcols: List[List[String]] = testlines.map(line => line.split("\t").toList)
val hdrs = Set("inchi", "name", "plausibility", "comment", "reference")
if (testcols.length == 0 || !testcols(0).toSet.equals(hdrs)) {
println("Invalid test file: " + test_file)
println("\tExpected: " + hdrs.toString)
println("\tFound: " + testcols(0).toString)
} else {
// delete the header from the data set, leaving behind only the test rows
val hdr = testcols(0)
val rows = testcols.drop(1)
def add_hdrs(row: List[String]) = hdr.zip(row)
// partition test rows based on whether this reachables set passes or fails them
val (passed, failed) = rows.partition(testrow => run_regression(add_hdrs(testrow), reachable_inchis))
val report = generate_report(test_file, passed, failed)
write_to(output_report_dir + "/" + new File(test_file).getName, report)
println("Regression file: " + test_file)
println("Total test: " + rows.length + " (passed, failed): (" + passed.length + ", " + failed.length + ")")
}
}
def mk_regression_test_reporting_dir(dir: String) {
val dirl = new File(dir)
if (dirl exists) {
if (dirl.isFile) {
println(dir + " already exists as a file. Need it as dir for regression output. Abort.")
System.exit(-1)
}
} else {
dirl.mkdir()
}
}
def generate_report(f: String, passed: List[List[String]], failed:List[List[String]]) = {
val total = passed.length + failed.length
val write_successes = false
val write_failures = true
val lines =
// add summary to head of report file
List(
"** Regression test result for " + f,
"\tTOTAL: " + total + " PASSED: " + passed.length,
"\tTOTAL: " + total + " FAILED: " + failed.length
) ++ (
// add details of cases that succeeded
if (write_successes)
passed.map("\t\tPASSED: " + _).toList
else
List()
) ++ (
// add details of cases that failed
if (write_failures)
failed.map("\t\tFAILED: " + _).toList
else
List()
)
// make the report as a string of lines
val report = lines reduce (_ + "\n" + _)
// return the report
report
}
def run_regression(row: List[(String, String)], reachable_inchis: Set[String]): Boolean = {
val data = row.toMap
val inchi = data.getOrElse("inchi", "") // inchi column
val should_exist = data.getOrElse("plausibility", "TRUE").toBoolean // plausibility column
val exists = reachable_inchis.contains(inchi)
if (should_exist)
exists // if should-exist then output whether it exists
else
!exists // if should-not-exist then output negation of it-exists in reachable
}
def write_to(fname: String, json: String) {
val file = new PrintWriter(new File(fname))
file write json
file.close()
}
}