-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathinterleave_rqcfilter.wdl
243 lines (218 loc) · 6.63 KB
/
interleave_rqcfilter.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# Interleaved fastq QC workflow
version 1.0
workflow nmdc_rqcfilter {
input{
String container="bfoster1/img-omics:0.1.9"
String bbtools_container="microbiomedata/bbtools:38.96"
String workflowmeta_container="microbiomedata/workflowmeta:1.1.1"
String proj
String prefix=sub(proj, ":", "_")
String input_fastq1
String input_fastq2
String database="/refdata/"
}
call stage {
input: container=bbtools_container,
memory="10G",
input_fastq1=input_fastq1,
input_fastq2=input_fastq2
}
# Estimate RQC runtime at an hour per compress GB
call rqcfilter as qc {
input: input_files=stage.interleaved_reads,
threads="16",
database=database,
memory="60G",
container = bbtools_container
}
call make_info_file {
input: info_file = qc.info_file,
container=container,
prefix = prefix
}
call finish_rqc {
input: container=workflowmeta_container,
prefix = prefix,
filtered = qc.filtered,
filtered_stats = qc.stat,
filtered_stats2 = qc.stat2
}
output {
File filtered_final = finish_rqc.filtered_final
File filtered_stats_final = finish_rqc.filtered_stats_final
File filtered_stats2_final = finish_rqc.filtered_stats2_final
File rqc_info = make_info_file.rqc_info
}
}
task stage {
input{
String container
String memory
String target_reads_1="raw_reads_1.fastq.gz"
String target_reads_2="raw_reads_2.fastq.gz"
String output_interleaved="raw_interleaved.fastq.gz"
String input_fastq1
String input_fastq2
}
command <<<
set -euo pipefail
if [ $( echo ~{input_fastq1} | egrep -c "https*:") -gt 0 ] ; then
wget ~{input_fastq1} -O ~{target_reads_1}
wget ~{input_fastq2} -O ~{target_reads_2}
else
ln -s ~{input_fastq1} ~{target_reads_1} || cp ~{input_fastq1} ~{target_reads_1}
ln -s ~{input_fastq2} ~{target_reads_2} || cp ~{input_fastq2} ~{target_reads_2}
fi
reformat.sh -Xmx~{memory} in1=~{target_reads_1} in2=~{target_reads_2} out=~{output_interleaved}
# Capture the start time
date --iso-8601=seconds > start.txt
>>>
output{
File interleaved_reads = "~{output_interleaved}"
String start = read_string("start.txt")
}
runtime {
memory: "1 GiB"
cpu: 2
maxRetries: 1
docker: container
}
}
task rqcfilter {
input{
File input_files
String container
String database
String rqcfilterdata = database + "/RQCFilterData"
Boolean chastityfilter_flag=true
String? memory
String? threads
String filename_outlog="stdout.log"
String filename_errlog="stderr.log"
String filename_stat="filtered/filterStats.txt"
String filename_stat2="filtered/filterStats2.txt"
String filename_stat_json="filtered/filterStats.json"
String filename_reproduce="filtered/reproduce.sh"
String system_cpu="$(grep \"model name\" /proc/cpuinfo | wc -l)"
String jvm_threads=select_first([threads,system_cpu])
String chastityfilter= if (chastityfilter_flag) then "cf=t" else "cf=f"
}
runtime {
docker: container
memory: "70 GB"
cpu: 16
}
command<<<
export TIME="time result\ncmd:%C\nreal %es\nuser %Us \nsys %Ss \nmemory:%MKB \ncpu %P"
set -euo pipefail
rqcfilter2.sh \
~{if (defined(memory)) then "-Xmx" + memory else "-Xmx60G" }\
-da \
threads=~{jvm_threads} \
~{chastityfilter} \
jni=t \
in=~{input_files} \
path=filtered \
rna=f \
trimfragadapter=t \
qtrim=r \
trimq=0 \
maxns=3 \
maq=3 \
minlen=51 \
mlf=0.33 \
phix=t \
removehuman=t \
removedog=t \
removecat=t \
removemouse=t \
khist=t \
removemicrobes=t \
sketch \
kapa=t \
clumpify=t \
barcodefilter=f \
trimpolyg=5 \
usejni=f \
rqcfilterdata=~{rqcfilterdata} \
> >(tee -a ~{filename_outlog}) \
2> >(tee -a ~{filename_errlog} >&2)
python <<CODE
import json
f = open("~{filename_stat}",'r')
d = dict()
for line in f:
if not line.rstrip():continue
key,value=line.rstrip().split('=')
d[key]=float(value) if 'Ratio' in key else int(value)
with open("~{filename_stat_json}", 'w') as outfile:
json.dump(d, outfile)
CODE
>>>
output {
File stdout = filename_outlog
File stderr = filename_errlog
File stat = filename_stat
File stat2 = filename_stat2
File info_file = filename_reproduce
File filtered = glob("filtered/*fastq.gz")[0]
File json_out = filename_stat_json
}
}
task make_info_file {
input{
File info_file
String prefix
String container
}
command<<<
set -euo pipefail
sed -n 2,5p ~{info_file} 2>&1 | \
perl -ne 's:in=/.*/(.*) :in=$1:; s/#//; s/BBTools/BBTools(1)/; print;' > \
~{prefix}_readsQC.info
echo -e "\n(1) B. Bushnell: BBTools software package, http://bbtools.jgi.doe.gov/" >> \
~{prefix}_readsQC.info
>>>
output {
File rqc_info = "~{prefix}_readsQC.info"
}
runtime {
memory: "1 GiB"
cpu: 1
maxRetries: 1
docker: container
}
}
task finish_rqc {
input{
# File read
File filtered_stats
File filtered_stats2
File filtered
String container
String prefix
# String start
}
command<<<
set -euo pipefail
end=`date --iso-8601=seconds`
# Generate QA objects
ln -s ~{filtered} ~{prefix}_filtered.fastq.gz
ln -s ~{filtered_stats} ~{prefix}_filterStats.txt
ln -s ~{filtered_stats2} ~{prefix}_filterStats2.txt
# Generate stats but rename some fields untilt the script is
# fixed.
/scripts/rqcstats.py ~{filtered_stats} > stats.json
cp stats.json ~{prefix}_qa_stats.json
>>>
output {
File filtered_final = "~{prefix}_filtered.fastq.gz"
File filtered_stats_final = "~{prefix}_filterStats.txt"
File filtered_stats2_final = "~{prefix}_filterStats2.txt"
}
runtime {
docker: container
memory: "1 GiB"
cpu: 1
}
}