forked from andreas-wilm/compbio-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecont.sh
executable file
·313 lines (276 loc) · 8.96 KB
/
decont.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#!/bin/bash
set -o pipefail
# Decontaminate FastQ (single-end, paired-end, optionally gzipped) by
# mapping against a contamination source
echoinfo() {
echo "INFO: $@" 1>&2
}
echofatal() {
echo "FATAL: $@" 1>&2
}
# defaults
# illumina quality encoding
illumina=0
# number of threads to use
threads=4
# keep temp directory
keep_temp=0
# default picard directory; override with env var PICARDDIR
PICARDDIR_DEFAULT="/mnt/software/stow/picard-1.74/bin/"
usage() {
# keep in sync with arg parsing below
cat <<EOF
$(basename $0): decontaminate fastq
Performs a mapping of given SR/PE reads (gzip supported) with BWA
against given source of contamination and produces a BAM file with
contaminated reads and new fastq file(s) with clean reads (qualities
will be Sanger encoded).
Prerequisites: BWA, Picard and samtools. Point the PICARDDIR
environment variable to your picard installation (will use
$PICARDDIR_DEFAULT otherwise)
Mandatory options:
-f | --fastq1 : Input fastq[.gz] file
-r | --ref : Reference (contamination source) fasta file
-o | --outprefix : Output prefix
Optional:
-g | --fastq2 : Fastq[.gz], second in pair (optional)
-t | --threads : Number of threads to use (default=$threads)
-I | --illumina : Phred qualities are ASCII64, ie. Illumina 1.3-1.7 instead of Sanger (check with FastQC)
-T | --tmpdir : Use this as temp directory instead of automatically determined one
-K | --keep : Keep temp directory
-B | --reusebam : Reuse already created BAM, which contains unmapped reads and reads mapped against the contaminant
Still needs original fastq files for auto setting of output filenames and determining SR or PE.
-h | --help : Display this help
EOF
}
# check for required programs
#
for bin in java samtools bwa; do
if ! which $bin >/dev/null 2>&1; then
echofatal "Couldn't find $bin. make sure it's in your path"
exit 1
fi
done
test -z "$PICARDDIR" && export PICARDDIR=$PICARDDIR_DEFAULT
# check for picard needed for samtofastq adding (needed for GATK)
picard_samtofastq_jar=${PICARDDIR}/SamToFastq.jar
if [ ! -s $picard_samtofastq_jar ]; then
echofatal "Couldn't find Picard's $(basename picard_samtofastq_jar). Please set PICARDDIR to your Picard installation"
exit 1
fi
# parse arguments
#
reusebam=""
reffa=""
fastq1=""
while [ "$1" != "" ]; do
case $1 in
-f | --fastq1 )
shift
fastq1=$1
test -e $fastq1 || exit 1
;;
-g | --fastq2 )
shift
fastq2=$1
test -e $fastq2 || exit 1
;;
-h | --help )
usage
exit
;;
-I | --illumina )
illumina=1
;;
-K | --keep )
keep_temp=1
;;
-o | --outprefix )
shift
outprefix=$1
;;
-r | --ref )
shift
reffa=$1
test -e $reffa || exit 1
;;
-t | --threads )
shift
threads=$1
;;
-T | --tmpdir )
shift
tmpdir=$1
;;
-B | --reusebam )
shift
reusebam=$1
test -e $reusebam || exit 1
;;
* )
echofatal "unknown argument \"$1\""
usage
exit 1
esac
shift
done
# check arguments
#
if [ -z "$fastq1" ]; then
echofatal "fastq file \"$fastq1\" missing"
usage
exit 1
fi
if [ -z "$reffa" ] && [ -z "$reusebam" ]; then
echofatal "reference fasta file \"$reffa\" missing"
usage
exit 1
fi
if [ -z "$outprefix" ]; then
echofatal "output prefix missing"
usage
exit 1
fi
if [ -z "$tmpdir" ]; then
tmpdir=$(mktemp --tmpdir -d "$(basename $0).XXXXXX")
fi
echoinfo "Using temp dir $tmpdir"
# set first batch of output file names that we need early on
contbam=${outprefix}_cont.bam
fastq_clean_1=${outprefix}_1.fastq
if [ ! -z $fastq2 ]; then
fastq_clean_2=${outprefix}_2.fastq
fi
for f in $fastq_clean_1 ${fastq_clean_1}.gz $contbam; do
if [ -s $f ]; then
echofatal "refusing to overwrite already existing file $f"
exit 1
fi
done
#
# Now after all this paranoia checking and user friendliness, can we
# please get on with it?
#
# index reference if necessary
if [ -n "$reffa" ]; then
test -s ${reffa}.pac || bwa index $reffa || exit 1
fi
allbam=$tmpdir/all.bam
if [ -n "$reusebam" ]; then
allbam=$reusebam
fi
bwalog=$tmpdir/bwa.log
# STEP 1: bwa aln
# --------------------------------------------------------------------
# -q 3: remove q3 in accordance with illumina guidelines
bwa_aln_extra_args="-t $threads -q 3"
if [ $illumina -eq 1 ]; then
# -I: if phred qualities are ascii64, ie. Illumina 1.3-1.7
bwa_aln_extra_args="$bwa_aln_extra_args -I"
fi
# bwa aln for each fastq. skip if sai already exists
sais=""
for fastq in $fastq1 $fastq2; do
sai=$tmpdir/$(basename $fastq .gz | sed -e 's,.fastq$,,' | sed -e 's,.txt$,,').sai
sais="$sais $sai"
if [ -s "$sai" ]; then
echoinfo "Reusing already existing $sai"
continue
fi
if [ -s "$allbam" ]; then
echoinfo "Skipping alignment step (bwa aln) and reusing already existing $allbam"
continue
fi
#cat <<EOF
if ! bwa aln $bwa_aln_extra_args -f $sai $reffa $fastq >> $bwalog 2>&1; then
echofatal "bwa aln failed. See $bwalog"
exit 1
fi
#EOF
done
echoinfo "BWA aln done"
# STEP 2: bwa sam[sp]e
# --------------------------------------------------------------------
# SR/PE specific options to bwa
bwa_samse_extra_args=""
bwa_sampe_extra_args="-s"
# -s disable Smith-Waterman for the unmapped mate
if [ -z $fastq2 ]; then
args="samse $bwa_samse_extra_args $reffa $sais $fastq1"
# remove unmapped reads from single end mapping
else
args="sampe $bwa_sampe_extra_args $reffa $sais $fastq1 $fastq2"
# keep only reads mapped in proper pair
fi
if [ -s "$allbam" ]; then
echoinfo "Skipping sam conversion and reusing already existing $allbam"
else
#cat <<EOF
if ! bwa $args 2>>$bwalog | samtools view -bS - > $allbam; then
echofatal "bwa or samtools failed. see $bwalog"
exit 1
fi
#EOF
fi
echoinfo "BWA sam[sp]e done"
# STEP 3: split into contaminated reads (BAM) and clean FastQ
# --------------------------------------------------------------------
# Not using samtools view with -f/-F (0x4 segment unmapped, 0x8 next
# segment in the template unmapped), but awk to save one reading
# operation and also the following reason: In case of PE reads, we
# consider clean if neither read nor mate are mapped read 0x4 and mate
# 0x8 unmapped. Note, samtools view -f 12 (note: not 0x12!) does not
# allow for mapped reads and unmapped mates and vice versa. At the
# same time you can give -F/-f only once to samtools. Another
# advantage of using awk is that you can avoid a nasty BWA bug, which
# results in inconsistent mate-pair info. See
# http://www.biostars.org/p/60100/ and
# http://www.biostars.org/p/60001/
#
# Use picard for bam2fastq because it's extra pedantic. Good
# alternative would be
# http://www.hudsonalpha.org/gsl/software/bam2fastq.php which has the
# (dis)advantage of simply skipping unpaired mate-pairs. With
# "LENIENT" Picard will ignore errors like "Error parsing text SAM
# file. MAPQ should be 0 for unmapped read." for cicular chromosomes.
#
# All communication via fifo to save space and only save BAM/gz.
# Disadvantage: can't catch errors directly, only via log.
# setup fifos
fastq_clean_1_fifo=$tmpdir/$(basename $fastq_clean_1 fastq)fifo
if [ ! -z $fastq2 ]; then
fastq_clean_2_fifo=$tmpdir/$(basename $fastq_clean_2 fastq)fifo
fi
contbam_fifo=$tmpdir/$(basename $contbam bam)fifo
fifos="$fastq_clean_1_fifo $fastq_clean_2_fifo $contbam_fifo"
for fifo in $fifos; do
test -e $fifo && rm $fifo
mkfifo $fifo
done
execlog=${outprefix}_decont-exec.log
samtools view -bS - < $contbam_fifo > $contbam 2>>$execlog &
gzip < $fastq_clean_1_fifo > ${fastq_clean_1}.gz 2>>$execlog &
if [ -z "$fastq2" ] && [ ! -s "$allbam" ]; then
echofatal "Filtering for SE reads not yet implemented (awk cmd)"
exit 1
else
gzip < $fastq_clean_2_fifo > ${fastq_clean_2}.gz 2>>$execlog &
samtofastq_pe_arg="SECOND_END_FASTQ=$fastq_clean_2_fifo"
fi
echoinfo "Splitting into contaminated BAM and clean fastq..."
samtools view -h $allbam 2>>$execlog | \
awk -v fifo=$contbam_fifo 'BEGIN {FS="\t"; OFS="\t"}
{if (/^@/ && substr($2, 3, 1)==":") {print >> fifo; next}
if (and($2, 0x4) && and($2, 0x8) && $3=="*") {print} else {print >> fifo}}' | \
java -jar $picard_samtofastq_jar \
VERBOSITY=ERROR QUIET=TRUE \
INCLUDE_NON_PF_READS=true VALIDATION_STRINGENCY=LENIENT \
INPUT=/dev/stdin FASTQ=$fastq_clean_1_fifo $samtofastq_pe_arg 2>>$execlog
rm $fifos
grep -v '[samopen] SAM header is present:' $execlog 1>&2
if [ $keep_temp -ne 1 ]; then
test -d $tmpdir && rm -rf $tmpdir
else
echoinfo "Keeping $tmpdir"
fi
echoinfo "Successful exit"