This repository has been archived by the owner on Jul 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbam_filter.py
executable file
·363 lines (283 loc) · 10.7 KB
/
bam_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#!/usr/bin/env python
#/usr/bin/env python
import string
import os, sys
import pysam
from optparse import OptionParser
#################################
# Simple Error Printing Funtion #
#################################
def DoError(ErrorString):
print "!!!Error:", ErrorString,"!!!"
sys.exit()
#############################################
# Function to reverse complement a sequence #
#############################################
def revcomp(sequence):
rev=sequence[::-1]
revcomp=''
d={'A':'T', 'T':'A', 'C':'G', 'G':'C', 'a':'t', 't':'a', 'g':'c', 'c':'g', "n":"n", "N":"N"}
for i in rev:
if d.has_key(i):
revcomp=revcomp+d[i]
else:
revcomp=revcomp+i
return revcomp
##########################################
# Function to Get command line arguments #
##########################################
def main():
usage = "usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-b", "--bam", action="store", dest="bam", help="input file in sam or bam format (must have suffix .sam or .bam)", default="", metavar="FILE")
parser.add_option("-o", "--output", action="store", dest="output", help="output prefix for file name(s)", default="", metavar="FILE")
parser.add_option("-f", "--format", type="choice", dest="fileformat", choices=["fasta","fastq","pairedfastq","sam","bam"], action="store", help="output file format (sam, bam, fastq, pairedfastq, or fasta) [default=%default]", default="pairedfastq", metavar="FORMAT")
parser.add_option("-t", "--type", action="store", type="choice", dest="outputtype", choices=["unmapped","mapped", "paired", "all","contigs", "minusdup", "onemapped"], help="reads to output (all, mapped, paired, unmapped, minusdup, onemapped or contigs). Note 1: for pairedfastq output, mapped includes all reads where one of the pair is mapped. paired includes only those reads where the reads are in a proper pair. For other output formats the mapped option will only print the individual mapped reads and paired requires the reads to be mapped in proper pairs. Note 2: the contigs option requires the -c option to be set (see below). [default=%default]", default="unmapped", metavar="FILE")
parser.add_option("-c", "--contigs", action="store", dest="contigs", help="comma separated list of contigs for which mapped reads should be output. Note there should be no whitespace within the list", default="")
return parser.parse_args()
################################
# Check command line arguments #
################################
def check_input_validity(options, args):
if options.bam=='':
DoError('No bam/sam input file selected')
elif options.bam.split('.')[-1] not in ['sam', 'bam']:
DoError('Input file '+options.bam+' must have the suffix .sam or .bam')
elif not os.path.isfile(options.bam):
DoError('Cannot find file '+options.bam)
if options.output=="":
DoError('No output file name (-o) selected')
if options.outputtype=="contigs" and len(options.contigs)==0:
DoError('Contigs options (-c) must be used when contigs is selected as read output type')
if len(options.contigs)>0:
options.contiglist=options.contigs.split(',')
if options.outputtype=="all" and options.fileformat in ["sam", "bam"]:
DoError('You do not need to use this program to save all reads in bam/sam format! Use samtools view instead')
return
#############################
# Print read to output file #
#############################
def print_read_to_file(out, samread, format, sammate=False, mateout=False):
if samread.is_reverse:
samreadseq=revcomp(samread.seq)
samreadqual=samread.qual[::-1]
else:
samreadseq=samread.seq
samreadqual=samread.qual
if sammate and sammate.is_reverse:
sammateseq=revcomp(sammate.seq)
sammatequal=sammate.qual[::-1]
elif sammate!=False:
sammateseq=sammate.seq
sammatequal=sammate.qual
if format in ["sam", "bam"]:
out.write(read)
elif format=="fasta":
print >>out, ">"+samread.qname
print >>out, samreadseq
elif format=="fastq":
print >>out, "@"+samread.qname
print >>out, samreadseq
print >> out, "+"
print >> out, samreadqual
elif format=="pairedfastq" and sammate and mateout:
if samread.is_read1 and sammate.is_read2:
print >>out, "@"+samread.qname
print >>out, samreadseq
print >> out, "+"
print >> out, samreadqual
print >>mateout, "@"+sammate.qname
print >>mateout, sammateseq
print >> mateout, "+"
print >> mateout, sammatequal
elif samread.is_read2 and sammate.is_read1:
print >>out, "@"+sammate.qname
print >>out, sammateseq
print >> out, "+"
print >> out, sammatequal
print >>mateout, "@"+samread.qname
print >>mateout, samreadseq
print >> mateout, "+"
print >> mateout, samreadqual
return
########
# Main #
########
if __name__ == "__main__":
#Get command line arguments
(options, args) = main()
#Do some checking of the input files
check_input_validity(options, args)
#open the bam/sam file
if options.bam.split(".")[-1]=="bam":
print "Reading bam file"
try:
samfile = pysam.Samfile( options.bam, "rb" )
except StandardError:
DoError('Failed to open '+options.bam+'. Is it in bam format?')
elif options.bam.split(".")[-1]=="sam":
print "Reading sam file"
try:
samfile = pysam.Samfile( options.bam, "r" )
except StandardError:
DoError('Failed to open '+options.bam+'. Is it in sam format?')
#get reference names and lengths from the sam file header
refs=samfile.references
lengths=samfile.lengths
#if the contigs option has been chosen, check that the contigs specified are in the sam header
if options.outputtype=="contigs":
for contig in options.contiglist:
if not contig in refs:
options.contiglist.remove(contig)
if len(options.contiglist)==0:
DoError('None of the contigs you specified are in the bam/sam header')
#Create headers for sam/bam output
if options.fileformat in ["sam", "bam"]:
print "Adding sam headers"
newrefs=[]
newlengths=[]
if options.outputtype=="contigs":
for x, ref in enumerate(refs):
if ref in options.contiglist:
newrefs.append(ref)
newlengths.append(lengths[x])
else:
newrefs=refs
newlengths=lengths
#open the output file
if options.fileformat=="fasta":
output=open(options.output+".fasta", "w")
elif options.fileformat=="fastq":
output=open(options.output+".fastq", "w")
elif options.fileformat=="pairedfastq":
output=open(options.output+"_1.fastq", "w")
routput=open(options.output+"_2.fastq", "w")
elif options.fileformat=="sam":
output=pysam.Samfile(options.output+".sam", mode='wh', referencenames=newrefs, referencelengths=newlengths)
elif options.fileformat=="bam":
output=pysam.Samfile(options.output+".bam", mode='wb', referencenames=newrefs, referencelengths=newlengths)
else:
DoError('Somehow gained a file format')
print "Converting file"
if options.fileformat=="pairedfastq":
firstofpair={}
if options.outputtype=="minusdup":
prevdup=False
prevtoprint=False
remdups=set()
#Iterate through input file and print appropriate lines
for read in samfile:
if options.outputtype=="minusdup" and not prevdup:
prevdup=read
if options.fileformat=="pairedfastq" or options.outputtype=="minusdup":
if len(read.qname.split("/"))>1:
readname="/".join(read.qname.split("/")[:-1])
else:
readname=read.qname
if options.outputtype=="minusdup":
if len(prevdup.qname.split("/"))>1:
prevname="/".join(prevdup.qname.split("/")[:-1])
else:
prevname=prevdup.qname
printread=False
printprev=False
if options.outputtype=="all":
if options.fileformat=="pairedfastq":
if readname in firstofpair:
printread=True
else:
firstofpair[readname]=read
else:
printread=True
elif read.is_unmapped and options.outputtype=="unmapped":
if options.fileformat=="pairedfastq":
if readname in firstofpair:
printread=True
else:
firstofpair[readname]=read
else:
printread=True
elif (not read.is_unmapped or not read.mate_is_unmapped) and options.outputtype=="onemapped":
if options.fileformat=="pairedfastq":
if readname in firstofpair:
printread=True
else:
firstofpair[readname]=read
else:
printread=True
elif not read.is_unmapped and (options.outputtype=="mapped" or (options.outputtype=="contigs" and refs[read.rname] in options.contiglist)):
if options.fileformat=="pairedfastq":
if not read.mate_is_unmapped and read.mrnm==read.rname:
if readname in firstofpair:
printread=True
else:
firstofpair[readname]=read
else:
printread=True
elif not read.is_proper_pair and options.outputtype=="paired":
if options.fileformat=="pairedfastq":
if not read.mate_is_unmapped and read.mrnm==read.rname:
if readname in firstofpair:
printread=True
else:
firstofpair[readname]=read
else:
printread=True
elif not read.is_unmapped and options.outputtype=="minusdup":
if readname in remdups:
remdups.remove(readname)
elif not read.is_reverse and prevdup.pos==read.pos:
if prevdup.mate_is_unmapped and not read.mate_is_unmapped:
prevdup=read
remdups.add(prevname)
elif prevdup.mate_is_unmapped and not read.mate_is_unmapped:
remdups.add(readname)
continue
elif read.is_proper_pair and not prevdup.is_proper_pair:
if prevname in firstofpair:
del firstofpair[prevname]
remdups.add(prevname)
prevdup=read
elif prevdup.is_proper_pair and not read.is_proper_pair:
remdups.add(readname)
continue
elif read.mapq>=prevdup.mapq:
if prevname in firstofpair:
del firstofpair[prevname]
remdups.add(prevname)
prevdup=read
else:
remdups.add(readname)
continue
elif prevdup.pos!=read.pos or read.is_reverse:
if options.fileformat=="pairedfastq":
if not prevdup.mate_is_unmapped and prevdup.mrnm==read.rname:
if prevname in firstofpair:
printprev=True
prevtoprint=prevdup
else:
firstofpair[prevname]=prevdup
else:
printprev=True
prevdup=read
if printread:
if options.fileformat=="pairedfastq":
print_read_to_file(output, read, options.fileformat,sammate=firstofpair[readname],mateout=routput)
del firstofpair[readname]
else:
print_read_to_file(output, read, options.fileformat)
if printprev:
if options.fileformat=="pairedfastq":
print_read_to_file(output, prevtoprint, options.fileformat,sammate=firstofpair[prevname],mateout=routput)
del firstofpair[prevname]
else:
print_read_to_file(output, prevdup, options.fileformat)
#clean up
if options.fileformat=="pairedfastq":
output.close()
routput.close()
elif options.fileformat in ["fasta", "fastq"]:
output.close()
else:
output.close
samfile.close