-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpaired_nuc_freqs_from_bam.py
executable file
·178 lines (145 loc) · 5.47 KB
/
paired_nuc_freqs_from_bam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env python
"""Get freqs for nucleotide pairs (present per read) at given
positions."""
#--- standard library imports
#
import sys
import os
import logging
import argparse
#--- third-party imports
#
import pysam
#--- project specific imports
#
# /
__author__ = "Andreas Wilm"
__version__ = "0.1"
__email__ = "andreas.wilm@gmail.com"
__license__ = "The MIT License (MIT)"
# global logger
# http://docs.python.org/library/logging.html
LOG = logging.getLogger("")
logging.basicConfig(level=logging.WARN,
format='%(levelname)s [%(asctime)s]: %(message)s')
def cmdline_parser():
"""
creates an argparse instance
"""
# http://docs.python.org/library/optparse.html
parser = argparse.ArgumentParser(
description=__doc__)
parser.add_argument("--verbose",
dest="verbose",
action="store_true",
help=argparse.SUPPRESS) #"be verbose")
parser.add_argument("--debug",
dest="debug",
action="store_true",
help=argparse.SUPPRESS) #"debugging")
parser.add_argument("-1", "--pos1",
dest="pos1",
type=int,
required=True,
help="First position")
parser.add_argument("-2", "--pos2",
dest="pos2",
type=int,
required=True,
help="Second position")
parser.add_argument("-r", "--ref",
dest="ref",
required=True,
help="Mapping/reference sequence/chromosome name")
parser.add_argument("-b", "--bam",
dest="bam",
required=True,
help="Mapping input file (BAM)")
parser.add_argument("--fasta",
dest="fasta",
help="Print corresponding reference nucleotides"
" region from this fasta file as well")
return parser
def main():
"""
The main function
"""
parser = cmdline_parser()
args = parser.parse_args()
if args.verbose:
LOG.setLevel(logging.INFO)
if args.debug:
LOG.setLevel(logging.DEBUG)
if args.pos1 >= args.pos2:
LOG.fatal("First position must be smaller than second")
parser.print_usage(sys.stderr)
# file check
if not os.path.exists(args.bam):
LOG.fatal("file '%s' does not exist.\n" % args.bam)
sys.exit(1)
if args.fasta and not os.path.exists(args.fasta):
LOG.fatal("file '%s' does not exist.\n" % args.fasta)
sys.exit(1)
pos_pair = (args.pos1-1, args.pos2-1)
sam = pysam.Samfile(args.bam, "rb")
if args.fasta:
fastafile = pysam.Fastafile(args.fasta)
refregion = fastafile.fetch(args.ref, pos_pair[0], pos_pair[1]+1)
print "Ref. region: %s" % refregion
# initialize counts to valid nucleotide combinations even though
# not strictly necessary, since we deal with non existing values
# later.
#
VALID_NUCS = 'ACGTN'
counts = dict()
for n1 in VALID_NUCS:
# counting twice. booooooh
for n2 in VALID_NUCS:
counts["%c%c" % (n1, n2)] = 0
num_dups = 0
num_anomalous = 0
for alnread in sam.fetch(args.ref, pos_pair[0], pos_pair[1]+1):
if alnread.is_duplicate:
num_dups += 1
continue
assert not alnread.is_unmapped # shouldn't be possible anyway
if alnread.is_paired and not alnread.is_proper_pair:
num_anomalous += 1
continue
#region = list(sam.fetch("gi|158976983|ref|NC_001474.2|", 10308, 10411))
#[(i, str(r)) for (i, r) in enumerate(region) if r.cigar != [(0, 51)]]
# create a map of ref position (string as key) and the corresponding query
# (clipped read) nucleotides (value)
pos_nt_map = dict([(str(rpos), alnread.query[qpos])
for (qpos, rpos) in alnread.aligned_pairs
if qpos and rpos])
try:
nt5 = pos_nt_map[str(pos_pair[0])]
nt3 = pos_nt_map[str(pos_pair[1])]
except KeyError:
#print "read not fully overlapping both positions (want %d-%d, have %d-%d, cigar %s): %s" % (
# pos_pair[0]+1, pos_pair[1]+1,
# alnread.aend-alnread.alen, alnread.aend,
# alnread.cigar, alnread.positions)
continue
key = "%c%c" % (nt5, nt3)
counts[key] = counts.get(key, 0) + 1
#if key == 'AT':
# import pdb; pdb.set_trace()
# we've initialized counts but be paranoid about existance of key anyway
if num_dups:
print "Ignored %d reads flagged as duplicates" % num_dups
if num_anomalous:
print "Ignored %d reads flagged as paired but not in proper pair" % num_anomalous
counts_sum = sum(counts.values())
print "%d (non-dup)reads overlapped both given positions %d and %d" % (
counts_sum, pos_pair[0]+1, pos_pair[1]+1)
if counts_sum == 0:
sys.exit(0)
for k in sorted(counts.keys()):
if counts[k]:
print "%s %d %.4f" % (k, counts[k], counts[k]/float(counts_sum))
if __name__ == "__main__":
main()
LOG.warn("You might want to use the more powerful joined_freqs_from_bam.py instead")
LOG.info("Successful exit")