-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindelConverter.py
154 lines (141 loc) · 6.67 KB
/
indelConverter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import gzip
import sys
import docker
import argparse
from textwrap import dedent
def variantIndelConverter(client, volumes, container, referenceGenome, chromosome, start, end, ref, alt, to_dash):
if alt == '*':
alt = '-'
if ref == '*':
ref = '-'
if to_dash is False: # input with '-'
if len(ref) != len(alt) and ref != '-' and alt != '-':
return chromosome, start, end, ref, alt
if ref != '-' and alt != '-' and len(ref) == 1 and len(alt) == 1: # SNV
return chromosome, start, end, ref, alt
elif ref == '-': # INS
return dash_to_noDash_INS(client, volumes, container, referenceGenome, chromosome, start, end, ref, alt)
elif alt == '-': # DEL
return dash_to_noDash_DEL(client, volumes, container, referenceGenome, chromosome, start, end, ref, alt)
elif to_dash is True: # input without '-'
if len(ref) == 1 and len(alt) == 1: # SNV
return chromosome, start, end, ref, alt
elif len(ref) < len(alt): # INS
return noDash_to_dash_INS(chromosome, start, end, ref, alt)
elif len(ref) > len(alt): # DEL
return noDash_to_dash_DEL(chromosome, start, end, ref, alt)
return None
def getNucleotide(client, volumes, container, referenceGenome, chromosome, start, end):
if chromosome.startswith('chr') is False:
chromosome = 'chr' + chromosome
logs = container.exec_run("samtools faidx /ref/%s %s:%s-%s" %(referenceGenome, chromosome, start, end), stdout=True, stderr=True, stream=True)
try:
res = next(logs).decode('utf-8').split()
except: # no result from referenceGenome
return None
if len(res) == 2:
return res[1].upper()
return None
def dash_to_noDash_INS(client, volumes, container, referenceGenome, chromosome, start, end, ref, alt):
""" chr1 13417 13417 - GAGA to chr1 13417 13417 C CGAGA (require fetch from reference)"""
nt = getNucleotide(client, volumes, container, referenceGenome, chromosome, start, start)
if nt is not None:
return chromosome, start, end, nt, nt + alt
return chromosome, start, end, ref, alt
def dash_to_noDash_DEL(client, volumes, container, referenceGenome, chromosome, start, end, ref, alt):
""" chr1 10145 10145 A - to chr1 10144 10145 TA T (require fetch from reference)"""
nt = getNucleotide(client, volumes, container, referenceGenome, chromosome, start - 1, start - 1)
if nt is not None:
return chromosome, start - 1, end, nt + ref, nt
return chromosome, start, end, ref, alt
def noDash_to_dash_INS(chromosome, start, end, ref, alt):
""" chr1 13417 13417 C CGAGA to chr1 13417 13417 - GAGA """
return chromosome, start, end, '-', alt[1:]
def noDash_to_dash_DEL(chromosome, start, end, ref, alt):
""" chr1 10144 10145 TA T to chr1 10145 10145 A - """
return chromosome, start + 1, end, ref[1:], '-'
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=dedent("""\
Testing environment: Python 3
Require inputs:
1. in_file
2. out_file
3. in_reference
4. type (txt, vcf)
5. to_dash (ex. chr1 99 99 T TA to chr1 100 100 - A)
Usage:
- python3 indelConverter.py --in_file data/dash.txt --in_reference ucsc.hg19.fasta --out_file data/out_dash.txt --type txt --to_dash false
"""))
required = parser.add_argument_group('required arguments')
required.add_argument('--in_file', type=str, help='input file')
required.add_argument('--out_file', type=str, help='output file')
required.add_argument('--in_reference', type=str, help='input corresponding reference fasta')
required.add_argument('--type', type=str, help='input file type')
required.add_argument('--to_dash', type=str2bool, help="if true, convert indel in input file to format with '-', else on the contrary")
args = parser.parse_args()
in_reference_path = '/'.join(args.in_reference.split('/')[:-1]) if '/' in args.in_reference else '$PWD'
referenceGenome = args.in_reference.split('/')[-1]
client = docker.from_env()
volumes = {
in_reference_path: {'bind': '/ref', 'mode': 'rw'}
}
print('new a container,', volumes)
container = client.containers.run(image='biocontainers/samtools:1.3.1',
command="/bin/bash",
volumes=volumes,
tty=True,
stdin_open=True,
detach=True)
container.start()
out = open(args.out_file, 'w')
f = gzip.open(args.in_file, 'rb') if args.in_file.endswith('.gz') else open(args.in_file)
n = 0
for line in f:
if args.in_file.endswith('.gz'):
line = line.decode('utf-8')
if line.startswith('#'):
out.write(line)
continue
sep = line.strip('\n').split('\t')
try:
chromosome = sep[0]
ref = sep[3]
alt = sep[4]
others = sep[5:]
if args.type == 'txt':
start = int(sep[1])
end = int(sep[2])
elif args.type == 'vcf':
start = int(sep[1])
end = start + len(ref) - 1
# print('+', '\t'.join([chromosome, str(start), str(end), ref, alt]))
chromosome, start, end, ref, alt = variantIndelConverter(client, volumes, container, referenceGenome, chromosome, start, end, ref, alt, args.to_dash)
# print('-', '\t'.join([chromosome, str(start), str(end), ref, alt]), '\n')
out.write('%s\t%s\t%s\t%s\t%s\t%s\n' %(chromosome, str(start), str(end), ref, alt, '\t'.join(others)))
except Exception as e:
print(e)
out.write(line)
if n != 0 and n % 3000000 == 0:
container.stop()
container.remove()
container = client.containers.run(image='biocontainers/samtools:1.3.1',
command="/bin/bash",
volumes=volumes,
tty=True,
stdin_open=True,
detach=True)
container.start()
print('new a container,', volumes)
n += 1
print('processed %s variants' %n)
container.stop()
container.remove()