-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathgtf_to_gff.py
78 lines (58 loc) · 2.75 KB
/
gtf_to_gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
"""
Convert Gene Transfer Format [GTF] to Generic Feature Format Version 3 [GFF3].
Usage: python gtf_to_gff.py in.gtf > out.gff3
Requirement:
GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
helper.py: https://github.com/vipints/GFFtools-GX/blob/master/helper.py
Copyright (C)
2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
"""
import re
import sys
import helper
import GFFParser
def GFFWriter(gtf_content):
"""
write the feature information to GFF format
@args gtf_content: Parsed object from gtf file
@type gtf_content: numpy array
"""
sys.stdout.write('##gff-version 3\n')
for ent1 in gtf_content:
chr_name = ent1['chr']
strand = ent1['strand']
start = ent1['start']
stop = ent1['stop']
source = ent1['source']
ID = ent1['name']
Name = ent1['gene_info']['Name']
Name = ID if not Name else Name
sys.stdout.write('%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tID=%s;Name=%s\n' % (chr_name, source, start, stop, strand, ID, Name))
for idx, tid in enumerate(ent1['transcripts']):
t_start = ent1['exons'][idx][0][0]
t_stop = ent1['exons'][idx][-1][-1]
t_type = ent1['transcript_type'][idx]
utr5_exons, utr3_exons = [], []
if ent1['exons'][idx].any() and ent1['cds_exons'][idx].any():
utr5_exons, utr3_exons = helper.buildUTR(ent1['cds_exons'][idx], ent1['exons'][idx], strand)
sys.stdout.write('%s\t%s\t%s\t%d\t%d\t.\t%s\t.\tID=%s;Parent=%s\n' % (chr_name, source, t_type, t_start, t_stop, strand, tid[0], ID))
for ex_cod in utr5_exons:
sys.stdout.write('%s\t%s\tfive_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s\n' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]))
for ex_cod in ent1['cds_exons'][idx]:
sys.stdout.write('%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tParent=%s\n' % (chr_name, source, ex_cod[0], ex_cod[1], strand, ex_cod[2], tid[0]))
for ex_cod in utr3_exons:
sys.stdout.write('%s\t%s\tthree_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s\n' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]))
for ex_cod in ent1['exons'][idx]:
sys.stdout.write('%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s\n' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0]))
def __main__():
try:
gtf_fname = sys.argv[1]
except:
print __doc__
sys.exit(-1)
gtf_file_content = GFFParser.Parse(gtf_fname)
GFFWriter(gtf_file_content)
if __name__ == "__main__":
__main__()