-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodelbuilder
executable file
·114 lines (100 loc) · 3.25 KB
/
modelbuilder
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
import argparse
import glob
import json
import math
import os
import openturns as ot # conda install -y openturns
from grimoire.genome import Reader
import isoform
def create_length_model(seqs, lmin, lmax):
# get the lengths
lens = [len(seq) for seq in seqs]
# train Frechet
sample = ot.Sample([[x] for x in lens if x < lmax])
f = ot.FrechetFactory().buildAsFrechet(sample)
a = f.getAlpha()
b = f.getBeta()
g = f.getGamma()
# create histogram from frechet
pdf = []
for x in range(lmax):
if x < g: pdf.append(0)
else:
z = (x-g)/b
pdf.append((a/b) * z**(-1-a) * math.exp(-z**-a))
# create leading zeros and rescale
for i in range(lmin): pdf[i] = 0
total = sum(pdf)
for i in range(len(pdf)): pdf[i] /= total
return pdf
#######
# CLI #
#######
parser = argparse.ArgumentParser(description='splice model builder')
parser.add_argument('dir', help='path to directory of genes (e.g. smallgenes)')
parser.add_argument('name', help='name of the model (e.g. worm)')
parser.add_argument('out', help='save sub-models to directory (e.g. models)')
parser.add_argument('--don', type=int, default=5, help="[%(default)i]")
parser.add_argument('--acc', type=int, default=6, help="[%(default)i]")
parser.add_argument('--emm', type=int, default=3, help="[%(default)i]")
parser.add_argument('--emin', type=int, default=25, help="[%(default)i]")
parser.add_argument('--emax', type=int, default=1000, help="[%(default)i]")
parser.add_argument('--imm', type=int, default=3, help="[%(default)i]")
parser.add_argument('--imin', type=int, default=35, help="[%(default)i]")
parser.add_argument('--imax', type=int, default=1000, help="[%(default)i]")
arg = parser.parse_args()
## collect subsequences
accs = []
dons = []
exons = []
introns = []
exonsum = 0
for ff in glob.glob(f'{arg.dir}/*.fa'):
gf = ff[:-2] + 'gff3'
genome = Reader(gff=gf, fasta=ff)
tx = next(genome).ftable.build_genes()[0].transcripts()[0]
for f in tx.exons:
exons.append(f.seq_str())
exonsum += f.end - f.beg + 1
for f in tx.introns:
iseq = f.seq_str()
dons.append(iseq[0:arg.don])
accs.append(iseq[-arg.acc:])
introns.append(iseq)
## build models
inf = len(introns) / exonsum
acc = isoform.create_pwm(accs)
don = isoform.create_pwm(dons)
exs = isoform.create_markov(exons, arg.emm, 0, 0)
ins = isoform.create_markov(introns, arg.imm, arg.don, arg.acc)
exl = create_length_model(exons, arg.emin, arg.emax)
inl = create_length_model(introns, arg.imin, arg.imax)
## write individual models (hard-coded names)
facc = f'{arg.out}/acc.pwm'
fdon = f'{arg.out}/don.pwm'
fexs = f'{arg.out}/exon.mm'
fins = f'{arg.out}/intron.mm'
fexl = f'{arg.out}/exon.len'
finl = f'{arg.out}/intron.len'
finf = f'{arg.out}/intron.freq'
if not os.path.exists(arg.out): os.makedirs(arg.out)
isoform.write_pwm(facc, acc)
isoform.write_pwm(fdon, don)
isoform.write_markov(fexs, exs)
isoform.write_markov(fins, ins)
isoform.write_len(fexl, exl)
isoform.write_len(finl, inl)
with open(finf, 'w') as fp: print(inf, file=fp)
# write ghmm
ghmm = {
'name': arg.name,
'inf': isoform.prob2score(inf),
'acc': isoform.read_pwm(facc),
'don': isoform.read_pwm(fdon),
'exs': isoform.read_markov(fexs),
'ins': isoform.read_markov(fins),
'exl': isoform.read_len(fexl),
'inl': isoform.read_len(finl),
}
print(json.dumps(ghmm, indent=1))