-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathgenerate_seqs.py
54 lines (47 loc) · 1.58 KB
/
generate_seqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python
# encoding: utf-8
import hickle as hkl
import gzip
import numpy as np
from preprocess import *
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-e', dest='e', type=int, default=0)
args = parser.parse_args()
e = args.e
expnames = ['SNEDE0000EMT', 'SNEDE0000EPC', 'SNEDE0000EPH', 'SNEDE0000ENO', 'SNEDE0000EMU', 'SNEDE0000ENP']
name = expnames[e]
# path = '/home/xumin/spot/%s' % name
path = './spot/%s' % name
# the genome is a self-maintained UCSC hg19 genome hickle file,
# one can download chr[1-22,X,Y].fa.gz on http://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/
genome = './temp/sequences.hkl'
seqpath1 = './data/%s_pos' % name
seqpath2 = './data/%s_neg' % name
print 'loading whole genome...'
genome = hkl.load(genome)
print 'loaded!'
f = gzip.open(path)
f_1 = open(seqpath1, 'w')
f_2 = open(seqpath2, 'w')
contents = f.readlines()
f.close()
print 'totally %d lines' % len(contents)
for i, line in enumerate(contents):
if i % 1000 == 0:
print '%d lines converted...' % i
values = line.split()
chrid, start, end = (values[0], int(values[1]), int(values[2]))
f_1.write(genome[chrid][start:end])
f_1.write('\n')
l = end - start
f_2.write(genome[chrid][start+l*5:end+l*5])
f_2.write('\n')
f_1.close()
f_2.close()
k=6
s=2
seq2ngram('./data/%s_pos' % name, k, s, './data/%s_pos_%dgram_%dstride' % (name, k, s))
seq2ngram('./data/%s_neg' % name, k, s, './data/%s_neg_%dgram_%dstride' % (name, k, s))
forglove('./data/%s_pos_%dgram_%dstride' % (name,k,s),
'./data/%s_%dgram_%dstride_oneline' % (name,k,s))