-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsparkCorrelations.py
67 lines (52 loc) · 1.7 KB
/
sparkCorrelations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
import os,sys,optparse
import numpy as np
from collections import Counter,defaultdict
usage = "usage: %prog [options]"
parser = optparse.OptionParser(usage=usage)
parser.add_option("-p", "--prefix", dest="prefix", default="CORR",
metavar="prefix", help="names of sets to be correlated", type="string")
ox, args = parser.parse_args()
print(ox, file=sys.stderr)
pref = ox.prefix.split(',')
# c is array of random 1,-1 with the number of rows
# equal to the number probands in quads
# and the number of columns equal to number of iterations (10000)
# it is precomputed and stored so that
# the same matrix is used for all chromosomes
gnpFn = pref[0] + '.npz'
gncFn = pref[1] +'.npz'
chFn = pref[1] + '.indv'
prFn = pref[0] + '.indv'
# they are in mo,fa,... order
parents = []
with open(prFn, 'r') as f:
for l in f:
parents.append( l.strip('\n\r') )
children = []
with open(chFn, 'r') as f:
for l in f:
if 'spark' in pref[1]:
children.append( famStruct[l.strip('\n\r')][2] )
else:
children.append(l.strip('\n\r'))
f = np.load(gnpFn, 'rb')
print("Keys: %s" % list(f.keys()), file=sys.stderr)
gnp = f.get('GN').astype(float)
print("reading parents npz:", file=sys.stderr)
print("gnp.shape", gnp.shape, file=sys.stderr)
f.close()
f = np.load(gncFn, 'rb')
print("Keys: %s" % list(f.keys()), file=sys.stderr)
gnc = f.get('GN').astype(float)
print("reading children npz:", file=sys.stderr)
print("gnc.shape", gnc.shape, file=sys.stderr)
f.close()
prs = gnp
chn = gnc
corr = np.corrcoef(chn)
id = np.where(corr > 0.95)
l = len(id[0])
idd = set([id[0][i] for i in range(l) if id[0][i] != id[1][i]])
for i in idd:
print(children[i])