-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathtoH5ad.py
70 lines (59 loc) · 1.97 KB
/
toH5ad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# needs conda env:
#. '/home/zouyang/anaconda3/etc/profile.d/conda.sh'
#conda activate scanpy
import sys
strExp = sys.argv[1] #h5 file or csv or tsv
strMeta = sys.argv[2] #csv or tsv
strLayout = sys.argv[3] #csv or tsv
strH5ad = sys.argv[4] # h5ad
import pandas as pd
import anndata as annD
from scipy.sparse import csc_matrix
## Expression -----
if strExp.endswith('h5'):
D = annD.read_hdf(strExp,"Exp")
D.obs_names = pd.Index([s.decode('utf-8') for s in D.obs_names])
D.var_names = pd.Index([s.decode('utf-8') for s in D.var_names])
elif strExp.endswith('tsv'):
D = annD.read_text(strExp,delimiter="\t",first_column_names=True)
D = D.transpose()
elif strExp.endswith('csv'):
D = annD.read_text(strExp,delimiter=",",first_column_names=True)
D = D.transpose()
else:
raise Exception('FileFormatError: %s'%strExp)
## meta -----
if strMeta.endswith("csv"):
meta = pd.read_csv(strMeta,index_col=0)
elif strMeta.endswith('tsv'):
meta = pd.read_table(strMeta,index_col=0)
else:
raise Exception('FileFormatError: %s'%strMeta)
D.obs = pd.concat([D.obs,meta],axis=1)
for i in D.obs.columns:
if 'float' in str(D.obs[i].dtypes):
continue
if 'int' in str(D.obs[i].dtypes) and not 'clust' in i:
continue
D.obs[i] = D.obs[i].astype('category')
## layout ----------
if strLayout.endswith("csv"):
layout = pd.read_csv(strLayout,index_col=0)
elif strLayout.endswith('tsv'):
layout = pd.read_table(strLayout,index_col=0)
else:
raise Exception('FileFormatError: %s'%strLayout)
if layout.shape[0]!=D.shape[0]:
cID = list(D.obs_names)
selC=list(set(cID) & set(list(layout.index)))
if len(selC)==0:
raise Exception('Overlap cell is NONE!')
layout=layout.loc[selC,:]
D=D[selC]
lnames = list(layout.columns)
for i in range(0,layout.shape[1],2):
D.obsm['X_%s'%lnames[i].rsplit("_",1)[0]] = layout.iloc[:,[i,i+1]].values
## CSC sparse matrix
D.X = csc_matrix(D.X)
D.write(strH5ad)
print(strH5ad+" was created successfully!")