-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtcga_ranksum_rm50kb.py
40 lines (35 loc) · 1.84 KB
/
tcga_ranksum_rm50kb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
'''wilcoxon by cancer type'''
import argparse
import pandas as pd
import scipy.stats as ss
def get_args():
'''
Loads the parser
'''
# Main parser
parser = argparse.ArgumentParser(description="wilcoxon by cancer type")
# Args
required = parser.add_argument_group("Required input parameters")
# Metadata from input table
required.add_argument('-d', '--disease_type', required=True, help='Disease type.')
return parser.parse_args()
if __name__ == '__main__':
args = get_args()
disease = args.disease_type
distance_table=pd.read_csv('/gpfs/data/lyang-lab/users/fan/breakpoint_tcga/final_dis_56268x8990.csv',index_col=0)
distance_table0=distance_table.reset_index()
distance_table1=distance_table0.drop(['index'], axis=1)
part_expr=pd.read_csv('/gpfs/data/lyang-lab/users/fan/breakpoint_tcga/new_norm_exp.csv',index_col=0)
expr=pd.concat([distance_table1[['barcode','project_id']],part_expr],axis=1)
sv_table=pd.read_csv('/gpfs/data/lyang-lab/users/fan/breakpoint_tcga/remove_50kb/remove_50kb_svdistance_8990x56268.csv')
disease_sv=sv_table[sv_table.project_id == '{}'.format(disease)]
disease_expr=expr[expr.project_id == '{}'.format(disease)]
gene_list=list(expr.columns)[2:]
table = pd.DataFrame()
for j in gene_list:
print(j)
smaller=disease_expr[disease_sv[j] < 100000][j].astype(float).dropna()
bigger=disease_expr[(disease_sv[j] >= 100000) | disease_sv[j].isnull()][j].astype(float).dropna()
results = ss.ranksums(smaller, bigger)
table = table.append({'cancer_type':'{}'.format(disease),'gene':j, 'pvalue':results[1]/2,'z_statistic':results[0],'samplesize_sv':len(smaller), 'samplesize_wosv':len(bigger)}, ignore_index=True)
table.to_csv('/gpfs/data/lyang-lab/users/fan/breakpoint_tcga/remove_50kb/tcga_ranksum_{}_rm50kb.csv'.format(disease))