-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathwiq.py
271 lines (245 loc) · 12.7 KB
/
wiq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!/usr/bin/env python
import os,sys,re,argparse
from wig import Wig
from glob import glob
from time import time
from functions import div
def rawsort(ifile,sort_ofile,gfile,format,step=10,suppress=False,buffer=None):
tm=time()
if format=='wig':
print('\nconverting',ifile,'...')
raw_ofile=sort_ofile[:-3]+'raw.wiq'
wg=Wig(file=ifile,gfile=gfile,step=step,suppress=suppress)
wg.ajust_size(gfile=gfile)
wg.save(file=raw_ofile,format="wiq",step=step,suppress=suppress)
print('time cost:',time()-tm)
tm=time()
else:raw_ofile=ifile
print('\nsorting',raw_ofile,'...')
temp=ifile[:-3]+'temp'
while os.path.isdir(temp):temp=temp+'.temp'
os.mkdir(temp)
if buffer!=None:cmd='sort -r -n -s -k1 -k2 -o '+sort_ofile+' --buffer-size '+str(buffer)+' --temporary-directory '+str(temp)+' '+raw_ofile
else:cmd='sort -r -n -s -k1 -k2 -o '+sort_ofile+' --temporary-directory '+str(temp)+' '+raw_ofile
os.system(cmd)
if format=='wig':
print('Removing ',raw_ofile,'...')
os.system('rm '+raw_ofile)
print('removing',temp)
os.system('rm '+str(temp)+' -r')
print('time cost:',time()-tm)
def refquantile(paths,ofile,gfile):
print('\nPreparing reference ...')
tm=time()
files=paths.split(':')
fi={}
fo=open(ofile,'w')
for file in files:fi[file]=open(file)
nfile=len(files)
for line in fi[files[0]]:
col=line.split()
v=float(col[0])
for file in files[1:]:
add_line=fi[file].readline()
v+=float(add_line.split()[0])
fo.write(str(div(v,nfile))+'\t-\t-\t-\n')
fo.close()
print('time cost:',time()-tm)
def changevalue(ifile,ref,ofile,gfile,step=10,suppress=False,buffer=None):
from random import randint
if ifile!=ref:print('\nnormalizing',ifile,'...')
else:print('\nsaving reference ...')
tm=time()
fi,fr=open(ifile),open(ref)
wg=Wig(step=step,gfile=gfile)
for line in fi:
col=line.split()
rcol=fr.readline().split()
if len(rcol)==0:rcol=[0.0]
cr,pos,vl=col[2],div(int(col[3]),step),float(rcol[0])
wg.data[cr][pos]=vl
n=0
for line in fr:n+=1
if n>0:print('Warning: the input genome size is smaller than the reference genome size by',n,'wiggle steps!')
wg.save(ofile,suppress=suppress)
print('time cost:',time()-tm)
def qnorwig(paths,gfile,out_dir='wiq_result',ref=None,iformat='wig',rformat='wig',isorted=False,rsorted=False,step=10,suppress=False,buffer=1000000,format_only=False):
#buffer unit is 1024
buffer=int(buffer*1000000)
if isorted==1:isorted=True
else:isorted=False
if rsorted==1:rsorted=True
else:rsorted=False
alltime=time()
if not os.path.isdir(out_dir):os.mkdir(out_dir)
files,sfiles={},{}
for path in paths.split(':'):
if os.path.isfile(path):files[path]=1#.append(path)
elif os.path.isdir(path):
if iformat=='wig':
for file in glob(os.path.join(path,'*wig')):files[file]=1
if iformat=='wiq':
for file in glob(os.path.join(path,'*wiq')):files[file]=1
for ifile in files:
opath=re.sub('/','_',ifile)#'_'.join(elems)
while opath[0]=='.' or opath[0]=='_' or opath[0]=='/':opath=opath[1:]
opath=os.path.join(out_dir,opath)
if (iformat=='wig') or (isorted==False):
sfiles[opath[:-3]+'sort.wiq']=opath[:-3]+'qnor.wig'
rawsort(ifile=ifile,sort_ofile=opath[:-3]+'sort.wiq',gfile=gfile,format=iformat,step=step,suppress=suppress,buffer=buffer)
else:sfiles[ifile]=opath[:-3]+'qnor.wig'
if format_only:
print('job done, total time cost:',time()-alltime,'\n')
return
if ref==None:
tnb=0
for c in paths:tnb+=ord(c)
refpath=os.path.join(out_dir,'reference.'+str(tnb)+'.sort.wiq')
refquantile(paths=':'.join(list(sfiles.keys())),ofile=refpath,gfile=gfile)
else:
elems=os.path.split(ref)
opath=os.path.join(out_dir,elems[-1])
if rformat=='wig' or rsorted==False:
refpath=opath[:-3]+'sort.wiq'
rawsort(ifile=ref,sort_ofile=opath[:-3]+'sort.wiq',gfile=gfile,format=rformat,step=step,suppress=suppress,buffer=buffer)
else:refpath=ref
for sfile in sfiles:
changevalue(ifile=sfile,ref=refpath,ofile=sfiles[sfile],gfile=gfile,step=step,suppress=suppress)
#if ref==None:changevalue(ifile=refpath,ref=refpath,ofile=refpath[:-8]+'qnor.wig',gfile=gfile,step=step,suppress=suppress)
print('job done, total time cost:',time()-alltime,'\n')
def wig2wiq(command='wiq'):
'''
Description:
This function parses input parameters, calls and passes all parameters values to the function qnorwig, or print some help messages if required.
parameters:
none
'''
if (len(sys.argv)<2) and ('-h' not in sys.argv) and ('--help' not in sys.argv):
# at least two parameter need to be specified, will print help message if no parameter is specified
print("\nusage:\npython danpos.py wig2wiq <chr_file> <file_path> [optional arguments]\n\nfor more help, please try: python danpos wig2wiq -h\n")
return 0
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,\
usage="\npython danpos.py <command> <chr_file> <file_path>[optional arguments]\n\n",\
description='',epilog="Kaifu Chen, et al. chenkaifu@gmail.com, \
Li lab, Biostatistics department, Dan L. Duncan cancer center, \
Baylor College of Medicine.")
parser.add_argument('command',default=None,\
help="set as 'wig2wiq' to convert wiggle format to wiq format.")
parser.add_argument('chr_file',default=None,\
help="a text file with each line describing a chrosome length, e.g. 'chr1 10043'")
parser.add_argument('input_paths',default=None,\
help="paths to the wiggle format input files.\
Each path could point to a file or a directory that contains the files. use ':'\
to seperate paths, e.g. 'Path_A:path_B")
parser.add_argument('--out_dir', dest="out_dir",metavar='',default='wig2wiq',\
help="a path to the output directory")
'''
parser.add_argument('--count', dest="count",metavar='',default=None,type=int,\
help="specify the total reads count, e.g. 10000000, so the reads count \
in each data set will be normalized to this number")
'''
parser.add_argument('--step', dest="step",metavar='',default=10,type=int,\
help="the step size in wiggle format data.")
parser.add_argument('--buffer_size', dest="buffer",metavar='',default=10,type=float,\
help="maximal memory size that can be used to sort file, e.g. set to 1 when need 1G memory.")
if '-h' in sys.argv or '--help' in sys.argv: # print help information once required by user
print('\n')
parser.print_help()
print('\n')
return 0
elif len(sys.argv)>=3: # at least two parameter need to be specified
try:
args=parser.parse_args() #all paramter values are now saved in args
except:
print("\nfor more help, please try: python danpos wig2wiq -h\n")
return 0
else:
print("\nfor help, please try: python danpos wig2wiq -h\n")
return 0
print('\ncommand:\npython'," ".join(sys.argv)) # print the command line, this let the user to keep a log and remember what parameters they specified
qnorwig(paths=args.input_paths,\
gfile=args.chr_file,\
out_dir=args.out_dir,\
#ref=None,\
#iformat=args.iformat,\
#rformat=args.rformat,\
#isorted=args.isorted,\
#rsorted=args.rsorted,\
step=args.step,\
suppress=False,\
buffer=args.buffer,\
format_only=True)
def wiq(command='wiq'):
'''
Description:
This function parses input parameters, calls and passes all parameters values to the function qnorwig, or print some help messages if required.
parameters:
none
'''
if (len(sys.argv)<2) and ('-h' not in sys.argv) and ('--help' not in sys.argv):
# at least two parameter need to be specified, will print help message if no parameter is specified
print("\nusage:\npython danpos.py wiq <chr_file> <file_path> [optional arguments]\n\nfor more help, please try: python danpos wiq -h\n")
return 0
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,\
usage="\npython danpos.py <command> <chr_file> <file_path>[optional arguments]\n\n",\
description='',epilog="Kaifu Chen, et al. chenkaifu@gmail.com, \
Li lab, Biostatistics department, Dan L. Duncan cancer center, \
Baylor College of Medicine.")
parser.add_argument('command',default=None,\
help="set as 'wiq' to do quantile normalization for wiggle format data.")
parser.add_argument('chr_file',default=None,\
help="a text file with each line describing a chrosome length, e.g. 'chr1 10043'")
parser.add_argument('input_paths',default=None,\
help="paths to the input files, each file shoule be in '.wig' or '.wiq' format.\
Each path could point to a file or a directory that contains the files. use ':'\
to seperate paths, e.g. 'Path_A:path_B")
parser.add_argument('--out_dir', dest="out_dir",metavar='',default='wiq_result',\
help="a path to the output directory")
'''
parser.add_argument('--count', dest="count",metavar='',default=0,type=int,\
help="specify the total reads count, e.g. 10000000, so the reads count \
in each data set will be normalized to this number")
'''
parser.add_argument('--reference', dest="ref",metavar='',default=None,\
help="a path to the file containing reference data. When reference data is available, \
data in each input file will be normalized to have the same quantiles in reference data")
parser.add_argument('--iformat', dest="iformat",metavar='',default='wig',\
help="the data format in input files, can be 'wig' or 'wiq'")
parser.add_argument('--rformat', dest="rformat",metavar='',default='wig',\
help="the data format in reference file, can be 'wig' or 'wiq'")
parser.add_argument('--isorted', dest="isorted",metavar='',default=0,type=int,\
help="set to 1 if the input file in wiq format and sorted.")
parser.add_argument('--rsorted', dest="rsorted",metavar='',default=0,type=int,\
help="set to 1 if the reference file is in wiq format and sorted.")
parser.add_argument('--step', dest="step",metavar='',default=10,type=int,\
help="the step size in wiggle format data.")
parser.add_argument('--buffer_size', dest="buffer",metavar='',default=10,type=float,\
help="maximal memory size that can be used to sort file, e.g. set to 1 when need 1G memory.")
if '-h' in sys.argv or '--help' in sys.argv: # print help information once required by user
print('\n')
parser.print_help()
print('\n')
return 0
elif len(sys.argv)>=3: # at least two parameter need to be specified
try:
args=parser.parse_args() #all paramter values are now saved in args
except:
print("\nfor more help, please try: python danpos wiq -h\n")
return 0
else:
print("\nfor help, please try: python danpos wiq -h\n")
return 0
print('\ncommand:\npython'," ".join(sys.argv)) # print the command line, this let the user to keep a log and remember what parameters they specified
qnorwig(paths=args.input_paths,\
gfile=args.chr_file,\
out_dir=args.out_dir,\
ref=args.ref,\
iformat=args.iformat,\
rformat=args.rformat,\
isorted=args.isorted,\
rsorted=args.rsorted,\
step=args.step,\
suppress=False,\
buffer=args.buffer)
if __name__ == "__main__":
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # This allows to print each message on screen immediately.