-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathTNER_main.R
248 lines (212 loc) · 18.9 KB
/
TNER_main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#################################################################
# TNER2: Tri-Nucleotide Error Reducer 2
# Build cfDNA background polish model
# Using tri-nucleotide context data in a Bayesian model
# Last updated: 08/17/2019
# Contact: shibing.deng {at} pfizer.com & tao.xie {at} pfizer.com or xietao2000 {at} gmail.com
# Contributed by: Shibing Deng, Joy Hsu, Jadwiga Bienkowska, Paul Rejto and Tao Xie.
# TNER2 publication: (in review)
# Pfizer Early Clinical Development Biostatistics
# Pfizer Early Oncology Development and Clinical Research
# Copyright (c) 2019 Pfizer Inc.
# All rights reserved.
#################################################################
#################################################################
# Terms of Use for TNER2 (Tri-Nucleotide Error Reducer 2)
# Pfizer has attempted to provide accurate and current information on TNER2. However, Pfizer makes no representations or warranties that the information contained or published on TNER2 will be suitable for your specific purposes or for any other purposes. You agree to indemnify, defend, and hold Pfizer harmless from all claims, damages, liabilities and expenses, including without limitation reasonable attorney's fees and costs, whether or not a lawsuit or other proceeding is filed, that in any way arise out of or relate to your use of TNER2.
# ALL INFORMATION AND DATA PROVIDED ON TNER2 IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND. PFIZER MAKES NO REPRESENTATIONS OR WARRANTIES CONCERNING TNER2 OR ANY OTHER MATTER WHATSOEVER, INCLUDING WITHOUT LIMITATION ANY EXPRESS, IMPLIED OR STATUTORY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT OF THIRD PARTY RIGHTS, TITLE, ACCURACY, COMPLETENESS OR ARISING OUT OF COURSE OF CONDUCT OR TRADE CUSTOM OR USAGE, AND DISCLAIMS ALL SUCH EXPRESS, IMPLIED OR STATUTORY WARRANTIES. PFIZER MAKES NO WARRANTY OR REPRESENTATION THAT YOUR USE OF TNER2 WILL NOT INFRINGE UPON THE INTELLECTUAL PROPERTY OR OTHER RIGHTS OF ANY THIRD PARTY. FURTHER, PFIZER SHALL NOT BE LIABLE IN ANY MANNER WHATSOEVER FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, CONSEQUENTIAL OR EXEMPLARY DAMAGES ARISING OUT OF OR IN ANY WAY RELATED TO TNER2, THE USE OF, OR INABILITY TO USE, ANY OF THE INFORMATION OR DATA CONTAINED OR REFERENCED IN TNER2, OR ANY OTHER MATTER. THE FOREGOING EXCLUSIONS AND LIMITATIONS SHALL APPLY TO ALL CLAIMS AND ACTIONS OF ANY KIND AND ON ANY THEORY OF LIABILITY, WHETHER BASED ON CONTRACT, TORT OR ANY OTHER GROUNDS, AND REGARDLESS OF WHETHER A PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES, AND NOTWITHSTANDING ANY FAILURE OF ESSENTIAL PURPOSE OF ANY LIMITED REMEDY. BY USING TNER2, YOU FURTHER AGREE THAT EACH WARRANTY DISCLAIMER, EXCLUSION OF DAMAGES OR OTHER LIMITATION OF LIABILITY HEREIN IS INTENDED TO BE SEVERABLE AND INDEPENDENT OF THE OTHER CLAUSES OR SENTENCES BECAUSE THEY EACH REPRESENT SEPARATE ELEMENTS OF RISK ALLOCATION BETWEEN THE PARTIES.
# TNER2 may contain information from third party websites, which may or may not be marked with the name of the source. Such information does not necessarily represent the views or opinions of Pfizer, and Pfizer shall have no responsibility whatsoever for such information. All information from third party websites are the sole responsibility of the person or entity that provides and/or maintains such website. As a TNER2 user, you are solely responsible for any information that you display, generate, transmit or transfer while using TNER2, and for the consequences of such actions.
# Should any TNER2 user provide general, scientific or other feedback information, whether in the form of questions, comments, or suggestions to Pfizer, regarding the content of Pfizer's website or otherwise, such information shall not be deemed to be confidential or proprietary to you or to any other party. Pfizer shall have no obligation of any kind with respect to such information and Pfizer shall have the right, without limitation, to reproduce, use, disclose, merge, display, make derivatives of and distribute such information to others. Pfizer shall also have the right, without limitation, to use and exploit such information, including ideas, concepts, know-how, inventions, techniques or other materials contained in such information for any purpose whatsoever, including but not limited to, making changes or improvements to TNER2 and/or developing, manufacturing, marketing, selling or distributing products and technologies incorporating such information. However, you agree that Pfizer has no obligation whatsoever to respond to your comments or to change or correct any information on TNER2 based on your comments.
# Pfizer reserves the right to alter the content of TNER2 in any way, at any time and for any reason, with or without prior notice to you, and Pfizer will not be liable in any way for possible consequences of such changes or for inaccuracies, typographical errors or omissions in the contents hereof. Nothing contained herein shall be construed as conferring by implication, estoppel or otherwise any license or right under any patent, trademark or other intellectual property of Pfizer or any third party. Except as expressly provided above, nothing contained herein shall be construed as conferring any right or license under any Pfizer copyrights.
# Pfizer also reserves the right to modify these Terms of Use at any time and for any reason, with or without prior notice to you. You should always review the most current Terms of Use herein before using this website. By using TNER2, you agree to the current Terms of Use posted on this site. You also agree that these Terms of Use constitute the entire agreement between you and Pfizer regarding the subject matter hereof and supersede all prior and contemporaneous understandings, oral or written, regarding such subject matter. In addition, if any provision of these Terms of Use is found by a court of competent jurisdiction to be invalid, void or unenforceable, the remaining provisions shall remain in full force and effect, and the affected provision shall be revised so as to reflect the original intent of the parties hereunder to the maximum extent permitted by applicable law.
# BY USING TNER2, YOU ACKNOWLEDGE AND AGREE THAT YOU HAVE READ, UNDERSTOOD AND AGREE TO ALL OF THESE TERMS OF USE.
#################################################################
#################################################################
#################################################################
# Command line usage:
#################################################################
# Rscript TNER_R_script.R TNER_example_input_file.txt hs_ave_bg_error.csv hs_depth.csv hs_adj.csv
#################################################################
# Input:
# There can be up to four input parameters, the first one is required: the input file name (.csv format)
# The 2nd one is the average background error rate file calculated from healthy subjects,
# which is assumed to be "hs_ave_bg_error.csv", can be generated via "Create_background_error_rate.R".
# The third one is the coverage depth. It can be a file name or a number which will be
# used for all bases. The default is assumed to be "hs_depth.csv", can also be generated via "Create_background_error_rate.R".
# The last input parameter is only for TNER2 - it is the file with values for threshold adjustment based on known cancer TNC data derived froma decay function (described in a manuscript currently under review).
#################################################################
# Output:
# For each input file (or sample) the script will create an error polished frequency file,
# a file with detected mutants and a file with all noise (background error).
#################################################################
args = commandArgs(trailingOnly=TRUE) # 1-3 arguments from input
work.dir="." #working directory is where the input and output file stored
# Some optional parameters to tune
input.alpha = 0.01 #Polish strength parameter
input.filter = FALSE #turn on the filter to filter out those bases with abnormal data
# processing the input
#####################
tnt.rate.file=tnt.depth.file=NULL
if (length(args)<4) {
stop("Need four input parameters.", call.=FALSE)
}
input.file=args[1] # first argument is input file
tnt.rate.file=args[2] # Average bkgrd rate file
tnt.depth.file=args[3] # 3nt bkgrd rate file
tnt.adj.file=args[4] # TNER2 only, for cancer TNC adjustment
#the input file can be a single file (.txt, .freq, etc) or a list of files saved in a csv file
if (length(grep('csv',input.file))>0) input.pileup.file=unlist(read.csv(input.file,header=F,as.is=T)) else
input.pileup.file=input.file
# end of processing input
s=c("A","C","T","G") # four nucleotide letter
#read in the average background error and depth file
######################################################
hs.bkg.ave = read.csv(tnt.rate.file,row.names = 1)
if (!is.null(tnt.depth.file)) hs.depth=read.csv(tnt.depth.file,row.names = 1)
hs.adj = read.csv(tnt.adj.file,row.names = 1) # To adjust the background threshholds based on the known cancer mutation TNC
#create a Tri-nucleotide (tnt) code for each position
pileup1.data=read.table(file=input.pileup.file[1],header=T,check.names = F,as.is=T)
ref=toupper(pileup1.data$REF)
n=length(ref)
tnt=c("",paste0(ref[-((n-1):n)],ref[-c(1,n)],ref[-(1:2)]),"")
# remove the tnt code for the position at sequence gaps
x=pileup1.data$POS
x2=diff(x)
tnt[c(FALSE,x2!=1)]="" #remove tri-nucleutide after gap
tnt[c(x2!=1,FALSE)]="" #remove tri-nucleutide before gap
## Emperical Bayesian - a shrinkage approach
##################################################
#calculate the tri-nucleotide average and SD
tnt.bg.ave=aggregate(hs.bkg.ave,list(TNT=tnt),function(x) mean(x[x<.1],na.rm=T))
tnt.bg.sd=aggregate(hs.bkg.ave,list(TNT=tnt),function(x) sd(x[x<0.1],na.rm=T))
# match the tnt average to the input data
loc.tnt.ave=tnt.bg.ave[match(tnt,tnt.bg.ave$TNT),-1]
loc.tnt.sd=tnt.bg.sd[match(tnt,tnt.bg.sd$TNT),-1]
w=loc.tnt.ave/(loc.tnt.ave+hs.bkg.ave) #shrinkage weight parameter
w[is.na(w)]=1
# average mutation error rate
shr.bg.ave=w*loc.tnt.ave+(1-w)*hs.bkg.ave
#average depth matrix
if (exists("hs.depth")) depth.mat=matrix(rep(apply(hs.depth,1,mean),4),ncol=4) else
depth.mat=matrix(tnt.depth,nrow=nrow(pileup1.data),ncol=4)
# function to define BMER and call mutation for input file
polish.pred.3nt=function(bgrd.ave=shr.bg.ave, #average mutation rate data matrix used to define the background
polish.method="Binomial", # Background polish method: Poisson or Binomial
bgrd.depth=depth.mat, #average seq depth for background
pred.sample=file.path(data.dir,input.pileup.file[1]), #sample file name to be polished, should be a tab delimited pileup text file
alpha=input.alpha, #alpha = significance level, control polishing strength
filter=input.filter, #filter out those bases based on filtering criteria
polyclone=TRUE, # Output polyclone results, if FALSE, then only the base with the highest mutation rate is outputted
out.dir=NULL) #output directory. If null, polished data will be outputed to the same directory of input data with bgrm added to the file names
{ cat(paste("Polishing",pred.sample,"\n"))
pileup.data=read.table(file=pred.sample,header=T,check.names = F,as.is=T) #read in the data
out.pileup=pileup.data #save a copy for output
#below is not a critical step. It changes the column name of forward strand to cap letter: A+ to A; and
#reserse strand to lower case letter A+ to a.
if (length(grep("\\+",names(pileup.data)))>0) names(pileup.data)=gsub("\\+","",names(pileup.data))
if (length(grep("-",names(pileup.data)))>0) {
rev.base.col=grep("-",names(pileup.data)) # data column has reverse string pile up
names(pileup.data)[rev.base.col]=gsub("-","",tolower(names(pileup.data)[rev.base.col]))
}
#change all reference letter acgt to upper case
pileup.data$REF=toupper(pileup.data$REF)
fs=match(s,names(pileup.data)) # columns with forward strand
bs=match(tolower(s),names(pileup.data)) # columns with reverse strand
pileup.data$fs.sum=apply(pileup.data[,fs],1,sum) #sum read count of FS
pileup.data$bs.sum=apply(pileup.data[,bs],1,sum) #sum read count of RS
pileup.data$mut.rate1 = pileup.data$fs.sum/(pileup.data$R+pileup.data$fs.sum)*100 #FS mutation rate in %
pileup.data$mut.rate2 = pileup.data$bs.sum/(pileup.data$r+pileup.data$bs.sum)*100 #RS mutation rate in %
pileup.data$mut.rate = (pileup.data$fs.sum+pileup.data$bs.sum)/pileup.data$DEPTH*100 #Total mutation rate in %
pileup.data$mut.F = apply(pileup.data[,fs],1,function(x) names(which.max(x))) #FS mutation letter
pileup.data$mut.R = apply(pileup.data[,bs],1,function(x) toupper(names(which.max(x)))) #RS mutation letter
pileup.data$mut.B = apply(pileup.data[,fs]+pileup.data[,bs],1,function(x) names(which.max(x))) #overall (FS+RS) mutation letter
pileup.data$mut.rateF = apply(pileup.data[,fs],1,max)/(pileup.data$R+pileup.data$fs.sum)*100 #FS specifc mutation rate
pileup.data$mut.rateR = apply(pileup.data[,bs],1,max)/(pileup.data$r+pileup.data$bs.sum)*100 #RS specifc mutation rate
# Code below filter out the bases where mutation will not be called and mutation rate (mut.rate.adj) is set to 0.
if (filter) {
#remove mutation rate data for bases meeting the following criteria
pileup.data$mut.rate.adj=ifelse(
pileup.data$DEPTH<100 |
(pileup.data$R>=50 & pileup.data$r>=50 &
( (pileup.data$mut.F!=pileup.data$mut.R & (pileup.data$fs.sum<10 | pileup.data$bs.sum<10))| #added min of fs and bs <10 3/31/17
(pileup.data$mut.rate1<1 & pileup.data$mut.rate2>5) |
(pileup.data$mut.rate1>5 & pileup.data$mut.rate2<1) |
(pileup.data$mut.rate1==0 & pileup.data$mut.rate2>1.5) |
(pileup.data$mut.rate1>1.5 & pileup.data$mut.rate2==0) |
(pileup.data$mut.rate1>10*pileup.data$mut.rate2 & pileup.data$mut.rate2>0.1) |
(pileup.data$mut.rate2>10*pileup.data$mut.rate1 & pileup.data$mut.rate1>0.1) |
(pileup.data$mut.rateF<0.75*pileup.data$mut.rate1 & pileup.data$mut.rate1>1) |
(pileup.data$mut.rateR<0.75*pileup.data$mut.rate2 & pileup.data$mut.rate2>1))),0,pileup.data$mut.rate)
pileup.data$mut.rate.adj[pileup.data$R>10*pileup.data$r & pileup.data$R >=100 | pileup.data$r>10*pileup.data$R & pileup.data$r >=100]=0
} else pileup.data$mut.rate.adj=pileup.data$mut.rate
#set mutation rate to 0 if it is >10% (SNP) - removed in 7/10/17
#pileup.data$mut.rate.adj[pileup.data$mut.rate.adj>=10]=0
#sum FS and RS count for each base
d1= pileup.data[,c("A","a","C","c","T","t","G","g")]
d1.2 = sapply(s,function(x) rowSums(d1[,toupper(names(d1))==x]))
if (polish.method=="Binomial") d2=d1.2/pileup.data$DEPTH else d2=d1.2
#calculate the average count using input rate and depth
ave.read.count=bgrd.ave*bgrd.depth
#define background for poisson or binomial using upper CI limit
if (polish.method=="Poisson") {
d3=qchisq(1-alpha/2,2*ave.read.count+2)/2 } else {
d3=qbeta(1-alpha/2,as.matrix(ave.read.count)+1,as.matrix(bgrd.depth-ave.read.count))
}
#call mutation if observed data > background
d3=d3*hs.adj #apply the adjustement data from known cancer mutation TNC data to the model
pred=apply(d2>d3,1,sum)
if (polyclone) {pred.nts=apply(d2>d3,1,function(x) which(x>0))
clone.n=unlist(lapply(pred.nts,length))} else
{ d2.2=d2
d2.2[d2<d3]=0
pred.nts=apply(d2.2,1,function(x) ifelse(max(x)>0,which.max(x),0))
clone.n=(pred.nts>0)+0
}
pred[pileup.data$mut.rate.adj==0]=0 #for those bases to be filtered out, we do not call.
#pred[apply(d1.2,1,max)<5]=0 #for those bases whose total counts<5, we do not call. removed 7/10/17
#polished noise data output
polish.noise=cbind(out.pileup,noise.num=apply(d1.2>0,1,sum),freq=apply(d1.2,1,max)/out.pileup$DEPTH,threshold=d3[cbind(1:nrow(d3),apply(d1.2,1,which.max))])[apply(d1.2,1,sum)>0 & pred==0,]
#output the polished data
out.pileup[pred==0,7:14]=0 #hard coded columns, assume column 7:14 are the ACTG columns
out.pileup$DEPTH=apply(out.pileup[,5:14],1,sum) #re-calculate the depth after polish
#handling output file name and directory
#if input file name has directory path, remove the path
sample.name.loc=unlist(gregexpr("/",pred.sample))
if (sample.name.loc[1]!=-1) sample.name.loc=sample.name.loc[length(sample.name.loc)]+1 else
sample.name.loc=1
out.file.name=substr(pred.sample,sample.name.loc,nchar(pred.sample))
out.file.name=gsub(".txt",".bgrm.txt",out.file.name)
if (!is.null(out.dir)) out.file=file.path(out.dir,out.file.name) else
out.file=out.file.name
write.table(out.pileup,file=out.file,sep="\t",quote=F,row.names=F)
#summary statistic for the predicted mutants
sum.data1=list()
for (j in 1:max(clone.n)) {
if (polyclone) pred.nt=rapply(pred.nts[clone.n>=j], function(x) x[j]) else pred.nt=pred.nts[pred.nts>0]
sum.data=out.pileup[clone.n>=j,]
sum.data$mutant=s[pred.nt]
sum.data$tnt=tnt[clone.n>=j]
sum.data$bkg.threshold=d3[clone.n>=j,,drop=F][cbind(1:length(pred.nt),pred.nt)]
sum.data$mut.freq= d2[clone.n>=j,,drop=F][cbind(1:length(pred.nt),pred.nt)]
sum.data$tnt[sum.data$tnt==""]="NA"
sum.data$tnt.ave.freq=bgrd.ave[clone.n>=j,,drop=F][cbind(1:length(pred.nt),pred.nt)] #average healthy subject tri-nuc count
if (polish.method=="Poisson") sum.data$zscore=qnorm(ppois(sum.data$mut.freq,sum.data$tnt.ave.freq,lower.tail = F,log.p=T),lower.tail = F,log.p=T) else
if (polish.method=="Binomial") sum.data$zscore=qnorm(pbinom(sum.data$mut.freq*10000,10000,sum.data$tnt.ave.freq,lower.tail = F,log.p=T),lower.tail = F,log.p=T)
sum.data1=rbind(sum.data1,sum.data)
}
write.table(polish.noise,file=sub(".bgrm.txt",".polished.noise.txt",out.file),sep="\t",quote=F,row.names=F)
return(pred)
}
# call the function to run each input sample
##############################################
n.sample=length(input.pileup.file)
pred3.all=matrix(0,ncol=n.sample,nrow=nrow(pileup1.data))
for (i in 1:n.sample) pred3.all[,i]=polish.pred.3nt(bgrd.ave = shr.bg.ave,
bgrd.depth=depth.mat,
pred.sample = input.pileup.file[i],
alpha=input.alpha,
filter=input.filter,
polyclone=TRUE,
out.dir=work.dir)
# The resulted data pred3.all is an indicator matrix for each position(row) and sample (column)
# 1=mutation, 0= no mutation