-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMCMC_diagnostics_posterior.R
269 lines (255 loc) · 10.7 KB
/
MCMC_diagnostics_posterior.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#-------------------#
# CLEAN ENVIRONMENT #
#-------------------#
rm( list = ls( ) )
#-----------------------------------------------#
# LOAD PACKAGES, FUNCTIONS, AND SET ENVIRONMENT #
#-----------------------------------------------#
# This package lets you find automatically the path to a specific location
# in your file structure
# If you have not installed this package, you will need to install it.
# You can uncomment the following line to do this:
#install.packages( "rstudioapi" )
library( rstudioapi )
scripts_dir <- gsub( pattern = "scripts..*", replacement = "scripts/",
x = getActiveDocumentContext()$path )
setwd( scripts_dir )
# Load the file with all the functions used throughout this script
source( file = "../../../src/Functions.R" )
# Run in-house function to set home directory and output directory for ESS
# and convergence tests
# NOTE: This function will create a directory called `plots` and another called
# `ESS_and_chains_convergence` inside the `analyses` directory if you have
# not created them yet
home_dir <- set_homedir()$home
outchecks_dir <- set_homedir()$ESS
# By now, set the working directory to `home_dir`
setwd( home_dir )
#-------------------------------------------------------------#
# DEFINE GLOBAL VARIABLES -- modify according to your dataset #
#-------------------------------------------------------------#
# First, we will define the global variables that match the settings in our
# analysis.
# 1. Number of chains
num_chains <- 16
# 2. Number of divergence times that have been estimated. One trick to find
# this out quickly is to subtract 1 to the number of species. In this case,
# there are 246 taxa (246), so the number of internal nodes
# is `n_taxa-=246-1=245`.
# Another way to verify this is by opening the `mcmc.txt` file and check the
# header. The first element after `Gen` will have the format of `t_nX`, where
# X will be an integer (i.e., 247). Subtract two to this number
# (i.e., 247-2=245) and this will be your number of divergence times that are
# parameters of the MCMC. Please modify the number below so it fits to the
# dataset you are using.
num_divt <- 245
# 3. Number of samples that you specified in the `MCMCtree` control file to
# collect. NOTE: you may have not collect them all, but do not worry!
def_samples <- 20000
# 4. Quantile percentage that you want to set By default, the variable below is
# set to 0.975 so the 97.5% and 2.5% quantiles (i.e., 95%CI). If you want to
# change this, however, just modify the value.
perc <- 0.975
# 5. Load a semicolon-separated file with info about calibrated nodes. Note that
# this file is output by script `Merge_node_labels.R`. A summary of its content
# in case you are to generate your own input files:
#
# Each column needs to be separated with semicolons and an extra blank line
# after the last row with calibration information needs to be added. If the
# extra blank is not added, R will complain and will not load the file!
# If you add a header, please make sure you name the column elements as
# `Calib;node;Prior`. If not, the R function below will deal with the header,
# but make sure you set `head_avail = FALSE` when running `read_calib_f`
# function below. An example of the content of this file is given below:
#
# ```
# Calib;node;Prior
# ex_n5;5;ST(5.8300,0.0590,0.1120,109.1240)
# ex_n7;7;B(4.1200,4.5200,0.0250,0.0250)
#
# ```
#
# The first column will have the name of the calibration/s that can help you
# identify which node belongs to which calibration. The second column is the
# number given to this node by`MCMCtree` (this information is automatically
# found when you run the script `Merge_node_labels.R`, otherwise you will need
# to keep checking the output file `node_trees.tree` to figure out which node
# is which). The third column is the calibration used for that node in
# `MCMCtree` format.
#
# [[ NOTES ABOUT ALLOWED CALIBRATION FORMATS]]
#
# Soft-bound calibrations:
# E.g.1: A calibration with a minimum of 0.6 and a maximum of 0.8 would with
# the default tail probabilities would have the following equivalent
# formats:
# >> B(0.6,0.8) | B(0.6,0.8,0.025,0.025)
# E.g.2: A calibration with a minimum of 0.6 and a maximum of 0.8 would with
# the pL=0.001 and pU=0.025 would have the following format. Note that,
# whenever you want to modify either pL or pU, you need to write down
# the four parameters in the format of "B(min,max,pL,pU)":
# >> B(0.6,0.8,0.001,0.025)
#
# Lower-bound calibrations:
# E.g.1: A calibration with a minimum of 0.6 and the default parameters for
# p = 0.1, c = 1, pL = 0.025:
# >> L(0.6) | L(0.6,0.1,1,0.025)
# E.g.2: A calibration with a hard minimum at 0.6, and so pL = 1e-300.
# Note that, whenever you want to modify either pL or pU, you need to
# write down the four parameters in the format of "L(min,p,c,pL)":
# >> L(0.6,0.1,1,1e-300)
#
# Upper-bound calibrations:
# E.g.1: A calibration with a maximum of 0.8 and the default parameters for
# pU = 0.025:
# >> U(0.8) | U(0.8,0.025)
# E.g.2: A calibration with a hard maximum at 0.8, and so pU = 1e-300.
# Note that, if you want to modify pU, you need to write down the two
# parameters in the format of "U(max,pU)":
# >> U(0.8,1e-300)
#
# ST distributions:
# The format accepted has four parameters: xi (location, mean root age),
# omega (scale), alpha (shape), nu (df). Accepted format:
# >> ST(5.8300,0.0590,0.1120,109.1240)
#
# SN distributions:
# The format accepted has three parameters: xi (location, mean root age),
# omega (scale), alpha (shape). Accepted format:
# >> SN(5.8300,0.0590,0.1120)
#
#
# The next command executes the `read_calib_f` in-house function, which reads
# your input files (semicolon-separated files). The path to this directory is
# what the argument `main_dir` needs. The argument `f_names` requires the name
# of the file/s that you have used. Argument `dat` requires the same global
# object that you have created at the beginning of the script.
dat <- "LUCAdup"
calib_nodes <- read_calib_f( main_dir = paste( home_dir, "calib_files/",
sep = "" ),
f_names = "Calibnodes.csv",
dat = dat, head_avail = TRUE )
# 6. Number of columns in the `mcmc.txt` that are to be deleted as they do not
# correspond to sample values for divergence times (i.e., the entries are not
# names following the format `t_nX`). To figure out this number quickly, you
# can open the `mcmc.txt` file, read the header, and count the number of `mu*`
# and `sigma2*` elements. Do not count the `lnL` value when looking at
# `mcmc.txt` files generated when sampling from the posterior -- this is
# automatically accounted for in the in-house R functions that you will
# subsequently use. E.g., assuming an MCMC ran under a relaxed-clock model with
# no partitions, we would see `mu` and `sigma2` columns. Therefore, the variable
# would be set to `delcol_post <- 2`. Please modify the value/s below
# (depending on having one or more datasets) according to the `mcmc.txt` file
# generated when sampling from the posterior (`delcol_posterior`).
delcol_post <- 2 # There are two column: mu, sigma2
# 7. Path to the directory where the subdirectories where each chain ran for
# each dataset are saved In this case, we will have the path to the directory
# where the analyses when sampling from the prior took place (i.e., directory
# that contains the subdirectories from `1` to `n`, where `n` is the number
# of chains we ran).
all_paths <- c( paste( home_dir, "sum_analyses/01_posterior/GBM/", sep = "" ),
paste( home_dir, "sum_analyses/01_posterior/ILN/",
sep = "" ) )
#--------------#
# ANALYSE DATA #
#--------------#
# Define object names
num_dirs <- num_chains
delcol <- c( rep( delcol_post, 2 ) )
path <- all_paths
num_divt <- num_divt
node_calib <- calib_nodes[[ 1 ]]
dataset <- c( "LUCAdup_GBM", "LUCAdup_ILN" )
perc <- perc
def_samples <- def_samples
prior <- FALSE
out_dat <- all_paths
time_unit <- 1000
out_file_pdf <- c( "Convergence_plot_post_GBM",
"Convergence_plot_post_ILN" )
out_title_pdf <- c( "GBM","ILN" )
th <- 0.1
# Generate a list to keep all the returned objects
sum_post_QC <- vector( "list", length( all_paths ) )
names( sum_post_QC ) <- dataset
# Now, run the `QC_conv` function in a loop
for( i in 1:length( dataset ) ){
# NOTE: Only vectors with more than one element rely on `i`
sum_post_QC[[ i ]] <- QC_conv( num_dirs = num_dirs, delcol = delcol[i],
path = path[i], num_divt = num_divt,
node_calib = node_calib, dataset = dataset[i],
perc = perc, def_samples = def_samples,
prior = prior, out_dat = out_dat[i],
time_unit = time_unit,
out_file_pdf = out_file_pdf[i],
out_title_pdf = out_title_pdf[i],
th = th,
outchecks_dir = outchecks_dir )
}
# Save object!
if( ! dir.exists( paste( home_dir, "out_RData", sep = "" ) ) ){
dir.create( paste( home_dir, "out_RData", sep = "" ) )
}
save( file = paste( home_dir, "out_RData/sum_post_QC.RData", sep = "" ),
sum_post_QC )
# Print out sum stats in the script as a log
## [[ GBM | conc | cbA ]] ####
sum_post_QC$LUCAdup_GBM$ESS_results
# $median
# [1] 18687
#
# $min
# [1] 15350
#
# $max
# [1] 20001
#
# $num_samples_for_ESS
# [1] 245600
#
# $ESS
# Tail-ESS Bulk-ESS
# Med. 121016 118875
# Min. 104448 80050
# Max. 123427 123901
#
# $minRhat
# [1] 0.9999561
#
# $maxRhat
# [1] 1.000188
#
# $total_samples
# [1] 293854 245
length( sum_post_QC$LUCAdup_GBM$not_conv_nodes ) # 0
sum_post_QC$LUCAdup_GBM$num_chains # 16
## [[ ILN | conc | cbA ]] ####
sum_post_QC$LUCAdup_ILN$ESS_results
# $median
# [1] 19657
#
# $min
# [1] 14685
#
# $max
# [1] 20001
#
# $num_samples_for_ESS
# [1] 234960
#
# $ESS
# Tail-ESS Bulk-ESS
# Med. 115856 114577
# Min. 105629 88704
# Max. 117867 117924
#
# $minRhat
# [1] 0.9999549
#
# $maxRhat
# [1] 1.000102
#
# $total_samples
# [1] 306816 245
length( sum_post_QC$LUCAdup_ILN$not_conv_nodes ) # 0
sum_post_QC$LUCAdup_ILN$num_chains # 16