-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMCMC_diagnostics_prior.R
239 lines (226 loc) · 10.2 KB
/
MCMC_diagnostics_prior.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#-------------------#
# CLEAN ENVIRONMENT #
#-------------------#
rm( list = ls( ) )
#-----------------------------------------------#
# LOAD PACKAGES, FUNCTIONS, AND SET ENVIRONMENT #
#-----------------------------------------------#
# This package lets you find automatically the path to a specific location
# in your file structure
# If you have not installed this package, you will need to install it.
# You can uncomment the following line to do this:
#install.packages( "rstudioapi" )
library( rstudioapi )
scripts_dir <- gsub( pattern = "scripts..*", replacement = "scripts/",
x = getActiveDocumentContext()$path )
setwd( scripts_dir )
# Load the file with all the functions used throughout this script
source( file = "../../../src/Functions.R" )
# Run in-house function to set home directory and output directory for ESS
# and convergence tests
# NOTE: This function will create a directory called `plots` and another called
# `ESS_and_chains_convergence` inside the `analyses` directory if you have
# not created them yet
home_dir <- set_homedir()$home
outchecks_dir <- set_homedir()$ESS
# By now, set the working directory to `home_dir`
setwd( home_dir )
#-------------------------------------------------------------#
# DEFINE GLOBAL VARIABLES -- modify according to your dataset #
#-------------------------------------------------------------#
# First, we will define the global variables that match the settings in our
# analysis.
# 1. Number of chains
num_chains <- 6
# 2. Number of divergence times that have been estimated. One trick to find
# this out quickly is to subtract 1 to the number of species. In this case,
# there are 246 taxa (246), so the number of internal nodes
# is `n_taxa-=246-1=245`.
# Another way to verify this is by opening the `mcmc.txt` file and check the
# header. The first element after `Gen` will have the format of `t_nX`, where
# X will be an integer (i.e., 247). Subtract two to this number
# (i.e., 247-2=245) and this will be your number of divergence times that are
# parameters of the MCMC. Please modify the number below so it fits to the
# dataset you are using.
num_divt <- 245
# 3. Number of samples that you specified in the `MCMCtree` control file to
# collect. NOTE: you may have not collect them all, but do not worry!
def_samples <- 20000
# 4. Quantile percentage that you want to set By default, the variable below is
# set to 0.975 so the 97.5% and 2.5% quantiles (i.e., 95%CI). If you want to
# change this, however, just modify the value.
perc <- 0.975
# 5. Load a semicolon-separated file with info about calibrated nodes. Note that
# this file is output by script `Merge_node_labels.R`. A summary of its content
# in case you are to generate your own input files:
#
# Each column needs to be separated with semicolons and an extra blank line
# after the last row with calibration information needs to be added. If the
# extra blank is not added, R will complain and will not load the file!
# If you add a header, please make sure you name the column elements as
# `Calib;node;Prior`. If not, the R function below will deal with the header,
# but make sure you set `head_avail = FALSE` when running `read_calib_f`
# function below. An example of the content of this file is given below:
#
# ```
# Calib;node;Prior
# ex_n5;5;ST(5.8300,0.0590,0.1120,109.1240)
# ex_n7;7;B(4.1200,4.5200,0.0250,0.0250)
#
# ```
#
# The first column will have the name of the calibration/s that can help you
# identify which node belongs to which calibration. The second column is the
# number given to this node by`MCMCtree` (this information is automatically
# found when you run the script `Merge_node_labels.R`, otherwise you will need
# to keep checking the output file `node_trees.tree` to figure out which node
# is which). The third column is the calibration used for that node in
# `MCMCtree` format.
#
# [[ NOTES ABOUT ALLOWED CALIBRATION FORMATS]]
#
# Soft-bound calibrations:
# E.g.1: A calibration with a minimum of 0.6 and a maximum of 0.8 would with
# the default tail probabilities would have the following equivalent
# formats:
# >> B(0.6,0.8) | B(0.6,0.8,0.025,0.025)
# E.g.2: A calibration with a minimum of 0.6 and a maximum of 0.8 would with
# the pL=0.001 and pU=0.025 would have the following format. Note that,
# whenever you want to modify either pL or pU, you need to write down
# the four parameters in the format of "B(min,max,pL,pU)":
# >> B(0.6,0.8,0.001,0.025)
#
# Lower-bound calibrations:
# E.g.1: A calibration with a minimum of 0.6 and the default parameters for
# p = 0.1, c = 1, pL = 0.025:
# >> L(0.6) | L(0.6,0.1,1,0.025)
# E.g.2: A calibration with a hard minimum at 0.6, and so pL = 1e-300.
# Note that, whenever you want to modify either pL or pU, you need to
# write down the four parameters in the format of "L(min,p,c,pL)":
# >> L(0.6,0.1,1,1e-300)
#
# Upper-bound calibrations:
# E.g.1: A calibration with a maximum of 0.8 and the default parameters for
# pU = 0.025:
# >> U(0.8) | U(0.8,0.025)
# E.g.2: A calibration with a hard maximum at 0.8, and so pU = 1e-300.
# Note that, if you want to modify pU, you need to write down the two
# parameters in the format of "U(max,pU)":
# >> U(0.8,1e-300)
#
# ST distributions:
# The format accepted has four parameters: xi (location, mean root age),
# omega (scale), alpha (shape), nu (df). Accepted format:
# >> ST(5.8300,0.0590,0.1120,109.1240)
#
# SN distributions:
# The format accepted has three parameters: xi (location, mean root age),
# omega (scale), alpha (shape). Accepted format:
# >> SN(5.8300,0.0590,0.1120)
#
#
# The next command executes the `read_calib_f` in-house function, which reads
# your input files (semicolon-separated files). The path to this directory is
# what the argument `main_dir` needs. The argument `f_names` requires the name
# of the file/s that you have used. Argument `dat` requires the same global
# object that you have created at the beginning of the script.
dat <- "LUCAdup"
calib_nodes <- read_calib_f( main_dir = paste( home_dir, "calib_files/",
sep = "" ),
f_names = "Calibnodes.csv",
dat = dat, head_avail = TRUE )
# 6. Number of columns in the `mcmc.txt` that are to be deleted as they do not
# correspond to sample values for divergence times (i.e., the entries are not
# names following the format `t_nX`). To figure out this number quickly, you
# can open the `mcmc.txt` file, read the header, and count the number of `mu*`
# elements. Do not count the `lnL` value when looking at
# `mcmc.txt` files generated when sampling from the posterior -- this is
# automatically accounted for in the in-house R functions that you will
# subsequently use. E.g., you expect to see as many `mu[0-9]` as alignment
# blocks you have in your sequence file! E.g., if you had two alignment blocks,
# you would speciy `delcol_prior <- 2`. Please modify the value/s below
# (depending on having one or more datasets) according to the `mcmc.txt` file
# generated when sampling from the prior (`delcol_prior`)
##> NOTE: If you ran `MCMCtree` with `clock = 2` or `clock = 3` when
##> sampling from the prior, you will also need to count the `sigma2*`
##> columns! We ran `clock = 1` so that the analyses ran quicker, and thus
##> we only have `mu*` columns.
delcol_prior <- 1 # There is one column: mu
# 7. Path to the directory where the subdirectories where each chain ran for
# each dataset are saved In this case, we will have the path to the directory
# where the analyses when sampling from the prior took place (i.e., directory
# that contains the subdirectories from `1` to `n`, where `n` is the number
# of chains we ran).
path_prior <- paste( home_dir, "sum_analyses/00_prior/CLK/", sep = "" )
#--------------#
# ANALYSE DATA #
#--------------#
# Define object names
num_dirs <- num_chains
delcol <- delcol_prior
path <- path_prior
num_divt <- num_divt
node_calib <- calib_nodes[[ 1 ]] # Get the 1st element, only 1 set of cals!
dataset <- c( "LUCAdup_CLK" )
perc <- perc
def_samples <- def_samples
prior <- TRUE
out_dat <- path_prior
time_unit <- 1000
out_file_pdf <- c( "Convergence_plot_prior" )
out_title_pdf <- c( "CLK")
th <- 0.1
# Generate a list to keep all the returned objects
sum_prior_QC <- vector( "list", length( path_prior ) )
names( sum_prior_QC ) <- dataset
# Now, run the `QC_conv` function in a loop
for( i in 1:length( dataset ) ){
# NOTE: Only vectors with more than one element rely on `i`
sum_prior_QC[[ i ]] <- QC_conv( num_dirs = num_dirs, delcol = delcol[i],
path = path[i], num_divt = num_divt,
node_calib = node_calib, dataset = dataset[i],
perc = perc, def_samples = def_samples,
prior = prior, out_dat = out_dat[i],
time_unit = time_unit,
out_file_pdf = out_file_pdf[i],
out_title_pdf = out_title_pdf[i],
th = th,
outchecks_dir = outchecks_dir )
}
# Save object!
if( ! dir.exists( paste( home_dir, "out_RData", sep = "" ) ) ){
dir.create( paste( home_dir, "out_RData", sep = "" ) )
}
save( file = paste( home_dir, "out_RData/sum_prior_QC.Rdata", sep = "" ),
sum_prior_QC )
# Print out sum stats in the script as a log
## [[ CLK | conc | cbA ]] ####
sum_prior_QC$LUCAdup_CLK$ESS_results
# $median
# [1] 20001
#
# $min
# [1] 20001
#
# $max
# [1] 20001
#
# $num_samples_for_ESS
# [1] 120006
#
# $ESS
# Tail-ESS Bulk-ESS
# Med. 59333 59756
# Min. 57323 56625
# Max. 60445 61360
#
# $minRhat
# [1] 0.9999514
#
# $maxRhat
# [1] 1.000149
#
# $total_samples
# [1] 120006 245
length( sum_prior_QC$LUCAdup_CLK$not_conv_nodes ) # 0
sum_prior_QC$LUCAdup_CLK$num_chains # 6