-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path3-clean-finalize.Rmd
120 lines (87 loc) · 3.21 KB
/
3-clean-finalize.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
---
output:
html_document:
toc: true
toc_float: true
---
# Dataset finalization
```{r setup_3, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
# This file provides the startup() function.
source("R/_startup.R")
# Load desired packages and report any missing packages that should be installed.
startup(auto_install = FALSE, verbose = FALSE)
# Load any additional R files in the R/ directory.
ck37r::load_all_code("R", verbose = TRUE)
```
## Load data {-}
```{r load_data_3}
# Created in 2-clean-impute.Rmd
# Objects included: data, vars
# renv also includes a load() method, so we specify base:: here.
base::load("data/clean-impute.RData")
```
## Factors to indicators
```{r factors_indicators}
result = ck37r::factors_to_indicators(data[vars$predictors], verbose = TRUE)
names(result)
# Temporarily remove all predictors from the dataframe.
data[vars$predictors] = NULL
# Now add the new data back on.
data = cbind(data, result$data)
# Remove the original factor predictors from the list of used predictors, add
# add the new indicator predictors.
(vars$predictors = c(setdiff(vars$predictors, result$factor_vars), unlist(result$factor_names)))
# Confirm that our predictor vector is updated correctly.
if (!all(vars$predictors %in% names(data))) {
missing_vars = setdiff(vars$predictors, names(data))
stop("Missing new indicators that were added: ", paste(missing_vars, collapse = ", "))
}
rm(result)
```
## Remove collinear predictors
This is not essential, but nice for the linear regression estimators.
This needs to be after imputation, because it currently cannot handle missingness.
```{r remove_collinear}
# Remove linearly correlated columns from the covariate file
# NOTE: assumes that there are no factor variables.
linear_combos = caret::findLinearCombos(data[, vars$predictors])
if (length(linear_combos$remove) > 0L) {
if (conf$verbose) {
cat("Removing", length(linear_combos$remove), "predictors due to collinearity.\n")
cat("Vars:", paste0(vars$predictors[linear_combos$remove], collapse = ", "), "\n")
}
# Make sure we don't switch to a vector if only 1 column remains.
data = data[, !colnames(data) %in% vars$predictors[linear_combos$remove],
drop = FALSE]
vars$predictors = setdiff(vars$predictors, vars$predictors[linear_combos$remove])
if (conf$verbose) {
cat("Updated predictor count:", length(vars$predictors), "\n")
}
} else {
cat("No linear duplication found.\n")
}
rm(linear_combos)
```
## Confirm predictor matrix invertability
This is not essential, but nice for the linear regression estimators.
```{r confirm_invertability}
# Compute covariance matrix.
# NOTE: this requires that no factors be included.
cov_mat = stats::cov(data[vars$predictors])
# Compute QR decomposition of covariance matrix.
qr_cov = base::qr(cov_mat)
# These need to be equal for the covariance matrix to be full rank.
if (ncol(cov_mat) != qr_cov$rank) {
cat("Warning: matrix of predictors is not full rank.\n")
cat("Predictor columns:", ncol(cov_mat), "QR rank:", qr_cov$rank, "\n")
} else {
cat("Predictor matrix is full rank.\n")
}
rm(cov_mat, qr_cov)
```
## Save finalized dataset {-}
```{r save_finalized}
save(data, vars,
file = "data/clean-finalize-imputed.RData")
```