-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsPLSDA SRBCT Case Study.R
executable file
·194 lines (135 loc) · 11.2 KB
/
sPLSDA SRBCT Case Study.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
## ----global_options, include=FALSE----------------------------------------------------------------------------------
library(knitr)
knitr::opts_chunk$set(dpi = 100, echo= TRUE, warning=FALSE, message=FALSE, fig.align = 'center',
fig.show=TRUE, fig.keep = 'all', out.width = '90%')
## -------------------------------------------------------------------------------------------------------------------
library(mixOmics) # import the mixOmics library
set.seed(5249) # for reproducibility, remove for normal use
## -------------------------------------------------------------------------------------------------------------------
data(srbct) # extract the small round bull cell tumour data
X <- srbct$gene # use the gene expression data as the X matrix
Y <- srbct$class # use the class data as the Y matrix
dim(X) # check the dimensions of the X dataframe
summary(Y) # check the distribution of class labels
## ---- fig.cap = "FIGURE 1: Barplot of the variance each principal component explains of the SRBCT gene expression data."----
pca.srbct = pca(X, ncomp = 10, center = TRUE, scale = TRUE) # run pca method on data
plot(pca.srbct) # barplot of the eigenvalues (explained variance per component)
## ---- fig.cap = "FIGURE 2: Preliminary (unsupervised) analysis with PCA on the SRBCT gene expression data"----------
plotIndiv(pca.srbct, group = srbct$class, ind.names = FALSE, # plot the samples projected
legend = TRUE, title = 'PCA on SRBCT, comp 1 - 2') # onto the PCA subspace
## -------------------------------------------------------------------------------------------------------------------
srbct.splsda <- splsda(X, Y, ncomp = 10) # set ncomp to 10 for performance assessment later
## ---- fig.show = "hold", out.width = "49%", fig.cap = "FIGURE 3: Sample plots of the SRBCT gene expression data after a basic PLS-DA model was operated on this data. (a) depicts the samples with the confidence ellipses of different class labels while (b) depicts the prediction background generated by these samples. Both plots use the first two components as axes."----
# plot the samples projected onto the first two components of the PLS-DA subspace
plotIndiv(srbct.splsda , comp = 1:2,
group = srbct$class, ind.names = FALSE, # colour points by class
ellipse = TRUE, # include 95% confidence ellipse for each class
legend = TRUE, title = '(a) PLSDA with confidence ellipses')
# use the max.dist measure to form decision boundaries between classes based on PLS-DA data
background = background.predict(srbct.splsda, comp.predicted=2, dist = "max.dist")
# plot the samples projected onto the first two components of the PLS-DA subspace
plotIndiv(srbct.splsda, comp = 1:2,
group = srbct$class, ind.names = FALSE, # colour points by class
background = background, # include prediction background for each class
legend = TRUE, title = " (b) PLSDA with prediction background")
## ---- fig.cap = "FIGURE 4: Tuning the number of components in PLS-DA on the SRBCT gene expression data. For each component, repeated cross-validation (10 × 3−fold CV) is used to evaluate the PLS-DA classification performance (OER and BER), for each type of prediction distance; `max.dist`, `centroids.dist` and `mahalanobis.dist`."----
# undergo performance evaluation in order to tune the number of components to use
perf.splsda.srbct <- perf(srbct.splsda, validation = "Mfold",
folds = 5, nrepeat = 10, # use repeated cross-validation
progressBar = FALSE, auc = TRUE) # include AUC values
# plot the outcome of performance evaluation across all ten components
plot(perf.splsda.srbct, col = color.mixo(5:7), sd = TRUE,
legend.position = "horizontal")
## -------------------------------------------------------------------------------------------------------------------
perf.splsda.srbct$choice.ncomp # what is the optimal value of components according to perf()
## ---- fig.cap = "FIGURE 5: Tuning keepX for the sPLS-DA performed on the SRBCT gene expression data. Each coloured line represents the balanced error rate (y-axis) per component across all tested keepX values (x-axis) with the standard deviation based on the repeated cross-validation folds. As sPLS-DA is an iterative algorithm, values represented for a given component (e.g. comp 1 to 2) include the optimal keepX value chosen for the previous component (comp 1)."----
# grid of possible keepX values that will be tested for each component
list.keepX <- c(1:10, seq(20, 300, 10))
# undergo the tuning process to determine the optimal number of variables
tune.splsda.srbct <- tune.splsda(X, Y, ncomp = 4, # calculate for first 4 components
validation = 'Mfold',
folds = 5, nrepeat = 10, # use repeated cross-validation
dist = 'max.dist', # use max.dist measure
measure = "BER", # use balanced error rate of dist measure
test.keepX = list.keepX,
cpus = 2) # allow for paralleliation to decrease runtime
plot(tune.splsda.srbct, col = color.jet(4)) # plot output of variable number tuning
## -------------------------------------------------------------------------------------------------------------------
tune.splsda.srbct$choice.ncomp$ncomp # what is the optimal value of components according to tune.splsda()
## -------------------------------------------------------------------------------------------------------------------
tune.splsda.srbct$choice.keepX # what are the optimal values of variables according to tune.splsda()
## -------------------------------------------------------------------------------------------------------------------
optimal.ncomp <- tune.splsda.srbct$choice.ncomp$ncomp
optimal.keepX <- tune.splsda.srbct$choice.keepX[1:optimal.ncomp]
## -------------------------------------------------------------------------------------------------------------------
# form final model with optimised values for component and variable count
final.splsda <- splsda(X, Y,
ncomp = optimal.ncomp,
keepX = optimal.keepX)
## ---- fig.show = "hold", out.width = "49%", fig.cap = "FIGURE 6: Sample plots from sPLS-DA performed on the SRBCT gene expression data including 95% confidence ellipses. Samples are projected into the space spanned by the first three components. (a) Components 1 and 2 and (b) Components 1 and 3. Samples are coloured by their tumour subtypes."----
plotIndiv(final.splsda, comp = c(1,2), # plot samples from final model
group = srbct$class, ind.names = FALSE, # colour by class label
ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
title = ' (a) sPLS-DA on SRBCT, comp 1 & 2')
plotIndiv(final.splsda, comp = c(1,3), # plot samples from final model
group = srbct$class, ind.names = FALSE, # colour by class label
ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
title = '(b) sPLS-DA on SRBCT, comp 1 & 3')
## ---- eval = FALSE--------------------------------------------------------------------------------------------------
## # set the styling of the legend to be homogeneous with previous plots
## legend=list(legend = levels(Y), # set of classes
## col = unique(color.mixo(Y)), # set of colours
## title = "Tumour Type", # legend title
## cex = 0.7) # legend size
##
## # generate the CIM, using the legend and colouring rows by each sample's class
## cim <- cim(final.splsda, row.sideColors = color.mixo(Y),
## legend = legend)
## ---- fig.cap = "FIGURE 8: Stability of variable selection from the sPLS-DA on the SRBCT gene expression data. The barplot represents the frequency of selection across repeated CV folds for each selected gene for component 1 (a), 2 (b) and 3 (c)."----
# form new perf() object which utilises the final model
perf.splsda.srbct <- perf(final.splsda,
folds = 5, nrepeat = 10, # use repeated cross-validation
validation = "Mfold", dist = "max.dist", # use max.dist measure
progressBar = FALSE)
# plot the stability of each feature for the first three components, 'h' type refers to histogram
par(mfrow=c(1,3))
plot(perf.splsda.srbct$features$stable[[1]], type = 'h',
ylab = 'Stability',
xlab = 'Features',
main = '(a) Comp 1', las =2)
plot(perf.splsda.srbct$features$stable[[2]], type = 'h',
ylab = 'Stability',
xlab = 'Features',
main = '(b) Comp 2', las =2)
plot(perf.splsda.srbct$features$stable[[3]], type = 'h',
ylab = 'Stability',
xlab = 'Features',
main = '(c) Comp 3', las =2)
## ---- fig.cap = "FIGURE 9: Correlation circle plot representing the genes selected by sPLS-DA performed on the SRBCT gene expression data. Gene names are truncated to the first 10 characters. Only the genes selected by sPLS-DA are shown in components 1 and 2."----
var.name.short <- substr(srbct$gene.name[, 2], 1, 10) # form simplified gene names
plotVar(final.splsda, comp = c(1,2), var.names = list(var.name.short), cex = 3) # generate correlation circle plot
## -------------------------------------------------------------------------------------------------------------------
train <- sample(1:nrow(X), 50) # randomly select 50 samples in training
test <- setdiff(1:nrow(X), train) # rest is part of the test set
# store matrices into training and test set:
X.train <- X[train, ]
X.test <- X[test,]
Y.train <- Y[train]
Y.test <- Y[test]
## -------------------------------------------------------------------------------------------------------------------
# train the model
train.splsda.srbct <- splsda(X.train, Y.train, ncomp = optimal.ncomp, keepX = optimal.keepX)
## -------------------------------------------------------------------------------------------------------------------
# use the model on the Xtest set
predict.splsda.srbct <- predict(train.splsda.srbct, X.test, dist = "mahalanobis.dist")
## -------------------------------------------------------------------------------------------------------------------
# evaluate the prediction accuracy for the first two components
predict.comp2 <- predict.splsda.srbct$class$mahalanobis.dist[,2]
table(factor(predict.comp2, levels = levels(Y)), Y.test)
## -------------------------------------------------------------------------------------------------------------------
# evaluate the prediction accuracy for the first three components
predict.comp3 <- predict.splsda.srbct$class$mahalanobis.dist[,3]
table(factor(predict.comp3, levels = levels(Y)), Y.test)
## ---- fig.show = "hold", out.width = "49%", fig.cap = "FIGURE 10: ROC curve and AUC from sPLS-DA on the SRBCT gene expression data on component 1 (a) and all three components (b) averaged across one-vs.-all comparisons."----
auc.splsda = auroc(final.splsda, roc.comp = 1, print = FALSE) # AUROC for the first component
auc.splsda = auroc(final.splsda, roc.comp = 3, print = FALSE)# AUROC for all three components