sPLSDA SRBCT Case Study.R

## ----global_options, include=FALSE----------------------------------------------------------------------------------
library(knitr)
knitr::opts_chunk$set(dpi = 100, echo= TRUE, warning=FALSE, message=FALSE, fig.align = 'center',
                      fig.show=TRUE, fig.keep = 'all', out.width = '90%')


## -------------------------------------------------------------------------------------------------------------------
library(mixOmics) # import the mixOmics library

set.seed(5249) # for reproducibility, remove for normal use


## -------------------------------------------------------------------------------------------------------------------
data(srbct) # extract the small round bull cell tumour data
X <- srbct$gene # use the gene expression data as the X matrix
Y <- srbct$class # use the class data as the Y matrix

dim(X) # check the dimensions of the X dataframe
summary(Y) # check the distribution of class labels


## ---- fig.cap = "FIGURE 1: Barplot of the variance each principal component explains of the SRBCT gene expression data."----
pca.srbct = pca(X, ncomp = 10, center = TRUE, scale = TRUE) # run pca method on data
plot(pca.srbct)  # barplot of the eigenvalues (explained variance per component)


## ---- fig.cap = "FIGURE 2: Preliminary (unsupervised) analysis with PCA on the SRBCT gene expression data"----------
plotIndiv(pca.srbct, group = srbct$class, ind.names = FALSE, # plot the samples projected
          legend = TRUE, title = 'PCA on SRBCT, comp 1 - 2') # onto the PCA subspace


## -------------------------------------------------------------------------------------------------------------------
srbct.splsda <- splsda(X, Y, ncomp = 10)  # set ncomp to 10 for performance assessment later


## ---- fig.show = "hold", out.width = "49%", fig.cap = "FIGURE 3: Sample plots of the SRBCT gene expression data after a basic PLS-DA model was operated on this data. (a) depicts the samples with the confidence ellipses of different class labels while (b) depicts the prediction background generated by these samples. Both plots use the first two components as axes."----
# plot the samples projected onto the first two components of the PLS-DA subspace
plotIndiv(srbct.splsda , comp = 1:2, 
          group = srbct$class, ind.names = FALSE,  # colour points by class
          ellipse = TRUE, # include 95% confidence ellipse for each class
          legend = TRUE, title = '(a) PLSDA with confidence ellipses')

# use the max.dist measure to form decision boundaries between classes based on PLS-DA data
background = background.predict(srbct.splsda, comp.predicted=2, dist = "max.dist")

# plot the samples projected onto the first two components of the PLS-DA subspace
plotIndiv(srbct.splsda, comp = 1:2,
          group = srbct$class, ind.names = FALSE, # colour points by class
          background = background, # include prediction background for each class
          legend = TRUE, title = " (b) PLSDA with prediction background")


## ---- fig.cap = "FIGURE 4: Tuning the number of components in PLS-DA on the SRBCT gene expression data. For each component, repeated cross-validation (10 × 3−fold CV) is used to evaluate the PLS-DA classification performance (OER and BER), for each type of prediction distance; `max.dist`, `centroids.dist` and `mahalanobis.dist`."----
# undergo performance evaluation in order to tune the number of components to use
perf.splsda.srbct <- perf(srbct.splsda, validation = "Mfold", 
                          folds = 5, nrepeat = 10, # use repeated cross-validation
                          progressBar = FALSE, auc = TRUE) # include AUC values

# plot the outcome of performance evaluation across all ten components
plot(perf.splsda.srbct, col = color.mixo(5:7), sd = TRUE,
     legend.position = "horizontal")


## -------------------------------------------------------------------------------------------------------------------
perf.splsda.srbct$choice.ncomp # what is the optimal value of components according to perf()


## ---- fig.cap = "FIGURE 5:  Tuning keepX for the sPLS-DA performed on the SRBCT gene expression data. Each coloured line represents the balanced error rate (y-axis) per component across all tested keepX values (x-axis) with the standard deviation based on the repeated cross-validation folds. As sPLS-DA is an iterative algorithm, values represented for a given component (e.g. comp 1 to 2) include the optimal keepX value chosen for the previous component (comp 1)."----
# grid of possible keepX values that will be tested for each component
list.keepX <- c(1:10,  seq(20, 300, 10))

# undergo the tuning process to determine the optimal number of variables
tune.splsda.srbct <- tune.splsda(X, Y, ncomp = 4, # calculate for first 4 components
                                 validation = 'Mfold',
                                 folds = 5, nrepeat = 10, # use repeated cross-validation
                                 dist = 'max.dist', # use max.dist measure
                                 measure = "BER", # use balanced error rate of dist measure
                                 test.keepX = list.keepX,
                                 cpus = 2) # allow for paralleliation to decrease runtime

plot(tune.splsda.srbct, col = color.jet(4)) # plot output of variable number tuning


## -------------------------------------------------------------------------------------------------------------------
tune.splsda.srbct$choice.ncomp$ncomp # what is the optimal value of components according to tune.splsda()


## -------------------------------------------------------------------------------------------------------------------
tune.splsda.srbct$choice.keepX # what are the optimal values of variables according to tune.splsda()


## -------------------------------------------------------------------------------------------------------------------
optimal.ncomp <- tune.splsda.srbct$choice.ncomp$ncomp
optimal.keepX <- tune.splsda.srbct$choice.keepX[1:optimal.ncomp]


## -------------------------------------------------------------------------------------------------------------------
# form final model with optimised values for component and variable count
final.splsda <- splsda(X, Y, 
                       ncomp = optimal.ncomp, 
                       keepX = optimal.keepX)


## ---- fig.show = "hold", out.width = "49%", fig.cap = "FIGURE 6:  Sample plots from sPLS-DA performed on the SRBCT gene expression data including 95% confidence ellipses. Samples are projected into the space spanned by the first three components. (a) Components 1 and 2 and (b) Components 1 and 3. Samples are coloured by their tumour subtypes."----
plotIndiv(final.splsda, comp = c(1,2), # plot samples from final model
          group = srbct$class, ind.names = FALSE, # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = ' (a) sPLS-DA on SRBCT, comp 1 & 2')

plotIndiv(final.splsda, comp = c(1,3), # plot samples from final model
          group = srbct$class, ind.names = FALSE,  # colour by class label
          ellipse = TRUE, legend = TRUE, # include 95% confidence ellipse
          title = '(b) sPLS-DA on SRBCT, comp 1 & 3')


## ---- eval = FALSE--------------------------------------------------------------------------------------------------
## # set the styling of the legend to be homogeneous with previous plots
## legend=list(legend = levels(Y), # set of classes
##             col = unique(color.mixo(Y)), # set of colours
##             title = "Tumour Type", # legend title
##             cex = 0.7) # legend size
## 
## # generate the CIM, using the legend and colouring rows by each sample's class
## cim <- cim(final.splsda, row.sideColors = color.mixo(Y),
##            legend = legend)


## ---- fig.cap = "FIGURE 8:  Stability of variable selection from the sPLS-DA on the SRBCT gene expression data. The barplot represents the frequency of selection across repeated CV folds for each selected gene for component 1 (a), 2 (b) and 3 (c)."----
# form new perf() object which utilises the final model
perf.splsda.srbct <- perf(final.splsda, 
                          folds = 5, nrepeat = 10, # use repeated cross-validation
                          validation = "Mfold", dist = "max.dist",  # use max.dist measure
                          progressBar = FALSE)

# plot the stability of each feature for the first three components, 'h' type refers to histogram
par(mfrow=c(1,3))
plot(perf.splsda.srbct$features$stable[[1]], type = 'h', 
     ylab = 'Stability', 
     xlab = 'Features', 
     main = '(a) Comp 1', las =2)
plot(perf.splsda.srbct$features$stable[[2]], type = 'h', 
     ylab = 'Stability', 
     xlab = 'Features', 
     main = '(b) Comp 2', las =2)
plot(perf.splsda.srbct$features$stable[[3]], type = 'h', 
     ylab = 'Stability', 
     xlab = 'Features',
     main = '(c) Comp 3', las =2)


## ---- fig.cap = "FIGURE 9: Correlation circle plot representing the genes selected by sPLS-DA performed on the SRBCT gene expression data. Gene names are truncated to the first 10 characters. Only the genes selected by sPLS-DA are shown in components 1 and 2."----
var.name.short <- substr(srbct$gene.name[, 2], 1, 10) # form simplified gene names

plotVar(final.splsda, comp = c(1,2), var.names = list(var.name.short), cex = 3) # generate correlation circle plot


## -------------------------------------------------------------------------------------------------------------------
train <- sample(1:nrow(X), 50) # randomly select 50 samples in training
test <- setdiff(1:nrow(X), train) # rest is part of the test set

# store matrices into training and test set:
X.train <- X[train, ]
X.test <- X[test,]
Y.train <- Y[train]
Y.test <- Y[test]


## -------------------------------------------------------------------------------------------------------------------
# train the model
train.splsda.srbct <- splsda(X.train, Y.train, ncomp = optimal.ncomp, keepX = optimal.keepX)


## -------------------------------------------------------------------------------------------------------------------
# use the model on the Xtest set
predict.splsda.srbct <- predict(train.splsda.srbct, X.test, dist = "mahalanobis.dist")


## -------------------------------------------------------------------------------------------------------------------
# evaluate the prediction accuracy for the first two components
predict.comp2 <- predict.splsda.srbct$class$mahalanobis.dist[,2]
table(factor(predict.comp2, levels = levels(Y)), Y.test)


## -------------------------------------------------------------------------------------------------------------------
# evaluate the prediction accuracy for the first three components
predict.comp3 <- predict.splsda.srbct$class$mahalanobis.dist[,3]
table(factor(predict.comp3, levels = levels(Y)), Y.test)


## ---- fig.show = "hold", out.width = "49%", fig.cap = "FIGURE 10: ROC curve and AUC from sPLS-DA on the SRBCT gene expression data on component 1 (a) and all three components (b) averaged across one-vs.-all comparisons."----
auc.splsda = auroc(final.splsda, roc.comp = 1, print = FALSE) # AUROC for the first component

auc.splsda = auroc(final.splsda, roc.comp = 3, print = FALSE)# AUROC for all three components