-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpca.R
executable file
·118 lines (99 loc) · 3.57 KB
/
pca.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
cat("\014")
rm(list = ls())
library(readr)
library(dplyr)
data("iris")
# Get names and group name for convenience
samplename <- seq_along(iris$Species)
samplegroup <- iris$Species
# Get only the numerical part
sampleprocessed <- iris[,1:ncol(iris)-1]
# Optional horizontal normalization
sampleprocessed <- apply(sampleprocessed,1,function(x) {
. <- norm(x,type="2")
return(x/.)
}) %>%
t # apply merges the results by columns, so need transpose
## prcomp way -----
pca <- prcomp(as.matrix(sampleprocessed),
retx = TRUE,
center = TRUE,
scale. = TRUE)
# Eigenvectors, all have unit length; linear combination coefficients to get
# the PC scores
eigvectors <- pca$rotation %>%
as.data.frame
# Variances of each PC; also the eigenvalues for the eigenvectors
eigvals <- pca$sdev^2
# x is PC scores
# x is unstandardized
scores <- pca$x %>%
as.data.frame
scores_std <- scale(scores) %>%
as.data.frame
# Loadings are eigenvectors scaled by sqrt of eigenvalues;
# such that their magnitude equals to eigenvalues
# each loading equals to Pearson correlation between original data and PC scores
. <- colnames(eigvectors)
loadings <- as.matrix(eigvectors) %*% diag(sqrt(eigvals)) %>%
as.data.frame
colnames(loadings) <- .
# Varimax rotation: rotate the principal axes to bring loadings closer to 0 or 1
# for better interpretation
# Usually done after deciding which PC to keep
. <- varimax(loadings %>%
select(PC1:PC2) %>%
as.matrix(),
normalize = FALSE)
rLoadings <- .$loadings
rotmat <- .$rotmat
# Use rotmat to rotate the standardized PC scores
# Each rotated loading equals to Pearson correlation between original data and
# rotated PC scores (somehow only true when rotating standardized scores)
rScores <- as.matrix(select(scores_std,PC1:PC2)) %*% rotmat %>%
as.data.frame
colnames(rScores) <- c("RC1","RC2")
## manual way -----
# samplescale <- scale(sampleprocessed,
# center = TRUE,
# scale = TRUE)
# eigs_manual <- eigen(cov(samplescale))
# eigs_manual_vectors <- eigs_manual$vectors
# eigs_manual_values <- eigs_manual$values
## Plot -----
# Scree plot: used to select PCs to keep
library(ggplot2)
library(hrbrthemes)
# scree <- as.data.frame(list(colnames(eigvectors),eigvals/sum(eigvals)))
# colnames(scree) <- c("PC","Percent")
# ggplot(scree, aes(x = PC,y = Percent,group=1)) +
# geom_line(color="grey") +
# geom_point() +
# ggtitle("Scree Plot") +
# xlab("Principal Components") +
# ylab("Percent Variance Explained") +
# theme_modern_rc()
# PC1 PC2 biplot
library(ggbiplot)
ggbiplot(pca,
obs.scale = 1,
var.scale = 1,
labels = samplename,
groups = samplegroup,
ellipse = TRUE,
circle = TRUE,
var.axes = T,
varname.size = 1.5) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top') +
theme_minimal()
# In correlation circle, variables with longer arrows are better represented for
# that PC
# The angle between the arrows represent correlation between the variables:
# small angle means high + correlation, 90 degrees means no correlation,
# opposite means high - correlation
# Correlation circles are plotted using loadings, which represent correlation
# between original variables and the PCs. Hence, they have a radius of 1.
# The ellipses are confidence ellipses, 2D version of confidence intervals.
# Means that 95% percent sure of the circle contains the true population
# distribution (population from which the sample is drawn)