-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathDiamond Prices.R
85 lines (52 loc) · 2.31 KB
/
Diamond Prices.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Diamond Prices
### Load Libraries
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
library(magrittr)
library(caret)
library(plotly)
library(corrplot)
### Receiving Dataset and Change Working Directory
setwd('./Kaggle')
diamond <- read.csv('diamonds.csv')
head(diamond)
### Class and Missing Variables for Diamonds
data.frame(cbind(data.frame(VarType=sapply(diamond,class)),data.frame(Total_Missing=sapply(diamond,function(x){sum(is.na(x))}))))
## See Different Levels in Factor Variations
print("Cut Levels")
levels(diamond$cut)
print("Color Levels")
levels(diamond$color)
print("Clarity Levels")
levels(diamond$clarity)
### Density Plots
qplot(price, data=diamond, geom="density", fill=cut, alpha=I(.5),
main="Distribution of Carat", xlab="Different kinds of cut",
ylab="Density") + theme_minimal()
qplot(price, data=diamond, geom="density", fill=color, alpha=I(.5),
main="Distribution of Carat", xlab="Different Colors",
ylab="Density") + theme_minimal()
qplot(price, data=diamond, geom="density", fill=clarity, alpha=I(.5),
main="Distribution of Carat", xlab="Different clarity parameters",
ylab="Density") + theme_minimal()
### More Plots
ggplot(data=diamond,aes(x=cut))+geom_bar(fill="green")+theme_minimal()+ylab("Total Count")+ggtitle("Distribution of Diamonds by Cut Type")
ggplot(data=diamond,aes(x=color))+geom_bar(fill="khaki")+theme_minimal()+ylab("Total Count")+ggtitle("Distribution of Diamonds by Color Type")
ggplot(data=diamond,aes(x=clarity))+geom_bar(fill="violet")+theme_minimal()+ylab("Total Count")+ggtitle("Distribution of Diamonds by Clarity Type")
### Encoding
ohe_features<-c("cut","color","clarity")
dummies<-dummyVars(~cut + color + clarity ,data=diamond)
diamond_ohe<-as.data.frame(predict(dummies,newdata=diamond))
diamond_combined<-cbind(diamond,diamond_ohe)
newdiamond<-diamond_combined[,!names(diamond_combined)%in%ohe_features]
rm(diamond_combined)
rm(diamond_ohe)
### Looking at the New Data
head(newdiamond)
### Dropping all the Null Values
x.label<-newdiamond$X
y.label <-as.numeric(newdiamond$price)
newdiamond$X<-NULL
newdiamond$price<-NULL
## Correlation plot
corrplot(cor(cbind(newdiamond,Price=y.label)),type="upper")