-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHousePricing-R.R
149 lines (108 loc) · 3.51 KB
/
HousePricing-R.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#import dataset
df<- read.csv("House_Price.csv", header=TRUE)
#view data
head(df)
#get structure of data
str(df)
#univariate analysi
summary(df)
#plot histogram
hist(df$crime_rate) #did not give any major info
pairs(~price + crime_rate + n_hot_rooms + rainfall, data = df)
#shows n_hot_rooms and rainfal has outliers
#crime rate has a different relationship wit price, hence needs to be modified
#to get a linear relationship with price
#get barplot of the 3 categorical variables
barplot(table(df$airport)) #nothing suspicious
barplot(table(df$waterbody)) #nothing suspicious
barplot(table(df$bus_ter)) #only 1 value hence not useful in the data set
#observations
#1: n_hot_rooms has outliers
#2: hos_beds has missing values
#3: bus terminal is useless value
#4: crime rate has some functional relationship with price
#Outlier treatment
quantile(df$n_hot_rooms,0.99)
uv = quantile(df$n_hot_rooms,0.99)
df$n_hot_rooms[df$n_hot_rooms>uv] <- uv
#checking changes
summary(df$n_hot_rooms)
lv = 0.3 * quantile(df$rainfall, 0.01)
df$rainfall[df$rainfall < lv] <- lv
#checking changes
summary(df$rainfall)
#handling missing values with mean
mean(df$n_hos_beds,na.rm = TRUE)
#positions with na values
which(is.na(df$n_hos_beds))
df$n_hos_beds[is.na(df$n_hos_beds)] <- mean(df$n_hos_beds,na.rm = TRUE)
#checking changes
summary(df$n_hos_beds)
#plot pair plot of price and crime data
pairs(~price+crime_rate, data=df)
plot(df$price, df$crime_rate)
#transform to logrithmic format
df$crime_rate = log(1+df$crime_rate)
#get new variable to represent dist 1,2,3,4
df$avg_dist = (df$dist1 + df$dist2 + df$dist3 + df$dist4)/4
#view the new data
View(df)
#delete dist1,2,3,4
df2 <- df[ ,-7:-10]
df <- df2 #reassign back to df
rm(df2) #delete df2
#remove unnecessary column bus_terminal
df <- df[ , -14]
#create dummy variables for all the categorical variable
df$airport <- ifelse(df$airport == "YES", 1,0)
df$river <- ifelse(df$waterbody == "River" | df$waterbody == "Lake and River", 1,0)
df$lake <- ifelse(df$waterbody == "Lake" | df$waterbody == "Lake and River", 1,0)
#remove water body column
df <- df[ , -12]
#get correlation matrix rounded to 2 dp
round(cor(df),2)
#park and air quality will lead to multi-collinearity
#we will remove park as it has higher correlation
df <- df[ , -13]
#get linear regression model
simple_model <- lm(price~room_num, data = df)
summary(simple_model)
#plot rrom number nd price
plot(df$room_num, df$price)
abline (simple_model)
multiple_model <- lm(price~.,data = df)
summary(multiple_model)
#split data into test and train
#install required package
install.packages("caTools")
library("caTools")
#set seed
set.seed(0)
#split data
split = sample.split(df,SplitRatio = 0.8)
training_set = subset(df,split==TRUE)
test_set = subset(df, split==FALSE)
#run linear model on training dataset
lm_a = lm(price~., data = training_set)
summary(lm_a)
#predict value of price
train_a = predict(lm_a, training_set)
test_a = predict(lm_a, test_set)
mean((training_set$price - train_a)^2)
mean((test_set$price - test_a)^2)
#subset selectin
#install requred library
install.packages("leaps")
library("leaps")
lm_best = regsubsets(price~., data = df, nvmax = 15)
summary(lm_best)
#get adjusted r2 value
summary(lm_best)$adjr2
#get which dot max value\
which.max(summary(lm_best)$adjr2)
coef(lm_best, 8)
#run forward and backward selection
lm_forward = regsubsets(price~., data = df, nvmax = 15, method = "forward")
summary(lm_forward)
lm_backward = regsubsets(price~., data = df, nvmax = 15, method = "backward")
summary(lm_backward)