Skip to content

Commit

Permalink
Merge branch 'master' of github.com:szilard/teach-ML-CEU-master-bizan…
Browse files Browse the repository at this point in the history
…alytics
  • Loading branch information
szilard committed Dec 8, 2017
2 parents 5bd4138 + dfd52d0 commit fb2a34f
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
File renamed without changes.
13 changes: 8 additions & 5 deletions wk02/lab/overfitting.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ set.seed(1234)
# DATA, CREATE TRAIN/TEST SPLIT

# https://www.kaggle.com/harlfoxem/housesalesprediction/data
data <- read_csv("wk02/lab/kc_house_data.csv")
data <- read_csv("wk02/lab/data/kc_house_data.csv")

test_ratio <- 0.5
data_train <- data %>% sample_frac(test_ratio)
data_test <- anti_join(data, data_train, by = "id")

#
# ESTIMATION AND EVALUATION

RMSE <- function(x, true_x) sum((x - true_x)^2)

Expand All @@ -27,10 +27,11 @@ results <- map_df(
~ {
param <- .
model <- rpart(
formula = log(price) ~ bedrooms + bathrooms + sqft_living + sqft_lot + floors + waterfront + view + condition + grade + sqft_above + sqft_basement + yr_built + yr_renovated + zipcode + lat + long + sqft_living15 + sqft_lot15,
data = data_train,
formula = log(price) ~ .,
data = data_train %>% select(-id, - date),
control = rpart.control(xval = 0, cp = 0.00001, minbucket = param),
method = "anova")
method = "anova"
)

train_error <- RMSE(predict(model, data_train), log(data_train[["price"]]))
test_error <- RMSE(predict(model, data_test), log(data_test[["price"]]))
Expand All @@ -40,4 +41,6 @@ results <- map_df(
}
) %>% rbind()

# PLOT TRAIN AND TEST ERRORS

ggplot(results) + geom_line(aes(x = parameter, y = error_value, color = error_type))

0 comments on commit fb2a34f

Please sign in to comment.