-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCreateTuningXgboostSubmission.R
43 lines (37 loc) · 1.76 KB
/
CreateTuningXgboostSubmission.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
library(data.table)
inputPath <- "data/"
dateString <- "2019-07-07"
outputPath <- paste0("../Submissions/", dateString, "/Slytherin/")
engineerFeatures <- function(dataset) {
dataset <- copy(dataset)
dataset[, c("data_id", "NumberOfClasses") := NULL]
dataset[, complexity := 1.0 * NumberOfInstances * NumberOfFeatures * nrounds / scimark]
dataset[, tree_complexity := complexity * colsample_bylevel * colsample_bytree * subsample]
return(dataset)
}
if (!dir.exists(outputPath)) {
dir.create(outputPath, showWarnings = FALSE, recursive = TRUE)
}
for (inputTestFile in list.files(inputPath, pattern = "kuhn2018-test-[0-9]+\\.csv", full.names = TRUE)) {
# Read in
trainData <- fread(gsub("test", "train", inputTestFile))
testData <- fread(inputTestFile)
# Pre-process
trainData <- engineerFeatures(trainData)
testData <- engineerFeatures(testData)
preprocModel <- caret::preProcess(trainData[, -"target"], method = "medianImpute")
trainData <- predict(preprocModel, trainData)
testData <- predict(preprocModel, testData)
# Train model
xgbTrainPredictors <- Matrix::sparse.model.matrix(~ .,data = trainData[, -"target"])[, -1]
xgbTrainData <- xgboost::xgb.DMatrix(data = xgbTrainPredictors, label = trainData$target)
xgbTestPredictors <- Matrix::sparse.model.matrix(~ ., data = testData)[, -1]
xgbModel <- xgboost::xgb.train(data = xgbTrainData, nrounds = 50,
params = list(objective = "reg:tweedie", nthread = 4))
# Predict
solution <- data.table(target = predict(xgbModel, newdata = xgbTestPredictors))
solution[ target < 1, target := 1]
numberString <- regmatches(inputTestFile, regexpr("[0-9]+.csv$", inputTestFile))
fwrite(solution, file = paste0(outputPath, "Slytherin-", dateString, "-prediction-", numberString),
quote = FALSE)
}