-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEL_xgboost.R
264 lines (164 loc) · 9.94 KB
/
EL_xgboost.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
require(tidyverse)
require(parallel)
require(xgboost)
ff <- function(a,b){
idx <- which(b$opp == a)
if(length(idx) > 0){
return(b[idx,])
}else{
return(NULL)
}
}
tList <- readRDS("processed_data.RDS")
cl <- makeCluster(detectCores())
clusterEvalQ(cl, require(tidyverse))
clusterEvalQ(cl, require(parallel))
gameStats <- tList
gameStats <- lapply(gameStats, function(season) lapply(season, function(team) lapply(team, function(games) games[[1]])))
begin <- Sys.time()
gameStats <- parLapply(cl,gameStats, function(season) lapply(season, function(team) lapply(team, function(games){
isN <- colnames(games)[which(sapply(games,function(x) is.numeric(x) | is.logical(x)))]
games %>% group_by(CODETEAM) %>% summarise_at(.vars = isN,.funs = sum)
}
)))
print(Sys.time() - begin)
gameStats <-lapply(gameStats, function(season) lapply(season, function(team) lapply(team, function(games) games %>% filter(CODETEAM!="FALSE"))))
gameStats <-lapply(gameStats, function(season) lapply(season, function(team) lapply(team, function(games) games %>% mutate(isHome= isHome > 0))))
gameStats <-lapply(gameStats, function(season) lapply(season, function(team) team %>% bind_rows(.id = "opp")))
gameStats <-lapply(gameStats, function(season) lapply(season, function(team) team %>% mutate(game_count=1:n())))
gameStats <-lapply(gameStats, function(season) lapply(season, function(team) team %>% mutate(pts=3*made3p +2*made2p +madeFT,
oppPts=3*made3pOp + 2*made2pOp + madeFTOp,
oEFF = 100*pts/possession,
dEFF = 100*oppPts/possessionOp,
dReb_pct = defReb/(defReb+offRebOp),
oReb_pct = offReb/(defRebOp+offReb),
oEFF_Op = 100*oppPts/possessionOp,
dEFF_Op =100*pts/possession,
pace = 40*possession/((duration/60)),
paceOp = 40*possessionOp/((duration/60)),
win = as.numeric(pts > oppPts)
)))
gameStats <-lapply(gameStats, function(season) lapply(season, function(team)
team %>% group_by(CODETEAM,opp) %>% mutate(enc=1:n()) %>% ungroup()
))
gameStats <-lapply(gameStats, function(season) lapply(season, function(team)
team %>% select(-c(MINUTE,POINTS_A,POINTS_B,quarter,timeNum))
))
gs_win_home <- lapply(gameStats, function(season) lapply(season, function(team)
team %>% select(CODETEAM,enc,opp,win,isHome)
))
#cl <- makeCluster(detectCores())
#clusterEvalQ(cl, require(tidyverse))
#clusterEvalQ(cl, require(parallel))
begin <- Sys.time()
gameStats2 <- parLapply(cl,gameStats, function(season) lapply(season, function(team){
tmp <- lapply(team$game_count, function(x){
isN <- colnames(team)[which(sapply(team, function(x) is.numeric(x) | is.logical(x)))]
flt <- team %>% filter(game_count <= x)
col_no <- dim(flt)[2]
col_names <- colnames(flt)
if(dim(flt)[1] < 6){
if(dim(flt)[1] != 0){
flt <- flt %>% group_by(CODETEAM) %>% summarise_at(.vars = isN,.funs = mean)
}else{
}
}else{
flt <- flt[((dim(flt)[1])-3):(dim(flt)[1]-1),]
flt <- flt %>% group_by(CODETEAM) %>% summarise_at(.vars = isN,.funs = mean)
}
})
tmp <- tmp %>% bind_rows()
tmp$opp <- team$opp
tmp$enc <- team$enc
tmp$pts <- team$pts
tmp$oppPts <- team$oppPts
tmp$game_count <- team$game_count
return(tmp)
}))
print(Sys.time() - begin)
gameStats2 <- lapply(gameStats2, function(season) lapply(season, function(team){
team <- team %>% mutate(fg_2 = made2p/attempted2p,
fg_3 = made3p/attempted3p,
fg_ft = madeFT/attemptedFT,
fg_2_Op = made2pOp/attempted2pOp,
fg_3_Op = made3pOp/attempted3pOp,
fg_ft_Op = madeFTOp/attemptedFTOp,
dReb_pct = defReb/(defReb+offRebOp),
oReb_pct = offReb/(defRebOp+offReb))
team <- team %>% select(CODETEAM,opp,game_count,win,enc,pace,assist,turnover,fg_2,fg_3,fg_ft,defReb,dReb_pct,offReb,oReb_pct,foul,foulDrawn,block,rejected,steal,assistOp,turnoverOp,fg_2_Op,fg_3_Op,fg_ft_Op,defRebOp,offRebOp,stealOp,oEFF,dEFF,oEFF_Op,dEFF_Op,pts,oppPts)
return(team)
}))
gS_names <- lapply(gameStats2,names)
gameStats2 <- Map(function(season1,season2) Map(function(team1,team2) left_join(team1,team2,by=c("CODETEAM","opp","enc"),suffix = c("","_true") ),team1=season1,team2=season2),season1=gameStats2,season2=gs_win_home)
gameStats_others <- Map(function(name,df) lapply(name, function(y) lapply(df, function(x) ff(y,x))),name=gS_names,df=gameStats2)
gameStats_others <- Map(function(x,y) setNames(x,y),x=gameStats_others,y=gS_names)
gameStats_others <- lapply(gameStats_others, function(y) lapply(y, function(x)x[lengths(x) != 0]))
gameStats_others <- lapply(gameStats_others,function(x) lapply(x, bind_rows))
gameStats_merged <- Map(function(x,y) Map(function(a,b) left_join(a %>% mutate(across(where(is.character), str_trim)),b %>% mutate(across(where(is.character), str_trim)),by=c("CODETEAM"="opp","opp"="CODETEAM","enc"),suffix=c("","_other")),a=x,b=y),x=gameStats2,y=gameStats_others)
gameStats_merged <- lapply(gameStats_merged, bind_rows)
gameStats_merged <- gameStats_merged %>% bind_rows(.id = "season")
#### table ####
pre_table <- lapply(gameStats, function(x) x %>% bind_rows(.id = "CODETEAM"))
pre_table <- pre_table[["2021"]]
pre_table <- pre_table %>% filter(!(str_detect(CODETEAM,"DYR|UNK|CSK") | str_detect(opp,"DYR|UNK|CSK")))
pre_table <- pre_table %>% select(CODETEAM,assist:oppPts,win)
pre_table <- pre_table %>% group_by(CODETEAM) %>% mutate(game_count=1) %>% summarise_if(is.numeric,sum)
#### model cont. ####
stats_other_op <- colnames(gameStats_merged)[which(colnames(gameStats_merged) %>% str_detect("_other") & (colnames(gameStats_merged) %>% str_detect("Op")))]
stats_other <- colnames(gameStats_merged)[which(colnames(gameStats_merged) %>% str_detect("_other") & !(colnames(gameStats_merged) %>% str_detect("Op")))]
stats_op <- colnames(gameStats_merged)[which(!colnames(gameStats_merged) %>% str_detect("_other") & (colnames(gameStats_merged) %>% str_detect("Op")))]
stats_ <- colnames(gameStats_merged)[which(!colnames(gameStats_merged) %>% str_detect("_other") & !(colnames(gameStats_merged) %>% str_detect("Op")))]
stats_ <- stats_[-c((1:4),6,length(stats_),(length(stats_) -1))]
stats_other <- stats_other[-c(1,length(stats_other),(length(stats_other) -1))]
stats_idx <- sapply(stats_, function(x) sum(sapply(stats_other_op, function(y) y %>% str_detect(x) )))
stats_op_idx <- sapply(stats_other %>% str_remove("_other"), function(x) sum(sapply(stats_op, function(y) y %>% str_detect(x) )))
for(x in 1:length(stats_[stats_idx == 1])){
varName <- paste0(stats_[stats_idx==1][x] %>% str_remove_all("Op|_other"),"_s")
var1 <- stats_[stats_idx==1][x]
var2 <- stats_other_op[x]
gameStats_merged <- gameStats_merged %>% mutate(!!varName := gameStats_merged[[var1]] - gameStats_merged[[var2]])
}
for(x in 1:length(stats_other[stats_op_idx == 1])){
varName <- paste0(stats_other[stats_op_idx == 1][x] %>% str_remove_all("Op|_other"),"_a")
var1 <- stats_other[stats_op_idx == 1][x]
var2 <- stats_op[x]
gameStats_merged <- gameStats_merged %>% mutate(!!varName := gameStats_merged[[var1]] - gameStats_merged[[var2]])
}
gameStats_merged <- gameStats_merged %>% arrange(season,game_count)
gameStats_merged <- gameStats_merged %>% filter(game_count > 1)
g
gs_tt <- gameStats_merged[-((dim(gameStats_merged)[1]-50):(dim(gameStats_merged)[1])),]
gs_pred <- gameStats_merged[((dim(gameStats_merged)[1]-50):(dim(gameStats_merged)[1])),]
####xgboost####
#gs_tt <- gs_tt[,c(stats_,stats_op,stats_other,stats_other_op,"isHome","win_true")]
#gs_pred <- gs_pred[,c(stats_,stats_op,stats_other,stats_other_op,"isHome","win_true")]
pts_tt <- gs_tt$pts
pts_pred <- gs_pred$pts
gs_tt <- gs_tt %>% select(-c("oppPts","pts_other","oppPts_other","pts","win_true","win_other","season","CODETEAM","opp","game_count","enc","win_true_other"))
gs_pred <- gs_pred %>% select(-c("oppPts","pts_other","oppPts_other","pts","win_true","win_other","season","CODETEAM","opp","game_count","enc","win_true_other"))
index.test <- sample(nrow(gs_tt),floor(0.8*nrow(gs_tt)))
dataMatrix <- as.matrix(gs_tt[index.test,])
testMatrix <- as.matrix(gs_tt[-index.test,])
xgbTrain <- xgb.DMatrix(data = dataMatrix,label=pts_tt[index.test])
xgbTest <- xgb.DMatrix(data = testMatrix,label=pts_tt[-index.test])
xgb_model_full <- xgb.train(params = list(
booster="gbtree",
eta=0.001,
max_depth=30,
gamma=10,
subsample=0.828,
colsample_bytree=0.837,
objective="binary:logistic",
eval_metric="auc",
num_class=1
),data=xgbTrain,
nrounds=20000,
early_stopping_rounds=50,verbose = 1,watchlist = list(train=xgbTrain,test=xgbTest))
xgb.ggplot.importance(xgb.importance(model=xgb_model_full))
xgbOutofSample <- xgb.DMatrix(as.matrix(gs_pred))
outOfSampleF = data.frame(predictions=predict(xgb_model_full,xgbOutofSample,reshape=T))
outOfSampleF$team <- gs_pred$CODETEAM
outOfSampleF$opp <- gs_pred$opp
outOfSampleF$pts <- pts_pred
outOfSampleF$ptsOpp <- gs_pred$oppPts
View(outOfSampleF)