-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsimple_train.py
140 lines (112 loc) · 5.76 KB
/
simple_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from functools import partial
import math
import numpy as np
import pandas as pd
import optuna
import lightgbm
from hpoflow import SignificanceRepeatedTrainingPruner
from sklearn.model_selection import StratifiedKFold
def fit(trial, train_x, train_y, val_x, val_y):
# Optional: add info about categorical_feature (in Dataset or train params)
# Optional: should "fnlwgt" be a categorical_feature?
# example: categorical_feature=["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
train_data = lightgbm.Dataset(train_x, label=train_y)
test_data = lightgbm.Dataset(val_x, label=val_y)
# will be filled by LightGBM
evals_result = {}
params = {
"objective":"binary",
"metric": "auc", # we might want to use F1 score here - mltb can be used to do that - see https://github.com/PhilipMay/mltb#module-lightgbm
"verbose":-1,
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5), # default 0.1 constraint > 0.0
"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 30), # default 20 constraint >= 0
"max_bin": trial.suggest_int("max_bin", 100, 2000), # default 255 constraint > 1
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.01, 1.0), # default 1.0 constraint 0.0 < bagging_fraction <= 1.0
"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0.0, 1.0), # default 0.0 constraint >= 0.0
# this is optional
# "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 80.0),
# "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 80.0),
# "extra_trees": trial.suggest_categorical("extra_trees", [True, False])
}
# there is a special dependency between max_depth and num_leaves
# see https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html#tune-parameters-for-the-leaf-wise-best-first-tree
max_depth = trial.suggest_int("max_depth", 1, 9) # default -1
params["max_depth"] = max_depth
num_leaves = trial.suggest_int("num_leaves", 2, 2**max_depth) # default 31 constraint 1 < num_leaves <= 131072
params["num_leaves"] = num_leaves
# this is optional
# if trial.suggest_categorical("do_bagging", [True, False]):
# params["bagging_freq"] = trial.suggest_int("bagging_freq", 1, 10)
# params["feature_fraction"] = trial.suggest_float("feature_fraction", 0.1, 1.0)
# fir the model
bst = lightgbm.train(
params,
train_data,
num_boost_round=trial.suggest_int("num_boost_round", 10, 200), # default 100 constraint >= 0 (although 0 of course makes no sense)
valid_sets=[test_data],
callbacks=[lightgbm.record_evaluation(evals_result)],
)
# get the result result of the last boosing round
result = evals_result["valid_0"]["auc"][-1]
# make sure nothing crashes
if result < 0 or math.isnan(result):
result = 0.0
return result
def objective(trial, data):
# list of results of CV
results = []
# 10 fold CV
# to ensure that the results of the different HP evaluations are comparable, we use a seed here
skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
for fold_id, (train_index, val_index) in enumerate(skf.split(data, data["label"])): # make sure to have stratified splits
# create train data from train_index
train_x = data.iloc[train_index].copy()
train_x.drop("label", axis=1, inplace=True)
train_y = data.iloc[train_index]["label"].to_list()
assert len(train_x) == len(train_y)
# create train data from val_index
val_x = data.iloc[val_index].copy()
val_x.drop("label", axis=1, inplace=True)
val_y = data.iloc[val_index]["label"].to_list()
assert len(val_x) == len(val_y)
# fit the model
result = fit(trial, train_x, train_y, val_x, val_y)
# collect results of the CV
results.append(result)
# set the results to optuna as user attribute for tracking
trial.set_user_attr("results", str(results))
# optional: this uses SignificanceRepeatedTrainingPruner to check if we can prune the cross validation
# see https://telekom.github.io/HPOflow/doc/SignificanceRepeatedTrainingPruner.html#significancerepeatedtrainingpruner-doc
trial.report(result, fold_id)
if trial.should_prune():
break
# return CV result
return np.mean(results)
if __name__ == "__main__":
# load data
df = df = pd.read_csv("data/census_income_train.csv", low_memory=False)
# create optuna study
study = optuna.create_study(
sampler=optuna.samplers.TPESampler(multivariate=True),
study_name="lightgbm_simple_01",
# we use a locale DB here (a file)
# if you want to do HP search on multiple machines use an external SQL db
storage="sqlite:///optuna.db",
# storage=f"postgresql://optuna:{OPTUNA_PASS}@{OPTUNA_HOSTNAME}/optuna",
load_if_exists=True,
direction="maximize",
# optional: this can speed up HP search if fitting needs much time
pruner=SignificanceRepeatedTrainingPruner(
alpha=0.4,
n_warmup_steps=3,
),
)
# optuna expects a function with only one parameter (trial)
# this is a trick to pass the data anyway
objective_partial = partial(objective, data=df)
# we do not specify n_trials here
# this means that the optimization is executed until it is canceled
# that can be done by pressing ctrl + c
# the optimization can just restarted
# previous results are then simply loaded at the beginning from the DB
study.optimize(objective_partial)