-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_HAT.py
185 lines (162 loc) · 8.4 KB
/
run_HAT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings("ignore")
import optuna
import joblib
import time
import datetime as dt
import os, sys
from copy import deepcopy
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error,r2_score,roc_auc_score,accuracy_score,average_precision_score
from sklearn.model_selection import train_test_split
import river
from river import compose
from river import tree
from river import ensemble
from river import forest
from river import neural_net as r_nn
from river import metrics
from river import evaluate
from river.utils import Rolling
from river import stream
from river import preprocessing
from river import optim as r_optim
from river import datasets
from Data_Processing import load_benchmark, generate_X_y, get_data_split
from SRAModels import reset_seed_
def optimize_static_params_HAT(dataset_csv,features,target,init_fit_ratio=0.2,n_trial=30, time_out=6*3600,seed=42,name='ARF_'):
def objective_HAT(trial):
params = {
'max_depth':trial.suggest_int("max_depth",2, 6, step = 1 ),
'seed':trial.suggest_int("seed",42, 43, step = 1 )
}
model = tree.HoeffdingAdaptiveTreeClassifier(max_depth=params["max_depth"],seed=params["seed"])
metric= metrics.ROCAUC()
X_train_init,y_train_init, data,stream_features = get_data_split(data=dataset_csv, init_fit_ratio=init_fit_ratio, features=features,target=target)
#X_train_init,X_val_init,y_train_init,y_val_init = train_test_split(X_train_init,y_train_init,test_size=0.3,random_state=42)
stream_train_init = stream.iter_pandas(X_train_init,y_train_init)
_=evaluate.progressive_val_score(
model=model,
dataset=stream_train_init,
metric=metric,#Rolling(metrics.ROCAUC(),window_size=10_000),
print_every=0
)
return metric.get()
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=seed))
study.optimize(objective_HAT,timeout=time_out,n_trials=n_trial)
best_trial = study.best_trial
best_p = best_trial.params
joblib.dump(best_p,name+'best_params')
return best_p
def evaluate_batch_HAT(params, model, dataset, n_rows, n_delay, n_windows, features, target, metric, seed=42, moment=0, delay='delay', batch_index='batch_index'):
stream_features = features+['delay','batch_index']
X,y_target = generate_X_y(dataset=dataset,n_rows=n_rows, n_delay=n_delay,n_windows=n_windows,features=stream_features,target=target,seed=seed)
dataset_stream = stream.iter_pandas(X,y_target) ## stream data
batch_index, batch_size = np.unique(X.batch_index,return_counts=True) ## extract id of batch and size
windows_count = np.zeros(len(batch_size),dtype=int) ## initialise obtained labels of each batch.
Y_pred_buffer = [{} for i in range(len(batch_size))] ## initialise the buffer of predictions
Y_true_buffer = [{} for i in range(len(batch_size))] ## initialise the buffer of labels
X_buffer = {} ## initialise the buffer of observation, which is removed from the model, once using it for the learning.
result_batch= []
result_batch_accuracy= []
Len_Batch = []
##
for i, x,y, *kwargs in stream.simulate_qa(dataset_stream, moment=0, delay='delay'):
index_window = int(x.pop('batch_index'))
delay = x.pop('delay')
if y is None:
#xto_pred = scaler.transform_one(x)
y_pred = model.predict_proba_one(x)
# add the prediction to its corresponding window
Y_pred_buffer[index_window][i] = y_pred[1]
X_buffer[i] = x
continue
if y is not None:
windows_count[index_window]+=1#increase the number of obtained label for the window index_window
Y_true_buffer[index_window][i] = y
model.learn_one(X_buffer.get(i),y) # use the label to update the model
del X_buffer[i] # remove the instance x from the buffer once using it to update the model
# if the label buffer of the window index_window is, then display the metric for full batch
if windows_count[index_window] == batch_size[index_window]:
y_true_window = np.array(list(Y_true_buffer[index_window].values()))# ground truth
y_true_index_window = list(Y_true_buffer[index_window].keys())# get all predictions of the current window the index (i) of true label
Y_pred_buffer_window = Y_pred_buffer[index_window]## extract all predictions of the current window
y_pred_window = np.array([Y_pred_buffer_window.get(i) for i in y_true_index_window]) # get all predictions array of the current window
result = metric[0](y_true_window,y_pred_window)
result_batch.append(result)
result_batch_accuracy.append(metric[1](y_true_window,(y_pred_window>0).astype(int)))
Len_Batch.append(len(y_pred_window))
print(f"window={index_window}---Score={result}")
return Len_Batch,result_batch,result_batch_accuracy
def run_experiment_batch_HAT(
model_name,
dataset,
dataset_name,
n_delays,
features,
target,
metric,
n_windows = 10000,
static_optim_ntrial = 25,
seed=42,
moment=0,
init_fit_ratio = 0.3,
delay='delay',
batch_index='batch_index'
):
## split the data for offline tuning and stream evaluation
X_train_init,y_train_init, data,stream_features = get_data_split(data=dataset, init_fit_ratio=init_fit_ratio, features=features,target=target)
## offline optimization
#X_train_init,X_val_init,y_train_init,y_val_init = train_test_split(X_train_init,y_train_init,test_size=0.3,random_state=42)
stream_train_init = stream.iter_pandas(X_train_init,y_train_init)
save_config_dir = model_name+'_'+dataset_name
if os.path.exists(save_config_dir+'best_params'):
best_params = joblib.load(save_config_dir+'best_params')
else:
best_params = optimize_static_params_HAT(dataset_csv=dataset,features=features,target=target,init_fit_ratio=init_fit_ratio,n_trial=static_optim_ntrial, time_out=6*3600,seed=42,name=save_config_dir)
model = tree.HoeffdingAdaptiveTreeClassifier(max_depth=best_params["max_depth"],seed=best_params["seed"])
_=evaluate.progressive_val_score(
model=model,
dataset=stream_train_init,
metric=metrics.ROCAUC(),
print_every=0
)
model_copies = [deepcopy(model) for m in range(len(n_delays))]
all_results = []
all_results_accuracy = []
run_times = []
df_all_results = pd.DataFrame()
df_all_results_accuracy = pd.DataFrame()
df_all_results_runtimes = pd.DataFrame()
df_all_results_runtimes['delays'] = n_delays
for m, number_delay in enumerate(n_delays):
start_time = time.time()
Len_Batch,result_batch,result_batch_accuracy = evaluate_batch_HAT(params=best_params,
model = model_copies[m],
dataset = data,
n_rows=len(data),
n_delay=number_delay,
n_windows=n_windows,
features=features,
target=target,
metric= metric
)
if m==0:
df_all_results["Batch_len"]= Len_Batch
df_all_results_accuracy["Batch_len"]= Len_Batch
all_results.append(result_batch)
all_results_accuracy.append(result_batch_accuracy)
col_name = model_name+'_delay_'+str(number_delay)
df_all_results[col_name]=result_batch
df_all_results_accuracy[col_name]=result_batch_accuracy
path = dataset_name+model_name+".csv"
df_all_results.to_csv('aucroc_'+path,index=False)
df_all_results_accuracy.to_csv('accuracy_'+path,index=False)
end_time = time.time()
run_times.append(end_time-start_time)
df_all_results_runtimes['runtime'] = run_times
df_all_results_runtimes.to_csv('runtime_'+path,index=False)
return run_times, all_results, all_results_accuracy