-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathexplanations_runtime.py
201 lines (162 loc) · 5.62 KB
/
explanations_runtime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# %%
from emutils.imports import *
from emutils.utils import (
attrdict,
pandas_max,
load_pickle,
save_pickle,
)
from cfshap.utils.preprocessing import MultiScaler
from utils import *
# Suppress warnings
import warnings
# warnings.filterwarnings(action="error", category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
# Suppress scientific notation
# np.set_printoptions(suppress=True)
np.seterr(all='raise')
pandas_max(100, 200)
# %%
from constants import DATA_DIR, MODEL_DIR, RESULTS_DIR
parser = ArgumentParser(sys.argv)
# General
parser.add_argument('--dataset', type=str, default='moons', choices=['heloc', 'lendingclub', 'wines'], required=False)
parser.add_argument('--data_version', type=str, default="v2")
parser.add_argument('--data_path', type=str, default=DATA_DIR, required=False)
parser.add_argument('--random_state', type=int, default=2021, required=False)
# Model
parser.add_argument('--model_path', type=str, default=MODEL_DIR, required=False)
parser.add_argument('--model_type', type=str, default='xgb')
parser.add_argument('--model_version', type=str, default='v5')
# Results
parser.add_argument('--results_path', type=str, default=RESULTS_DIR, required=False)
# Experiments
parser.add_argument('--monotonic', action='store_true', default=False)
parser.add_argument('--methods', nargs='+', required=False, default=None)
args, unknown = parser.parse_known_args()
args = attrdict(vars(args))
os.makedirs(args.results_path, exist_ok=True)
print(args)
# %%
# FILE NAMES
# -> Data
DATA_RUN_NAME = f"{args.dataset}_D{args.data_version}"
FEATURES_FILENAME = f"{args.data_path}/{DATA_RUN_NAME}_features.pkl"
CLASSES_FILENAME = f"{args.data_path}/{DATA_RUN_NAME}_classes.pkl"
TRENDS_FILENAME = f"{args.data_path}/{DATA_RUN_NAME}_trends.pkl"
# -> Model
MODEL_RUN_NAME = f"{DATA_RUN_NAME}M{args.model_version}_{args.model_type}"
MODELWRAPPER_FILENAME = f"{args.model_path}/{MODEL_RUN_NAME}_model.pkl"
def profiling_filename(
args,
dataset=None,
data_version=None,
model_version=None,
model_type=None,
ext='pkl',
):
if dataset is None:
dataset = args.dataset
if data_version is None:
data_version = args.data_version
if model_version is None:
model_version = args.model_version
if model_type is None:
model_type = args.model_type
MODEL_RUN_NAME = f"{args.dataset}_D{args.data_version}M{args.model_version}_{args.model_type}"
return f"{args.results_path}/{MODEL_RUN_NAME}_profiling.{ext}"
# %% [markdown]
# # Load data and model
# Let's load all the data and the trained model.
# We load here also the manifold information.
# %%
X, _, X_train, _, y_train, _ = load_data(args)
feature_names = load_pickle(FEATURES_FILENAME)
class_names = load_pickle(CLASSES_FILENAME)
feature_trends_ = load_pickle(TRENDS_FILENAME)
feature_trends = feature_trends_ if args.monotonic else None
multiscaler = MultiScaler(X_train)
model = load_pickle(MODELWRAPPER_FILENAME)
preds__ = model.predict(X.values)
X_bad = X[preds__ == 1]
X_explain = X_bad.values
print(f"Number of Bads : {X_bad.shape}")
# %%
from emutils.parallel.utils import max_cpu_count
# Set number of threads for efficiency.
model.get_booster().set_param({'nthread': min(15, max_cpu_count() - 1)})
# %% [markdown]
# # Explainers
# Let's set up all the explainers that we want to use
# %%
from explainers import create_explainers
# Explainers
EXPLAINERS = create_explainers(
X=X_train,
y=y_train,
model=model,
ref_points=[],
multiscaler=multiscaler,
feature_names=feature_names,
feature_trends=feature_trends, # None by default
random_state=args.random_state,
)
# %%
from emutils.profiling.time import estimate_parallel_function_linear_plateau
from emutils.profiling.time import estimate_iterative_function_runtime
model_parallelism = np.median([
estimate_parallel_function_linear_plateau(model.booster.inplace_predict, X_train.values, start=100, precision=1e-3)
for _ in range(10)
])
model_parallelism
# %% [markdown]
# # Test (Speed) Performance
# %%
N = 10
T = 1e-1
start = time.time()
results = []
for m, (method, explainer) in enumerate(EXPLAINERS.items()):
# Skip if not in the list of methods to test
if args.methods is not None and method not in args.methods:
continue
def __go():
return estimate_iterative_function_runtime(
lambda X: explainer.shap_values(X),
X_explain,
n=1,
precision=T,
concatenate=True,
)
ts = []
if 'cone' in method:
# We optimize the speed of cone (as much as possible)
for mp in np.ceil(np.linspace(model_parallelism * .5, model_parallelism * 2, 20)).astype(int):
tst = []
for i in tqdm(range(N), desc=f"{method} ({m+1}/{len(EXPLAINERS)})"):
EXPLAINERS[method].counterfactual_method.wrapping_instance.kwargs['model_parallelism'] = mp
tst.append(__go())
if not ts or np.min(tst) < np.min(ts):
ts = tst
else:
for i in tqdm(range(N), desc=f"{method} ({m+1}/{len(EXPLAINERS)})"):
ts.append(__go())
results.append(
dict(
Method=method,
Runtimes=ts,
Mean=np.mean(ts),
Minimum=np.min(ts),
Maximum=np.max(ts),
Variance=np.var(ts, ddof=1),
Dataset=dataset_to_name(args.dataset),
Model=model_version_to_name(args.model_version),
))
print(f'Experiment took {time.time()-start} seconds')
_ = save_pickle(results, profiling_filename(args))
# %% [markdown]
# ### Results
# Let's check the results
# %%
pd.DataFrame(results)
# %%