19
19
logger = law .logger .get_logger (__name__ )
20
20
21
21
22
+ def get_proc_mask (
23
+ events : ak .Array ,
24
+ proc : str | od .Process ,
25
+ config_inst : od .Config | None = None ,
26
+ ) -> tuple (np .ndarray , list ):
27
+ """
28
+ Creates the mask selecting events belonging to the process *proc* and a list of all ids belonging to this process.
29
+
30
+ :param events: Event array
31
+ :param proc: Either string or process instance.
32
+ :param config_inst: An instance of the Config, can be None if Porcess instance is given.
33
+ :return process mask and the corresponding process ids
34
+ """
35
+ # get process instance
36
+ if config_inst :
37
+ proc_inst = config_inst .get_process (proc )
38
+ elif isinstance (proc , od .Process ):
39
+ proc_inst = proc
40
+
41
+ proc_id = events .process_id
42
+ unique_proc_ids = set (proc_id )
43
+
44
+ # get list of Ids that are belonging to the process and are present in the event array
45
+ sub_id = [
46
+ proc_inst .id
47
+ for proc_inst , _ , _ in proc_inst .walk_processes (include_self = True )
48
+ if proc_inst .id in unique_proc_ids
49
+ ]
50
+
51
+ # Create process mask
52
+ proc_mask = np .isin (proc_id , sub_id )
53
+ return proc_mask , sub_id
54
+
55
+
22
56
def input_features_sanity_checks (ml_model_inst : MLModel , input_features : list [str ]):
23
57
"""
24
58
Perform sanity checks on the input features.
@@ -78,8 +112,10 @@ def __init__(self, ml_model_inst: MLModel, process: "str", events: ak.Array, sta
78
112
"""
79
113
self ._ml_model_inst = ml_model_inst
80
114
self ._process = process
115
+
116
+ proc_mask , _ = get_proc_mask (events , process , ml_model_inst .config_inst )
81
117
self ._stats = stats
82
- self ._events = events
118
+ self ._events = events [ proc_mask ]
83
119
84
120
def __repr__ (self ):
85
121
return f"{ self .__class__ .__name__ } ({ self .ml_model_inst .cls_name } , { self .process } )"
@@ -185,21 +221,89 @@ def shuffle_indices(self) -> np.ndarray:
185
221
self ._shuffle_indices = np .random .permutation (self .n_events )
186
222
return self ._shuffle_indices
187
223
224
+ def get_xsec_train_weights (self ) -> np .ndarray :
225
+ """
226
+ Weighting such that each event has roughly the same weight,
227
+ sub processes are weighted accoridng to their cross section
228
+ """
229
+ if hasattr (self , "_xsec_train_weights" ):
230
+ return self ._xsec_train_weights
231
+
232
+ if not self .stats :
233
+ raise Exception ("cannot determine train weights without stats" )
234
+
235
+ _ , sub_id = get_proc_mask (self ._events , self .process , self .ml_model_inst .config_inst )
236
+ sum_abs_weights = np .sum ([self .stats [self .process ]["sum_abs_weights_per_process" ][str (id )] for id in sub_id ])
237
+ num_events = np .sum ([self .stats [self .process ]["num_events_per_process" ][str (id )] for id in sub_id ])
238
+
239
+ xsec_train_weights = self .weights / sum_abs_weights * num_events
240
+
241
+ return xsec_train_weights
242
+
243
+ def get_equal_train_weights (self ) -> np .ndarray :
244
+ """
245
+ Weighting such that events of each sub processes are weighted equally
246
+ """
247
+ if hasattr (self , "_equally_train_weights" ):
248
+ return self ._equal_train_weights
249
+
250
+ if not self .stats :
251
+ raise Exception ("cannot determine train weights without stats" )
252
+
253
+ combined_proc_inst = self .ml_model_inst .config_inst .get_process (self .process )
254
+ _ , sub_id_proc = get_proc_mask (self ._events , self .process , self .ml_model_inst .config_inst )
255
+ num_events = np .sum ([self .stats [self .process ]["num_events_per_process" ][str (id )] for id in sub_id_proc ])
256
+ targeted_sum_of_weights_per_process = (
257
+ num_events / len (combined_proc_inst .x .ml_config .sub_processes )
258
+ )
259
+ equal_train_weights = ak .full_like (self .weights , 1. )
260
+ sub_class_factors = {}
261
+
262
+ for proc in combined_proc_inst .x .ml_config .sub_processes :
263
+ proc_mask , sub_id = get_proc_mask (self ._events , proc , self .ml_model_inst .config_inst )
264
+ sum_pos_weights_per_sub_proc = 0.
265
+ sum_pos_weights_per_proc = self .stats [self .process ]["sum_pos_weights_per_process" ]
266
+
267
+ for id in sub_id :
268
+ id = str (id )
269
+ if id in self .stats [self .process ]["num_events_per_process" ]:
270
+ sum_pos_weights_per_sub_proc += sum_pos_weights_per_proc [id ]
271
+
272
+ if sum_pos_weights_per_sub_proc == 0 :
273
+ norm_const_per_proc = 1.
274
+ logger .info (
275
+ f"No weight sum found in stats for sub process { proc } ."
276
+ f"Normalization constant set to 1 but results are probably not correct." )
277
+ else :
278
+ norm_const_per_proc = targeted_sum_of_weights_per_process / sum_pos_weights_per_sub_proc
279
+ logger .info (f"Normalizing constant for { proc } is { norm_const_per_proc } " )
280
+
281
+ sub_class_factors [proc ] = norm_const_per_proc
282
+ equal_train_weights = np .where (proc_mask , self .weights * norm_const_per_proc , equal_train_weights )
283
+
284
+ return equal_train_weights
285
+
188
286
@property
189
287
def train_weights (self ) -> np .ndarray :
190
288
"""
191
- Weighting such that each event has roughly the same weight
289
+ Weighting according to the parameters set in the ML model config
192
290
"""
193
291
if hasattr (self , "_train_weights" ):
194
292
return self ._train_weights
195
293
196
294
if not self .stats :
197
295
raise Exception ("cannot determine train weights without stats" )
198
296
199
- sum_abs_weights = self .stats [self .process ]["sum_abs_weights" ]
200
- num_events = self .stats [self .process ]["num_events" ]
297
+ # TODO: hier muss np.float gemacht werden
298
+ proc = self .process
299
+ proc_inst = self .ml_model_inst .config_inst .get_process (proc )
300
+ if proc_inst .x ("ml_config" , None ) and proc_inst .x .ml_config .weighting == "equal" :
301
+ train_weights = self .get_equal_train_weights ()
302
+ else :
303
+ train_weights = self .get_xsec_train_weights ()
304
+
305
+ self ._train_weights = ak .to_numpy (train_weights ).astype (np .float32 )
201
306
202
- self ._train_weights = self .weights / sum_abs_weights * num_events
203
307
return self ._train_weights
204
308
205
309
@property
@@ -213,11 +317,26 @@ def equal_weights(self) -> np.ndarray:
213
317
if not self .stats :
214
318
raise Exception ("cannot determine val weights without stats" )
215
319
320
+ # TODO: per process pls [done] and now please tidy up
216
321
processes = self .ml_model_inst .processes
217
- sum_abs_weights = self .stats [self .process ]["sum_abs_weights" ]
218
- num_events_per_process = {proc : self .stats [proc ]["num_events" ] for proc in processes }
219
-
220
- self ._validation_weights = self .weights / sum_abs_weights * max (num_events_per_process .values ())
322
+ num_events_per_process = {}
323
+ for proc in processes :
324
+ id_list = list (self .stats [proc ]["num_events_per_process" ].keys ())
325
+ proc_inst = self .ml_model_inst .config_inst .get_process (proc )
326
+ sub_id = [
327
+ p_inst .id
328
+ for p_inst , _ , _ in proc_inst .walk_processes (include_self = True )
329
+ if str (p_inst .id ) in id_list
330
+ ]
331
+ if proc == self .process :
332
+ sum_abs_weights = np .sum ([
333
+ self .stats [self .process ]["sum_abs_weights_per_process" ][str (id )] for id in sub_id
334
+ ])
335
+ num_events_per_proc = np .sum ([self .stats [proc ]["num_events_per_process" ][str (id )] for id in sub_id ])
336
+ num_events_per_process [proc ] = num_events_per_proc
337
+
338
+ validation_weights = self .weights / sum_abs_weights * max (num_events_per_process .values ())
339
+ self ._validation_weights = ak .to_numpy (validation_weights ).astype (np .float32 )
221
340
222
341
return self ._validation_weights
223
342
@@ -544,6 +663,7 @@ def target(self) -> np.ndarray:
544
663
if self ._ml_model_inst .negative_weights == "handle" :
545
664
target [self .m_negative_weights ] = 1 - target [self .m_negative_weights ]
546
665
666
+ # NOTE: I think here the targets are somehow 64floats... Maybe check that
547
667
self ._target = target
548
668
return self ._target
549
669
0 commit comments