6
6
7
7
from collections import defaultdict
8
8
9
+ from functools import cached_property
10
+
9
11
import law
10
12
import luigi
11
13
14
+ from scinum import Number
15
+
12
16
13
17
from columnflow .tasks .framework .mixins import (
14
18
ProducersMixin , MLModelsMixin ,
29
33
logger = law .logger .get_logger (__name__ )
30
34
31
35
32
- def create_table_from_csv (csv_file_path ):
36
+ def create_table_from_csv (csv_file_path , transpose = False , with_header = True ):
33
37
import csv
34
38
from tabulate import tabulate
35
39
@@ -38,12 +42,18 @@ def create_table_from_csv(csv_file_path):
38
42
reader = csv .reader (file )
39
43
data = list (reader )
40
44
45
+ # Transpose the data if requested
46
+ if transpose :
47
+ data = list (zip (* data ))
48
+
41
49
# Optionally, if you want to use the first row as headers
42
- headers = data [0 ] # First row as headers
43
- table_data = data [1 :] # Rest as table data
50
+ headers = None
51
+ if with_header :
52
+ headers = data [0 ] # First row as headers
53
+ data = data [1 :] # Rest as table data
44
54
45
55
# Generate the table using tabulate
46
- table = tabulate (table_data , headers = headers , tablefmt = "grid" )
56
+ table = tabulate (data , headers = headers , tablefmt = "grid" )
47
57
48
58
# Print the table
49
59
print (table )
@@ -84,18 +94,27 @@ def requires(self):
84
94
def keys_repr (self ):
85
95
return "_" .join (sorted (self .keys_of_interest ))
86
96
97
+ @cached_property
98
+ def stats (self ):
99
+ inp = self .input ()
100
+ return {
101
+ dataset : inp [dataset ]["collection" ][0 ]["stats" ].load (formatter = "json" )
102
+ for dataset in self .datasets
103
+ }
104
+
87
105
def output (self ):
88
106
output = {
89
107
"selection_summary_csv" : self .target ("selection_summary.csv" ),
90
108
"selection_summary_table" : self .target ("selection_summary.txt" ),
109
+ "selection_steps_summary_csv" : self .target ("selection_steps_summary.csv" ),
110
+ "selection_steps_summary_table" : self .target ("selection_steps_summary.txt" ),
91
111
}
92
112
return output
93
113
94
114
def write_selection_summary (self , outp ):
95
115
import csv
96
116
outp .touch ()
97
117
lumi = self .config_inst .x .luminosity
98
- inputs = self .input ()
99
118
100
119
empty_datasets = []
101
120
@@ -113,30 +132,35 @@ def write_selection_summary(self, outp):
113
132
114
133
writer .writerow (["Dataset" ] + [header_map .get (key , key ) for key in keys_of_interest ])
115
134
for dataset in self .datasets :
116
- stats = inputs [dataset ]["collection" ][0 ]["stats" ].load (formatter = "json" )
135
+ dataset_inst = self .config_inst .get_dataset (dataset )
136
+ stats = self .stats [dataset ]
117
137
# hists = inputs[dataset]["collection"][0]["hists"].load(formatter="pickle")
118
138
119
- xsec = self . config_inst . get_dataset ( dataset ) .processes .get_first ().xsecs .get (
139
+ xsec = dataset_inst .processes .get_first ().xsecs .get (
120
140
self .config_inst .campaign .ecm , None ,
121
141
)
122
142
123
143
def safe_div (num , den ):
124
144
return num / den if den != 0 else 0
125
145
126
- missing_keys = {"sum_mc_weight" , "sum_mc_weight_selected" } - set (stats .keys ())
146
+ sumw_key = "sum_mc_weight" if dataset_inst .is_mc else "num_events"
147
+
148
+ missing_keys = {f"{ sumw_key } " , f"{ sumw_key } _selected" } - set (stats .keys ())
127
149
if missing_keys :
128
150
logger .warning (f"Missing keys in stats in dataset { dataset } : { missing_keys } " )
129
151
continue
130
152
131
- selection_eff = safe_div (stats ["sum_mc_weight_selected" ], stats ["sum_mc_weight" ])
132
- if xsec is not None :
153
+ selection_eff = safe_div (stats [f"{ sumw_key } _selected" ], stats [f"{ sumw_key } " ])
154
+ if dataset_inst .is_data :
155
+ expected_yield = Number (stats ["num_events_selected" ])
156
+ elif xsec is not None :
133
157
expected_yield = xsec * selection_eff * lumi
134
158
135
159
if stats ["num_events_selected" ] == 0 :
136
160
empty_datasets .append (dataset )
137
161
138
162
selection_summary = {
139
- "xsec" : xsec .nominal ,
163
+ "xsec" : xsec .nominal if xsec else - 1 ,
140
164
"empty" : True if stats ["num_events_selected" ] == 0 else False ,
141
165
"selection_eff" : round_sig (selection_eff , 4 ),
142
166
"expected_yield" : round_sig (expected_yield .nominal , 4 ),
@@ -154,13 +178,40 @@ def safe_div(num, den):
154
178
155
179
self .publish_message (f"Empty datasets: { empty_datasets } " )
156
180
181
+ def write_selection_steps_summary (self , outp ):
182
+ import csv
183
+ outp .touch ()
184
+
185
+ with open (outp .path , "w" ) as f :
186
+ writer = csv .writer (f )
187
+
188
+ steps = [
189
+ k .replace ("num_events_step_" , "" ) for k in self .stats [self .datasets [0 ]].keys ()
190
+ if "num_events_step_" in k
191
+ ]
192
+
193
+ writer .writerow (["Datasets" ] + steps )
194
+
195
+ for dataset in self .datasets :
196
+ dataset_inst = self .config_inst .get_dataset (dataset )
197
+ stats = self .stats [dataset ]
198
+
199
+ sumw_key = "num_events" if dataset_inst .is_data else "sum_mc_weight"
200
+
201
+ row = [dataset ] + [stats .get (f"{ sumw_key } _step_{ step } " , 0 ) / stats .get (sumw_key , 1. ) for step in steps ]
202
+ writer .writerow (row )
203
+
157
204
def run (self ):
158
205
output = self .output ()
159
206
self .write_selection_summary (output ["selection_summary_csv" ])
160
207
161
208
table = create_table_from_csv (output ["selection_summary_csv" ].path )
162
209
output ["selection_summary_table" ].dump (table , formatter = "text" )
163
210
211
+ self .write_selection_steps_summary (output ["selection_steps_summary_csv" ])
212
+ table = create_table_from_csv (output ["selection_steps_summary_csv" ].path , transpose = True )
213
+ output ["selection_steps_summary_table" ].dump (table , formatter = "text" )
214
+
164
215
165
216
class DumpAnalysisSummary (
166
217
HBWTask ,
0 commit comments