Skip to content

Commit e8f45fb

Browse files
committed
WIP
1 parent bb754e2 commit e8f45fb

File tree

2 files changed

+63
-12
lines changed

2 files changed

+63
-12
lines changed

hbw/config/datasets.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def hbw_dataset_names(config: od.Config, as_list: bool = False) -> DotDict[str:
8888
# NLO samples
8989
"dy_m50toinf_amcatnlo",
9090
"dy_m10to50_amcatnlo",
91-
"dy_m4to10_amcatnlo",
91+
# "dy_m4to10_amcatnlo", # broken files in 2023pre
9292
"dy_m50toinf_0j_amcatnlo",
9393
"dy_m50toinf_1j_amcatnlo",
9494
"dy_m50toinf_2j_amcatnlo",

hbw/tasks/inspection.py

+62-11
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,13 @@
66

77
from collections import defaultdict
88

9+
from functools import cached_property
10+
911
import law
1012
import luigi
1113

14+
from scinum import Number
15+
1216

1317
from columnflow.tasks.framework.mixins import (
1418
ProducersMixin, MLModelsMixin,
@@ -29,7 +33,7 @@
2933
logger = law.logger.get_logger(__name__)
3034

3135

32-
def create_table_from_csv(csv_file_path):
36+
def create_table_from_csv(csv_file_path, transpose=False, with_header=True):
3337
import csv
3438
from tabulate import tabulate
3539

@@ -38,12 +42,18 @@ def create_table_from_csv(csv_file_path):
3842
reader = csv.reader(file)
3943
data = list(reader)
4044

45+
# Transpose the data if requested
46+
if transpose:
47+
data = list(zip(*data))
48+
4149
# Optionally, if you want to use the first row as headers
42-
headers = data[0] # First row as headers
43-
table_data = data[1:] # Rest as table data
50+
headers = None
51+
if with_header:
52+
headers = data[0] # First row as headers
53+
data = data[1:] # Rest as table data
4454

4555
# Generate the table using tabulate
46-
table = tabulate(table_data, headers=headers, tablefmt="grid")
56+
table = tabulate(data, headers=headers, tablefmt="grid")
4757

4858
# Print the table
4959
print(table)
@@ -84,18 +94,27 @@ def requires(self):
8494
def keys_repr(self):
8595
return "_".join(sorted(self.keys_of_interest))
8696

97+
@cached_property
98+
def stats(self):
99+
inp = self.input()
100+
return {
101+
dataset: inp[dataset]["collection"][0]["stats"].load(formatter="json")
102+
for dataset in self.datasets
103+
}
104+
87105
def output(self):
88106
output = {
89107
"selection_summary_csv": self.target("selection_summary.csv"),
90108
"selection_summary_table": self.target("selection_summary.txt"),
109+
"selection_steps_summary_csv": self.target("selection_steps_summary.csv"),
110+
"selection_steps_summary_table": self.target("selection_steps_summary.txt"),
91111
}
92112
return output
93113

94114
def write_selection_summary(self, outp):
95115
import csv
96116
outp.touch()
97117
lumi = self.config_inst.x.luminosity
98-
inputs = self.input()
99118

100119
empty_datasets = []
101120

@@ -113,30 +132,35 @@ def write_selection_summary(self, outp):
113132

114133
writer.writerow(["Dataset"] + [header_map.get(key, key) for key in keys_of_interest])
115134
for dataset in self.datasets:
116-
stats = inputs[dataset]["collection"][0]["stats"].load(formatter="json")
135+
dataset_inst = self.config_inst.get_dataset(dataset)
136+
stats = self.stats[dataset]
117137
# hists = inputs[dataset]["collection"][0]["hists"].load(formatter="pickle")
118138

119-
xsec = self.config_inst.get_dataset(dataset).processes.get_first().xsecs.get(
139+
xsec = dataset_inst.processes.get_first().xsecs.get(
120140
self.config_inst.campaign.ecm, None,
121141
)
122142

123143
def safe_div(num, den):
124144
return num / den if den != 0 else 0
125145

126-
missing_keys = {"sum_mc_weight", "sum_mc_weight_selected"} - set(stats.keys())
146+
sumw_key = "sum_mc_weight" if dataset_inst.is_mc else "num_events"
147+
148+
missing_keys = {f"{sumw_key}", f"{sumw_key}_selected"} - set(stats.keys())
127149
if missing_keys:
128150
logger.warning(f"Missing keys in stats in dataset {dataset}: {missing_keys}")
129151
continue
130152

131-
selection_eff = safe_div(stats["sum_mc_weight_selected"], stats["sum_mc_weight"])
132-
if xsec is not None:
153+
selection_eff = safe_div(stats[f"{sumw_key}_selected"], stats[f"{sumw_key}"])
154+
if dataset_inst.is_data:
155+
expected_yield = Number(stats["num_events_selected"])
156+
elif xsec is not None:
133157
expected_yield = xsec * selection_eff * lumi
134158

135159
if stats["num_events_selected"] == 0:
136160
empty_datasets.append(dataset)
137161

138162
selection_summary = {
139-
"xsec": xsec.nominal,
163+
"xsec": xsec.nominal if xsec else -1,
140164
"empty": True if stats["num_events_selected"] == 0 else False,
141165
"selection_eff": round_sig(selection_eff, 4),
142166
"expected_yield": round_sig(expected_yield.nominal, 4),
@@ -154,13 +178,40 @@ def safe_div(num, den):
154178

155179
self.publish_message(f"Empty datasets: {empty_datasets}")
156180

181+
def write_selection_steps_summary(self, outp):
182+
import csv
183+
outp.touch()
184+
185+
with open(outp.path, "w") as f:
186+
writer = csv.writer(f)
187+
188+
steps = [
189+
k.replace("num_events_step_", "") for k in self.stats[self.datasets[0]].keys()
190+
if "num_events_step_" in k
191+
]
192+
193+
writer.writerow(["Datasets"] + steps)
194+
195+
for dataset in self.datasets:
196+
dataset_inst = self.config_inst.get_dataset(dataset)
197+
stats = self.stats[dataset]
198+
199+
sumw_key = "num_events" if dataset_inst.is_data else "sum_mc_weight"
200+
201+
row = [dataset] + [stats.get(f"{sumw_key}_step_{step}", 0) / stats.get(sumw_key, 1.) for step in steps]
202+
writer.writerow(row)
203+
157204
def run(self):
158205
output = self.output()
159206
self.write_selection_summary(output["selection_summary_csv"])
160207

161208
table = create_table_from_csv(output["selection_summary_csv"].path)
162209
output["selection_summary_table"].dump(table, formatter="text")
163210

211+
self.write_selection_steps_summary(output["selection_steps_summary_csv"])
212+
table = create_table_from_csv(output["selection_steps_summary_csv"].path, transpose=True)
213+
output["selection_steps_summary_table"].dump(table, formatter="text")
214+
164215

165216
class DumpAnalysisSummary(
166217
HBWTask,

0 commit comments

Comments
 (0)