implement PlotPostfitShapes task

mafrahm · mafrahm · commit 6933274d8b96 · 2024-01-31T13:22:43.000+01:00
diff --git a/hbw/tasks/inference.py b/hbw/tasks/inference.py
@@ -273,7 +273,6 @@ def requires(self):
         reqs = {
             "datacards": self.reqs.CreateDatacards.req(self),
         }
-
         return reqs
 
     def output(self):
@@ -360,3 +359,84 @@ def run(self):
                 problematic_bin_count = check_empty_bins(h_rebin)  # noqa
                 logger.info(f"Inserting histogram with name {key}")
                 out_file[key] = uproot.from_pyroot(h_rebin)
+
+
+class PrepareInferenceTaskCalls(
+    HBWTask,
+    InferenceModelMixin,
+    MLModelsMixin,
+    ProducersMixin,
+    SelectorStepsMixin,
+    CalibratorsMixin,
+):
+    """
+    Simple task that produces string to run certain tasks in Inference
+    """
+
+    # upstream requirements
+    reqs = Requirements(
+        ModifyDatacardsFlatRebin=ModifyDatacardsFlatRebin,
+    )
+
+    def workflow_requires(self):
+        reqs = super().workflow_requires()
+
+        reqs["rebinned_datacards"] = self.reqs.ModifyDatacardsFlatRebin.req(self)
+
+        return reqs
+
+    def requires(self):
+        reqs = {
+            "rebinned_datacards": self.reqs.ModifyDatacardsFlatRebin.req(self),
+        }
+        return reqs
+
+    def output(self):
+        return {
+            "PlotUpperLimitsAtPoint": self.target("PlotUpperLimitsAtPoint.txt"),
+            "PlotUpperLimitsPoint": self.target("PlotUpperLimitsPoint.txt"),
+            "FitDiagnostics": self.target("FitDiagnostics.txt"),
+        }
+
+    def run(self):
+        inputs = self.input()
+        output = self.output()
+
+        # string that represents the version of datacards
+        identifier = "__".join([self.config, self.selector, self.inference_model, self.version])
+
+        # get the datacard names from the inputs
+        collection = inputs["rebinned_datacards"]["collection"]
+        card_fns = [collection[key]["card"].fn for key in collection.keys()]
+
+        # get the category names from the inference models
+        categories = self.inference_model_inst.categories
+        cat_names = [c.name for c in categories]
+
+        # combine category names with card fn to a single string
+        datacards = ",".join([f"{cat_name}={card_fn}" for cat_name, card_fn in zip(cat_names, card_fns)])
+
+        print("\n\n")
+        # creating upper limits for kl=1
+        cmd = (
+            f"law run PlotUpperLimitsAtPoint --version {identifier} --multi-datacards {datacards} "
+            f"--datacard-names {identifier}"
+        )
+        print(cmd, "\n\n")
+        output["PlotUpperLimitsAtPoint"].dump(cmd, formatter="text")
+
+        # creating kl scan
+        cmd = (
+            f"law run PlotUpperLimits --version {identifier} --datacards {datacards} "
+            f"--xsec fb --y-log"
+        )
+        print(cmd, "\n\n")
+        output["PlotUpperLimitsPoint"].dump(cmd, formatter="text")
+
+        # running FitDiagnostics for Pre+Postfit plots
+        cmd = (
+            f"law run FitDiagnostics --version {identifier} --datacards {datacards} "
+            f"--skip-b-only"
+        )
+        print(cmd, "\n\n")
+        output["FitDiagnostics"].dump(cmd, formatter="text")
diff --git a/hbw/tasks/plotting.py b/hbw/tasks/plotting.py
@@ -5,22 +5,32 @@
 e.g. default sets of plots or datacards
 """
 
+from __future__ import annotations
+
+from collections import OrderedDict
+
 import law
 import luigi
+import order as od
 
 from columnflow.tasks.framework.base import Requirements
 from columnflow.tasks.framework.mixins import (
     InferenceModelMixin, MLModelsMixin, ProducersMixin, SelectorStepsMixin,
     CalibratorsMixin,
 )
 from columnflow.tasks.framework.plotting import (
-    PlotBase1D, VariablePlotSettingMixin, ProcessPlotSettingMixin,
+    PlotBase, PlotBase1D, VariablePlotSettingMixin, ProcessPlotSettingMixin,
 )
 from columnflow.tasks.plotting import PlotVariables1D
 # from columnflow.tasks.framework.remote import RemoteWorkflow
 from hbw.tasks.base import HBWTask
 
-from columnflow.util import dev_sandbox
+from columnflow.util import dev_sandbox, DotDict, maybe_import
+
+uproot = maybe_import("uproot")
+
+
+logger = law.logger.get_logger(__name__)
 
 
 class InferencePlots(
@@ -106,3 +116,185 @@ def output(self):
 
     def run(self):
         pass
+
+
+def load_hists_uproot(fit_diagnostics_path):
+    """ Helper to load histograms from a fit_diagnostics file """
+    # prepare output dict
+    hists = DotDict()
+    with uproot.open(fit_diagnostics_path) as tfile:
+        keys = [key.split("/") for key in tfile.keys()]
+        for key in keys:
+            if len(key) != 3:
+                continue
+
+            # get the histogram from the tfile
+            h_in = tfile["/".join(key)]
+
+            # unpack key
+            fit, channel, process = key
+            process = process.split(";")[0]
+
+            if "data" not in process:
+                # transform TH1F to hist
+                h_in = h_in.to_hist()
+
+            # set the histogram in a deep dictionary
+            hists = law.util.merge_dicts(hists, DotDict.wrap({fit: {channel: {process: h_in}}}), deep=True)
+
+    return hists
+
+
+# imports regarding plot function
+mpl = maybe_import("matplotlib")
+plt = maybe_import("matplotlib.pyplot")
+mplhep = maybe_import("mplhep")
+
+from columnflow.plotting.plot_all import plot_all
+from columnflow.plotting.plot_util import (
+    prepare_plot_config,
+    prepare_style_config,
+)
+
+
+def plot_postfit_shapes(
+    hists: OrderedDict,
+    config_inst: od.Config,
+    category_inst: od.Category,
+    variable_insts: list[od.Variable],
+    style_config: dict | None = None,
+    density: bool | None = False,
+    shape_norm: bool | None = False,
+    yscale: str | None = "",
+    hide_errors: bool | None = None,
+    process_settings: dict | None = None,
+    variable_settings: dict | None = None,
+    **kwargs,
+) -> tuple(plt.Figure, tuple(plt.Axes)):
+    variable_inst = law.util.make_tuple(variable_insts)[0]
+
+    plot_config = prepare_plot_config(
+        hists,
+        shape_norm=shape_norm,
+        hide_errors=hide_errors,
+    )
+
+    default_style_config = prepare_style_config(
+        config_inst, category_inst, variable_inst, density, shape_norm, yscale,
+    )
+    default_style_config["ax_cfg"].pop("xlim")
+
+    style_config = law.util.merge_dicts(default_style_config, style_config, deep=True)
+    if shape_norm:
+        style_config["ax_cfg"]["ylabel"] = r"$\Delta N/N$"
+
+    return plot_all(plot_config, style_config, **kwargs)
+
+
+class PlotPostfitShapes(
+    HBWTask,
+    PlotBase1D,
+    # to correctly setup our InferenceModel, we need all these mixins, but hopefully, all these
+    # parameters are automatically resolved correctly
+    InferenceModelMixin,
+    MLModelsMixin,
+    ProducersMixin,
+    SelectorStepsMixin,
+    CalibratorsMixin,
+):
+    """
+    Task that creates Postfit shape plots based on a fit_diagnostics file.
+
+    Work in Progress!
+    TODO:
+    - include data
+    - include correct uncertainty bands
+    - pass correct binning information
+    """
+
+    sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox"))
+
+    plot_function = PlotBase.plot_function.copy(
+        default="hbw.tasks.plotting.plot_postfit_shapes",
+        add_default_to_description=True,
+    )
+
+    fit_diagnostics_file = luigi.Parameter(
+        default=law.NO_STR,
+        description="fit_diagnostics file that is used to load histograms",
+    )
+
+    prefit = luigi.BoolParameter(
+        default=False,
+        description="Whether to do prefit or postfit plots; defaults to False",
+    )
+
+    def requires(self):
+        return {}
+
+    def output(self):
+        return {"plots": self.target("plots", dir=True)}
+
+    def run(self):
+        logger.warning(
+            f"Note! It is important that the requested inference_model {self.inference_model} "
+            "is identical to the one that has been used to create the datacards",
+        )
+        all_hists = load_hists_uproot(self.fit_diagnostics_file)
+
+        outp = self.output()
+        if self.prefit:
+            fit_type = "prefit"
+        else:
+            fit_type = "fit_s"
+
+        all_hists = all_hists[f"shapes_{fit_type}"]
+
+        for channel, hists in all_hists.items():
+            has_category = self.inference_model_inst.has_category(channel)
+            if not has_category:
+                logger.warning(f"Category {channel} is not part of the inference model {self.inference_model}")
+
+            for proc_key in list(hists.keys()):
+                # remove unnecessary histograms
+                if "data" in proc_key or "total" in proc_key:
+                    hists.pop(proc_key)
+                    continue
+
+                proc_inst = None
+                # try getting the config process via InferenceModel
+                if has_category:
+                    # TODO: process customization based on inference process? e.g. scale
+                    inference_process = self.inference_model_inst.get_process(proc_key, channel)
+                    proc_inst = self.config_inst.get_process(inference_process.config_process)
+                else:
+                    # try getting proc inst directly via config
+                    proc_inst = self.config_inst.get_process(proc_key, default=None)
+
+                # replace string keys with process instances
+                if proc_inst:
+                    hists[proc_inst] = hists[proc_key]
+                    hists.pop(proc_key)
+
+            # try getting the config category and variable via InferenceModel
+            if has_category:
+                # TODO: category/variable customization based on inference model?
+                inference_category = self.inference_model_inst.get_category(channel)
+                config_category = self.config_inst.get_category(inference_category.config_category)
+                variable_inst = self.config_inst.get_variable(inference_category.config_variable)
+            else:
+                # default to dummy Category and Variable
+                config_category = od.Category(channel, id=1)
+                variable_inst = od.Variable("dummy")
+
+            # call the plot function
+            fig, _ = self.call_plot_func(
+                self.plot_function,
+                hists=hists,
+                config_inst=self.config_inst,
+                category_inst=config_category,
+                variable_insts=variable_inst,
+                **self.get_plot_parameters(),
+            )
+
+            outp["plots"].child(f"{channel}_{fit_type}.pdf", type="f").dump(fig, formatter="mpl")