Developed a task to calculate fake factors for WJ and QCD

hephysicist · hephysicist · commit d7ff94466ec7 · 2025-01-28T10:45:28.000+01:00
diff --git a/columnflow/tasks/data_driven_methods.py b/columnflow/tasks/data_driven_methods.py
@@ -23,7 +23,7 @@
 from columnflow.util import dev_sandbox, DotDict
 
 
-class CreateFakeFactorHistograms(
+class PrepareFakeFactorHistograms(
     VariablesMixin,
     WeightProducerMixin,
     ProducersMixin,
@@ -177,8 +177,7 @@ def run(self):
                 
                 h = (hist.Hist.new
                     .IntCat([], name="category", growth=True)
-                    .IntCat([], name="process", growth=True)
-                    .IntCat([], name="shift", growth=True))
+                    .IntCat([], name="process", growth=True))
                 for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
                     h = eval(f'h.{var_axis.ax_str}') 
                 
@@ -189,11 +188,11 @@ def run(self):
                         axis=-1,
                     )
                 # broadcast arrays so that each event can be filled for all its categories
+                
                 fill_data = {
                     "category"          : category_ids,
                     "process"           : events.process_id,
-                    "shift"             : np.ones(len(events), dtype=np.int32) * self.global_shift_inst.id,
-                    "weight": weight,
+                    "weight"            : weight,
                 }
                 for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
                     route = Route(var_axis.var_route)
@@ -214,19 +213,19 @@ def run(self):
 
 # overwrite class defaults
 check_overlap_tasks = law.config.get_expanded("analysis", "check_overlapping_inputs", [], split_csv=True)
-CreateFakeFactorHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy(
-    default=CreateFakeFactorHistograms.task_family in check_overlap_tasks,
+PrepareFakeFactorHistograms.check_overlapping_inputs = ChunkedIOMixin.check_overlapping_inputs.copy(
+    default=PrepareFakeFactorHistograms.task_family in check_overlap_tasks,
     add_default_to_description=True,
 )
 
 
-CreateFakeFactorHistogramsWrapper = wrapper_factory(
+PrepareFakeFactorHistogramsWrapper = wrapper_factory(
     base_cls=AnalysisTask,
-    require_cls=CreateFakeFactorHistograms,
+    require_cls=PrepareFakeFactorHistograms,
     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
 )
 
-class MergeFakeFactors(
+class ComputeFakeFactors(
     VariablesMixin,
     DatasetsProcessesMixin,
     CategoriesMixin,
@@ -253,12 +252,12 @@ class MergeFakeFactors(
     # upstream requirements
     reqs = Requirements(
         RemoteWorkflow.reqs,
-        CreateFakeFactorHistograms=CreateFakeFactorHistograms,
+        PrepareFakeFactorHistograms=PrepareFakeFactorHistograms,
     )
     
     def store_parts(self):
         parts = super().store_parts()
-        parts.insert_before("version", "datasets" )#, f"datasets_{self.datasets_repr}")
+        parts.insert_before("version", "datasets", f"datasets_{self.datasets_repr}")
         return parts
     
     @classmethod
@@ -291,7 +290,7 @@ def workflow_requires(self):
         if not self.pilot:
             variables = self._get_variables()
             if variables:
-                reqs["ff_method"] = self.reqs.CreateFakeFactorHistograms.req_different_branching(
+                reqs["ff_method"] = self.reqs.PrepareFakeFactorHistograms.req_different_branching(
                     self,
                     branch=-1,
                     variables=tuple(variables),
@@ -301,74 +300,141 @@ def workflow_requires(self):
 
     def requires(self):
         return {
-            d: self.reqs.CreateFakeFactorHistograms.req(
+            d: self.reqs.PrepareFakeFactorHistograms.req(
                 self,
                 dataset=d,
                 branch=-1,
             )
             for d in self.datasets
         }
     def output(self):
-        return {"hists": self.target(f"fake_factors.pickle")}
+        return {"ff_json": {ff_type: self.target(f"fake_factors_{ff_type}.json")for ff_type in ['qcd','wj']},
+                "plots": {syst: self.target(f"fake_factor_syst_{syst}.png") for syst in ['nominal', 'up', 'down']},}
 
     @law.decorator.log
     def run(self):
         import hist
         import numpy as np
         import matplotlib.pyplot as plt
+        import correctionlib.convert as cl_convert 
         # preare inputs and outputs
         inputs = self.input()
         outputs = self.output()
         merged_per_dataset = {}
         projected_hists = []
+        hists_by_dataset = []
         for (dataset_name, dataset) in inputs.items():
             files = dataset['collection']
             # load input histograms per dataset
-            hists = [
+            hists_per_ds = [
                 inp['hists'].load(formatter="pickle")['fake_factors']
                 for inp in self.iter_progress(files.targets.values(), len(files), reach=(0, 50))
             ]
             self.publish_message(f"merging Fake factor histograms for {dataset_name}")
-            the_hist = sum(hists[1:], hists[0].copy())
-            merged_per_dataset[dataset_name] = the_hist
-            #Get axes names excluding 'process'. This is needed to merge hists for different processes
-            ax_names = [ax_name for ax_name in the_hist.axes.name if ax_name != 'process']
-            #Remove 'process' axis by projecting hist on the remaining axes
-            projected_hists.append(the_hist.project(*ax_names))
-        merged_hist = sum(projected_hists[1:], projected_hists[0].copy())
+            ds_single_hist = sum(hists_per_ds[1:], hists_per_ds[0].copy())
+            hists_by_dataset.append(ds_single_hist)
+        
+        hists_by_proc = {}
+        for proc_name in self.config_inst.processes.names():
+            proc = self.config_inst.processes.get(proc_name)
+            self.publish_message(f"merging Fake factor histograms for process: {proc.name}")
+            for the_hist in hists_by_dataset:
+                
+                if proc.id in the_hist.axes["process"]: 
+                    h = the_hist.copy()
+                    h = h[{"process": hist.loc(proc.id)}]
+                    # add the histogram
+                    if proc in hists_by_proc:
+                        hists_by_proc[proc] += h
+                    else:
+                        hists_by_proc[proc] = h
         
-        cat_SR = self.config_inst.get_category(self.branch_data.category)
-        cat_DR_den = self.config_inst.get_category(cat_SR.x.DR_den)
-        cat_DR_num = self.config_inst.get_category(cat_SR.x.DR_num)
+        mc_hists    = [h for p, h in hists_by_proc.items() if p.is_mc and not p.has_tag("signal")]
+        data_hists  = [h for p, h in hists_by_proc.items() if p.is_data]
         
-        def get_hist (h, category): 
-            return h[{"category": hist.loc(category.id)}]
+        mc_hists    = sum(mc_hists[1:], mc_hists[0].copy())
+        data_hists  = sum(data_hists[1:], data_hists[0].copy())
         
-        h_DR_num = get_hist(merged_hist,cat_DR_num).values()
-        h_DR_den = get_hist(merged_hist,cat_DR_den).values()
+        dr_names = ['dr_num_wj','dr_den_wj','dr_num_qcd','dr_den_qcd']
+        
+        def get_hist(h, category): 
+             return h[{"category": hist.loc(category.id)}]
         
-        ff_values = np.where((h_DR_num > 0) & (h_DR_den > 0),
-                             h_DR_num / np.maximum(h_DR_den, 1),
-                             0.0,
-        )
         
-        #For the control: make 2d hists and plot them:
-        hist2d = merged_hist.project('tau_pt','tau_dm_pnet')
-        ff_hist = hist.Hist(*hist2d.axes, data=ff_values[0])
-        fig, ax = plt.subplots(figsize=(12, 8))
-        ff_hist.plot2d(ax=ax)
-        plt.savefig('fake_factors.pdf')
-        from IPython import embed; embed()
-        #outputs["hists"][variable_name].dump(merged, formatter="pickle")F
-
-        # optionally remove inputs
-        if self.remove_previous:
-            inputs.remove()
-
-
-# MergeFakeFactorsWrapper = wrapper_factory(
-#     base_cls=AnalysisTask,
-#     require_cls=MergeFakeFactors,
-#     enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"],
-# )
+        #Create two dictionaries that contain histograms for different determination regions
+        data_h_cat ={}
+        mc_h_cat = {}
+        for dr_name in dr_names:
+            cat = self.config_inst.get_category(self.branch_data.category.replace('sr',dr_name))
+            data_h_cat[dr_name]  = get_hist(data_hists, cat)
+            mc_h_cat[dr_name]    = get_hist(mc_hists, cat)
+            
+        
+        def get_ff_corr(self, h_data, h_mc, num_cat, den_cat, name='ff_hist', label='ff_hist'):
+            num = h_data[num_cat].values() - h_mc[num_cat].values()
+            den = h_data[den_cat].values() - h_mc[den_cat].values()
+            ff_val = np.where((num > 0) & (den > 0),
+                               num / np.maximum(den, 1),
+                               1)
+            def rel_err(x):
+                return x.variances()/np.maximum(x.values()**2, 1)
+            ff_err2 = np.where((num > 0) & (den > 0),
+                               np.sqrt(rel_err(h_data[num_cat]) + 
+                                       + rel_err(h_mc[den_cat]) +
+                                       + rel_err(h_data[num_cat]) + 
+                                       + rel_err(h_mc[den_cat])) * ff_val**2,
+                               0.5* np.ones_like(ff_val))
+            h = hist.Hist.new
+            for (var_name, var_axis) in self.config_inst.x.fake_factor_method.axes.items(): 
+                h = eval(f'h.{var_axis.ax_str}') 
+            h = h.StrCategory(['nominal', 'up', 'down'], name='syst', label='Statistical uncertainty of the fake factor')
+            ff_hist= h.Weight()
+            ff_hist.view().value[...,0] = ff_val
+            ff_hist.view().value[...,1] = ff_val + np.sqrt(ff_err2)
+            ff_hist.view().value[...,2] = np.maximum(ff_val - np.sqrt(ff_err2),0)
+            ff_hist.name = name
+            ff_hist.label = label
+            ff_corr = cl_convert.from_histogram(ff_hist) #temporary correction without systematic axis
+            ff_corr.data.flow = "clamp"
+            return ff_corr, ff_hist
+        
+        import rich
+        
+        wj_corr, wj_h = get_ff_corr(self,
+                              data_h_cat,
+                              mc_h_cat,
+                              num_cat = 'dr_num_wj',
+                              den_cat = 'dr_den_wj',
+                              name='ff_wjets',
+                              label='Fake factor W+jets')
+        
+        qcd_corr, qcd_h = get_ff_corr(self,
+                              data_h_cat,
+                              mc_h_cat,
+                              num_cat = 'dr_num_qcd',
+                              den_cat = 'dr_den_qcd',
+                              name='ff_qcd',
+                              label='Fake factor QCD')
+        
+        for h_name in ['wj', 'qcd']:
+            the_hist = eval(f'{h_name}_h')
+            
+            for syst in ['nominal','up','down']:
+                fig, ax = plt.subplots(figsize=(12, 8))
+                the_hist[...,syst].plot2d(ax=ax)
+                self.output()['plots'][syst].dump(fig, formatter="mpl")
+                
+            
+        self.output()['ff_json']['wj'].dump(wj_corr.json(exclude_unset=True), formatter="json")
+        self.output()['ff_json']['qcd'].dump(qcd_corr.json(exclude_unset=True), formatter="json")
+            
+            
+            
+            
+            
+            
+            
+            
+            
+