22CMS TAUPOG skimmed datasets from the 2022 data-taking campaign
33"""
44import cmsdb .processes as procs
5- from cmsdb .campaigns .run3_2022_preEE_nano_tau_skim_2025_v1 import campaign_run3_2022_preEE_nano_tau_skim_2025_v1 as cpn # TODO: adjust if needed
6-
7- ### prod CP-even datasets ###
8- cpn .add_dataset (
9- name = "h_ggf_htt_sm_prod_sm_filtered" ,
10- id = 22000000 ,
11- processes = [procs .h_ggf_htt_sm_prod_sm ],
12- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_SM_Filtered_ProdAndDecay" ,],
13- n_files = 18 ,
14- n_events = 6703604 ,
15- )
16-
17- cpn .add_dataset (
18- name = "h_ggf_htt_mm_prod_sm_filtered" ,
19- id = 22000001 ,
20- processes = [procs .h_ggf_htt_mm_prod_sm ],
21- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_SM_Filtered_ProdAndDecay" ,],
22- n_files = 18 ,
23- n_events = 6703604 ,
24- )
25-
26- cpn .add_dataset (
27- name = "h_ggf_htt_cpo_prod_sm_filtered" ,
28- id = 22000002 ,
29- processes = [procs .h_ggf_htt_cpo_prod_sm ],
30- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_SM_Filtered_ProdAndDecay" ,],
31- n_files = 18 ,
32- n_events = 6703604 ,
33- )
34-
35- cpn .add_dataset (
36- name = "h_ggf_htt_flat_prod_sm_filtered" ,
37- id = 22000003 ,
38- processes = [procs .h_ggf_htt_flat_prod_sm ],
39- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_SM_Filtered_ProdAndDecay" ,],
40- n_files = 18 ,
41- n_events = 6703604 ,
42- )
43-
44- ### prod CP-odd datasets ###
45- cpn .add_dataset (
46- name = "h_ggf_htt_sm_prod_cpo_filtered" ,
47- id = 22000010 ,
48- processes = [procs .h_ggf_htt_sm_prod_cpo ],
49- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_CPodd_Filtered_ProdAndDecay" ,],
50- n_files = 19 ,
51- n_events = 7185840 ,
52- )
53- cpn .add_dataset (
54- name = "h_ggf_htt_mm_prod_cpo_filtered" ,
55- id = 22000011 ,
56- processes = [procs .h_ggf_htt_mm_prod_cpo ],
57- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_CPodd_Filtered_ProdAndDecay" ,],
58- n_files = 19 ,
59- n_events = 7185840 ,
60- )
61- cpn .add_dataset (
62- name = "h_ggf_htt_cpo_prod_cpo_filtered" ,
63- id = 22000012 ,
64- processes = [procs .h_ggf_htt_cpo_prod_cpo ],
65- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_CPodd_Filtered_ProdAndDecay" ,],
66- n_files = 19 ,
67- n_events = 7185840 ,
68- )
69- cpn .add_dataset (
70- name = "h_ggf_htt_flat_prod_cpo_filtered" ,
71- id = 22000013 ,
72- processes = [procs .h_ggf_htt_flat_prod_cpo ],
73- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_CPodd_Filtered_ProdAndDecay" ,],
74- n_files = 19 ,
75- n_events = 7185840 ,
76- )
77-
78- ### prod Max. mixing datasets ###
79- cpn .add_dataset (
80- name = "h_ggf_htt_sm_prod_mm_filtered" ,
81- id = 22000020 ,
82- processes = [procs .h_ggf_htt_sm_prod_mm ],
83- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_MM_Filtered_ProdAndDecay" ,],
84- n_files = 17 ,
85- n_events = 6424278 ,
86- )
87- cpn .add_dataset (
88- name = "h_ggf_htt_mm_prod_mm_filtered" ,
89- id = 22000021 ,
90- processes = [procs .h_ggf_htt_mm_prod_mm ],
91- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_MM_Filtered_ProdAndDecay" ,],
92- n_files = 17 ,
93- n_events = 6424278 ,
94- )
95- cpn .add_dataset (
96- name = "h_ggf_htt_cpo_prod_mm_filtered" ,
97- id = 22000022 ,
98- processes = [procs .h_ggf_htt_cpo_prod_mm ],
99- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_MM_Filtered_ProdAndDecay" ,],
100- n_files = 17 ,
101- n_events = 6424278 ,
102- )
103-
104- cpn .add_dataset (
105- name = "h_ggf_htt_flat_prod_mm_filtered" ,
106- id = 22000023 ,
107- processes = [procs .h_ggf_htt_flat_prod_mm ],
108- keys = ["/GluGluHTo2Tau_UncorrelatedDecay_MM_Filtered_ProdAndDecay" ,],
109- n_files = 17 ,
110- n_events = 6424278 ,
111- )
112-
113-
114- #VBF signal samples
115- cpn .add_dataset (
116- name = "h_vbf_htt_sm_filtered" ,
117- id = 22000030 ,
118- processes = [procs .h_vbf_htt_sm ],
119- keys = ["/VBFHToTauTau_UncorrelatedDecay_Filtered" ,],
120- n_files = 13 ,
121- n_events = 5082505 ,
122- )
123-
124- cpn .add_dataset (
125- name = "h_vbf_htt_cpo_filtered" ,
126- id = 22000031 ,
127- processes = [procs .h_vbf_htt_cpo ],
128- keys = ["/VBFHToTauTau_UncorrelatedDecay_Filtered" ,],
129- n_files = 13 ,
130- n_events = 5082505 ,
131- )
132-
133- cpn .add_dataset (
134- name = "h_vbf_htt_mm_filtered" ,
135- id = 22000032 ,
136- processes = [procs .h_vbf_htt_mm ],
137- keys = ["/VBFHToTauTau_UncorrelatedDecay_Filtered" ,],
138- n_files = 13 ,
139- n_events = 5082505 ,
140- )
141-
142- cpn .add_dataset (
143- name = "h_vbf_htt_flat_filtered" ,
144- id = 22000033 ,
145- processes = [procs .h_vbf_htt_flat ],
146- keys = ["/VBFHToTauTau_UncorrelatedDecay_Filtered" ,],
147- n_files = 13 ,
148- n_events = 5082505 ,
149- )
150-
151- #VH signal samples
152- ### ZH ###
153- cpn .add_dataset (
154- name = "zh_htt_sm_filtered" ,
155- id = 22000040 ,
156- processes = [procs .zh_htt_sm ],
157- keys = ["/ZHToTauTau_UncorrelatedDecay_Filtered" ,],
158- n_files = 2 ,
159- n_events = 613598.0 ,
160- )
161- cpn .add_dataset (
162- name = "zh_htt_mm_filtered" ,
163- id = 22000041 ,
164- processes = [procs .zh_htt_mm ],
165- keys = ["/ZHToTauTau_UncorrelatedDecay_Filtered" ,],
166- n_files = 2 ,
167- n_events = 613598.0 ,
168- )
169-
170- cpn .add_dataset (
171- name = "zh_htt_cpo_filtered" ,
172- id = 22000042 ,
173- processes = [procs .zh_htt_cpo ],
174- keys = ["/ZHToTauTau_UncorrelatedDecay_Filtered" ,],
175- n_files = 2 ,
176- n_events = 613598.0 ,
177- )
178-
179- cpn .add_dataset (
180- name = "zh_htt_flat_filtered" ,
181- id = 22000043 ,
182- processes = [procs .zh_htt_flat ],
183- keys = ["/ZHToTauTau_UncorrelatedDecay_Filtered" ,],
184- n_files = 2 ,
185- n_events = 613598.0 ,
186- )
187-
188- ### W^+H ###
189- cpn .add_dataset (
190- name = "wph_htt_sm_filtered" ,
191- id = 22000050 ,
192- processes = [procs .wph_htt_sm ],
193- keys = ["/WplusHToTauTau_UncorrelatedDecay_Filtered" ],
194- n_files = 2 ,
195- n_events = 716466.0 ,
196- )
197-
198- cpn .add_dataset (
199- name = "wph_htt_mm_filtered" ,
200- id = 22000051 ,
201- processes = [procs .wph_htt_mm ],
202- keys = ["/WplusHToTauTau_UncorrelatedDecay_Filtered" ],
203- n_files = 2 ,
204- n_events = 716466.0 ,
205- )
206-
207- cpn .add_dataset (
208- name = "wph_htt_cpo_filtered" ,
209- id = 22000052 ,
210- processes = [procs .wph_htt_cpo ],
211- keys = ["/WplusHToTauTau_UncorrelatedDecay_Filtered" ],
212- n_files = 2 ,
213- n_events = 716466.0 ,
214- )
215-
216- cpn .add_dataset (
217- name = "wph_htt_flat_filtered" ,
218- id = 22000053 ,
219- processes = [procs .wph_htt_flat ],
220- keys = ["/WplusHToTauTau_UncorrelatedDecay_Filtered" ],
221- n_files = 2 ,
222- n_events = 716466.0 ,
223- )
224-
225- ### W^-H ###
226- cpn .add_dataset (
227- name = "wmh_htt_sm_filtered" ,
228- id = 22000054 ,
229- processes = [procs .wmh_htt_sm ],
230- keys = ["/WminusHToTauTau_UncorrelatedDecay_Filtered" ],
231- n_files = 1 ,
232- n_events = 431839.0 ,
233- )
234-
235- cpn .add_dataset (
236- name = "wmh_htt_mm_filtered" ,
237- id = 22000055 ,
238- processes = [procs .wmh_htt_mm ],
239- keys = ["/WminusHToTauTau_UncorrelatedDecay_Filtered" ],
240- n_files = 1 ,
241- n_events = 431839.0 ,
242- )
243-
244- cpn .add_dataset (
245- name = "wmh_htt_cpo_filtered" ,
246- id = 22000056 ,
247- processes = [procs .wmh_htt_cpo ],
248- keys = ["/WminusHToTauTau_UncorrelatedDecay_Filtered" ],
249- n_files = 1 ,
250- n_events = 431839.0 ,
251- )
252-
253- cpn .add_dataset (
254- name = "wmh_htt_flat_filtered" ,
255- id = 22000057 ,
256- processes = [procs .wmh_htt_flat ],
257- keys = ["/WminusHToTauTau_UncorrelatedDecay_Filtered" ],
258- n_files = 1 ,
259- n_events = 431839.0 ,
260- )
5+ from cmsdb .campaigns .run3_2022_preEE_nano_tau_skim_2025_v1 import campaign_run3_2022_preEE_nano_tau_skim_2025_v1 as cpn
6+
7+
8+
9+ import re
10+ from collections import OrderedDict
11+
12+ def _base_name (name : str ) -> str :
13+ m = re .match (r'^(.*)_ext\d+$' , name )
14+ return m .group (1 ) if m else name
15+
16+ def _ext_number (s : str ) -> int :
17+ m = re .search (r'_ext(\d+)$' , s )
18+ return int (m .group (1 )) if m else 0
19+
20+ def _key_sort_key (key : str ):
21+ n = _ext_number (key )
22+ # base (no ext) first, then _ext1, _ext2, ...
23+ return (0 , 0 ) if n == 0 else (1 , n )
24+
25+ def add_merged_datasets (dataset_rows , cpn , procs ):
26+ """
27+ dataset_rows: iterable of (name, key_or_keys, n_evt, n_files, pid, proc)
28+ Groups *_extX with their base sample, then calls cpn.add_dataset once per base.
29+ """
30+ groups = {} # base_name -> accumulator
31+ for name , key , n_evt , n_files , pid , proc in dataset_rows :
32+ base = _base_name (name )
33+ g = groups .get (base )
34+ if g is None :
35+ g = {
36+ "name" : base ,
37+ "proc" : proc ,
38+ "id" : None , # prefer non-ext id; fallback to first seen
39+ "keys" : OrderedDict (), # preserve insertion order, avoid dups
40+ "n_events" : 0 ,
41+ "n_files" : 0 ,
42+ }
43+ groups [base ] = g
44+
45+ if g ["proc" ] != proc :
46+ raise ValueError (f"Process mismatch for { base } : { g ['proc' ]} vs { proc } " )
47+
48+ if not re .search (r'_ext\d+$' , name ):
49+ g ["id" ] = pid
50+ elif g ["id" ] is None :
51+ g ["id" ] = pid
52+
53+ # --- FIX: accept string OR list of strings for 'key' ---
54+ keys_in = key if isinstance (key , (list , tuple )) else [key ]
55+ for k in keys_in :
56+ if not isinstance (k , str ):
57+ raise TypeError (f"key must be a string, got { type (k ).__name__ } : { k } " )
58+ g ["keys" ][k ] = True
59+
60+ g ["n_events" ] += int (n_evt )
61+ g ["n_files" ] += int (n_files )
62+
63+ # emit one add per base sample with sorted keys (base first, then ext1, ext2, ...)
64+ for base , g in groups .items ():
65+ keys = list (g ["keys" ].keys ())
66+ keys .sort (key = _key_sort_key )
67+ cpn .add_dataset (
68+ name = g ["name" ],
69+ id = g ["id" ],
70+ is_data = False ,
71+ processes = [getattr (procs , g ["proc" ])],
72+ keys = keys ,
73+ n_files = g ["n_files" ],
74+ n_events = g ["n_events" ],
75+ )
76+
77+ # ---- your datasets (name, key, n_evt, n_files, pid, proc) ----
78+
79+ dataset_rows = [
80+ ("h_ggf_htt_sm_prod_sm_filtered" , ["/GluGluHto2Tau_UncorrelatedDecay_SM_Filtered_ProdAndDecay" ], 6703604 , 18 , 22000000 , "h_ggf_htt_sm_prod_sm" ),
81+ ("h_ggf_htt_sm_prod_cpo_filtered" , ["/GluGluHto2Tau_UncorrelatedDecay_CPodd_Filtered_ProdAndDecay" ], 7185840 , 19 , 22000010 , "h_ggf_htt_sm_prod_cpo" ),
82+ ("h_ggf_htt_sm_prod_mm_filtered" , ["/GluGluHto2Tau_UncorrelatedDecay_MM_Filtered_ProdAndDecay" ], 6424278 , 17 , 22000020 , "h_ggf_htt_sm_prod_mm" ),
83+
84+ ("h_vbf_htt_sm_filtered" , ["/VBFHto2Tau_UncorrelatedDecay_Filtered" ], 5082505 , 13 , 22000030 , "h_vbf_htt_sm" ),
85+
86+ ("zh_htt_sm_filtered" , ["/ZHto2Tau_UncorrelatedDecay_Filtered" ], 613598 , 2 , 22000040 , "zh_htt_sm" ),
87+ ("wph_htt_sm_filtered" , ["/WplusHto2Tau_UncorrelatedDecay_Filtered" ], 716466 , 2 , 22000050 , "wph_htt_sm" ),
88+ ("wmh_htt_sm_filtered" , ["/WminusHto2Tau_UncorrelatedDecay_Filtered" ], 431839 , 2 , 22000060 , "wmh_htt_sm" ),
89+ ]
90+
91+ dataset_rows_cp = []
92+ for name , key , n_evt , n_files , pid , proc in dataset_rows :
93+ dataset_rows_cp .append ((name , key , n_evt , n_files , pid , proc ))
94+ for idx , the_cp_var in enumerate (['htt_mm' ,'htt_cpo' ,'htt_flat' ]):
95+ cp_name = name .replace ('htt_sm' , the_cp_var )
96+ cp_proc = proc .replace ('htt_sm' , the_cp_var )
97+ cp_pid = pid + idx + 1
98+ dataset_rows_cp .append ((cp_name , key , n_evt , n_files , cp_pid , cp_proc ))
99+
100+ add_merged_datasets (dataset_rows_cp , cpn , procs )
0 commit comments