From 6b63284745dbea6fc8eb299345f863899168604a Mon Sep 17 00:00:00 2001 From: Joosep Pata Date: Thu, 28 Apr 2022 12:19:04 +0300 Subject: [PATCH] gen/sim 12_3_0_pre6 generation (#1) * 1.2 format, ztt dataset * version 1.3.0 with new gensim truth * new dataset * add qcd * add some asserts * add new features * keep PS * add tau as pf target --- heptfds/__init__.py | 11 +++++- heptfds/cms_pf/__init__.py | 1 + heptfds/cms_pf/qcd.py | 61 ++++++++++++++++++++++++++++++++ heptfds/cms_pf/singleele.py | 3 +- heptfds/cms_pf/singlegamma.py | 3 +- heptfds/cms_pf/singlemu.py | 3 +- heptfds/cms_pf/singlepi.py | 3 +- heptfds/cms_pf/singlepi0.py | 5 +-- heptfds/cms_pf/singletau.py | 3 +- heptfds/cms_pf/ttbar.py | 6 ++-- heptfds/cms_pf/ztt.py | 65 +++++++++++++++++++++++++++++++++++ heptfds/cms_utils.py | 56 +++++++++++++++--------------- 12 files changed, 181 insertions(+), 39 deletions(-) create mode 100644 heptfds/cms_pf/qcd.py create mode 100644 heptfds/cms_pf/ztt.py diff --git a/heptfds/__init__.py b/heptfds/__init__.py index 4e71e0c..f9f6163 100644 --- a/heptfds/__init__.py +++ b/heptfds/__init__.py @@ -1,4 +1,13 @@ """HEP datasets.""" from .delphes_pf import DelphesPf -from .cms_pf import CmsPfTtbar, CmsPfSinglePi, CmsPfSingleElectron, CmsPfSingleMu, CmsPfSingleTau, CmsPfSinglePi0, CmsPfSingleGamma +from .cms_pf import ( + CmsPfTtbar, + CmsPfSinglePi, + CmsPfSingleElectron, + CmsPfSingleMu, + CmsPfSingleTau, + CmsPfSinglePi0, + CmsPfSingleGamma, + CmsPfZtt +) diff --git a/heptfds/cms_pf/__init__.py b/heptfds/cms_pf/__init__.py index c395658..e038766 100644 --- a/heptfds/cms_pf/__init__.py +++ b/heptfds/cms_pf/__init__.py @@ -7,3 +7,4 @@ from .singleele import CmsPfSingleElectron from .singletau import CmsPfSingleTau from .singlegamma import CmsPfSingleGamma +from .ztt import CmsPfZtt diff --git a/heptfds/cms_pf/qcd.py b/heptfds/cms_pf/qcd.py new file mode 100644 index 0000000..7371f59 --- /dev/null +++ b/heptfds/cms_pf/qcd.py @@ -0,0 +1,61 @@ +"""CMS PF TTbar dataset.""" + +from pathlib import Path +import tensorflow as tf +import tensorflow_datasets as tfds + +from heptfds import cms_utils + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +QCD events with PU~55 in a Run3 setup. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 12000 + +class CmsPfQcd(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf dataset.""" + + VERSION = tfds.core.Version("1.3.0") + RELEASE_NOTES = { + "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + FIXME + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "QCDForPF_13TeV_TuneCUETP8M1_cfi" + return cms_utils.split_sample(path/sample_dir/"raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/heptfds/cms_pf/singleele.py b/heptfds/cms_pf/singleele.py index d29462d..9dcf620 100644 --- a/heptfds/cms_pf/singleele.py +++ b/heptfds/cms_pf/singleele.py @@ -24,10 +24,11 @@ class CmsPfSingleElectron(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlepi dataset.""" - VERSION = tfds.core.Version("1.1.0") + VERSION = tfds.core.Version("1.2.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Initial release.", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleElectronFlatPt1To100_pythia8_cfi data/ diff --git a/heptfds/cms_pf/singlegamma.py b/heptfds/cms_pf/singlegamma.py index ac58743..5f68d3c 100644 --- a/heptfds/cms_pf/singlegamma.py +++ b/heptfds/cms_pf/singlegamma.py @@ -24,9 +24,10 @@ class CmsPfSingleGamma(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlegamma dataset.""" - VERSION = tfds.core.Version("1.1.0") + VERSION = tfds.core.Version("1.2.0") RELEASE_NOTES = { "1.1.0": "Initial release", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleGammaFlatPt10To100_pythia8_cfi data/ diff --git a/heptfds/cms_pf/singlemu.py b/heptfds/cms_pf/singlemu.py index 02c89b2..687c9cf 100644 --- a/heptfds/cms_pf/singlemu.py +++ b/heptfds/cms_pf/singlemu.py @@ -24,10 +24,11 @@ class CmsPfSingleMu(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlepi dataset.""" - VERSION = tfds.core.Version("1.1.0") + VERSION = tfds.core.Version("1.2.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleMuFlatPt0p7To10_cfi data/ diff --git a/heptfds/cms_pf/singlepi.py b/heptfds/cms_pf/singlepi.py index 422679c..9711af0 100644 --- a/heptfds/cms_pf/singlepi.py +++ b/heptfds/cms_pf/singlepi.py @@ -24,10 +24,11 @@ class CmsPfSinglePi(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlepi dataset.""" - VERSION = tfds.core.Version("1.1.0") + VERSION = tfds.core.Version("1.2.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SinglePiFlatPt0p7To10_cfi data/ diff --git a/heptfds/cms_pf/singlepi0.py b/heptfds/cms_pf/singlepi0.py index ace712f..6d3d10a 100644 --- a/heptfds/cms_pf/singlepi0.py +++ b/heptfds/cms_pf/singlepi0.py @@ -24,9 +24,10 @@ class CmsPfSinglePi0(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singlepi0 dataset.""" - VERSION = tfds.core.Version("1.1.0") + VERSION = tfds.core.Version("1.2.0") RELEASE_NOTES = { - "1.1.0": "Initial release" + "1.1.0": "Initial release", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SinglePi0E10_pythia8_cfi data/ diff --git a/heptfds/cms_pf/singletau.py b/heptfds/cms_pf/singletau.py index d9cd543..d3dee36 100644 --- a/heptfds/cms_pf/singletau.py +++ b/heptfds/cms_pf/singletau.py @@ -24,9 +24,10 @@ class CmsPfSingleTau(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_singletau dataset.""" - VERSION = tfds.core.Version("1.1.0") + VERSION = tfds.core.Version("1.2.0") RELEASE_NOTES = { "1.1.0": "Add muon type, fix electron GSF association", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleTauFlatPt2To150_cfi data/ diff --git a/heptfds/cms_pf/ttbar.py b/heptfds/cms_pf/ttbar.py index cf2e833..7225370 100644 --- a/heptfds/cms_pf/ttbar.py +++ b/heptfds/cms_pf/ttbar.py @@ -19,15 +19,17 @@ _CITATION = """ """ -PADDED_NUM_ELEM_SIZE = 6400 +PADDED_NUM_ELEM_SIZE = 12000 class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf dataset.""" - VERSION = tfds.core.Version("1.1.0") + VERSION = tfds.core.Version("1.3.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", + "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ mkdir -p data diff --git a/heptfds/cms_pf/ztt.py b/heptfds/cms_pf/ztt.py new file mode 100644 index 0000000..9605257 --- /dev/null +++ b/heptfds/cms_pf/ztt.py @@ -0,0 +1,65 @@ +"""CMS PF TTbar dataset.""" + +from pathlib import Path +import tensorflow as tf +import tensorflow_datasets as tfds + +from heptfds import cms_utils + +X_FEATURES = cms_utils.X_FEATURES +Y_FEATURES = cms_utils.Y_FEATURES + +_DESCRIPTION = """ +Dataset generated with CMSSW and full detector sim. + +ZTT events with PU~55 in a Run3 setup. +""" + +# TODO(cms_pf): BibTeX citation +_CITATION = """ +""" + +PADDED_NUM_ELEM_SIZE = 12000 + +class CmsPfZtt(tfds.core.GeneratorBasedBuilder): + """DatasetBuilder for cms_pf dataset.""" + + VERSION = tfds.core.Version("1.3.0") + RELEASE_NOTES = { + "1.0.0": "Initial release.", + "1.1.0": "Add muon type, fix electron GSF association", + "1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events", + "1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle", + } + MANUAL_DOWNLOAD_INSTRUCTIONS = """ + mkdir -p data + rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi data/ + """ + + def _info(self) -> tfds.core.DatasetInfo: + """Returns the dataset metadata.""" + # TODO(cms_pf): Specifies the tfds.core.DatasetInfo object + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict( + { + "X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32), + "ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + "ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32), + } + ), + supervised_keys=("X", "ycand"), + homepage="", + citation=_CITATION, + metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES), + ) + + def _split_generators(self, dl_manager: tfds.download.DownloadManager): + """Returns SplitGenerators.""" + path = dl_manager.manual_dir + sample_dir = "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi" + return cms_utils.split_sample(path/sample_dir/"raw", PADDED_NUM_ELEM_SIZE) + + def _generate_examples(self, files): + return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE) diff --git a/heptfds/cms_utils.py b/heptfds/cms_utils.py index ae64b3e..35990da 100644 --- a/heptfds/cms_utils.py +++ b/heptfds/cms_utils.py @@ -9,30 +9,24 @@ ELEM_LABELS_CMS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] ELEM_NAMES_CMS = ["NONE", "TRACK", "PS1", "PS2", "ECAL", "HCAL", "GSF", "BREM", "HFEM", "HFHAD", "SC", "HO"] -CLASS_LABELS_CMS = [0, 211, 130, 1, 2, 22, 11, 13] +CLASS_LABELS_CMS = [0, 211, 130, 1, 2, 22, 11, 13, 15] CLASS_NAMES_CMS = ["none", "ch.had", "n.had", "HFEM", "HFHAD", "gamma", "ele", "mu"] X_FEATURES = [ - "typ_idx", - "pt", - "eta", - "phi", - "e", - "layer", - "depth", - "charge", - "trajpoint", - "eta_ecal", - "phi_ecal", - "eta_hcal", - "phi_hcal", - "muon_dt_hits", - "muon_csc_hits", - "muon_type", - "gsf_brem_sc_energy", - "num_hits" + "typ_idx", "pt", "eta", "phi", "e", + "layer", "depth", "charge", "trajpoint", + "eta_ecal", "phi_ecal", "eta_hcal", "phi_hcal", "muon_dt_hits", "muon_csc_hits", "muon_type", + "px", "py", "pz", "deltap", "sigmadeltap", + "gsf_electronseed_trkorecal", + "gsf_electronseed_dnn1", + "gsf_electronseed_dnn2", + "gsf_electronseed_dnn3", + "gsf_electronseed_dnn4", + "gsf_electronseed_dnn5", + "num_hits", "cluster_flags", "corr_energy", + "corr_energy_err", "vx", "vy", "vz", "pterror", "etaerror", "phierror", "lambd", "lambdaerror", "theta", "thetaerror" ] - + Y_FEATURES = [ "typ_idx", "charge", @@ -59,11 +53,11 @@ def prepare_data_cms(fn, padded_num_elem_size): ycand = event["ycand"] # remove PS from inputs, they don't seem to be very useful - msk_ps = (Xelem["typ"] == 2) | (Xelem["typ"] == 3) + #msk_ps = (Xelem["typ"] == 2) | (Xelem["typ"] == 3) - Xelem = Xelem[~msk_ps] - ygen = ygen[~msk_ps] - ycand = ycand[~msk_ps] + Xelem = Xelem + ygen = ygen + ycand = ycand Xelem = append_fields( Xelem, "typ_idx", np.array([ELEM_LABELS_CMS.index(int(i)) for i in Xelem["typ"]], dtype=np.float32) @@ -131,11 +125,15 @@ def prepare_data_cms(fn, padded_num_elem_size): return X, ygen, ycand def split_sample(path, pad_size, test_frac=0.8): - files = sorted(list(path.glob("*.pkl*"))) - idx_split = int(test_frac*len(files)) - files_train = files[:idx_split] - files_test = files[idx_split:] - return {"train": generate_examples(files_train, pad_size), "test": generate_examples(files_test, pad_size)} + files = sorted(list(path.glob("*.pkl*"))) + print("Found {} files in {}".format(files, path)) + assert(len(files)>0) + idx_split = int(test_frac*len(files)) + files_train = files[:idx_split] + files_test = files[idx_split:] + assert(len(files_train)>0) + assert(len(files_test)>0) + return {"train": generate_examples(files_train, pad_size), "test": generate_examples(files_test, pad_size)} def generate_examples(files, pad_size): """Yields examples."""