Skip to content

Commit

Permalink
gen/sim 12_3_0_pre6 generation (#1)
Browse files Browse the repository at this point in the history
* 1.2 format, ztt dataset

* version 1.3.0 with new gensim truth

* new dataset

* add qcd

* add some asserts

* add new features

* keep PS

* add tau as pf target
  • Loading branch information
jpata authored Apr 28, 2022
1 parent 88738ac commit 6b63284
Show file tree
Hide file tree
Showing 12 changed files with 181 additions and 39 deletions.
11 changes: 10 additions & 1 deletion heptfds/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
"""HEP datasets."""

from .delphes_pf import DelphesPf
from .cms_pf import CmsPfTtbar, CmsPfSinglePi, CmsPfSingleElectron, CmsPfSingleMu, CmsPfSingleTau, CmsPfSinglePi0, CmsPfSingleGamma
from .cms_pf import (
CmsPfTtbar,
CmsPfSinglePi,
CmsPfSingleElectron,
CmsPfSingleMu,
CmsPfSingleTau,
CmsPfSinglePi0,
CmsPfSingleGamma,
CmsPfZtt
)
1 change: 1 addition & 0 deletions heptfds/cms_pf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
from .singleele import CmsPfSingleElectron
from .singletau import CmsPfSingleTau
from .singlegamma import CmsPfSingleGamma
from .ztt import CmsPfZtt
61 changes: 61 additions & 0 deletions heptfds/cms_pf/qcd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""CMS PF TTbar dataset."""

from pathlib import Path
import tensorflow as tf
import tensorflow_datasets as tfds

from heptfds import cms_utils

X_FEATURES = cms_utils.X_FEATURES
Y_FEATURES = cms_utils.Y_FEATURES

_DESCRIPTION = """
Dataset generated with CMSSW and full detector sim.
QCD events with PU~55 in a Run3 setup.
"""

# TODO(cms_pf): BibTeX citation
_CITATION = """
"""

PADDED_NUM_ELEM_SIZE = 12000

class CmsPfQcd(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf dataset."""

VERSION = tfds.core.Version("1.3.0")
RELEASE_NOTES = {
"1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
FIXME
"""

def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
# TODO(cms_pf): Specifies the tfds.core.DatasetInfo object
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict(
{
"X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32),
"ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32),
"ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32),
}
),
supervised_keys=("X", "ycand"),
homepage="",
citation=_CITATION,
metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES),
)

def _split_generators(self, dl_manager: tfds.download.DownloadManager):
"""Returns SplitGenerators."""
path = dl_manager.manual_dir
sample_dir = "QCDForPF_13TeV_TuneCUETP8M1_cfi"
return cms_utils.split_sample(path/sample_dir/"raw", PADDED_NUM_ELEM_SIZE)

def _generate_examples(self, files):
return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE)
3 changes: 2 additions & 1 deletion heptfds/cms_pf/singleele.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
class CmsPfSingleElectron(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf_singlepi dataset."""

VERSION = tfds.core.Version("1.1.0")
VERSION = tfds.core.Version("1.2.0")
RELEASE_NOTES = {
"1.0.0": "Initial release.",
"1.1.0": "Initial release.",
"1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleElectronFlatPt1To100_pythia8_cfi data/
Expand Down
3 changes: 2 additions & 1 deletion heptfds/cms_pf/singlegamma.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@
class CmsPfSingleGamma(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf_singlegamma dataset."""

VERSION = tfds.core.Version("1.1.0")
VERSION = tfds.core.Version("1.2.0")
RELEASE_NOTES = {
"1.1.0": "Initial release",
"1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleGammaFlatPt10To100_pythia8_cfi data/
Expand Down
3 changes: 2 additions & 1 deletion heptfds/cms_pf/singlemu.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
class CmsPfSingleMu(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf_singlepi dataset."""

VERSION = tfds.core.Version("1.1.0")
VERSION = tfds.core.Version("1.2.0")
RELEASE_NOTES = {
"1.0.0": "Initial release.",
"1.1.0": "Add muon type, fix electron GSF association",
"1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleMuFlatPt0p7To10_cfi data/
Expand Down
3 changes: 2 additions & 1 deletion heptfds/cms_pf/singlepi.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
class CmsPfSinglePi(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf_singlepi dataset."""

VERSION = tfds.core.Version("1.1.0")
VERSION = tfds.core.Version("1.2.0")
RELEASE_NOTES = {
"1.0.0": "Initial release.",
"1.1.0": "Add muon type, fix electron GSF association",
"1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SinglePiFlatPt0p7To10_cfi data/
Expand Down
5 changes: 3 additions & 2 deletions heptfds/cms_pf/singlepi0.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@
class CmsPfSinglePi0(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf_singlepi0 dataset."""

VERSION = tfds.core.Version("1.1.0")
VERSION = tfds.core.Version("1.2.0")
RELEASE_NOTES = {
"1.1.0": "Initial release"
"1.1.0": "Initial release",
"1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SinglePi0E10_pythia8_cfi data/
Expand Down
3 changes: 2 additions & 1 deletion heptfds/cms_pf/singletau.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@
class CmsPfSingleTau(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf_singletau dataset."""

VERSION = tfds.core.Version("1.1.0")
VERSION = tfds.core.Version("1.2.0")
RELEASE_NOTES = {
"1.1.0": "Add muon type, fix electron GSF association",
"1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/SingleTauFlatPt2To150_cfi data/
Expand Down
6 changes: 4 additions & 2 deletions heptfds/cms_pf/ttbar.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,17 @@
_CITATION = """
"""

PADDED_NUM_ELEM_SIZE = 6400
PADDED_NUM_ELEM_SIZE = 12000

class CmsPfTtbar(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf dataset."""

VERSION = tfds.core.Version("1.1.0")
VERSION = tfds.core.Version("1.3.0")
RELEASE_NOTES = {
"1.0.0": "Initial release.",
"1.1.0": "Add muon type, fix electron GSF association",
"1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events",
"1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
mkdir -p data
Expand Down
65 changes: 65 additions & 0 deletions heptfds/cms_pf/ztt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""CMS PF TTbar dataset."""

from pathlib import Path
import tensorflow as tf
import tensorflow_datasets as tfds

from heptfds import cms_utils

X_FEATURES = cms_utils.X_FEATURES
Y_FEATURES = cms_utils.Y_FEATURES

_DESCRIPTION = """
Dataset generated with CMSSW and full detector sim.
ZTT events with PU~55 in a Run3 setup.
"""

# TODO(cms_pf): BibTeX citation
_CITATION = """
"""

PADDED_NUM_ELEM_SIZE = 12000

class CmsPfZtt(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for cms_pf dataset."""

VERSION = tfds.core.Version("1.3.0")
RELEASE_NOTES = {
"1.0.0": "Initial release.",
"1.1.0": "Add muon type, fix electron GSF association",
"1.2.0": "12_1_0_pre3 generation, add corrected energy, cluster flags, 20k events",
"1.3.0": "12_2_0_pre2 generation with updated caloparticle/trackingparticle",
}
MANUAL_DOWNLOAD_INSTRUCTIONS = """
mkdir -p data
rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi data/
"""

def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
# TODO(cms_pf): Specifies the tfds.core.DatasetInfo object
return tfds.core.DatasetInfo(
builder=self,
description=_DESCRIPTION,
features=tfds.features.FeaturesDict(
{
"X": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(X_FEATURES)), dtype=tf.float32),
"ygen": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32),
"ycand": tfds.features.Tensor(shape=(PADDED_NUM_ELEM_SIZE, len(Y_FEATURES)), dtype=tf.float32),
}
),
supervised_keys=("X", "ycand"),
homepage="",
citation=_CITATION,
metadata=tfds.core.MetadataDict(x_features=X_FEATURES, y_features=Y_FEATURES),
)

def _split_generators(self, dl_manager: tfds.download.DownloadManager):
"""Returns SplitGenerators."""
path = dl_manager.manual_dir
sample_dir = "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi"
return cms_utils.split_sample(path/sample_dir/"raw", PADDED_NUM_ELEM_SIZE)

def _generate_examples(self, files):
return cms_utils.generate_examples(files, PADDED_NUM_ELEM_SIZE)
56 changes: 27 additions & 29 deletions heptfds/cms_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,24 @@
ELEM_LABELS_CMS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
ELEM_NAMES_CMS = ["NONE", "TRACK", "PS1", "PS2", "ECAL", "HCAL", "GSF", "BREM", "HFEM", "HFHAD", "SC", "HO"]

CLASS_LABELS_CMS = [0, 211, 130, 1, 2, 22, 11, 13]
CLASS_LABELS_CMS = [0, 211, 130, 1, 2, 22, 11, 13, 15]
CLASS_NAMES_CMS = ["none", "ch.had", "n.had", "HFEM", "HFHAD", "gamma", "ele", "mu"]

X_FEATURES = [
"typ_idx",
"pt",
"eta",
"phi",
"e",
"layer",
"depth",
"charge",
"trajpoint",
"eta_ecal",
"phi_ecal",
"eta_hcal",
"phi_hcal",
"muon_dt_hits",
"muon_csc_hits",
"muon_type",
"gsf_brem_sc_energy",
"num_hits"
"typ_idx", "pt", "eta", "phi", "e",
"layer", "depth", "charge", "trajpoint",
"eta_ecal", "phi_ecal", "eta_hcal", "phi_hcal", "muon_dt_hits", "muon_csc_hits", "muon_type",
"px", "py", "pz", "deltap", "sigmadeltap",
"gsf_electronseed_trkorecal",
"gsf_electronseed_dnn1",
"gsf_electronseed_dnn2",
"gsf_electronseed_dnn3",
"gsf_electronseed_dnn4",
"gsf_electronseed_dnn5",
"num_hits", "cluster_flags", "corr_energy",
"corr_energy_err", "vx", "vy", "vz", "pterror", "etaerror", "phierror", "lambd", "lambdaerror", "theta", "thetaerror"
]

Y_FEATURES = [
"typ_idx",
"charge",
Expand All @@ -59,11 +53,11 @@ def prepare_data_cms(fn, padded_num_elem_size):
ycand = event["ycand"]

# remove PS from inputs, they don't seem to be very useful
msk_ps = (Xelem["typ"] == 2) | (Xelem["typ"] == 3)
#msk_ps = (Xelem["typ"] == 2) | (Xelem["typ"] == 3)

Xelem = Xelem[~msk_ps]
ygen = ygen[~msk_ps]
ycand = ycand[~msk_ps]
Xelem = Xelem
ygen = ygen
ycand = ycand

Xelem = append_fields(
Xelem, "typ_idx", np.array([ELEM_LABELS_CMS.index(int(i)) for i in Xelem["typ"]], dtype=np.float32)
Expand Down Expand Up @@ -131,11 +125,15 @@ def prepare_data_cms(fn, padded_num_elem_size):
return X, ygen, ycand

def split_sample(path, pad_size, test_frac=0.8):
files = sorted(list(path.glob("*.pkl*")))
idx_split = int(test_frac*len(files))
files_train = files[:idx_split]
files_test = files[idx_split:]
return {"train": generate_examples(files_train, pad_size), "test": generate_examples(files_test, pad_size)}
files = sorted(list(path.glob("*.pkl*")))
print("Found {} files in {}".format(files, path))
assert(len(files)>0)
idx_split = int(test_frac*len(files))
files_train = files[:idx_split]
files_test = files[idx_split:]
assert(len(files_train)>0)
assert(len(files_test)>0)
return {"train": generate_examples(files_train, pad_size), "test": generate_examples(files_test, pad_size)}

def generate_examples(files, pad_size):
"""Yields examples."""
Expand Down

0 comments on commit 6b63284

Please sign in to comment.